From f115c3f85467d5d9619119d1dbeb9c03c3d73864 Mon Sep 17 00:00:00 2001
From: Kihiro Bando <39278362+bandokihiro@users.noreply.github.com>
Date: Tue, 13 May 2025 15:55:29 -0400
Subject: [PATCH] Release v4.0.0 (#2294)

---
 CHANGELOG.md                                  |  488 +-
 CMakeLists.txt                                |   23 +-
 CONTRIBUTORS.md                               |   51 +-
 EULA.txt                                      |  188 +
 LICENSE.txt                                   |    7 +
 README.md                                     |  184 +-
 customConfigs.cmake                           |   12 +-
 .../56_hopper_ptr_array_batched_gemm.cu       |    2 +-
 .../57_hopper_grouped_gemm.cu                 |   73 +-
 examples/58_ada_fp8_gemm/ada_fp8_gemm.cu      |    4 -
 .../65_distributed_gemm.cu                    |   14 +-
 ...specialized_gemm_with_blockwise_scaling.cu |    8 -
 ...specialized_gemm_with_groupwise_scaling.cu |    8 -
 .../75_blackwell_grouped_gemm.cu              |   16 +-
 .../75_blackwell_grouped_gemm_block_scaled.cu |   17 +-
 ..._fmha_fwd_mainloop_tma_warpspecialized.hpp |    4 +-
 ...m100_fmha_gen_mainloop_warpspecialized.hpp |    4 +-
 ...9d_blackwell_geforce_nvfp4_grouped_gemm.cu |    2 +-
 .../82_blackwell_distributed_gemm.cu          |   14 +-
 examples/cute/tutorial/hopper/wgmma_sm90.cu   |    8 +-
 .../cute/tutorial/hopper/wgmma_tma_sm90.cu    |    4 +-
 .../python/CuTeDSL/ampere/elementwise_add.py  |  392 +
 .../CuTeDSL/ampere/elementwise_apply.py       |  395 +
 .../CuTeDSL/ampere/flash_attention_v2.py      | 1353 ++++
 examples/python/CuTeDSL/ampere/sgemm.py       |  780 ++
 .../python/CuTeDSL/ampere/tensorop_gemm.py    |  968 +++
 .../python/CuTeDSL/blackwell/dense_gemm.py    | 1922 +++++
 .../blackwell/dense_gemm_persistent.py        | 2144 ++++++
 examples/python/CuTeDSL/blackwell/fmha.py     | 2984 ++++++++
 .../python/CuTeDSL/blackwell/grouped_gemm.py  | 2287 ++++++
 examples/python/CuTeDSL/notebooks/README.md   |   31 +
 .../CuTeDSL/notebooks/cuda_graphs.ipynb       |  648 ++
 .../notebooks/cute_layout_algebra.ipynb       | 1001 +++
 .../python/CuTeDSL/notebooks/data_types.ipynb |  310 +
 .../CuTeDSL/notebooks/elementwise_add.ipynb   |  838 +++
 .../CuTeDSL/notebooks/hello_world.ipynb       |  173 +
 .../notebooks/images/cuda_graphs_image.png    |  Bin 0 -> 8586 bytes
 examples/python/CuTeDSL/notebooks/print.ipynb |  425 ++
 .../python/CuTeDSL/notebooks/tensor.ipynb     |  390 +
 .../python/CuTeDSL/notebooks/tensorssa.ipynb  |  558 ++
 .../{ => deprecated}/00_basic_gemm.ipynb      |    0
 .../python/{ => deprecated}/01_epilogue.ipynb |    0
 .../02_pytorch_extension_grouped_gemm.ipynb   |    0
 .../{ => deprecated}/03_basic_conv2d.ipynb    |    0
 .../04_epilogue_visitor.ipynb                 |    0
 examples/python/{ => deprecated}/README.md    |    0
 include/cute/arch/config.hpp                  |    1 +
 include/cute/arch/mma_sm100_umma.hpp          |   32 +
 include/cute/atom/copy_traits_sm100_tma.hpp   |    4 +-
 include/cute/atom/copy_traits_sm90_tma.hpp    |    8 +-
 include/cute/tensor_impl.hpp                  |    6 +-
 include/cutlass/arch/config.h                 |    8 +-
 .../collective/builders/sm120_builder.inl     |   40 +
 .../collective/builders/sm90_builder.inl      |   49 +-
 .../collective/builders/sm90_common.inl       |   26 +-
 .../epilogue/collective/default_epilogue.hpp  |    4 +-
 ...100_epilogue_array_tma_warpspecialized.hpp |   20 +-
 .../collective/sm100_epilogue_nosmem.hpp      |    4 +-
 .../sm100_epilogue_tma_warpspecialized.hpp    |    8 +-
 ...m90_epilogue_array_tma_warpspecialized.hpp |   18 +-
 .../sm90_epilogue_tma_warpspecialized.hpp     |    4 +-
 include/cutlass/epilogue/thread/activation.h  |   34 +-
 include/cutlass/functional.h                  |    2 +-
 .../builders/sm100_blockwise_umma_builder.inl |   16 +-
 .../collective/builders/sm90_gmma_builder.inl |   17 +-
 ..._blockscaled_mma_array_warpspecialized.hpp |    7 +-
 ...rray_warpspecialized_blockwise_scaling.hpp |   14 +-
 ..._mma_warpspecialized_blockwise_scaling.hpp |   14 +-
 .../sm100_mma_warpspecialized_mixed_input.hpp |  824 ---
 .../sm120_blockscaled_mma_array_tma.hpp       |   21 +-
 ...ma_gmma_rs_warpspecialized_mixed_input.hpp |    4 +
 ..._mma_array_tma_gmma_ss_warpspecialized.hpp |    7 +-
 ..._array_tma_gmma_ss_warpspecialized_fp8.hpp |    4 +
 ..._warpspecialized_fp8_blockwise_scaling.hpp |  277 +-
 ..._warpspecialized_fp8_blockwise_scaling.hpp |   17 +-
 include/cutlass/gemm/dispatch_policy.hpp      |    2 +
 .../sm100_gemm_array_tma_warpspecialized.hpp  |    8 +-
 ...ay_tma_warpspecialized_input_transform.hpp |    7 +-
 ...rray_tma_warpspecialized_mma_transform.hpp |    3 +
 .../kernel/sm100_gemm_tma_warpspecialized.hpp |    6 +-
 ...mm_tma_warpspecialized_input_transform.hpp |    7 +-
 ...gemm_tma_warpspecialized_mma_transform.hpp |    6 +-
 .../sm100_sparse_gemm_tma_warpspecialized.hpp |    5 +-
 ...specialized_cooperative_asymmetric_dma.hpp |    7 +-
 ..._array_tma_warpspecialized_cooperative.hpp |   34 +-
 ...emm_array_tma_warpspecialized_pingpong.hpp |   19 +-
 ...0_gemm_tma_warpspecialized_cooperative.hpp |    2 +
 include/cutlass/pipeline/sm100_pipeline.hpp   |    6 +-
 include/cutlass/version.h                     |    6 +-
 media/docs/cpp/blackwell.rst                  |   10 +
 .../cpp/blackwell_cluster_launch_control.md   |    8 +-
 media/docs/cpp/blackwell_functionality.md     |    2 +-
 .../building_in_windows_with_visual_studio.md |   12 +-
 .../building_with_clang_as_host_compiler.md   |    8 +-
 media/docs/cpp/build/index.rst                |   10 +
 media/docs/cpp/code_organization.md           |    2 +-
 media/docs/cpp/cute/02_layout_algebra.md      |   44 +-
 media/docs/cpp/cute/03_tensor.md              |    8 +-
 media/docs/cpp/cute/0t_mma_atom.md            |   36 +-
 media/docs/cpp/cute/0x_gemm_tutorial.md       |   12 +-
 media/docs/cpp/cute/0z_tma_tensors.md         |   20 +-
 media/docs/cpp/cute/index.rst                 |    2 +-
 media/docs/cpp/cutlass_2x.rst                 |   12 +
 media/docs/cpp/cutlass_3x.rst                 |   11 +
 .../cpp/cutlass_3x_backwards_compatibility.md |    2 +-
 media/docs/cpp/cutlass_3x_design.md           |    2 +-
 media/docs/cpp/functionality.md               |    2 +-
 media/docs/cpp/fundamental_types.md           |    2 +-
 media/docs/cpp/gemm_api.md                    |   18 +-
 media/docs/cpp/gemm_api_3x.md                 |    8 +-
 media/docs/cpp/getting_started.rst            |   16 +
 media/docs/cpp/grouped_scheduler.md           |    2 +-
 media/docs/cpp/ide_setup.md                   |    2 +-
 media/docs/cpp/layout.md                      |    6 +-
 media/docs/cpp/overview.md                    |  619 --
 media/docs/cpp/profiler.md                    |    8 +-
 media/docs/cpp/programming_guidelines.md      |    2 +-
 media/docs/cpp/quickstart.md                  |    6 +-
 media/docs/cpp/terminology.md                 |    5 +-
 media/docs/cpp/tile_iterator_concept.md       |    2 +-
 media/docs/cpp/utilities.md                   |    2 +-
 media/docs/pythonDSL/cute_dsl.rst             |   18 +
 media/docs/pythonDSL/cute_dsl_api.rst         |   12 +
 media/docs/pythonDSL/cute_dsl_api/cute.rst    |   11 +
 .../docs/pythonDSL/cute_dsl_api/cute_arch.rst |   24 +
 .../pythonDSL/cute_dsl_api/cute_nvgpu.rst     |   18 +
 .../cute_dsl_api/cute_nvgpu_common.rst        |    9 +
 .../cute_dsl_api/cute_nvgpu_cpasync.rst       |   10 +
 .../cute_dsl_api/cute_nvgpu_tcgen05.rst       |   10 +
 .../cute_dsl_api/cute_nvgpu_warp.rst          |   10 +
 .../cute_dsl_api/cute_nvgpu_warpgroup.rst     |   10 +
 media/docs/pythonDSL/cute_dsl_api/utils.rst   |    9 +
 .../cute_dsl_general/autotuning_gemm.rst      |  154 +
 .../pythonDSL/cute_dsl_general/debugging.rst  |  133 +
 .../cute_dsl_general/dsl_code_generation.rst  |   90 +
 .../cute_dsl_general/dsl_control_flow.rst     |  140 +
 .../cute_dsl_general/dsl_dynamic_layout.rst   |  198 +
 .../cute_dsl_general/dsl_introduction.rst     |  128 +
 .../dsl_jit_arg_generation.rst                |  196 +
 .../cute_dsl_general/dsl_jit_caching.rst      |  152 +
 .../pythonDSL/cute_dsl_general/dsl_modes.png  |  Bin 0 -> 1134058 bytes
 .../framework_integration.rst                 |  412 ++
 .../pythonDSL/cute_dsl_general/notebooks.rst  |   16 +
 media/docs/pythonDSL/faqs.rst                 |  137 +
 media/docs/pythonDSL/functionality.rst        |   34 +
 media/docs/pythonDSL/limitations.rst          |  279 +
 media/docs/pythonDSL/overview.rst             |  108 +
 media/docs/pythonDSL/quick_start.rst          |   31 +
 python/CuTeDSL/EULA.txt                       |  188 +
 python/CuTeDSL/base_dsl/__init__.py           |   17 +
 .../base_dsl/_mlir_helpers/__init__.py        |   27 +
 .../CuTeDSL/base_dsl/_mlir_helpers/arith.py   |  691 ++
 python/CuTeDSL/base_dsl/_mlir_helpers/gpu.py  |   64 +
 .../base_dsl/_mlir_helpers/lru_cache_ir.py    |   76 +
 python/CuTeDSL/base_dsl/_mlir_helpers/op.py   |   34 +
 python/CuTeDSL/base_dsl/ast_helpers.py        |  584 ++
 python/CuTeDSL/base_dsl/ast_preprocessor.py   | 1459 ++++
 python/CuTeDSL/base_dsl/cache_helpers.py      |  154 +
 python/CuTeDSL/base_dsl/common.py             |  268 +
 python/CuTeDSL/base_dsl/compiler.py           |  221 +
 python/CuTeDSL/base_dsl/dsl.py                | 1637 +++++
 python/CuTeDSL/base_dsl/env_manager.py        |  303 +
 python/CuTeDSL/base_dsl/jit_executor.py       |  301 +
 python/CuTeDSL/base_dsl/runtime/__init__.py   |   29 +
 python/CuTeDSL/base_dsl/runtime/cuda.py       |  470 ++
 .../CuTeDSL/base_dsl/runtime/device_tensor.py |  121 +
 .../CuTeDSL/base_dsl/runtime/dlpack_types.py  |   76 +
 .../base_dsl/runtime/jit_arg_adapters.py      |  188 +
 .../base_dsl/runtime/tensor_descriptor.py     |  201 +
 python/CuTeDSL/base_dsl/typing.py             | 1897 +++++
 python/CuTeDSL/base_dsl/utils/__init__.py     |   19 +
 python/CuTeDSL/base_dsl/utils/logger.py       |   80 +
 python/CuTeDSL/base_dsl/utils/stacktrace.py   |  165 +
 python/CuTeDSL/base_dsl/utils/timer.py        |   56 +
 python/CuTeDSL/cutlass/__init__.py            |   57 +
 python/CuTeDSL/cutlass/cute/__init__.py       |  310 +
 python/CuTeDSL/cutlass/cute/arch/__init__.py  |   98 +
 python/CuTeDSL/cutlass/cute/arch/elect.py     |   75 +
 python/CuTeDSL/cutlass/cute/arch/mbar.py      |  208 +
 .../cutlass/cute/arch/nvvm_wrappers.py        |  547 ++
 python/CuTeDSL/cutlass/cute/arch/smem.py      |   96 +
 python/CuTeDSL/cutlass/cute/arch/tmem.py      |  142 +
 python/CuTeDSL/cutlass/cute/core.py           | 6417 +++++++++++++++++
 python/CuTeDSL/cutlass/cute/math.py           |  354 +
 python/CuTeDSL/cutlass/cute/nvgpu/__init__.py |   26 +
 python/CuTeDSL/cutlass/cute/nvgpu/common.py   |  143 +
 .../cutlass/cute/nvgpu/cpasync/__init__.py    |   38 +
 .../cutlass/cute/nvgpu/cpasync/copy.py        |  366 +
 .../cutlass/cute/nvgpu/cpasync/helpers.py     |  327 +
 python/CuTeDSL/cutlass/cute/nvgpu/helpers.py  |  159 +
 .../cutlass/cute/nvgpu/tcgen05/__init__.py    |   57 +
 .../cutlass/cute/nvgpu/tcgen05/copy.py        |  465 ++
 .../cutlass/cute/nvgpu/tcgen05/helpers.py     |  301 +
 .../CuTeDSL/cutlass/cute/nvgpu/tcgen05/mma.py |  603 ++
 .../cutlass/cute/nvgpu/warp/__init__.py       |   25 +
 .../CuTeDSL/cutlass/cute/nvgpu/warp/copy.py   |  189 +
 python/CuTeDSL/cutlass/cute/nvgpu/warp/mma.py |   78 +
 .../cutlass/cute/nvgpu/warpgroup/__init__.py  |   29 +
 .../cutlass/cute/nvgpu/warpgroup/helpers.py   |  109 +
 .../cutlass/cute/nvgpu/warpgroup/mma.py       |  380 +
 python/CuTeDSL/cutlass/cute/runtime.py        |  515 ++
 python/CuTeDSL/cutlass/cute/testing.py        |  285 +
 python/CuTeDSL/cutlass/cute/typing.py         |  193 +
 python/CuTeDSL/cutlass/impl_utils.py          |   32 +
 python/CuTeDSL/cutlass/torch.py               |  169 +
 python/CuTeDSL/cutlass/utils/README.md        |    9 +
 python/CuTeDSL/cutlass/utils/__init__.py      |   78 +
 .../CuTeDSL/cutlass/utils/ampere_helpers.py   |   26 +
 .../cutlass/utils/blackwell_helpers.py        |  910 +++
 .../grouped_gemm_tile_scheduler_helper.py     |  466 ++
 python/CuTeDSL/cutlass/utils/hardware_info.py |  174 +
 .../CuTeDSL/cutlass/utils/hopper_helpers.py   |  195 +
 python/CuTeDSL/cutlass/utils/layout.py        |   68 +
 python/CuTeDSL/cutlass/utils/pipeline.py      |  984 +++
 .../CuTeDSL/cutlass/utils/smem_allocator.py   |  217 +
 .../utils/static_persistent_tile_scheduler.py |  384 +
 .../cutlass/utils/tensormap_manager.py        |  140 +
 python/CuTeDSL/cutlass_dsl/__init__.py        |   37 +
 python/CuTeDSL/cutlass_dsl/cutlass.py         | 1322 ++++
 .../cutlass_dsl/cutlass_ast_decorators.py     |  515 ++
 python/CuTeDSL/requirements.txt               |    3 +
 python/cutlass/__init__.py                    |    2 +-
 python/cutlass/op/conv.py                     |    1 +
 python/cutlass/utils/lazy_import.py           |   32 +-
 python/cutlass/utils/profiler.py              |    1 +
 python/cutlass_library/emit_kernel_listing.py |  122 +-
 python/cutlass_library/gemm_operation.py      |   51 +-
 python/cutlass_library/generator.py           |  181 +-
 python/cutlass_library/manifest.py            |    6 +-
 python/cutlass_library/sm90_utils.py          |   16 +-
 python/setup_library.py                       |    2 +-
 python/setup_pycute.py                        |    2 +-
 test/unit/gemm/device/CMakeLists.txt          |   25 +-
 ...0_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnn.cu |   96 +-
 ...0_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnt.cu |   96 +-
 ...0_bssp_gemm_mxf4_mxf4_f32_f16_f16_q_tnt.cu |   72 +-
 ..._bssp_gemm_mxf4_mxf4_f32_f16_mxf8_q_tnt.cu |   72 +-
 ...0_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnn.cu |   96 +-
 ...0_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnt.cu |   96 +-
 ...0_bssp_gemm_mxf4_mxf4_f32_f32_f32_q_tnt.cu |   72 +-
 ...0_bssp_gemm_mxf4_mxf6_f32_f16_f16_q_tnt.cu |   72 +-
 ...0_bssp_gemm_mxf4_mxf8_f32_f16_f16_q_tnt.cu |   72 +-
 ...0_bssp_gemm_mxf6_mxf4_f32_f16_f16_q_tnt.cu |   72 +-
 ...0_bssp_gemm_mxf6_mxf6_f32_f16_f16_q_tnt.cu |   72 +-
 ...0_bssp_gemm_mxf6_mxf8_f32_f16_f16_q_tnt.cu |   72 +-
 ...0_bssp_gemm_mxf8_mxf4_f32_f16_f16_q_tnt.cu |   72 +-
 ..._bssp_gemm_mxf8_mxf4_f32_f16_mxf8_q_tnt.cu |   72 +-
 ...0_bssp_gemm_mxf8_mxf4_f32_f32_f32_q_tnt.cu |   72 +-
 ...0_bssp_gemm_mxf8_mxf6_f32_f16_f16_q_tnt.cu |   72 +-
 ...0_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnn.cu |   36 +-
 ...0_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnt.cu |   36 +-
 ...p_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnn_sfd.cu |   36 +-
 ...p_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnt_sfd.cu |   36 +-
 ...p_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnn_sfd.cu |   36 +-
 ...p_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_sfd.cu |   36 +-
 ...mm_mxf8_mxf8_f32_f16_mxf8_q_tnt_streamk.cu |   36 +-
 ...p_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttn_sfd.cu |   36 +-
 ...p_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttt_sfd.cu |   36 +-
 ...0_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnn.cu |   36 +-
 ...0_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnt.cu |   36 +-
 ..._bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnn.cu |   36 +-
 ..._bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnt.cu |   36 +-
 ..._bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnn.cu |   36 +-
 ..._bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnt.cu |   36 +-
 ..._gemm_mxf8_mxf8_f32_void_mxf8_q_tnn_sfd.cu |   36 +-
 ..._gemm_mxf8_mxf8_f32_void_mxf8_q_tnt_sfd.cu |   36 +-
 ...0_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnn.cu |   48 +-
 ...0_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnt.cu |   48 +-
 ...p_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnn_sfd.cu |   48 +-
 ...p_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_sfd.cu |   48 +-
 ...mm_nvf4_nvf4_f32_f16_nvf4_o_tnt_streamk.cu |   36 +-
 ...0_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnn.cu |   48 +-
 ...0_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnt.cu |   48 +-
 ..._bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnn.cu |   48 +-
 ..._bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnt.cu |   48 +-
 ..._bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnn.cu |   48 +-
 ..._bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnt.cu |   48 +-
 ..._gemm_nvf4_nvf4_f32_void_nvf4_o_tnn_sfd.cu |   48 +-
 ..._gemm_nvf4_nvf4_f32_void_nvf4_o_tnt_sfd.cu |   48 +-
 ...0_gemm_f8_f8_f8_tensor_op_f32_blockwise.cu |  320 +
 .../sm100_sp_gemm_f4_f4_f32_f16_f16_tn.cu     |   48 +-
 .../sm100_sp_gemm_f4_f4_f32_f16_f8_tn.cu      |   48 +-
 .../sm100_sp_gemm_f4_f4_f32_f32_f32_tn.cu     |   48 +-
 .../sm100_sp_gemm_f4_f6_f32_f16_f16_tn.cu     |   48 +-
 .../sm100_sp_gemm_f4_f8_f32_f16_f16_tn.cu     |   48 +-
 .../sm100_sp_gemm_f6_f4_f32_f16_f16_tn.cu     |   48 +-
 .../sm100_sp_gemm_f6_f6_f32_f16_f16_tn.cu     |   48 +-
 .../sm100_sp_gemm_f6_f6_f32_f16_f8_tn.cu      |   48 +-
 .../sm100_sp_gemm_f6_f6_f32_f32_f32_tn.cu     |   48 +-
 .../sm100_sp_gemm_f6_f8_f32_f16_f16_tn.cu     |   48 +-
 .../sm100_sp_gemm_f8_f4_f32_f16_f16_tn.cu     |   48 +-
 .../sm100_sp_gemm_f8_f6_f32_f16_f16_tn.cu     |   48 +-
 .../sm100_sp_gemm_f16_f16_f32_f16_f16_hmma.cu |  132 +-
 ...sm100_sp_gemm_f32_f32_f32_f32_f32_tfmma.cu |  132 +-
 .../sm100_sp_gemm_f8_f8_f32_f16_f16_qmma.cu   |  132 +-
 .../sm100_sp_gemm_f8_f8_f32_f16_f8_qmma.cu    |  132 +-
 .../sm100_sp_gemm_f8_f8_f32_f32_f32_qmma.cu   |  132 +-
 .../sm100_sp_gemm_s8_s8_s32_s8_s8_imma.cu     |  132 +-
 .../sm90_gemm_f8_f8_f32_tensor_op_fp32.cu     |   72 +
 299 files changed, 51495 insertions(+), 4413 deletions(-)
 create mode 100644 EULA.txt
 create mode 100644 examples/python/CuTeDSL/ampere/elementwise_add.py
 create mode 100644 examples/python/CuTeDSL/ampere/elementwise_apply.py
 create mode 100644 examples/python/CuTeDSL/ampere/flash_attention_v2.py
 create mode 100644 examples/python/CuTeDSL/ampere/sgemm.py
 create mode 100644 examples/python/CuTeDSL/ampere/tensorop_gemm.py
 create mode 100644 examples/python/CuTeDSL/blackwell/dense_gemm.py
 create mode 100644 examples/python/CuTeDSL/blackwell/dense_gemm_persistent.py
 create mode 100644 examples/python/CuTeDSL/blackwell/fmha.py
 create mode 100644 examples/python/CuTeDSL/blackwell/grouped_gemm.py
 create mode 100644 examples/python/CuTeDSL/notebooks/README.md
 create mode 100644 examples/python/CuTeDSL/notebooks/cuda_graphs.ipynb
 create mode 100644 examples/python/CuTeDSL/notebooks/cute_layout_algebra.ipynb
 create mode 100644 examples/python/CuTeDSL/notebooks/data_types.ipynb
 create mode 100644 examples/python/CuTeDSL/notebooks/elementwise_add.ipynb
 create mode 100644 examples/python/CuTeDSL/notebooks/hello_world.ipynb
 create mode 100644 examples/python/CuTeDSL/notebooks/images/cuda_graphs_image.png
 create mode 100644 examples/python/CuTeDSL/notebooks/print.ipynb
 create mode 100644 examples/python/CuTeDSL/notebooks/tensor.ipynb
 create mode 100644 examples/python/CuTeDSL/notebooks/tensorssa.ipynb
 rename examples/python/{ => deprecated}/00_basic_gemm.ipynb (100%)
 rename examples/python/{ => deprecated}/01_epilogue.ipynb (100%)
 rename examples/python/{ => deprecated}/02_pytorch_extension_grouped_gemm.ipynb (100%)
 rename examples/python/{ => deprecated}/03_basic_conv2d.ipynb (100%)
 rename examples/python/{ => deprecated}/04_epilogue_visitor.ipynb (100%)
 rename examples/python/{ => deprecated}/README.md (100%)
 delete mode 100644 include/cutlass/gemm/collective/sm100_mma_warpspecialized_mixed_input.hpp
 create mode 100644 media/docs/cpp/blackwell.rst
 create mode 100644 media/docs/cpp/build/index.rst
 create mode 100644 media/docs/cpp/cutlass_2x.rst
 create mode 100644 media/docs/cpp/cutlass_3x.rst
 create mode 100644 media/docs/cpp/getting_started.rst
 delete mode 100644 media/docs/cpp/overview.md
 create mode 100644 media/docs/pythonDSL/cute_dsl.rst
 create mode 100644 media/docs/pythonDSL/cute_dsl_api.rst
 create mode 100644 media/docs/pythonDSL/cute_dsl_api/cute.rst
 create mode 100644 media/docs/pythonDSL/cute_dsl_api/cute_arch.rst
 create mode 100644 media/docs/pythonDSL/cute_dsl_api/cute_nvgpu.rst
 create mode 100644 media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_common.rst
 create mode 100644 media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_cpasync.rst
 create mode 100644 media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_tcgen05.rst
 create mode 100644 media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_warp.rst
 create mode 100644 media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_warpgroup.rst
 create mode 100644 media/docs/pythonDSL/cute_dsl_api/utils.rst
 create mode 100644 media/docs/pythonDSL/cute_dsl_general/autotuning_gemm.rst
 create mode 100644 media/docs/pythonDSL/cute_dsl_general/debugging.rst
 create mode 100644 media/docs/pythonDSL/cute_dsl_general/dsl_code_generation.rst
 create mode 100644 media/docs/pythonDSL/cute_dsl_general/dsl_control_flow.rst
 create mode 100644 media/docs/pythonDSL/cute_dsl_general/dsl_dynamic_layout.rst
 create mode 100644 media/docs/pythonDSL/cute_dsl_general/dsl_introduction.rst
 create mode 100644 media/docs/pythonDSL/cute_dsl_general/dsl_jit_arg_generation.rst
 create mode 100644 media/docs/pythonDSL/cute_dsl_general/dsl_jit_caching.rst
 create mode 100644 media/docs/pythonDSL/cute_dsl_general/dsl_modes.png
 create mode 100644 media/docs/pythonDSL/cute_dsl_general/framework_integration.rst
 create mode 100644 media/docs/pythonDSL/cute_dsl_general/notebooks.rst
 create mode 100644 media/docs/pythonDSL/faqs.rst
 create mode 100644 media/docs/pythonDSL/functionality.rst
 create mode 100644 media/docs/pythonDSL/limitations.rst
 create mode 100644 media/docs/pythonDSL/overview.rst
 create mode 100644 media/docs/pythonDSL/quick_start.rst
 create mode 100644 python/CuTeDSL/EULA.txt
 create mode 100644 python/CuTeDSL/base_dsl/__init__.py
 create mode 100644 python/CuTeDSL/base_dsl/_mlir_helpers/__init__.py
 create mode 100644 python/CuTeDSL/base_dsl/_mlir_helpers/arith.py
 create mode 100644 python/CuTeDSL/base_dsl/_mlir_helpers/gpu.py
 create mode 100644 python/CuTeDSL/base_dsl/_mlir_helpers/lru_cache_ir.py
 create mode 100644 python/CuTeDSL/base_dsl/_mlir_helpers/op.py
 create mode 100644 python/CuTeDSL/base_dsl/ast_helpers.py
 create mode 100644 python/CuTeDSL/base_dsl/ast_preprocessor.py
 create mode 100644 python/CuTeDSL/base_dsl/cache_helpers.py
 create mode 100644 python/CuTeDSL/base_dsl/common.py
 create mode 100644 python/CuTeDSL/base_dsl/compiler.py
 create mode 100644 python/CuTeDSL/base_dsl/dsl.py
 create mode 100644 python/CuTeDSL/base_dsl/env_manager.py
 create mode 100644 python/CuTeDSL/base_dsl/jit_executor.py
 create mode 100644 python/CuTeDSL/base_dsl/runtime/__init__.py
 create mode 100644 python/CuTeDSL/base_dsl/runtime/cuda.py
 create mode 100644 python/CuTeDSL/base_dsl/runtime/device_tensor.py
 create mode 100644 python/CuTeDSL/base_dsl/runtime/dlpack_types.py
 create mode 100644 python/CuTeDSL/base_dsl/runtime/jit_arg_adapters.py
 create mode 100644 python/CuTeDSL/base_dsl/runtime/tensor_descriptor.py
 create mode 100644 python/CuTeDSL/base_dsl/typing.py
 create mode 100644 python/CuTeDSL/base_dsl/utils/__init__.py
 create mode 100644 python/CuTeDSL/base_dsl/utils/logger.py
 create mode 100644 python/CuTeDSL/base_dsl/utils/stacktrace.py
 create mode 100644 python/CuTeDSL/base_dsl/utils/timer.py
 create mode 100644 python/CuTeDSL/cutlass/__init__.py
 create mode 100644 python/CuTeDSL/cutlass/cute/__init__.py
 create mode 100644 python/CuTeDSL/cutlass/cute/arch/__init__.py
 create mode 100644 python/CuTeDSL/cutlass/cute/arch/elect.py
 create mode 100644 python/CuTeDSL/cutlass/cute/arch/mbar.py
 create mode 100644 python/CuTeDSL/cutlass/cute/arch/nvvm_wrappers.py
 create mode 100644 python/CuTeDSL/cutlass/cute/arch/smem.py
 create mode 100644 python/CuTeDSL/cutlass/cute/arch/tmem.py
 create mode 100644 python/CuTeDSL/cutlass/cute/core.py
 create mode 100644 python/CuTeDSL/cutlass/cute/math.py
 create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/__init__.py
 create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/common.py
 create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/cpasync/__init__.py
 create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py
 create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py
 create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/helpers.py
 create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/__init__.py
 create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/copy.py
 create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/helpers.py
 create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/mma.py
 create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/warp/__init__.py
 create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/warp/copy.py
 create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/warp/mma.py
 create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/__init__.py
 create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/helpers.py
 create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/mma.py
 create mode 100644 python/CuTeDSL/cutlass/cute/runtime.py
 create mode 100644 python/CuTeDSL/cutlass/cute/testing.py
 create mode 100644 python/CuTeDSL/cutlass/cute/typing.py
 create mode 100644 python/CuTeDSL/cutlass/impl_utils.py
 create mode 100644 python/CuTeDSL/cutlass/torch.py
 create mode 100644 python/CuTeDSL/cutlass/utils/README.md
 create mode 100644 python/CuTeDSL/cutlass/utils/__init__.py
 create mode 100644 python/CuTeDSL/cutlass/utils/ampere_helpers.py
 create mode 100644 python/CuTeDSL/cutlass/utils/blackwell_helpers.py
 create mode 100644 python/CuTeDSL/cutlass/utils/grouped_gemm_tile_scheduler_helper.py
 create mode 100644 python/CuTeDSL/cutlass/utils/hardware_info.py
 create mode 100644 python/CuTeDSL/cutlass/utils/hopper_helpers.py
 create mode 100644 python/CuTeDSL/cutlass/utils/layout.py
 create mode 100644 python/CuTeDSL/cutlass/utils/pipeline.py
 create mode 100644 python/CuTeDSL/cutlass/utils/smem_allocator.py
 create mode 100644 python/CuTeDSL/cutlass/utils/static_persistent_tile_scheduler.py
 create mode 100644 python/CuTeDSL/cutlass/utils/tensormap_manager.py
 create mode 100644 python/CuTeDSL/cutlass_dsl/__init__.py
 create mode 100644 python/CuTeDSL/cutlass_dsl/cutlass.py
 create mode 100644 python/CuTeDSL/cutlass_dsl/cutlass_ast_decorators.py
 create mode 100644 python/CuTeDSL/requirements.txt
 create mode 100644 test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_blockwise.cu

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9ca90d8e..813a04be 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,51 +1,83 @@
-# NVIDIA CUTLASS Changelog
+# Changelog
 
-## [3.9.2](https://github.com/NVIDIA/cutlass/releases/tag/v3.9.2) (2025-05-03)
+# CUTLASS 4.x
+## [4.0.0](https://github.com/NVIDIA/cutlass/tree/main) (2025-05-09)
 
-* Fixed [Blockwise](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) and [Groupwise](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) GEMM hang issue when problem size K is 128.
+### CuTe DSL
+* CuTe DSL, a Python DSL centered around CuTe's abstractions
+    - [Core DSL implementation files](https://github.com/NVIDIA/cutlass/tree/main/python/CuTeDSL)
+    - [DSL quick start](./media/docs/pythonDSL/quick_start.rst)
+    - [DSL Overview](./media/docs/pythonDSL/overview.rst)
+* [Overhauled documentation with an new dedicated website](https://docs.nvidia.com/cutlass)
+* Set of examples demonstrating how to use CuTe DSL to write peak-performance kernels
+    - [Blackwell persistent dense GEMM with static scheduling](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/blackwell/dense_gemm_persistent.py)
+    - [Blackwell grouped GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/blackwell/grouped_gemm.py)
+    - [Blackwell fused multi-head attention forward pass](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/blackwell/fmha.py)
+    - [Ampere GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/ampere/tensorop_gemm.py)
+    - [FlashAttention-2 implementation targeting Ampere and Ada class GPUs (SM80, SM86, SM89)](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/ampere/flash_attention_v2.py)
+* [Educational notebooks for getting started with CuTe DSL](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/notebooks)
+
+### CUTLASS C++
+* Support [Family Specific Architecture Features](https://developer.nvidia.com/blog/nvidia-blackwell-and-nvidia-cuda-12-9-introduce-family-specific-architecture-features/) which was introduced in CUDA 12.9
+  - 100f, 101f, 120f were added to support Family Specific Architecture Features which allows running the same binary on different chips belonging to the same Family (e.g. sm100) without recompiling.
+* Instruction shapes and redundant accumulation type have been removed from CUTLASS 3.x-style library kernel names to disambiguate kernels and shorten names.
+  - For example:
+
+    `(old) cutlass3x_sm90_tensorop_s64x128x16gemm_bf16_bf16_f32_bf16_bf16_128x256x64_1x1x1_0_tnn_align8_warpspecialized_cooperative_epi_tma`
+    `(new) cutlass3x_sm90_tensorop_gemm_bf16_bf16_f32_bf16_bf16_128x256x64_1x1x1_0_tnn_align8_warpspecialized_cooperative_epi_tma`
+   - If you are using the CUTLASS library kernel names directly (e.g. to compile a subset of the CUTLASS library with `-DCUTLASS_LIBRARY_KERNELS`, filter kernels in the CUTLASS profiler with `--kernels`), please update your uses accordingly, this is a breaking change.
+* Further improved [Blockwise](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) and [Groupwise](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) GEMMs on Hopper and Blackwell.
+  - Added non-power-of-two tile sizes.
+  - Improved performance for K-major scale factors.
+  - The argument `mma_promotion_interval` has been removed from non-grouped GEMM to align with the grouped and Blackwell versions.
+* Various improvements and fixes from the community and CUTLASS team. Thanks to everyone who submitted PRs!
 * Optimal code generation with CUDA toolkit versions 12.9.
 
+# CUTLASS 3.x
+
+## [3.9.2](https://github.com/NVIDIA/cutlass/releases/tag/v3.9.2) (2025-05-03)
+* Fixed [Blockwise](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) and [Groupwise](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) GEMM hang issue when problem size K is 128.
+* Optimal code generation with CUDA toolkit versions 12.9.
 
 ## [3.9.1](https://github.com/NVIDIA/cutlass/releases/tag/v3.9.1) (2025-04-30)
-
 * Fixed Group Gemm hang issue in CUTLASS 3.x
-* Improved Hopper [Blockwise](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) and [Groupwise](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) GEMM performance.
+* Improved Hopper [Blockwise](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) and [Groupwise](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) GEMM performance.
 
 ## [3.9.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.9.0) (2025-04-24)
 
 * Support for Blackwell SM120 kernels for GeForce GPUs in CUTLASS 3.x API:
   - Collective mainloops that target for:
-    * [Blockscaled datatypes with support for dense GEMM](./include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp)
-    * [Blockscaled datatypes with support for sparse GEMM](./include/cutlass/gemm/collective/sm120_blockscaled_sparse_mma_tma.hpp)
-  - New [GEMM](./include/cutlass/gemm/dispatch_policy.hpp) and [epilogue](./include/cutlass/epilogue/dispatch_policy.hpp) dispatch policies for collectives, kernel layers, and builders.
-  - [Blackwell SM120 epilogue](./include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp) and [full set of EVT fusions](./include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp).
+    * [Blockscaled datatypes with support for dense GEMM](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp)
+    * [Blockscaled datatypes with support for sparse GEMM](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm120_blockscaled_sparse_mma_tma.hpp)
+  - New [GEMM](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/dispatch_policy.hpp) and [epilogue](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/dispatch_policy.hpp) dispatch policies for collectives, kernel layers, and builders.
+  - [Blackwell SM120 epilogue](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp) and [full set of EVT fusions](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp).
 * Set of examples that demonstrate the usage of the 3.x API for targeting Blackwell SM120 architecture:
-  - [Blockscaled GEMM with NVFP4 input datatype and BF16 output tensor](./examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu).
-  - [Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor with scale factor generation](./examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu).
-  - [Blockscaled GEMM with mixed input datatype (MXFP8 and MXFP6) and BF16 output tensor](./examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu).
-  - [Grouped GEMM with nvfp4 datatype](./examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu).
-  - [Sparse Blockscaled GEMM with mxfp8 input datatype and BF16 output tensor](./examples/80_blackwell_geforce_sparse_gemm/80a_blackwell_geforce_mxfp8_bf16_sparse_gemm.cu).
-  - [Sparse Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor](./examples/80_blackwell_geforce_sparse_gemm/80b_blackwell_geforce_nvfp4_nvfp4_sparse_gemm.cu).
-* Set of unit tests that demonstrate the usage of both [sparse](./test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/) and [dense](./test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/) Blackwell SM120 blockscaled GEMM.
+  - [Blockscaled GEMM with NVFP4 input datatype and BF16 output tensor](https://github.com/NVIDIA/cutlass/tree/main/examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu).
+  - [Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor with scale factor generation](https://github.com/NVIDIA/cutlass/tree/main/examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu).
+  - [Blockscaled GEMM with mixed input datatype (MXFP8 and MXFP6) and BF16 output tensor](https://github.com/NVIDIA/cutlass/tree/main/examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu).
+  - [Grouped GEMM with nvfp4 datatype](https://github.com/NVIDIA/cutlass/tree/main/examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu).
+  - [Sparse Blockscaled GEMM with mxfp8 input datatype and BF16 output tensor](https://github.com/NVIDIA/cutlass/tree/main/examples/80_blackwell_geforce_sparse_gemm/80a_blackwell_geforce_mxfp8_bf16_sparse_gemm.cu).
+  - [Sparse Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor](https://github.com/NVIDIA/cutlass/tree/main/examples/80_blackwell_geforce_sparse_gemm/80b_blackwell_geforce_nvfp4_nvfp4_sparse_gemm.cu).
+* Set of unit tests that demonstrate the usage of both [sparse](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/) and [dense](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/) Blackwell SM120 blockscaled GEMM.
 * Support for Blackwell SM100 Sparse kernels:
   - Collective mainloop that target for
-    * [SM100 Sparse GEMM](./include/cutlass/gemm/collective/sm100_sparse_mma_warpspecialized.hpp)
+    * [SM100 Sparse GEMM](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm100_sparse_mma_warpspecialized.hpp)
 * Set of example that demonstrate the usage of the 3.x API for targeting Blackwell SM100 Sparse GEMM:
-  - [Sparse GEMM](./examples/83_blackwell_sparse_gemm/83_blackwell_sparse_gemm.cu)
-  - [Blockscaled Sparse GEMM with NVFP4 input data type](./examples/84_blackwell_narrow_precision_sparse_gemm/84a_blackwell_nvfp4_bf16_sparse_gemm.cu)
-  - [Blockscaled Sparse GEMM with mixed input data type (MXFP8 and MXFP4)](./examples/84_blackwell_narrow_precision_sparse_gemm/84b_blackwell_mixed_mxfp8_bf16_sparse_gemm.cu)
-* Set of unit tests that demonstrate the usage of [sparse](./test/unit/gemm/device/sm100_sparse_tensorop_gemm) and [blockscaled sparse](./test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm) Blackwell SM100 GEMM.
-* A new Multi-head Latent Attention (MLA) for SM100 Blackwell architecture in CUTLASS [example](./examples/77_blackwell_fmha/) covers the flashMLA-like weight-absorbed decoding use-case.
-* A new FMHA Backward kernel for SM100 Blackwell architecture extends CUTLASS [example](./examples/77_blackwell_fmha/) to show how the five backward pass MMAs can be fused into a single kernel to achieve high performance.
-* A new [distributed GEMM example](./examples/82_blackwell_distributed_gemm/82_blackwell_distributed_gemm.cu) for SM100 Blackwell architecture.
+  - [Sparse GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/83_blackwell_sparse_gemm/83_blackwell_sparse_gemm.cu)
+  - [Blockscaled Sparse GEMM with NVFP4 input data type](https://github.com/NVIDIA/cutlass/tree/main/examples/84_blackwell_narrow_precision_sparse_gemm/84a_blackwell_nvfp4_bf16_sparse_gemm.cu)
+  - [Blockscaled Sparse GEMM with mixed input data type (MXFP8 and MXFP4)](https://github.com/NVIDIA/cutlass/tree/main/examples/84_blackwell_narrow_precision_sparse_gemm/84b_blackwell_mixed_mxfp8_bf16_sparse_gemm.cu)
+* Set of unit tests that demonstrate the usage of [sparse](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm100_sparse_tensorop_gemm) and [blockscaled sparse](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm) Blackwell SM100 GEMM.
+* A new Multi-head Latent Attention (MLA) for SM100 Blackwell architecture in CUTLASS [example](https://github.com/NVIDIA/cutlass/tree/main/examples/77_blackwell_fmha/) covers the flashMLA-like weight-absorbed decoding use-case.
+* A new FMHA Backward kernel for SM100 Blackwell architecture extends CUTLASS [example](https://github.com/NVIDIA/cutlass/tree/main/examples/77_blackwell_fmha/) to show how the five backward pass MMAs can be fused into a single kernel to achieve high performance.
+* A new [distributed GEMM example](https://github.com/NVIDIA/cutlass/tree/main/examples/82_blackwell_distributed_gemm/82_blackwell_distributed_gemm.cu) for SM100 Blackwell architecture.
 * Enhancement and new support of block-wise and group-wise GEMM for Hopper and Blackwell architectures:
-  - Enhancement of [blockwise GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) for Hopper architecture.
-  - Enhancement of [groupwise GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) for Hopper architecture.
-  - Support for [grouped GEMM with blockwise and groupwise scaling](./examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/) for Hopper architecture.
-  - Support for [grouped-wise GEMM](./tools/profiler/src/blockwise_gemm_operation_profiler.cu) in CUTLASS profiler.
-  - Support for [blockwise GEMM](./examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu) for Blackwell architecture.
-  - Support for [groupwise GEMM](./examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu) for Blackwell architecture.
-  - Support for [grouped GEMM with blockwise](./examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu) and [groupwise scaling](./examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu) for Blackwell architecture.
+  - Enhancement of [blockwise GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) for Hopper architecture.
+  - Enhancement of [groupwise GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) for Hopper architecture.
+  - Support for [grouped GEMM with blockwise and groupwise scaling](https://github.com/NVIDIA/cutlass/tree/main/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/) for Hopper architecture.
+  - Support for [grouped-wise GEMM](https://github.com/NVIDIA/cutlass/tree/main/tools/profiler/src/blockwise_gemm_operation_profiler.cu) in CUTLASS profiler.
+  - Support for [blockwise GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu) for Blackwell architecture.
+  - Support for [groupwise GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu) for Blackwell architecture.
+  - Support for [grouped GEMM with blockwise](https://github.com/NVIDIA/cutlass/tree/main/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu) and [groupwise scaling](https://github.com/NVIDIA/cutlass/tree/main/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu) for Blackwell architecture.
 * Added support for enhanced kernel performance search (auto-tuning) in CUTLASS profiler:
   - Sorting performance results by GFLOPs/second: Users can now sort the final performance report based on GFLOPs/second, making it easier to identify the most efficient kernels.
   - Exhaustive search for best kernel performance in GFLOPs/second: The profiler now searches for the best-performing kernel across a range of problem sizes, swizzle sizes, rasterization orders, and dynamic cluster configurations to maximize performance.
@@ -58,32 +90,32 @@
 ## [3.8.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.8.0) (2025-01-25)
 
 * Support for new CuTe building blocks specifically for Blackwell SM100 architecture:
-  - [5th generation Blackwell Tensor Core instructions (TCGen05)](./include/cute/atom/mma_traits_sm100.hpp) via CuTe MMA atoms.
-  - Extensions to [Tensor Memory Accelerator](./include/cute/atom/copy_traits_sm100_tma.hpp) via CuTe Copy atoms.
-  - Exposure of Blackwell's new tensor memory (note: distinct from TMA) as [`tmem`](./include/cute/pointer.hpp) across CuTe as a first class data locale.
-  - Exposure of [`tmem->rmem`, `rmem->tmem` and `smem->tmem data movement instructions`](./include/cute/atom/copy_traits_sm100.hpp) as copy atoms in CuTe.
-  - [`make_tmem_copy()`](./include/cute/atom/copy_traits_sm100.hpp) utility method to ease creation of tiled copies for tmem copy atoms.
-  - Support for [new variants of LDSM on Blackwell](./include/cute/atom/copy_traits_sm100.hpp) via CuTe Copy atoms.
+  - [5th generation Blackwell Tensor Core instructions (TCGen05)](https://github.com/NVIDIA/cutlass/tree/main/include/cute/atom/mma_traits_sm100.hpp) via CuTe MMA atoms.
+  - Extensions to [Tensor Memory Accelerator](https://github.com/NVIDIA/cutlass/tree/main/include/cute/atom/copy_traits_sm100_tma.hpp) via CuTe Copy atoms.
+  - Exposure of Blackwell's new tensor memory (note: distinct from TMA) as [`tmem`](https://github.com/NVIDIA/cutlass/tree/main/include/cute/pointer.hpp) across CuTe as a first class data locale.
+  - Exposure of [`tmem->rmem`, `rmem->tmem` and `smem->tmem data movement instructions`](https://github.com/NVIDIA/cutlass/tree/main/include/cute/atom/copy_traits_sm100.hpp) as copy atoms in CuTe.
+  - [`make_tmem_copy()`](https://github.com/NVIDIA/cutlass/tree/main/include/cute/atom/copy_traits_sm100.hpp) utility method to ease creation of tiled copies for tmem copy atoms.
+  - Support for [new variants of LDSM on Blackwell](https://github.com/NVIDIA/cutlass/tree/main/include/cute/atom/copy_traits_sm100.hpp) via CuTe Copy atoms.
 * Support for new CUTLASS building blocks specifically for Blackwell SM100 architecture:
-  - Various narrow precision [FP4, FP6, and FP8](./include/cutlass/exmy_base.h) formats as well as their [block-scaled variants NVFP4, MXFP4, MXFP6, and MXFP8](./include/cutlass/float_subbyte.h)
-  - [Pipelines that implement Blackwell specific synchronization](./include/cutlass/pipeline/sm100_pipeline.hpp).
-  - [Cluster launch control API supporting preferred and fallback cluster shapes](./include/cutlass/cluster_launch.hpp).
+  - Various narrow precision [FP4, FP6, and FP8](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/exmy_base.h) formats as well as their [block-scaled variants NVFP4, MXFP4, MXFP6, and MXFP8](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/float_subbyte.h)
+  - [Pipelines that implement Blackwell specific synchronization](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/pipeline/sm100_pipeline.hpp).
+  - [Cluster launch control API supporting preferred and fallback cluster shapes](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/cluster_launch.hpp).
   - Data types including NVFP4, MXFP4, MXFP6, and MXFP8 and all their supported element and scale factor types.
-  - Tile schedulers using [Blackwell's Cluster Launch Control (CLC) feature](./media/docs/cpp/blackwell_cluster_launch_control.md) to implement dynamic persistence scheduling for [GEMMs](./include/cutlass/gemm/kernel/sm100_tile_scheduler.hpp), and [stream-K](./include/cutlass/gemm/kernel/sm100_tile_scheduler_stream_k.hpp).
+  - Tile schedulers using [Blackwell's Cluster Launch Control (CLC) feature](./media/docs/cpp/blackwell_cluster_launch_control.md) to implement dynamic persistence scheduling for [GEMMs](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm100_tile_scheduler.hpp), and [stream-K](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm100_tile_scheduler_stream_k.hpp).
   - Extensions to testbeds and reference check code for unit tests and CUTLASS profiler.
 * Full support for Blackwell SM100 kernels in CUTLASS 3.x API:
-  - [Blackwell specific kernel layers](./include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp) that
+  - [Blackwell specific kernel layers](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp) that
     + Implement a new warp-specialization recipe tuned specifically for Blackwell SM100 architecture.
     + Leverage all the new features such as CLC based tile scheduling, preferred cluster, and TMEM based double buffering of accumulators.
     + Support stream-K load balancing for all kernel types everywhere via composable scheduler support.
   - Blackwell collective mainloops that target the TCGen05 MMA instructions (both SS and TS) for
-    * [Non-block scaled data types without support for pointer array and grouped GEMM with TMA](./include/cutlass/gemm/collective/sm100_mma_warpspecialized.hpp)
-    * [Non-block scaled data types with support for pointer array and grouped GEMM with TMA](./include/cutlass/gemm/collective/sm100_mma_array_warpspecialized.hpp)
-    * [Block scaled data types without support for pointer array and grouped GEMM with TMA](./include/cutlass/gemm/collective/sm100_blockscaled_mma_warpspecialized.hpp)
-    * [Block scaled data types with support for pointer array and grouped GEMM with TMA](./include/cutlass/gemm/collective/sm100_blockscaled_mma_array_warpspecialized.hpp)
-  - Blackwell [collective mainloop for convolution kernels](./include/cutlass/conv/collective/sm100_implicit_gemm_umma_warpspecialized.hpp) supporting non-block scaled data types for fprop, dgrad, and wgrad.
-  - New [GEMM](./include/cutlass/gemm/dispatch_policy.hpp), [convolution](./include/cutlass/conv/dispatch_policy.hpp), and [epilogue](./include/cutlass/epilogue/dispatch_policy.hpp) dispatch policies for collectives, kernel layers, and builders.
-  - [Blackwell epilogue that supports loading accumulators from `tmem`](./include/cutlass/epilogue/collective/sm100_epilogue_tma_warpspecialized.hpp) and [full set of EVT fusions]().
+    * [Non-block scaled data types without support for pointer array and grouped GEMM with TMA](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm100_mma_warpspecialized.hpp)
+    * [Non-block scaled data types with support for pointer array and grouped GEMM with TMA](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized.hpp)
+    * [Block scaled data types without support for pointer array and grouped GEMM with TMA](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm100_blockscaled_mma_warpspecialized.hpp)
+    * [Block scaled data types with support for pointer array and grouped GEMM with TMA](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm100_blockscaled_mma_array_warpspecialized.hpp)
+  - Blackwell [collective mainloop for convolution kernels](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/conv/collective/sm100_implicit_gemm_umma_warpspecialized.hpp) supporting non-block scaled data types for fprop, dgrad, and wgrad.
+  - New [GEMM](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/dispatch_policy.hpp), [convolution](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/conv/dispatch_policy.hpp), and [epilogue](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/dispatch_policy.hpp) dispatch policies for collectives, kernel layers, and builders.
+  - [Blackwell epilogue that supports loading accumulators from `tmem`](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/collective/sm100_epilogue_tma_warpspecialized.hpp) and full set of EVT fusions.
 * CUTLASS library and profiler integration for block scaled data types for kernel emission, profiling, and verification.
   - Support for preferred and fallback cluster shapes via profiler command line arguments parsing to set dynamic cluster shapes.
   - Support for dynamic datatypes by parsing profiler via profiler command line arguments parsing to set dynamic datatype setting in TCGen05 MMA instruction descriptors.
@@ -91,81 +123,81 @@
 * New CUTLASS profiler flag `use-cuda-graphs` to reduce overheads when benchmarking launch-bound kernels.
 * A new 3.x version of grouped GEMM to the CUTLASS library and generates kernels for Hopper and Blackwell. Now grouped GEMM support is enabled in the CUTLASS profiler (`./cutlass_profiler --operation=GroupedGemm --help` for details).
 * Set of examples that demonstrate the usage of the 3.x API for targeting Blackwell SM100 architecture:
-  - [Basic FP16 and FP8 GEMMs with minimal changes from Hopper examples](./examples/70_blackwell_gemm/), demonstrating ease of migration for off the shelf kernels using the 3.x collective builder API.
-  - GEMM with [opt-in collective builder schedules showcasing available recipes](./examples/71_blackwell_gemm_with_collective_builder/71_blackwell_gemm_with_collective_builder.cu) for Blackwell.
+  - [Basic FP16 and FP8 GEMMs with minimal changes from Hopper examples](https://github.com/NVIDIA/cutlass/tree/main/examples/70_blackwell_gemm/), demonstrating ease of migration for off the shelf kernels using the 3.x collective builder API.
+  - GEMM with [opt-in collective builder schedules showcasing available recipes](https://github.com/NVIDIA/cutlass/tree/main/examples/71_blackwell_gemm_with_collective_builder/71_blackwell_gemm_with_collective_builder.cu) for Blackwell.
   - Block scaled data type GEMMs targeting Blackwell's native block scaled Tensor Cores:
-    + [NVFP4 inputs with BF16 output](./examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu)
-    + [NVFP4 inputs with NVFP4 output](./examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu)
-    + [Mixed MXFP8 and MXFP6 inputs with BF16 output](./examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu)
-  - GEMM example demonstrating [Blackwell's new preferred cluster support via dynamic cluster shapes](./examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu) for increased occupancy.
-  - [GEMM with CLC based StreamK scheduler for load balancing](./examples/74_blackwell_gemm_streamk/blackwell_gemm_streamk.cu).
-  - Grouped GEMM for [vanilla FP8 data inputs](./examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu) and [NVFP4 block scaled inputs](./examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu).
-  - Convolution kernels for [fprop](./examples/76_blackwell_conv/76_blackwell_conv_fprop.cu), [dgrad](./examples/76_blackwell_conv/76_blackwell_conv_dgrad.cu), and [wgrad](./examples/76_blackwell_conv/76_blackwell_conv_wgrad.cu).
-  - [Fused multi-head attention fprop kernel](./examples/77_blackwell_fmha/77_blackwell_fmha.cu) supporting fp16/bf16/fp8 data types across head dims of 32,64, and 128.
-  - A new BF16x9 GEMM [kernel](./examples/78_blackwell_emulated_bf16x9_gemm/78_blackwell_emulated_bf16x9_gemm.cu) that emulates FP32 GEMM (SGEMM) using BF16 operations.
+    + [NVFP4 inputs with BF16 output](https://github.com/NVIDIA/cutlass/tree/main/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu)
+    + [NVFP4 inputs with NVFP4 output](https://github.com/NVIDIA/cutlass/tree/main/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu)
+    + [Mixed MXFP8 and MXFP6 inputs with BF16 output](https://github.com/NVIDIA/cutlass/tree/main/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu)
+  - GEMM example demonstrating [Blackwell's new preferred cluster support via dynamic cluster shapes](https://github.com/NVIDIA/cutlass/tree/main/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu) for increased occupancy.
+  - [GEMM with CLC based StreamK scheduler for load balancing](https://github.com/NVIDIA/cutlass/tree/main/examples/74_blackwell_gemm_streamk/blackwell_gemm_streamk.cu).
+  - Grouped GEMM for [vanilla FP8 data inputs](https://github.com/NVIDIA/cutlass/tree/main/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu) and [NVFP4 block scaled inputs](https://github.com/NVIDIA/cutlass/tree/main/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu).
+  - Convolution kernels for [fprop](https://github.com/NVIDIA/cutlass/tree/main/examples/76_blackwell_conv/76_blackwell_conv_fprop.cu), [dgrad](https://github.com/NVIDIA/cutlass/tree/main/examples/76_blackwell_conv/76_blackwell_conv_dgrad.cu), and [wgrad](https://github.com/NVIDIA/cutlass/tree/main/examples/76_blackwell_conv/76_blackwell_conv_wgrad.cu).
+  - [Fused multi-head attention fprop kernel](https://github.com/NVIDIA/cutlass/tree/main/examples/77_blackwell_fmha/77_blackwell_fmha.cu) supporting fp16/bf16/fp8 data types across head dims of 32,64, and 128.
+  - A new BF16x9 GEMM [kernel](https://github.com/NVIDIA/cutlass/tree/main/examples/78_blackwell_emulated_bf16x9_gemm/78_blackwell_emulated_bf16x9_gemm.cu) that emulates FP32 GEMM (SGEMM) using BF16 operations.
 * Set of examples that demonstrate the usage of the 3.x API for targeting Hopper architecture:
-  - A set of new [Hopper grouped GEMM kernels](./examples/69_hopper_mixed_dtype_grouped_gemm/) that support mixed A and B datatypes.
-  - A new [Hopper FP8 GEMM with groupwise scaling](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu).
+  - A set of new [Hopper grouped GEMM kernels](https://github.com/NVIDIA/cutlass/tree/main/examples/69_hopper_mixed_dtype_grouped_gemm/) that support mixed A and B datatypes.
+  - A new [Hopper FP8 GEMM with groupwise scaling](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu).
 * Documentation updates:
-  - [Quickstart - instantiating a Blackwell block-scaled GEMM](./media/docs/cpp/quickstart.md#instantiating-a-blackwell-gemm-kernel).
+  - [Quickstart - instantiating a Blackwell block-scaled GEMM](./media/docs/cpp/quickstart.md#instantiating-a-blackwell-sm100-gemm-kernel).
   - Detailed [Blackwell block-scaled GEMM functionality documentation](./media/docs/cpp/blackwell_functionality.md)
   - A new [functionality documentation](./media/docs/cpp/functionality.md) specifically for 3.x API comprehensively documenting all supported kernel types, data types, kernel features, minimum CUDA tookit support etc for 3.x supported architectures.
-  - Updates to [compatibility](./README.md#compatibility) section regarding supported compilers, operating systems, CUDA Toolkits, Hardware Architectures, and [Target Architecture](./README.md#Target-Architecture).
+  - Updates to [compatibility](./README.md#compatibility) section regarding supported compilers, operating systems, CUDA Toolkits, Hardware Architectures, and [Target Architecture](./README.md#target-architecture).
   - Updates to [profiler documentation](./media/docs/cpp/profiler.md) for testing mixed input GEMM kernels on Hopper.
 
 ## [3.7.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.7.0) (2025-01-11)
-- [Hopper blockwise scaling FP8 GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) uses 2D scaling tensor, assigning one value per threadblock.  This allows a finer-grained scaling to be applied for each output tile per gemm-k iteration. The operands and scaling tensors are loaded from global memory to shared memory using TMA and cp_async, respectively. The scaling is applied inside the mainloop.  Details with figures are [here](https://github.com/NVIDIA/cutlass/pull/1932#issue-2645398439).
-- [Distributed GEMM](./examples/65_distributed_gemm/65_distributed_gemm.cu) is a new (experimental) API which can turn existing CUTLASS GEMM kernels into pipelined Tensor Parallel GEMMs that run efficiently on NVLink-based network of GPUs. Its pipelining schedules can hide most of the communication behind computation, and relies on point-to-point communication, which can simply use CUDA runtime's peer device access feature. It also utilizes remote TMA loads and memcopies with CUDA graphs to handle communication primarily through the Copy Engine, leaving all SMs free for Hopper's persistent kernels.  For more details you can refer to the [DistGEMM blog post](https://blog.shi-labs.com/distributed-gemm-88be6a481e2b).
-- Improved persistent grid launch for Hopper kernels with large cluster sizes (>= size of 4) using the new `make_kernel_hardware_info` API as shown in [example 48](./examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu).
+- [Hopper blockwise scaling FP8 GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) uses 2D scaling tensor, assigning one value per threadblock.  This allows a finer-grained scaling to be applied for each output tile per gemm-k iteration. The operands and scaling tensors are loaded from global memory to shared memory using TMA and cp_async, respectively. The scaling is applied inside the mainloop.  Details with figures are [here](https://github.com/NVIDIA/cutlass/pull/1932#issue-2645398439).
+- [Distributed GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/65_distributed_gemm/65_distributed_gemm.cu) is a new (experimental) API which can turn existing CUTLASS GEMM kernels into pipelined Tensor Parallel GEMMs that run efficiently on NVLink-based network of GPUs. Its pipelining schedules can hide most of the communication behind computation, and relies on point-to-point communication, which can simply use CUDA runtime's peer device access feature. It also utilizes remote TMA loads and memcopies with CUDA graphs to handle communication primarily through the Copy Engine, leaving all SMs free for Hopper's persistent kernels.  For more details you can refer to the [DistGEMM blog post](https://blog.shi-labs.com/distributed-gemm-88be6a481e2b).
+- Improved persistent grid launch for Hopper kernels with large cluster sizes (>= size of 4) using the new `make_kernel_hardware_info` API as shown in [example 48](https://github.com/NVIDIA/cutlass/tree/main/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu).
 - Enabled high precision accumulation for Hopper FP8 Sparse GEMM.
 - Potential API breaking changes:
   + Fix `cute::UniversalCopy` for type safety.
   + No longer implicitly select `cute::SM80_CP_ASYNC_*` based on input tensors. This avoids implicit downstream synchronization requirements. To use `SM80_CP_ASYNC`, users must explicitly select the appropriate CopyAtom.
   + Fix `cute::SM80_CP_ASYNC_CACHEALWAYS`, `cute::SM80_CP_ASYNC_CACHEGLOBAL`, `cute::SM80_CP_ASYNC_CACHEALWAYS_ZFILL`, `cute::SM80_CP_ASYNC_CACHEGLOBAL_ZFILL` to avoid implicitly selecting `ZFILL` behavior on predication.
   + Remove `cute::copy_vec<T>` in favor of `cute::copy_aligned` and `cute::copy(AutoVectorizingCopyWithAssumedAlignment<NumBits>,...)`.
-  + A refactor of default epilogue struct `DefaultEpilogue` [API](./include/cutlass/epilogue/collective/default_epilogue.hpp) to avoid reading non-void `ElementC` value for `ElementC = void` kernel.
+  + A refactor of default epilogue struct `DefaultEpilogue` [API](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/collective/default_epilogue.hpp) to avoid reading non-void `ElementC` value for `ElementC = void` kernel.
 - New CUTLASS profiler flags: `profiling-duration`, `min-iterations`, and `kernels-file` documented in [profiler.md](./media/docs/cpp/profiler.md#cutlass-profiler).
 - Various improvements and fixes from the community and CUTLASS team. Thanks to everyone who submitted PRs!
 - Optimal code generation with CUDA toolkit versions 12.6.
 
 ## [3.6.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.6.0) (2024-10-03)
 
-- [Hopper structured sparse GEMM](./examples/62_hopper_sparse_gemm/62_hopper_sparse_gemm.cu).
-  + [FP16](./test/unit/gemm/device/sm90_sparse_gemm_f16_f16_f32_tensor_op_f32.cu)
-  + [FP8](./test/unit/gemm/device/sm90_sparse_gemm_f8_f8_f32_tensor_op_f32.cu)
-  + [INT8](./test/unit/gemm/device/sm90_sparse_gemm_s8_s8_s32_tensor_op_s32.cu)
-  + [TF32](./test/unit/gemm/device/sm90_sparse_gemm_tf32_tf32_f32_tensor_op_f32.cu)
-- A refactor to the CUTLASS 3.x convolution `kernel::ConvUniversal` [API](./include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp) to bring it in line with `gemm::GemmUniversal`. Now the 3.x convolution API is no longer considered as a beta API.
-- [An improved mixed input GEMM](./examples/55_hopper_mixed_dtype_gemm/README.md) and a [lookup table implementation](./examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu) for `INT4`x`FP8` scale-only mode.
-- [EVT nodes for Top-K selection and softmax](./include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp) and [GEMM example using those](./examples/61_hopper_gemm_with_topk_and_softmax/61_hopper_gemm_with_topk_and_softmax.cu).
-- [Programmatic Dependent Launch](./include/cutlass/arch/grid_dependency_control.h) (PDL) that leverages a new Hopper feature to speedup two back-to-back kernels, and its corresponding [documentations](./media/docs/cpp/dependent_kernel_launch.md).
-- [A new debugging tool, synclog](./include/cutlass/arch/synclog.hpp), for dumping out all synchronization events from within a kernel to a file. Please see [synclog documentation](./media/docs/cpp/utilities.md#debugging-asynchronous-kernels-with-cutlasss-built-in-synclog-tool) for details.
-- A new TMA-enabled [epilogue](./include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp) for grouped GEMM that brings significant performance improvement, as well as its EVT support.
-- A SIMT-enabled pointer-array [epilogue](./include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp).
-- A new [Ping-Pong kernel schedule for Grouped GEMM](./include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp) and some other optimizations.
-- [A new instantiation strategy for CUTLASS profiler kernels](./python/cutlass_library/sm90_shapes.py) along with [improved documentation for instantiation level in CUTLASS profiler](./media/docs/cpp/profiler.md#instantiating-more-kernels-with-hopper).
-- A new hardware support for comparisons and computations of [`cutlass::bfloat16_t`](./include/cutlass/bfloat16.h)
-- Fixed use of isnan on Windows for [`half_t`](./test/unit/core/functional.cu).
+- [Hopper structured sparse GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/62_hopper_sparse_gemm/62_hopper_sparse_gemm.cu).
+  + [FP16](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm90_sparse_gemm_f16_f16_f32_tensor_op_f32.cu)
+  + [FP8](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm90_sparse_gemm_f8_f8_f32_tensor_op_f32.cu)
+  + [INT8](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm90_sparse_gemm_s8_s8_s32_tensor_op_s32.cu)
+  + [TF32](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm90_sparse_gemm_tf32_tf32_f32_tensor_op_f32.cu)
+- A refactor to the CUTLASS 3.x convolution `kernel::ConvUniversal` [API](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp) to bring it in line with `gemm::GemmUniversal`. Now the 3.x convolution API is no longer considered as a beta API.
+- [An improved mixed input GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/55_hopper_mixed_dtype_gemm/cpp/README.md) and a [lookup table implementation](https://github.com/NVIDIA/cutlass/tree/main/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu) for `INT4`x`FP8` scale-only mode.
+- [EVT nodes for Top-K selection and softmax](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp) and [GEMM example using those](https://github.com/NVIDIA/cutlass/tree/main/examples/61_hopper_gemm_with_topk_and_softmax/61_hopper_gemm_with_topk_and_softmax.cu).
+- [Programmatic Dependent Launch](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/arch/grid_dependency_control.h) (PDL) that leverages a new Hopper feature to speedup two back-to-back kernels, and its corresponding [documentations](./media/docs/cpp/dependent_kernel_launch.md).
+- [A new debugging tool, synclog](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/arch/synclog.hpp), for dumping out all synchronization events from within a kernel to a file. Please see [synclog documentation](./media/docs/cpp/utilities.md#debugging-asynchronous-kernels-with-cutlasss-built-in-synclog-tool) for details.
+- A new TMA-enabled [epilogue](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp) for grouped GEMM that brings significant performance improvement, as well as its EVT support.
+- A SIMT-enabled pointer-array [epilogue](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp).
+- A new [Ping-Pong kernel schedule for Grouped GEMM](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp) and some other optimizations.
+- [A new instantiation strategy for CUTLASS profiler kernels](https://github.com/NVIDIA/cutlass/tree/main/python/cutlass_library/sm90_shapes.py) along with [improved documentation for instantiation level in CUTLASS profiler](./media/docs/cpp/profiler.md#instantiating-more-kernels-with-hopper).
+- A new hardware support for comparisons and computations of [`cutlass::bfloat16_t`](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/bfloat16.h)
+- Fixed use of isnan on Windows for [`half_t`](https://github.com/NVIDIA/cutlass/tree/main/test/unit/core/functional.cu).
 - Various improvements and fixes from the community and CUTLASS team. Thanks to everyone who submitted PRs!
 - Optimal code generation with CUDA toolkit versions 12.6.
 
 ## [3.5.1](https://github.com/NVIDIA/cutlass/releases/tag/v3.5.1) (2024-07-25)
 
-- [Minimal SM90 WGMMA + TMA GEMM example in 100 lines of code](./examples/cute/tutorial/wgmma_sm90.cu)
-- [Exposure of L2 `cache_hint`s in TMA copy atoms](./include/cute/arch/copy_sm90_tma.hpp#L48)
-- Exposure of raster order and tile swizzle extent in [CUTLASS library profiler](./media/docs/cpp/profiler.md#GEMM), and
-[example 48](./examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu).
-- [TMA store based and EVT supported epilogues](./include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp) for [Hopper pointer array batched kernels](./test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu).
-- A new [`GemmSparseUniversal` API for CUTLASS 2.x Ampere kernels](./include/cutlass/gemm/device/gemm_sparse_universal.h) to enable serial and parallel split-k for sparse tensor cores and new tiny tile sizes to better support LLM inferrence:
-  + [FP16 TN](./test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu#L269-L393) and [NT](./test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu#L269-L411).
-  + [int8 TN](./test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu#L264-L452).
-  + [int4 TN](./test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu#L264-L452).
-  + [FP32 TN](./test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu#L427-L642) and [NT](./test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu#L427-L456).
-- [CUDA host adapter](./include/cutlass/cuda_host_adapter.hpp) extensions to support TMA descriptor construction driver APIs.
-- Inclusion of more [Hopper fprop, dgrad, and wgrad convolution kernels in CUTLASS library and profiler](./python/cutlass_library/generator.py).
+- [Minimal SM90 WGMMA + TMA GEMM example in 100 lines of code](https://github.com/NVIDIA/cutlass/tree/main/examples/cute/tutorial/wgmma_sm90.cu)
+- [Exposure of L2 `cache_hint`s in TMA copy atoms](https://github.com/NVIDIA/cutlass/tree/main/include/cute/arch/copy_sm90_tma.hpp#L48)
+- Exposure of raster order and tile swizzle extent in [CUTLASS library profiler](./media/docs/cpp/profiler.md#gemm), and
+[example 48](https://github.com/NVIDIA/cutlass/tree/main/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu).
+- [TMA store based and EVT supported epilogues](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp) for [Hopper pointer array batched kernels](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu).
+- A new [`GemmSparseUniversal` API for CUTLASS 2.x Ampere kernels](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/device/gemm_sparse_universal.h) to enable serial and parallel split-k for sparse tensor cores and new tiny tile sizes to better support LLM inferrence:
+  + [FP16 TN](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu#L269-L393) and [NT](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu#L269-L411).
+  + [int8 TN](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu#L264-L452).
+  + [int4 TN](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu#L264-L452).
+  + [FP32 TN](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu#L427-L642) and [NT](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu#L427-L456).
+- [CUDA host adapter](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/cuda_host_adapter.hpp) extensions to support TMA descriptor construction driver APIs.
+- Inclusion of more [Hopper fprop, dgrad, and wgrad convolution kernels in CUTLASS library and profiler](https://github.com/NVIDIA/cutlass/tree/main/python/cutlass_library/generator.py).
 - Support for residual add (beta != 0) in convolution kernels.
-- A new convolution [epilogue](./examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu#L269) for CUTLASS 2.x to support non-packed NHWC output.
-- A refactor of [include files throughout CUTLASS core directories](./include/cutlass/gemm/collective/collective_mma_decl.hpp) to reduce circular dependencies and [tests to guard against them](./test/self_contained_includes/CMakeLists.txt).
+- A new convolution [epilogue](https://github.com/NVIDIA/cutlass/tree/main/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu#L269) for CUTLASS 2.x to support non-packed NHWC output.
+- A refactor of [include files throughout CUTLASS core directories](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/collective_mma_decl.hpp) to reduce circular dependencies and [tests to guard against them](https://github.com/NVIDIA/cutlass/tree/main/test/self_contained_includes/CMakeLists.txt).
 - [A guide for setting up VSCode to work well with CUTLASS](./media/docs/cpp/ide_setup.md) and [expanded code style guide](./media/docs/cpp/programming_guidelines.md).
 - Better support for MSVC as a host compiler.
 - Many performance optimizations, improvements, and bug fixes including fixes for FlashAttention-2.
@@ -173,49 +205,49 @@
 
 ## [3.5.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.5.0) (2024-04-09)
 
-- Implicit GEMM Convolutions targeting Hopper SM90A via WGMMA + [TMA im2col](./include/cute/atom/copy_traits_sm90_im2col.hpp)
+- Implicit GEMM Convolutions targeting Hopper SM90A via WGMMA + [TMA im2col](https://github.com/NVIDIA/cutlass/tree/main/include/cute/atom/copy_traits_sm90_im2col.hpp)
   + Native implementation in CUTLASS 3.x using CuTe, mirroring the [same design hierarchy as that of GEMMs](./media/docs/cpp/gemm_api_3x.md).
-  + Support for 1D, 2D, and 3D convolutions in a [rank-agnostic fashion](./include/cutlass/conv/convnd_problem_shape.hpp).
-  + Support for [Fprop](./test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu), [Dgrad](./test/unit/conv/device_3x/dgrad/sm90_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu), and [Wgrad](./test/unit/conv/device_3x/wgrad/sm90_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu) algorithms
-  + [CUTLASS profiler support](./python/cutlass_library/conv3x_emitter.py) for 2D and 3D convolutions implemented via the 3.x API.
+  + Support for 1D, 2D, and 3D convolutions in a [rank-agnostic fashion](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/conv/convnd_problem_shape.hpp).
+  + Support for [Fprop](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu), [Dgrad](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device_3x/dgrad/sm90_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu), and [Wgrad](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device_3x/wgrad/sm90_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu) algorithms
+  + [CUTLASS profiler support](https://github.com/NVIDIA/cutlass/tree/main/python/cutlass_library/conv3x_emitter.py) for 2D and 3D convolutions implemented via the 3.x API.
   + NOTE: this is a beta release. Further updates to CUTLASS will include major performance improvements, feature enablement, and possible breaking changes to the API until 3.7 release. Your feedback is welcome on the design!
-- Support for [Ada (SM89) FP8 tensor cores via the 2.x API](./examples/58_ada_fp8_gemm/ada_fp8_gemm.cu). Requires CUDA 12.4 or newer.
-- [Ampere gather/scatter convolution example](./examples/59_ampere_gather_scatter_conv/README.md) in CuTe and CUTLASS 3.x
+- Support for [Ada (SM89) FP8 tensor cores via the 2.x API](https://github.com/NVIDIA/cutlass/tree/main/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu). Requires CUDA 12.4 or newer.
+- [Ampere gather/scatter convolution example](https://github.com/NVIDIA/cutlass/tree/main/examples/59_ampere_gather_scatter_conv/cpp/README.md) in CuTe and CUTLASS 3.x
   + Showcasing how custom kernels can be written and optimized using CUTLASS 3.x and CuTe and the general strategy for implementing convolutions as specializations of GETTs.
   + Implementation of a coarse grained sparse gather/scatter kernel achieving peak performance on Ampere class tensor cores.
 - 32x and 16x tile sizes are added to CUTLASS 2.x to improve the performance of narrow-tall and wide-short matrices.
-  + [Ampere FP16 TN](./test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm80.cu) and [NT](./test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu#L227-L301), [Ampere INT8 TN](./test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu#L392-L1342), [Ampere INT4 TN](./test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm80.cu#L372-L934).
-  + [Turing FP16 TN](./test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm75.cu#L55-L394), [Turing INT8 TN](./test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu#L166-L537), [Turing INT4 TN](./test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu#L310-L564).
-- Updates to CuTe documentation for [`cute::Tensor<>`](./media/docs/cpp/cute/03_tensor.md), [MMA atoms](./media/docs/cpp/cute/0t_mma_atom.md), and an overhauled [CuTe GEMM tutorial series](./examples/cute/tutorial).
-- Extensions to CuTe to support [L2 prefetching](./include/cute/algorithm/prefetch.hpp) and [TMA store+reductions](./include/cute/arch/copy_sm90_tma.hpp#L1337).
+  + [Ampere FP16 TN](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm80.cu) and [NT](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu#L227-L301), [Ampere INT8 TN](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu#L392-L1342), [Ampere INT4 TN](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm80.cu#L372-L934).
+  + [Turing FP16 TN](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm75.cu#L55-L394), [Turing INT8 TN](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu#L166-L537), [Turing INT4 TN](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu#L310-L564).
+- Updates to CuTe documentation for [`cute::Tensor<>`](./media/docs/cpp/cute/03_tensor.md), [MMA atoms](./media/docs/cpp/cute/0t_mma_atom.md), and an overhauled [CuTe GEMM tutorial series](https://github.com/NVIDIA/cutlass/tree/main/examples/cute/tutorial).
+- Extensions to CuTe to support [L2 prefetching](https://github.com/NVIDIA/cutlass/tree/main/include/cute/algorithm/prefetch.hpp) and [TMA store+reductions](https://github.com/NVIDIA/cutlass/tree/main/include/cute/arch/copy_sm90_tma.hpp#L1337).
 - Remove C++11 requirement on a few CUTLASS 2.x API header files. All CUTLASS files now require C++17.
 - Fixes to greatly reduce build warnings.
 - Updates and bugfixes from the community (thanks!)
 
 ## [3.4.1](https://github.com/NVIDIA/cutlass/releases/tag/v3.4.1) (2024-02-14)
 
-- Statically available [CUTLASS Version macros](./include/cutlass/version.h) that allow for handling API changes between CUTLASS releases on the users' side.
-- Improvements for Hopper [Group-GEMMs](./examples/57_hopper_grouped_gemm) and [Pointer-Array Batched GEMMs](./examples/56_hopper_ptr_array_batched_gemm).
+- Statically available [CUTLASS Version macros](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/version.h) that allow for handling API changes between CUTLASS releases on the users' side.
+- Improvements for Hopper [Group-GEMMs](https://github.com/NVIDIA/cutlass/tree/main/examples/57_hopper_grouped_gemm) and [Pointer-Array Batched GEMMs](https://github.com/NVIDIA/cutlass/tree/main/examples/56_hopper_ptr_array_batched_gemm).
 - Updates and bugfixes from the community (thanks!).
 
 ## [3.4.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.4.0) (2024-01-12)
-* Expanded [Mixed-input Hopper GEMMs](./examples/55_hopper_mixed_dtype_gemm) support covering {16-bit, 8-bit} x {8-bit, 4-bit} input types with fast numerical converters and group scaling factors.
-* Performance improvements to [Mixed-input Hopper GEMMs](./examples/55_hopper_mixed_dtype_gemm)
-* Beta release of [Pointer-Array Batched GEMMs](./examples/56_hopper_ptr_array_batched_gemm) now available on Hopper GPUs utilizing TMA and WGMMA (requires CUDA 12.3 or above).
-* Beta release of [Group-GEMM](./examples/57_hopper_grouped_gemm) utilizing TMA and WGMMA (requires CUDA 12.3 or above).
-* [Ampere Sparse GEMM](./examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm_with_visitor.cu) supports Epilogue Visitor Tree (EVT) now.
-* NamedBarriers usability improvement and list of [ReservedNamedBarriers](./include/cutlass/arch/barrier.h) has been officially released.
-* Improved [CuTe documentation](./media/docs/cpp/cute/) including improved clarity and depth of [Quickstart](./media/docs/cute/00_quickstart.md), [CuTe Layout](./media/docs/cpp/cute/01_layout.md), and [CuTe Layout Algebra](./media/docs/cpp/cute/02_layout_algebra.md). Associated code comments, post-conditions, and details in [CuTe Core Unit Tests](./test/unit/cute/core/) also improved.
+* Expanded [Mixed-input Hopper GEMMs](https://github.com/NVIDIA/cutlass/tree/main/examples/55_hopper_mixed_dtype_gemm) support covering {16-bit, 8-bit} x {8-bit, 4-bit} input types with fast numerical converters and group scaling factors.
+* Performance improvements to [Mixed-input Hopper GEMMs](https://github.com/NVIDIA/cutlass/tree/main/examples/55_hopper_mixed_dtype_gemm)
+* Beta release of [Pointer-Array Batched GEMMs](https://github.com/NVIDIA/cutlass/tree/main/examples/56_hopper_ptr_array_batched_gemm) now available on Hopper GPUs utilizing TMA and WGMMA (requires CUDA 12.3 or above).
+* Beta release of [Group-GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/57_hopper_grouped_gemm) utilizing TMA and WGMMA (requires CUDA 12.3 or above).
+* [Ampere Sparse GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm_with_visitor.cu) supports Epilogue Visitor Tree (EVT) now.
+* NamedBarriers usability improvement and list of [ReservedNamedBarriers](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/arch/barrier.h) has been officially released.
+* Improved CuTe documentation including improved clarity and depth of [Quickstart](./media/docs/cpp/cute/00_quickstart.md), [CuTe Layout](./media/docs/cpp/cute/01_layout.md), and [CuTe Layout Algebra](./media/docs/cpp/cute/02_layout_algebra.md). Associated code comments, post-conditions, and details in [CuTe Core Unit Tests](./test/unit/cute/core/) also improved.
 
 ## [3.3](https://github.com/NVIDIA/cutlass/releases/tag/v3.3.0) (2023-10-31)
-* [Mixed-input Hopper GEMMs](./examples/55_hopper_mixed_dtype_gemm) support covering 16-bit x 8-bit input operand types.
+* [Mixed-input Hopper GEMMs](https://github.com/NVIDIA/cutlass/tree/main/examples/55_hopper_mixed_dtype_gemm) support covering 16-bit x 8-bit input operand types.
 * [Mixed-input Ampere GEMMs](https://github.com/NVIDIA/cutlass/pull/1084) with support for canonical layouts (TN). The implementation supports upcast on operandB {fp16, bf16} x {s8, u8}, and upcast on operandA {s8, u8} x {fp16, bf16}.
-* [Copy Async based Hopper GEMMs](./test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized_cooperative.cu) - which support lower than 16B aligned input tensors.
+* [Copy Async based Hopper GEMMs](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized_cooperative.cu) - which support lower than 16B aligned input tensors.
 * Kernel schedules and Builder support for mixed precision and Copy Async GEMMs with < 16B aligned input tensors.
 * Profiler support for lower-aligned Hopper GEMMs.
-* Performance Improvements to [Scatter-Gather Hopper Example](./examples/52_hopper_gather_scatter_fusion).
+* Performance Improvements to [Scatter-Gather Hopper Example](https://github.com/NVIDIA/cutlass/tree/main/examples/52_hopper_gather_scatter_fusion).
 * Sub-Byte type fixes and improvements.
-* EVT Support for RELU with Aux bitmap tensor store (used in dRELU). See [SM90 EVT fusions](./include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp) for details.
+* EVT Support for RELU with Aux bitmap tensor store (used in dRELU). See [SM90 EVT fusions](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp) for details.
 * Fusion support for backprop fusions including drelu, dgelu, and dbias.
 * Support for void-C kernels and SM80 mixed-input GEMMs in the CUTLASS Python interface
 
@@ -227,7 +259,7 @@
 * SM80 EVT support in C++ and Python.
 * Other SM90 epilogue improvements.
 * Splitting CUTLASS library into smaller units based on operation, arch and datatypes. See [1105](https://github.com/NVIDIA/cutlass/discussions/1105) for details.
-* Making `tools/library/scripts` packageable - `tools/library/scripts` is now moving to `python/cutlass_library`. See the Python [README](./python/README.md) for details.
+* Making `tools/library/scripts` packageable - `tools/library/scripts` is now moving to `python/cutlass_library`. See the Python [README](https://github.com/NVIDIA/cutlass/tree/main/python/README.md) for details.
 * SM90 TF32 kernel improvements for all layouts.
 * SM90 rasterization direction support in the CUTLASS profiler.
 * Improvement for CUTLASS profiler build times.
@@ -235,34 +267,34 @@
 
 ## [3.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.2.0) (2023-08-03)
 
-* New warp-specialized persistent FP8 GEMM kernel [kernel schedules](./include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp) and [mainloops](./include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp)  targeting Hopper architecture that achieve great performance with TMA, WGMMA, and threadblock clusters. An example showcasing [Hopper warp-specialized FP8 GEMMs](./examples/54_hopper_fp8_warp_specialized_gemm). FP8 GEMMs come with a fast accumulation mode. When enabled, problem execution might be faster but at the cost of lower accuracy because intermediate results will not periodically be promoted to a higher precision.
-* New [Epilogue Visitor Tree (EVT)](./examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu) support for Hopper TMA epilogues. EVTs allows for user-defined customized epilogue fusion patterns without having to write a new epilogue.
-* [Stream-K](./include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp) feature for Hopper. Note that this is only a functional implementation of stream-K, and should not be used for performance comparison. Optimizations are expected in a future release.
-* Improved CTA rasterization and support for CTA swizzling for Hopper kernels using the [Tile Scheduler](./include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp).
-* Improved performance for [warp-specialized TensorFloat-32 (TF32) GEMM kernels](test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32_gmma_rs_cluster_warpspecialized.cu) targeting Hopper TMA.
-* [Hopper GEMM+Permute](./examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu), an example of fusing tensor reordering (permutation) with GEMM mainloop or epilogue.
-* New CUTLASS 2D Convolution Python interface. New [example](./examples/python/03_basic_conv2d.ipynb) here.
+* New warp-specialized persistent FP8 GEMM kernel [kernel schedules](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp) and [mainloops](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp)  targeting Hopper architecture that achieve great performance with TMA, WGMMA, and threadblock clusters. An example showcasing [Hopper warp-specialized FP8 GEMMs](https://github.com/NVIDIA/cutlass/tree/main/examples/54_hopper_fp8_warp_specialized_gemm). FP8 GEMMs come with a fast accumulation mode. When enabled, problem execution might be faster but at the cost of lower accuracy because intermediate results will not periodically be promoted to a higher precision.
+* New [Epilogue Visitor Tree (EVT)](https://github.com/NVIDIA/cutlass/tree/main/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu) support for Hopper TMA epilogues. EVTs allows for user-defined customized epilogue fusion patterns without having to write a new epilogue.
+* [Stream-K](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp) feature for Hopper. Note that this is only a functional implementation of stream-K, and should not be used for performance comparison. Optimizations are expected in a future release.
+* Improved CTA rasterization and support for CTA swizzling for Hopper kernels using the [Tile Scheduler](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp).
+* Improved performance for [warp-specialized TensorFloat-32 (TF32) GEMM kernels](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32_gmma_rs_cluster_warpspecialized.cu) targeting Hopper TMA.
+* [Hopper GEMM+Permute](https://github.com/NVIDIA/cutlass/tree/main/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu), an example of fusing tensor reordering (permutation) with GEMM mainloop or epilogue.
+* New CUTLASS 2D Convolution Python interface. New [example](https://github.com/NVIDIA/cutlass/tree/main/examples/python/03_basic_conv2d.ipynb) here.
 * Support for Windows (MSVC) builds. Tested with Visual Studio 2019 v16.11.27 on Windows 10.0.
 * Optimal performance using [**CUDA 12.2u1**](https://developer.nvidia.com/cuda-downloads)
 * Updates and bugfixes from the community (thanks!)
 
 ## [3.1.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.1.0) (2023-04-14)
-* New CUTLASS Python interface that aims to provide an ease-of-use interface for instantiating, emitting, compiling, and running CUTLASS kernels via Python. More details [here](./python/README.md) and new [examples](./examples/python).
-* New [efficient epilogues](test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative.cu#L783) using TMA for Hopper.
-* Support for [fused epilogues](test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_bias_elementwise.cu), such Bias, ReLU and GELU, using the new efficient epilogues.
-* New [warp-specialized TensorFloat-32 (TF32) GEMM kernels](test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32_gmma_rs_cluster_warpspecialized.cu) targeting Hopper TMA.
-* New [*warp-specialized persistent cooperative*](./include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp) kernel design that allows for larger tile sizes and improves performance on Hopper.
-* An [example](./examples/51_hopper_gett) showcasing GEMM-Like Tensor-Tensor Contraction (GETT) capability on Hopper.
-* Epilogue builders. Similar to mainloop builders (see [example 49](./examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu)), epilogue builders aim to generate the best-possible epilogue while exposing incremental opt-ins for greater customization.
+* New CUTLASS Python interface that aims to provide an ease-of-use interface for instantiating, emitting, compiling, and running CUTLASS kernels via Python. More details [here](https://github.com/NVIDIA/cutlass/tree/main/python/cpp/README.md) and new [examples](https://github.com/NVIDIA/cutlass/tree/main/examples/python).
+* New [efficient epilogues](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative.cu#L783) using TMA for Hopper.
+* Support for [fused epilogues](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_bias_elementwise.cu), such Bias, ReLU and GELU, using the new efficient epilogues.
+* New [warp-specialized TensorFloat-32 (TF32) GEMM kernels](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32_gmma_rs_cluster_warpspecialized.cu) targeting Hopper TMA.
+* New [*warp-specialized persistent cooperative*](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp) kernel design that allows for larger tile sizes and improves performance on Hopper.
+* An [example](https://github.com/NVIDIA/cutlass/tree/main/examples/51_hopper_gett) showcasing GEMM-Like Tensor-Tensor Contraction (GETT) capability on Hopper.
+* Epilogue builders. Similar to mainloop builders (see [example 49](https://github.com/NVIDIA/cutlass/tree/main/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu)), epilogue builders aim to generate the best-possible epilogue while exposing incremental opt-ins for greater customization.
 * Profiler support for overriding kernel and epilogue builder auto schedules for 3.x API kernels, allowing specific policies to be run in the CUTLASS profiler.
-* Performance optimizations for the [*warp-specialized persistent ping-pong*](./include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp) kernel.
+* Performance optimizations for the [*warp-specialized persistent ping-pong*](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp) kernel.
 * Changes to the [GEMM API 3.x](./media/docs/cpp/gemm_api_3x.md), involving the host-facing arguments and the underlying `Params` structs.
-* [FMHA Backward Pass](./examples/41_fused_multi_head_attention/fused_multi_head_attention_backward.cu) from Meta xFormers.
-* [Streamk GEMM with Broadcast](./examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu) enables epilogue broadcast with StreamK GEMM.
-* [Batched B2B GEMM](./examples/13_two_tensor_op_fusion) now can run multiple Back-to-Back GEMM with the same problem size in parallel.
-* [Batched Strided GEMV](test/unit/gemm/device/gemv.cu) support both row major and column major input matrix.
-* [Permute + GEMM fusion](./examples/39_gemm_permute) can fuse Permute with following GEMM now.  Before, we only support fusing GEMM with Permute in the epilogue.
-* [Row Broadcast](./include/cutlass/epilogue/threadblock/predicated_tile_iterator_row_broadcast.h) can be fused in the epilogue.
+* [FMHA Backward Pass](https://github.com/NVIDIA/cutlass/tree/main/examples/41_fused_multi_head_attention/fused_multi_head_attention_backward.cu) from Meta xFormers.
+* [Streamk GEMM with Broadcast](https://github.com/NVIDIA/cutlass/tree/main/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu) enables epilogue broadcast with StreamK GEMM.
+* [Batched B2B GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/13_two_tensor_op_fusion) now can run multiple Back-to-Back GEMM with the same problem size in parallel.
+* [Batched Strided GEMV](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemv.cu) support both row major and column major input matrix.
+* [Permute + GEMM fusion](https://github.com/NVIDIA/cutlass/tree/main/examples/39_gemm_permute) can fuse Permute with following GEMM now.  Before, we only support fusing GEMM with Permute in the epilogue.
+* [Row Broadcast](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/threadblock/predicated_tile_iterator_row_broadcast.h) can be fused in the epilogue.
 * The GitHub branch is renamed from `master` to `main` in this release.
 * Optimal performance using [**CUDA 12.1**](https://developer.nvidia.com/cuda-downloads)
 * Updates and bugfixes from the community (thanks!)
@@ -272,28 +304,30 @@
 * [A new conceptual operation hierarchy](./media/docs/cpp/cutlass_3x_design.md) that replaces the architecture-centric hierarchy of CUTLASS 2.x and [documentation for CUTLASS 3.0's GEMM API changes](./media/docs/cpp/gemm_api_3x.md).
 * Strict API backwards compatibility that exposes both 2.x and 3.x API kernels through the same [`device::GemmUniversalAdapter`](./include/cutlass/gemm/device/gemm_universal_adapter.h) and [`kernel::GemmUniversal`](./include/cutlass/gemm/kernel/gemm_universal.hpp) types, allowing users to include both APIs in the same translation units. More information can be found in the [3.x backwards compatibility section](./media/docs/cpp/cutlass_3x_backwards_compatibility.md).
 * Updates to [Functionality](./media/docs/cpp/functionality.md) which directs users on which kernels are supported via CUTLASS-2 and CUTLASS-3.
-* Updates to [Compatibility](./README.md#compatibility) Section regarding supported compilers, operating systems, CUDA Toolkits, Hardware Architectures and [Target Architecture](./README.md#Target-Architecture).
-* New warp-specialized GEMM [kernel schedules](./include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp) and [mainloops](./include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp) targeting Hopper architecture that achieve great performance with TMA, WGMMA, and threadblock clusters.
+* Updates to [Compatibility](./README.md#compatibility) Section regarding supported compilers, operating systems, CUDA Toolkits, Hardware Architectures and [Target Architecture](./README.md#target-architecture).
+* New warp-specialized GEMM [kernel schedules](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp) and [mainloops](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp) targeting Hopper architecture that achieve great performance with TMA, WGMMA, and threadblock clusters.
 * Extensions to CUTLASS profiler to support threadblock cluster shapes in library and profiler tile configurations.
-* [CUTLASS library integration](./tools/library/src/gemm_operation_3x.hpp) for 3.x API kernels built through the new `CollectiveBuilder` API, enabling CUTLASS profiler.
-* Support for [Hopper GEMMs](./examples/48_hopper_warp_specialized_gemm) through the new 3.0 API with CuTe-based exposure of the Hopper [Tensor Memory Accelerator](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor) and [WGMMA Tensor Core](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions) features.
-* Set of examples that demonstrate the usage of the new 3.0 API to easily build GEMM kernels targeting Hopper: examples [48](./examples/48_hopper_warp_specialized_gemm), [49](./examples/49_hopper_gemm_schedules_with_collective_builder), and [50](./examples/50_hopper_gemm_with_epilogue_swizzle).
+* [CUTLASS library integration](https://github.com/NVIDIA/cutlass/tree/main/tools/library/src/gemm_operation_3x.hpp) for 3.x API kernels built through the new `CollectiveBuilder` API, enabling CUTLASS profiler.
+* Support for [Hopper GEMMs](https://github.com/NVIDIA/cutlass/tree/main/examples/48_hopper_warp_specialized_gemm) through the new 3.0 API with CuTe-based exposure of the Hopper [Tensor Memory Accelerator](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor) and [WGMMA Tensor Core](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions) features.
+* Set of examples that demonstrate the usage of the new 3.0 API to easily build GEMM kernels targeting Hopper: examples [48](https://github.com/NVIDIA/cutlass/tree/main/examples/48_hopper_warp_specialized_gemm), [49](https://github.com/NVIDIA/cutlass/tree/main/examples/49_hopper_gemm_schedules_with_collective_builder), and [50](https://github.com/NVIDIA/cutlass/tree/main/examples/50_hopper_gemm_with_epilogue_swizzle).
+
+# CUTLASS 2.x
 
 ## [2.11.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.11.0) (2022-11-19)
-* [Stream-K](./examples/47_ampere_gemm_universal_streamk), which is a new general way to do split-K.  It can not only improve performance, but can also significantly reduce the number of tile sizes that need to be profiled to find the best one.
-* [Fused multi-head attention Kernel](./examples/41_fused_multi_head_attention).  It has two variants: one uses batched GEMM for the fixed sequence length, and the other one uses group GEMM for the variable sequence length.  Both versions just need one kernel.
-* [Dual GEMM](./examples/45_dual_gemm), which can fuse A x B and A x C into one kernel. Two GEMMs has no producer-consumer dependency.
-* Hopper improves [double precision matrix multiplication](./test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm90.cu) by 2x compared to Ampere at iso-clocks. It is supported since CUDA 11.8.
-* [BLAS3](./test/unit/gemm/device/hemm_cf64_cf64_cf64_tensor_op_f64_sm90.cu) functions with Hoppers new double precision matrix multiplication instructions.
-* [ELL Block Sparse GEMM](./examples/43_ell_block_sparse_gemm), which uses an [ELL matrix](https://developer.nvidia.com/blog/accelerating-matrix-multiplication-with-block-sparse-format-and-nvidia-tensor-cores/) to describe the sparsity of A matrix.  B and output matrices are still dense. The block size can be arbitary.
-* Optimized [Group Conv](./examples/42_ampere_tensorop_group_conv) for SingleGroup mode, which requires that the output channel per group is a multiple of Threadblock tile N.
-* [Optimized DepthWise Conv](./examples/46_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu).  Two new modes are added
-  * [kOptimized](./test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu) - use direct conv to compute instead of implicit GEMM.
+* [Stream-K](https://github.com/NVIDIA/cutlass/tree/main/examples/47_ampere_gemm_universal_streamk), which is a new general way to do split-K.  It can not only improve performance, but can also significantly reduce the number of tile sizes that need to be profiled to find the best one.
+* [Fused multi-head attention Kernel](https://github.com/NVIDIA/cutlass/tree/main/examples/41_fused_multi_head_attention).  It has two variants: one uses batched GEMM for the fixed sequence length, and the other one uses group GEMM for the variable sequence length.  Both versions just need one kernel.
+* [Dual GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/45_dual_gemm), which can fuse A x B and A x C into one kernel. Two GEMMs has no producer-consumer dependency.
+* Hopper improves [double precision matrix multiplication](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm90.cu) by 2x compared to Ampere at iso-clocks. It is supported since CUDA 11.8.
+* [BLAS3](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/hemm_cf64_cf64_cf64_tensor_op_f64_sm90.cu) functions with Hoppers new double precision matrix multiplication instructions.
+* [ELL Block Sparse GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/43_ell_block_sparse_gemm), which uses an [ELL matrix](https://developer.nvidia.com/blog/accelerating-matrix-multiplication-with-block-sparse-format-and-nvidia-tensor-cores/) to describe the sparsity of A matrix.  B and output matrices are still dense. The block size can be arbitary.
+* Optimized [Group Conv](https://github.com/NVIDIA/cutlass/tree/main/examples/42_ampere_tensorop_group_conv) for SingleGroup mode, which requires that the output channel per group is a multiple of Threadblock tile N.
+* [Optimized DepthWise Conv](https://github.com/NVIDIA/cutlass/tree/main/examples/46_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu).  Two new modes are added
+  * [kOptimized](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu) - use direct conv to compute instead of implicit GEMM.
     *  The restrictions are: 1) input ,output channel and group number should be multiple of (128 / sizeof(input element)). 2) The input filter size should be the same as the template parameter configuration.
-  * [kFixedStrideDilation](./test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_fixed_stride_dilation_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu) - which puts stride and dilation into templates to further improve the performance. In this mode, kernel persistents some inputs into register to squeeze more performance, so large filter/stride/dilation is not recommanded.
+  * [kFixedStrideDilation](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_fixed_stride_dilation_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu) - which puts stride and dilation into templates to further improve the performance. In this mode, kernel persistents some inputs into register to squeeze more performance, so large filter/stride/dilation is not recommanded.
     * The restrictions are: 1) input, output channel and group number should be multiple of (128 / sizeof(input element)). 2) input filter size, stride, dilation should same as the template parameter configuration.
-* [Scripts](./examples/44_multi_gemm_ir_and_codegen) to fuse multiple back-to-back GEMM.  Its implementation was discussed in a GTC'22 Spring [talk](https://www.nvidia.com/en-us/on-demand/session/gtcspring22-s41606/).
-* [FP8 data type definition](./include/cutlass/float8.h) and [conversion routines](./include/cutlass/numeric_conversion.h#L1274-2115).
+* [Scripts](https://github.com/NVIDIA/cutlass/tree/main/examples/44_multi_gemm_ir_and_codegen) to fuse multiple back-to-back GEMM.  Its implementation was discussed in a GTC'22 Spring [talk](https://www.nvidia.com/en-us/on-demand/session/gtcspring22-s41606/).
+* [FP8 data type definition](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/float8.h) and [conversion routines](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/numeric_conversion.h#L1274-2115).
 * Updates and bugfixes from the community (thanks!).  Big shout out to Meta's [xFormers](https://github.com/facebookresearch/xformers).
 
 * **Deprecation announcement:** CUTLASS plans to deprecate the following:
@@ -302,54 +336,54 @@
   * CUDA 10.2
 
 ## [2.10.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.10.0) (2022-08-23)
-* [CUTLASS Python](./examples/40_cutlass_py) now supports GEMM, CONV, Group GEMM for different data types as well as different epilogue flavours.
-* Optimizations for CUTLASS's [Grouped GEMM](./examples/24_gemm_grouped/gemm_grouped.cu) kernel.  Threadblock scheduling part is improved.  Some computation can be moved to the host side if applicable.  [Grouped Syr2k](./examples/38_syr2k_grouped/syr2k_grouped.cu) kernels are added, too.
-* Optimizations for [GEMM+Softmax](./examples/35_gemm_softmax).  All the reduction computation is fused into the previous GEMM.  More template arguments are provided to fine tune the performance.
-* [Grouped GEMM for Multihead Attention](./examples/41_multi_head_attention).  This general group gemm based MHA does not require the sequence length of all GEMMs to be the same which makes it most useful for natural language processing.
-* [GEMM + Layer norm fusion for Ampere](./examples/37_gemm_layernorm_gemm_fusion/) splits the layernorm into two parts and both of them can be fused into the GEMMs before and after separately.  In addition to use square sum to compute variance of layernorm, [Shift-K](https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data) is provided if square sum raise numerical issues.
-* [GEMM Epilogue Permutation Fusion](./examples/39_gemm_permute) can apply user provided permutation layout mapping in the GEMM epilogue.
-* [Grouped convolution targeting implicit GEMM](test/unit/conv/device/group_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu) introduces the first group convolution implementation to CUTLASS.  It is an Analytical implementation, not an Optimized.  The restrictions are: 1) input and output channel number should be multiple of group number. 2) split-K is not supported.  The implementation has 2 modes:
+* [CUTLASS Python](https://github.com/NVIDIA/cutlass/tree/main/examples/40_cutlass_py) now supports GEMM, CONV, Group GEMM for different data types as well as different epilogue flavours.
+* Optimizations for CUTLASS's [Grouped GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/24_gemm_grouped/gemm_grouped.cu) kernel.  Threadblock scheduling part is improved.  Some computation can be moved to the host side if applicable.  [Grouped Syr2k](https://github.com/NVIDIA/cutlass/tree/main/examples/38_syr2k_grouped/syr2k_grouped.cu) kernels are added, too.
+* Optimizations for [GEMM+Softmax](https://github.com/NVIDIA/cutlass/tree/main/examples/35_gemm_softmax).  All the reduction computation is fused into the previous GEMM.  More template arguments are provided to fine tune the performance.
+* [Grouped GEMM for Multihead Attention](https://github.com/NVIDIA/cutlass/tree/main/examples/41_multi_head_attention).  This general group gemm based MHA does not require the sequence length of all GEMMs to be the same which makes it most useful for natural language processing.
+* [GEMM + Layer norm fusion for Ampere](https://github.com/NVIDIA/cutlass/tree/main/examples/37_gemm_layernorm_gemm_fusion/) splits the layernorm into two parts and both of them can be fused into the GEMMs before and after separately.  In addition to use square sum to compute variance of layernorm, [Shift-K](https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data) is provided if square sum raise numerical issues.
+* [GEMM Epilogue Permutation Fusion](https://github.com/NVIDIA/cutlass/tree/main/examples/39_gemm_permute) can apply user provided permutation layout mapping in the GEMM epilogue.
+* [Grouped convolution targeting implicit GEMM](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/group_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu) introduces the first group convolution implementation to CUTLASS.  It is an Analytical implementation, not an Optimized.  The restrictions are: 1) input and output channel number should be multiple of group number. 2) split-K is not supported.  The implementation has 2 modes:
   * kSingleGroup: output channel per group is multiple of Threadblock tile N.
   * kMultipleGroup: Threadblock tile N is multiple of output channel per group.
-* [Depthwise separable convolution](test/unit/conv/device/depthwise_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu) introduces the first depthwise convolution which is also Analytical for now.  The restrictions are: 1) SIMT only 2) No split-K 3) input channel equals to output channel equals to group number.
-* Standalone [Layernorm](./tools/util/include/cutlass/util/device_layernorm.h) and [Pooling](./tools/util/include/cutlass/util/device_nhwc_pooling.h) kernels.
-* [Back-to-back GEMM/CONV](./examples/13_two_tensor_op_fusion) relaxes the requirement that the first GEMM K dimension needs to be the multiple of Threadblock Tile K dimension.
+* [Depthwise separable convolution](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/depthwise_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu) introduces the first depthwise convolution which is also Analytical for now.  The restrictions are: 1) SIMT only 2) No split-K 3) input channel equals to output channel equals to group number.
+* Standalone [Layernorm](https://github.com/NVIDIA/cutlass/tree/main/tools/util/include/cutlass/util/device_layernorm.h) and [Pooling](https://github.com/NVIDIA/cutlass/tree/main/tools/util/include/cutlass/util/device_nhwc_pooling.h) kernels.
+* [Back-to-back GEMM/CONV](https://github.com/NVIDIA/cutlass/tree/main/examples/13_two_tensor_op_fusion) relaxes the requirement that the first GEMM K dimension needs to be the multiple of Threadblock Tile K dimension.
 * Optimal performance using [**CUDA 11.6u2**](https://developer.nvidia.com/cuda-downloads)
 * Updates and bugfixes from the community (thanks!)
 
 ## [2.9.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.9.0) (2022-04-21)
 
-* [First layer Convolution kernels](./test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu) specialized for small channel counts and reduced alignment
-  * [Few channels](./include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h) specialization for reduced alignment capabilities
-  * [Fixed channels](./include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h) further specialized when channel count perfectly matches the access vector size
-  * [Unit tests](./test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu)
-  * [Python-based instance emitter](./python/cutlass_library/generator.py) in the CUTLASS Library and support in the Profiler
+* [First layer Convolution kernels](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu) specialized for small channel counts and reduced alignment
+  * [Few channels](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h) specialization for reduced alignment capabilities
+  * [Fixed channels](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h) further specialized when channel count perfectly matches the access vector size
+  * [Unit tests](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu)
+  * [Python-based instance emitter](https://github.com/NVIDIA/cutlass/tree/main/python/cutlass_library/generator.py) in the CUTLASS Library and support in the Profiler
 * [BLAS3](https://docs.nvidia.com/cuda/cublas/index.html#cublas-level-3-function-reference) operators accelerated by Tensor Cores
   * Supported types: f32, cf32, f64, cf64, tf32x3, complex tf32x3
-  * [HERK](./test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu) with [emitter](./python/cutlass_library/rank_k_operation.py)
-  * [SYRK](./test/unit/gemm/device/syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu) with [emitter](./python/cutlass_library/rank_k_operation.py)
-  * [SYMM](./test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu) with [emitter](./python/cutlass_library/symm_operation.py)
-  * [TRMM](./test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu) with [emitter](./python/cutlass_library/trmm_operation.py)
-  * [Unit tests](./test/unit/gemm/device/testbed_rank_k_universal.h)
-* [CUTLASS Python](./examples/40_cutlass_py) demonstrating JIT compilation of CUTLASS kernels and a Python-based runtime using [CUDA Python](https://developer.nvidia.com/cuda-python)
-  * [Python-based runtime](./tools/library/scripts/rt.py) interoperable with existing emitters
-* [GEMM + Softmax example](./examples/35_gemm_softmax)
-* [Gather and Scatter Fusion with GEMM](./examples/36_gather_scatter_fusion) can gather inputs and scatters outputs based on indices vectors in the same GEMM kernel.
+  * [HERK](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu) with [emitter](https://github.com/NVIDIA/cutlass/tree/main/python/cutlass_library/rank_k_operation.py)
+  * [SYRK](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu) with [emitter](https://github.com/NVIDIA/cutlass/tree/main/python/cutlass_library/rank_k_operation.py)
+  * [SYMM](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu) with [emitter](https://github.com/NVIDIA/cutlass/tree/main/python/cutlass_library/symm_operation.py)
+  * [TRMM](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu) with [emitter](https://github.com/NVIDIA/cutlass/tree/main/python/cutlass_library/trmm_operation.py)
+  * [Unit tests](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/testbed_rank_k_universal.h)
+* [CUTLASS Python](https://github.com/NVIDIA/cutlass/tree/main/examples/40_cutlass_py) demonstrating JIT compilation of CUTLASS kernels and a Python-based runtime using [CUDA Python](https://developer.nvidia.com/cuda-python)
+  * [Python-based runtime](https://github.com/NVIDIA/cutlass/tree/main/tools/library/scripts/rt.py) interoperable with existing emitters
+* [GEMM + Softmax example](https://github.com/NVIDIA/cutlass/tree/main/examples/35_gemm_softmax)
+* [Gather and Scatter Fusion with GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/36_gather_scatter_fusion) can gather inputs and scatters outputs based on indices vectors in the same GEMM kernel.
   * It can select random rows in a row major matrix.
   * It can select random columns in a column major matrix.
-* [Back-to-back GEMM/CONV](./examples/13_two_tensor_op_fusion) fully supports buffering the first GEMM/CONV results in the shared memory for the latter one to use.  It can eliminate register spill when the tile size is big.  Additionally, bias vector add is supported in the first GEMM/CONV.
+* [Back-to-back GEMM/CONV](https://github.com/NVIDIA/cutlass/tree/main/examples/13_two_tensor_op_fusion) fully supports buffering the first GEMM/CONV results in the shared memory for the latter one to use.  It can eliminate register spill when the tile size is big.  Additionally, bias vector add is supported in the first GEMM/CONV.
   * Supported kernels: GEMM and CONV.
   * Supported types: fp16 and int8.
   * Supported architectures: Turing and Ampere.
-* [Transposed Convolution](./examples/34_transposed_conv2d) (a.k.a Deconvolution) support which reuses Dgrad implementation.
-* [Utility functions](./tools/util/include/cutlass/util) that can pad NHWC and convert between NCHW and NHWC.
+* [Transposed Convolution](https://github.com/NVIDIA/cutlass/tree/main/examples/34_transposed_conv2d) (a.k.a Deconvolution) support which reuses Dgrad implementation.
+* [Utility functions](https://github.com/NVIDIA/cutlass/tree/main/tools/util/include/cutlass/util) that can pad NHWC and convert between NCHW and NHWC.
 * [Small alignment implicit gemm](https://github.com/NVIDIA/cutlass/issues/242) support for Fprop/Dgrad/Wgrad so that padding is no longer mandated to use tensor cores in these kernels.
 * Epilogue enhancement:
   * Eliminate bank conflicts in int8 tensor core kernels.
   * Half2 usage if epilogue compute type is fp16.
   * More activation functions: Silu, Hardswish, Leaky Relu.
-  * New elementwise fusion pattern for [residual block](./include/cutlass/epilogue/thread/linear_combination_residual_block.h).
-* [Group GEMM](./examples/24_gemm_grouped) thread block number calculation fix which helps to launch the intended number of threadblocks to fully occupy the GPUs.
+  * New elementwise fusion pattern for [residual block](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/thread/linear_combination_residual_block.h).
+* [Group GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/24_gemm_grouped) thread block number calculation fix which helps to launch the intended number of threadblocks to fully occupy the GPUs.
 * [Parallel GEMM splitk](https://github.com/NVIDIA/cutlass/pull/277) support in the CUTLASS profiler.
 * Optimal performance using [**CUDA 11.6u2**](https://developer.nvidia.com/cuda-downloads)
 * Updates and bugfixes from the community (thanks!)
@@ -359,17 +393,17 @@
 
 * **TF32x3:** emulated single-precision using Tensor Cores
   * 45+ TFLOPs on NVIDIA A100
-  * [GEMM SDK example](./examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu) (real)
-  * [COMPLEX GEMM SDK example](./examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_3xtf32_complex_gemm.cu) (complex)
-  * [Implicit GEMM Convolution SDK example](./examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu)
+  * [GEMM SDK example](https://github.com/NVIDIA/cutlass/tree/main/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu) (real)
+  * [COMPLEX GEMM SDK example](https://github.com/NVIDIA/cutlass/tree/main/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_3xtf32_complex_gemm.cu) (complex)
+  * [Implicit GEMM Convolution SDK example](https://github.com/NVIDIA/cutlass/tree/main/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu)
 * **Mainloop fusion for Convolution:** convolution with fused per-channel scale-bias-relu
-  * [Conv Fprop SDK example](./examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu)
-  * [Conv WGrad SDK example](./examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu)
-  * [cutlass::conv::device::ImplicitGemmConvolutionFusion](./include/cutlass/conv/device/implicit_gemm_convolution_fusion.h)
+  * [Conv Fprop SDK example](https://github.com/NVIDIA/cutlass/tree/main/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu)
+  * [Conv WGrad SDK example](https://github.com/NVIDIA/cutlass/tree/main/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu)
+  * [cutlass::conv::device::ImplicitGemmConvolutionFusion](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h)
 * **Grouped GEMM:** similar to batched GEMM with distinct problem size per group
-  * [SDK example](./examples/24_gemm_grouped) with performance comparison with Batched Strided GEMM
-  * [cutlass::gemm::device::GemmGrouped](./include/cutlass/gemm/device/gemm_grouped.h)
-* [Implicit GEMM Convolution fusion](./examples/13_two_tensor_op_fusion/) supports staging 1st convolution's output accumulator in the shared memory on Turing. This allows more flexible warp tile sizes and less regsiter pressue.
+  * [SDK example](https://github.com/NVIDIA/cutlass/tree/main/examples/24_gemm_grouped) with performance comparison with Batched Strided GEMM
+  * [cutlass::gemm::device::GemmGrouped](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/device/gemm_grouped.h)
+* [Implicit GEMM Convolution fusion](https://github.com/NVIDIA/cutlass/tree/main/examples/13_two_tensor_op_fusion/) supports staging 1st convolution's output accumulator in the shared memory on Turing. This allows more flexible warp tile sizes and less regsiter pressue.
 * Optimal performance using [**CUDA 11.5**](https://developer.nvidia.com/cuda-downloads)
 * Updates from the community (thanks!)
 
@@ -379,13 +413,13 @@
   * CUDA 10.2
 
 ## [2.7.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.7.0) (2021-09-24)
-  * Mainloop fusion for GEMM: [summation over A or B](./examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu)
-  * [Strided DGRAD (optimized iterators)](./include/cutlass/conv/kernel/default_conv2d_dgrad.h)
-  * [Half-precision GELU_taylor activation functions](./include/cutlass/epilogue/thread/activation.h#L196)
+  * Mainloop fusion for GEMM: [summation over A or B](https://github.com/NVIDIA/cutlass/tree/main/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu)
+  * [Strided DGRAD (optimized iterators)](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/conv/kernel/default_conv2d_dgrad.h)
+  * [Half-precision GELU_taylor activation functions](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/thread/activation.h#L196)
     * Use these when accumulation and epilogue compute types are all `cutlass::half_t`
-  * Tuning and bug fixes to [fused GEMM + GEMM example](./examples/13_two_tensor_op_fusion/)
-  * Support for smaller than 128b aligned Convolutions: [see examples](test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu#L272)
-  * Caching of results to accelerate Convolution [unit tests](test/unit/conv/device/cache_testbed_output.h)
+  * Tuning and bug fixes to [fused GEMM + GEMM example](https://github.com/NVIDIA/cutlass/tree/main/examples/13_two_tensor_op_fusion/)
+  * Support for smaller than 128b aligned Convolutions: [see examples](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu#L272)
+  * Caching of results to accelerate Convolution [unit tests](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/cache_testbed_output.h)
     * Can be enabled or disabled by running `cmake .. -DCUTLASS_TEST_ENABLE_CACHED_RESULTS=OFF`
   * Corrections and bug fixes reported by the CUTLASS community
     * Thank you for filing these issues!
@@ -398,24 +432,24 @@
 
 ## [2.6.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.6.0) (2021-07-22)
   * Optimal performance when compiled with the [CUDA 11.4 Toolkit](https://developer.nvidia.com/cuda-toolkit)
-    * Adopt the new L2 prefetch feature in [cp.async](./include/cutlass/arch/memory.h) and [global load](./include/cutlass/arch/memory_sm80.h)
+    * Adopt the new L2 prefetch feature in [cp.async](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/arch/memory.h) and [global load](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/arch/memory_sm80.h)
   * Fused operators with GEMM and Convolution
-    * [Fused broadcast in epilogue](test/unit/gemm/device/gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu)
-    * [Fused partial reduction in epilogue](./test/unit/gemm/device/gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu)
+    * [Fused broadcast in epilogue](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu)
+    * [Fused partial reduction in epilogue](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu)
   * 64b tensor strides and leading dimensions support for GEMMs
   * Affine rank=2 matrix layouts
-    * Row stride and column stride for matrices using [cutlass::layout::AffineRank2](./include/cutlass/layout/matrix.h)
-    * Support [FP64 tensor core](./examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu) and SIMT GEMM.
-  * [Batched GEMV](./test/unit/gemm/device/gemv.cu) preview implementation
-  * [New strided Dgrad](test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu) implementation
+    * Row stride and column stride for matrices using [cutlass::layout::AffineRank2](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/layout/matrix.h)
+    * Support [FP64 tensor core](https://github.com/NVIDIA/cutlass/tree/main/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu) and SIMT GEMM.
+  * [Batched GEMV](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemv.cu) preview implementation
+  * [New strided Dgrad](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu) implementation
     * Accelerates over previous implementation by cutting down redundant math by 4x
     * Support using new `Dy` and `w` analytic iterators and existing `cutlass::conv::device::ImplicitGemmConvolution` interface
   * Quaternion-valued GEMM and Convolution in single- and double-precision (targeting CUDA Cores)
-    * Updates to [quaternion.h](./include/cutlass/quaternion.h) and [functional.h](./include/cutlass/functional.h)
-    * SDK Example for [GEMM](./examples/21_quaternion_gemm/quaternion_gemm.cu) and [Convolution](./examples/22_quaternion_conv/quaternion_conv.cu)
-    * [Unit tests for GEMM](./test/unit/gemm/device/simt_qgemm_nn_sm50.cu) and [Convolution](./test/unit/conv/device/conv2d_fprop_implicit_gemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu)
+    * Updates to [quaternion.h](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/quaternion.h) and [functional.h](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/functional.h)
+    * SDK Example for [GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/21_quaternion_gemm/quaternion_gemm.cu) and [Convolution](https://github.com/NVIDIA/cutlass/tree/main/examples/22_quaternion_conv/quaternion_conv.cu)
+    * [Unit tests for GEMM](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/simt_qgemm_nn_sm50.cu) and [Convolution](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/conv2d_fprop_implicit_gemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu)
   * Many improvements to the epilogue.
-    * Provide an [option](./include/cutlass/epilogue/threadblock/epilogue.h) to not fully unroll the epilogue to reduce the code size and improve the performance when using complicated elementwise operations
+    * Provide an [option](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/threadblock/epilogue.h) to not fully unroll the epilogue to reduce the code size and improve the performance when using complicated elementwise operations
     * Performance improvement for FP16 tensor core kernels
     * Bug fixes
   * Enhanced Clang support and the combination of Clang 13 and CUDA 11.4 can build and run kernels from Pascal and Ampere.
@@ -427,14 +461,14 @@
 ## [2.5.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.5.0) (2021-02-26)
   * Tensor reductions
     * _m_-to-_n_ reductions of tensors with affine layout
-    * [Specializations](./test/unit/reduction/device/tensor_reduce_contiguous.cu) for reductions including contiguous dimension
-    * [Specializations](./test/unit/reduction/device/tensor_reduce_strided.cu) for reductions excluding contiguous dimension
+    * [Specializations](https://github.com/NVIDIA/cutlass/tree/main/test/unit/reduction/device/tensor_reduce_contiguous.cu) for reductions including contiguous dimension
+    * [Specializations](https://github.com/NVIDIA/cutlass/tree/main/test/unit/reduction/device/tensor_reduce_strided.cu) for reductions excluding contiguous dimension
     * Custom reduction functors such as `cutlass::logical_and`
     * Large tensor support, up to 2^63 elements (however, each dimension is limited to an extent of 2^31)
   * Optimizations for 3-D convolution
-    * [Optimized tile iterators](./include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h) using precomputed delta table for 3-D convolution
-    * Full coverage of [forward](test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu) and [backwards](test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu) passes for 3D convolution
-  * [Fused Convolution+Convolution example](./examples/13_two_tensor_op_fusion/README.md)
+    * [Optimized tile iterators](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h) using precomputed delta table for 3-D convolution
+    * Full coverage of [forward](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu) and [backwards](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu) passes for 3D convolution
+  * [Fused Convolution+Convolution example](https://github.com/NVIDIA/cutlass/tree/main/examples/13_two_tensor_op_fusion/README.md)
   * Corrections and bug fixes reported by the CUTLASS community
     * Thank you for filing these issues!
 
@@ -453,16 +487,16 @@
 
 ## [2.3.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.3.0) (2020-09-23)
  * [NVIDIA Ampere Architecture features](https://devblogs.nvidia.com/nvidia-ampere-architecture-in-depth/)
-   * [Sparse Tensor Core GEMM kernels](test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu):
+   * [Sparse Tensor Core GEMM kernels](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu):
      * Direct access to Sparse Tensor Cores and maximum performance via [`mma.sp.sync`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma-and-friends)
    * Fast SGEMM targeting GeForce RTX 30-series CUDA Cores
  * Minor Features:
-   * [Activation functions](./include/cutlass/epilogue/thread/activation.h) such as [GeLU](./include/cutlass/epilogue/thread/linear_combination_gelu.h) and [Sigmoid](./include/cutlass/epilogue/thread/linear_combination_sigmoid.h)
-   * Small [matrix](./include/cutlass/matrix.h) and [quaternion](./include/cutlass/quaternion.h) template classes in device code
-   * [Floating-point constants](./include/cutlass/constants.h)
+   * [Activation functions](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/thread/activation.h) such as [GeLU](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/thread/linear_combination_gelu.h) and [Sigmoid](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/thread/linear_combination_sigmoid.h)
+   * Small [matrix](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/matrix.h) and [quaternion](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/quaternion.h) template classes in device code
+   * [Floating-point constants](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/constants.h)
  * NVIDIA Ampere GPU Architecture examples and documentation:
-   * [Tensor Float 32](./examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu) and
-   * [Sparse Tensor Cores](./examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu)
+   * [Tensor Float 32](https://github.com/NVIDIA/cutlass/tree/main/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu) and
+   * [Sparse Tensor Cores](https://github.com/NVIDIA/cutlass/tree/main/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu)
    * Documentation added on CUTLASS [efficient row-major epilogue](./media/docs/cpp/gemm_api.md#efficient-epilogue)
 
 ## [2.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.2.0) (2020-06-08)
@@ -487,7 +521,7 @@
     * API to launch compiled kernel instances for GEMM and planar complex GEMM
  * Planar Complex GEMM kernels targeting Volta and Turing Tensor Cores
     * Computes complex matrix products on matrices stored as disjoint real and imaginary parts
-    * [SDK Examples of Planar Complex GEMMs](./examples/10_planar_complex/planar_complex.cu)
+    * [SDK Examples of Planar Complex GEMMs](https://github.com/NVIDIA/cutlass/tree/main/examples/10_planar_complex/planar_complex.cu)
  * Minor enhancements and bug fixes
 
 ## [2.0.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.0.0) (2019-11-19)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b54b8335..f141fd40 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -178,6 +178,10 @@ if (CUDA_VERSION VERSION_GREATER_EQUAL 12.8)
   list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 100 100a 101 101a 120 120a)
 endif()
 
+if (CUDA_VERSION VERSION_GREATER_EQUAL 12.9)
+  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 100f 101f 120f)
+endif()
+
 set(CUTLASS_NVCC_ARCHS ${CUTLASS_NVCC_ARCHS_SUPPORTED} CACHE STRING "The SM architectures requested.")
 set(CUTLASS_NVCC_ARCHS_ENABLED ${CUTLASS_NVCC_ARCHS} CACHE STRING "The SM architectures to build code for.")
 
@@ -676,25 +680,6 @@ if (NOT CUTLASS_NAMESPACE STREQUAL "cutlass")
   target_compile_definitions(CUTLASS INTERFACE CUTLASS_NAMESPACE=${CUTLASS_NAMESPACE})
 endif()
 
-if (NOT DEFINED CUTLASS_REVISION)
-
-  find_package(Git QUIET)
-
-  execute_process(
-    COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
-    RESULT_VARIABLE CUTLASS_REVISION_RESULT
-    OUTPUT_VARIABLE CUTLASS_REVISION
-    OUTPUT_STRIP_TRAILING_WHITESPACE
-  )
-
-  if (CUTLASS_REVISION_RESULT)
-    message(STATUS "CUTLASS Revision: Unable to detect, Git returned code ${CUTLASS_REVISION_RESULT}.")
-  else()
-    message(STATUS "CUTLASS Revision: ${CUTLASS_REVISION}")
-  endif()
-
-endif()
-
 configure_file(
   ${CMAKE_CURRENT_SOURCE_DIR}/cmake/version_extended.h.in
   ${CMAKE_CURRENT_BINARY_DIR}/include/cutlass/version_extended.h
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 46506007..f6ef0f50 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -2,7 +2,7 @@
 
 [README](./README.md#documentation) > **Contributors**
 
-# CUTLASS Developers **
+# CUTLASS C++ Developers **
 
 Andrew Kerr<br />
 Paul Springer<br />
@@ -70,8 +70,49 @@ Shreya Gaur<br />
 
 ** _The list is sorted in order of the author's first contribution to the CUTLASS project._
 
+# CUTLASS DSL Developers ***
 
-# CUTE Developers
+Albert Di<br />
+Albert Xu<br />
+Anakin Zheng<br />
+Arvin Jou<br />
+Brandon Sun<br />
+Chenyang Xu<br />
+Chunyu Wang<br />
+Cris Cecka<br />
+dePaul Miller<br />
+Edward Cao<br />
+Fung Xie<br />
+Guray Ozen<br />
+Hao Hu<br />
+Hong Wang<br />
+Jeremy Furtek<br />
+Jie Fang <br />
+JingZe Cui<br />
+Kihiro Bando<br />
+Linfeng Zheng<br />
+Longsheng Du<br />
+Mina Sun<br />
+Mindy Li<br />
+Pradeep Ramani<br />
+Questa Wang<br />
+Serif Yesil<br />
+Tao Xie<br />
+Tina Li<br />
+Vicki Wang<br />
+Vincent Zhang<br />
+Vijay Thakkar<br />
+Xiao Dong<br />
+Xiaolei Shi<br />
+Xinyu Wang<br />
+Yihan Chen<br />
+Yuhan Li<br />
+Zekun Fan<br />
+
+*** _Sorted in alphabetical order._
+
+
+# CuTe Developers
 
 Cris Cecka<br />
 Vijay Thakkar<br />
@@ -100,6 +141,9 @@ David Tanner<br />
 
 Tri Dao<br />
 Jay Shah<br />
+Mehdi Amini<br />
+Larry Wu<br />
+Justin Holewinski<br />
 Timothy Costa<br />
 Julien Demouth<br />
 Brian Fahs<br />
@@ -108,14 +152,11 @@ Michael Goldfarb<br />
 Mostafa Hagog<br />
 Fei Hu<br />
 Alan Kaatz<br />
-Tina Li<br />
 Wei Liu<br />
 Tim Martin<br />
 Kevin Siu<br />
 Markus Tavenrath<br />
 John Tran<br />
-Vicki Wang<br />
-Fung Xie<br />
 Yang Xu<br />
 Scott Yokim<br />
 Girish Bharambe<br />
diff --git a/EULA.txt b/EULA.txt
new file mode 100644
index 00000000..e7699599
--- /dev/null
+++ b/EULA.txt
@@ -0,0 +1,188 @@
+NVIDIA Software License Agreement
+
+IMPORTANT NOTICE – PLEASE READ AND AGREE BEFORE USING THE SOFTWARE
+This software license agreement (“Agreement”) is a legal agreement between you, whether an individual or entity, (“you”) and NVIDIA Corporation (“NVIDIA”) and governs the use of the NVIDIA CUTLASS DSLs software and materials that NVIDIA delivers to you under this Agreement (“Software”).
+NVIDIA and you are each a “party” and collectively the “parties.”
+This Agreement can be accepted only by an adult of legal age of majority in the country in which the Software is used.
+If you don’t have the required age or authority to accept this Agreement, or if you don’t accept all the terms and conditions of this Agreement, do not use the Software.
+
+1. License Grants
+
+    1.1. License Grant to You. The Software made available by NVIDIA to you is licensed, not sold.
+    Subject to the terms of this Agreement, NVIDIA grants you a limited, non-exclusive, revocable, non-transferable, and non-sublicensable (except as expressly granted in this Agreement), license to:
+
+    a. install and use copies of the Software,
+    b. configure the Software using configuration files provided (if applicable),
+    c. modify and create derivative works of any sample or example source code NVIDIA delivers to you as part of the Software (“Derivatives”) (if applicable), and
+    d. distribute python files in the Software package in source format as incorporated into a software application subject to the following distribution requirements:
+
+       i. Your application must have material additional functionality, beyond the included portions of the Software.
+       ii. The distributable portions of the Software shall only be accessed by your application.
+       iii. The following notice shall be included in modifications and derivative works of sample source code distributed: “This software contains source code provided by NVIDIA Corporation.”
+       iv. Unless a developer tool is identified in this Agreement as distributable, it is delivered for your internal use only.
+       v. The terms under which you distribute your application must be consistent with the terms of this Agreement, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA’s intellectual property rights.
+       vi. Additionally, you agree that you will protect the privacy, security and legal rights of your application users.
+
+    The foregoing (a) through (d) are, collectively, the “Purpose”, and the developed applications are only for use in systems with NVIDIA GPUs.
+
+    1.2. License Grant to NVIDIA. Subject to the terms of this Agreement, you grant NVIDIA and its affiliates a non-exclusive, perpetual, irrevocable, sublicensable, worldwide, royalty-free, fully paid-up and transferable license, under your intellectual property rights, to publicly perform, publicly display, reproduce, use, make, have made, sell, offer for sale, distribute (through multiple tiers of distribution), import, create derivative works of and otherwise commercialize and exploit at NVIDIA’s discretion any Derivatives created by or for you.
+    You may, but are not required to, deliver any Derivatives to NVIDIA.
+
+2. License Restrictions
+
+    Your license to use the Software and Derivatives is restricted as stated in this Section 2 (“License Restrictions”).
+    You will cooperate with NVIDIA and, upon NVIDIA’s written request, you will confirm in writing and provide reasonably requested information to verify your compliance with the terms of this Agreement.
+    You may not:
+
+    2.1. Use the Software or Derivatives for any purpose other than the Purpose;
+
+    2.2. Sell, rent, sublicense, transfer, distribute or otherwise make available to others (except authorized users as stated in Section 3 (“Authorized Users”)) any portion of the Software or Derivatives, except as expressly granted in Section 1.1 (“License Grant to You”);
+
+    2.3. Reverse engineer, decompile, or disassemble the Software components provided in binary form, nor attempt in any other manner to obtain source code of such Software;
+
+    2.4. Modify or create derivative works of the Software, except as expressly granted in Section 1.1 (“License Grant to You”);
+
+    2.5. Change or remove copyright or other proprietary notices in the Software;
+
+    2.6. Bypass, disable, or circumvent any technical limitation, encryption, security, digital rights management or authentication mechanism in the Software;
+
+    2.7. Use the Software or Derivatives in any manner that would cause them to become subject to an open source software license, subject to the terms in Section 6 (“Components Under Other Licenses”);
+
+    2.8. Use the Software or Derivatives in violation of any applicable law or regulation in relevant jurisdictions
+
+    2.9. Indicate that a product or service developed with the Software or Derivatives is sponsored or endorsed by NVIDIA;
+
+    2.10. Replace any NVIDIA software components in the Software that are governed by this Agreement with other software that implements NVIDIA APIs;
+
+    2.11. Reverse engineer, decompile or disassemble any portion of the output generated using Software elements for the purpose of translating such output artifacts to target a non-NVIDIA platform; or
+
+3. Authorized Users
+
+    You may allow employees and contractors of your entity or of your subsidiary(ies), and for educational institutions also enrolled students, to internally access and use the Software as authorized by this Agreement from your secure network to perform the work authorized by this Agreement on your behalf.
+    You are responsible for the compliance with the terms of this Agreement by your authorized users.
+    Any act or omission that if committed by you would constitute a breach of this Agreement will be deemed to constitute a breach of this Agreement if committed by your authorized users.
+
+4. Pre-Release
+
+    Software versions identified as alpha, beta, preview, early access or otherwise as pre-release (“Pre-Release”) may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability and reliability standards relative to NVIDIA commercial offerings.
+    You use Pre-Release Software at your own risk. NVIDIA did not design or test the Software for use in production or business-critical systems.
+    NVIDIA may choose not to make available a commercial version of Pre-Release Software.
+    NVIDIA may also choose to abandon development and terminate the availability of Pre-Release Software at any time without liability.
+
+5. Updates
+
+    NVIDIA may at any time and at its option, change, discontinue, or deprecate any part, or all, of the Software, or change or remove features or functionality, or make available patches, workarounds or other updates to the Software.
+    Unless the updates are provided with their separate governing terms, they are deemed part of the Software licensed to you under this Agreement, and your continued use of the Software is deemed acceptance of such changes.
+
+6. Components Under Other Licenses
+
+    The Software may include or be distributed with components provided with separate legal notices or terms that accompany the components, such as open source software licenses and other license terms (“Other Licenses”).
+    The components are subject to the applicable Other Licenses, including any proprietary notices, disclaimers, requirements and extended use rights;
+    except that this Agreement will prevail regarding the use of third-party open source software, unless a third-party open source software license requires its license terms to prevail.
+    Open source software license means any software, data or documentation subject to any license identified as an open source license by the Open Source Initiative (http://opensource.org), Free Software Foundation (http://www.fsf.org) or other similar open source organization or listed by the Software Package Data Exchange (SPDX) Workgroup under the Linux Foundation (http://www.spdx.org).
+
+7. Ownership
+
+    7.1. NVIDIA Ownership. The Software, including all intellectual property rights, is and will remain the sole and exclusive property of NVIDIA or its licensors.
+    Except as expressly granted in this Agreement, (a) NVIDIA reserves all rights, interests and remedies in connection with the Software, and (b) no other license or right is granted to you by implication, estoppel or otherwise.
+
+    7.2. Your Ownership. Subject to the rights of NVIDIA and its suppliers in the Software, which continue to be licensed as stated in this Agreement, even when incorporated in your products or services, and the extent permitted by applicable law, as between you and NVIDIA, you hold all rights, title and interest in and to your products, services and Derivatives you develop as permitted in this Agreement including their respective intellectual property rights.
+
+8. Feedback
+
+    You may, but you are not obligated to, provide suggestions, requests, fixes, modifications, enhancements, or other feedback regarding the Software (collectively, “Feedback”).
+    Feedback, even if designated as confidential by you, will not create any confidentiality obligation for NVIDIA or its affiliates.
+    If you provide Feedback, you grant NVIDIA, its affiliates and its designees a non-exclusive, perpetual, irrevocable, sublicensable, worldwide, royalty-free, fully paid-up and transferable license, under your intellectual property rights, to publicly perform, publicly display, reproduce, use, make, have made, sell, offer for sale, distribute (through multiple tiers of distribution), import, create derivative works of and otherwise commercialize and exploit the Feedback at NVIDIA’s discretion.
+
+9. Termination
+
+    9.1. Termination. This Agreement will automatically terminate without notice from NVIDIA if you fail to comply with any of the terms in this Agreement or if you commence or participate in any legal proceeding against NVIDIA with respect to the Software.
+    Additionally, either party may terminate this Agreement at any time with thirty (30) days’ advance written notice to the other party.
+
+    9.2. Effect of Termination. Upon any expiration or termination of this Agreement, you will promptly (a) stop using and return, delete or destroy NVIDIA confidential information and all Software received under this Agreement, and (b) delete or destroy Derivatives created under this Agreement, unless an authorized NVIDIA representative provides prior written approval that you may keep a copy of the Derivatives solely for archival purposes.
+    Upon written request, you will certify in writing that you have complied with your obligations under this Section 9.2 (“Effect of Termination”).
+
+    9.3. Survival. Section 1.2 (“License Grant to NVIDIA”), Section 5 (“Updates”), Section 6 (“Components Under Other Licenses”), Section 7 (“Ownership”), Section 8 (“Feedback), Section 9.2 (“Effect of Termination”), Section 9.3 (“Survival”), Section 10 (“Disclaimer of Warranties”), Section 11 (“Limitation of Liability”), Section 12 (“Use in Mission Critical Applications”), Section 13 (“Governing Law and Jurisdiction”), Section 14 (“Indemnity”) and Section 15 (“General”) will survive any expiration or termination of this Agreement.
+
+10. Disclaimer of Warranties
+
+    THE SOFTWARE IS PROVIDED BY NVIDIA AS-IS AND WITH ALL FAULTS. TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, NVIDIA DISCLAIMS ALL WARRANTIES AND REPRESENTATIONS OF ANY KIND, WHETHER
+    EXPRESS, IMPLIED OR STATUTORY, RELATING TO OR ARISING UNDER THIS AGREEMENT, INCLUDING, WITHOUT LIMITATION, THE WARRANTIES OF TITLE, NONINFRINGEMENT, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, USAGE OF TRADE AND COURSE OF DEALING. NVIDIA DOES NOT WARRANT OR ASSUME RESPONSIBILITY FOR THE ACCURACY OR COMPLETENESS OF ANY THIRD-PARTY INFORMATION, TEXT, GRAPHICS, LINKS CONTAINED IN THE SOFTWARE.
+    WITHOUT LIMITING THE FOREGOING, NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS, ANY DEFECTS OR ERRORS WILL BE CORRECTED, ANY CERTAIN CONTENT WILL BE AVAILABLE; OR THAT THE SOFTWARE IS FREE OF VIRUSES OR OTHER HARMFUL COMPONENTS. NO INFORMATION OR ADVICE GIVEN BY NVIDIA WILL IN ANY WAY INCREASE THE SCOPE OF ANY WARRANTY EXPRESSLY PROVIDED IN THIS AGREEMENT.
+    NVIDIA does not warrant or assume responsibility for the accuracy or completeness of any third-party information, text, graphics or links contained in the Software.
+
+11. Limitations of Liability
+
+    11.1. EXCLUSIONS. TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT WILL NVIDIA BE LIABLE FOR ANY (I) INDIRECT, PUNITIVE, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES, OR (ii) DAMAGES FOR (a) THE COST OF PROCURING SUBSTITUTE GOODS, OR (b) LOSS OF PROFITS, REVENUES, USE, DATA OR GOODWILL ARISING OUT OF OR RELATED TO THIS AGREEMENT, WHETHER BASED ON BREACH OF CONTRACT, TORT (INCLUDING NEGLIGENCE), STRICT LIABILITY, OR OTHERWISE, AND EVEN IF NVIDIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES AND EVEN IF A PARTY’S REMEDIES FAIL THEIR ESSENTIAL PURPOSE.
+
+    11.2. DAMAGES CAP. ADDITIONALLY, TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, NVIDIA’S TOTAL CUMULATIVE AGGREGATE LIABILITY FOR ANY AND ALL LIABILITIES, OBLIGATIONS OR CLAIMS ARISING OUT OF OR RELATED TO THIS AGREEMENT WILL NOT EXCEED FIVE U.S. DOLLARS (US$5).
+
+12. Use in Mission Critical Applications
+
+    You acknowledge that the Software provided under this Agreement is not designed or tested by NVIDIA for use in any system or application where the use or failure of such system or application developed with NVIDIA’s Software could result in injury, death or catastrophic damage (each, a “Mission Critical Application”).
+    Examples of Mission Critical Applications include use in avionics, navigation, autonomous vehicle applications, AI solutions for automotive products, military, medical, life support or other mission-critical or life-critical applications.
+    NVIDIA will not be liable to you or any third party, in whole or in part, for any claims or damages arising from these uses.
+    You are solely responsible for ensuring that systems and applications developed with the Software include sufficient safety and redundancy features and comply with all applicable legal and regulatory standards and requirements.
+
+13. Governing Law and Jurisdiction
+
+    This Agreement will be governed in all respects by the laws of the United States and the laws of the State of Delaware, without regard to conflict of laws principles or the United Nations Convention on Contracts for the International Sale of Goods.
+    The state and federal courts residing in Santa Clara County, California will have exclusive jurisdiction over any dispute or claim arising out of or related to this Agreement, and the parties irrevocably consent to personal jurisdiction and venue in those courts;
+    except that either party may apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
+
+14. Indemnity
+
+    By using the Software you agree to defend, indemnify and hold harmless NVIDIA and its affiliates and their respective officers, directors, employees and agents from and against any claims, disputes, demands, liabilities, damages, losses, costs and expenses arising out of or in any way connected with (i) products or services that have been developed or deployed with or use the Software, or claims that they violate laws, or infringe, violate, or misappropriate any third party right;
+    or (ii) use of the Software in breach of the terms of this Agreement.
+
+15. General
+
+    15.1. Independent Contractors.
+       The parties are independent contractors, and this Agreement does not create a joint venture, partnership, agency, or other form of business association between the parties.
+       Neither party will have the power to bind the other party or incur any obligation on its behalf without the other party’s prior written consent.
+       Nothing in this Agreement prevents either party from participating in similar arrangements with third parties.
+
+    15.2. No Assignment.
+       NVIDIA may assign, delegate or transfer its rights or obligations under this Agreement by any means or operation of law.
+       You may not, without NVIDIA’s prior written consent, assign, delegate or transfer any of your rights or obligations under this Agreement by any means or operation of law, and any attempt to do so is null and void.
+
+    15.3. No Waiver.
+       No failure or delay by a party to enforce any term or obligation of this Agreement will operate as a waiver by that party, or prevent the enforcement of such term or obligation later.
+
+    15.4. Trade Compliance.
+       You agree to comply with all applicable export, import, trade and economic sanctions laws and regulations, as amended, including without limitation U.S. Export Administration Regulations and Office of Foreign Assets Control regulations.
+       You confirm (a) your understanding that export or reexport of certain NVIDIA products or technologies may require a license or other approval from appropriate authorities and (b) that you will not export or reexport any products or technology, directly or indirectly, without first obtaining any required license or other approval from appropriate authorities, (i) to any countries that are subject to any U.S. or local export restrictions (currently including, but not necessarily limited to, Belarus, Cuba, Iran, North Korea, Russia, Syria, the Region of Crimea, Donetsk People’s Republic Region and Luhansk People’s Republic Region);
+       (ii) to any end-user who you know or have reason to know will utilize them in the design, development or production of nuclear, chemical or biological weapons, missiles, rocket systems, unmanned air vehicles capable of a maximum range of at least 300 kilometers, regardless of payload, or intended for military end-use, or any weapons of mass destruction;
+       (iii) to any end-user who has been prohibited from participating in the U.S. or local export transactions by any governing authority;
+       or (iv) to any known military or military-intelligence end-user or for any known military or military-intelligence end-use in accordance with U.S. trade compliance laws and regulations.
+
+    15.5. Government Rights.
+       The Software, documentation and technology (“Protected Items”) are “Commercial products” as this term is defined at 48 C.F.R.
+       2.101, consisting of “commercial computer software” and “commercial computer software documentation” as such terms are used in, respectively, 48 C.F.R.
+       12.212 and 48 C.F.R. 227.7202 & 252.227-7014(a)(1). Before any Protected Items are supplied to the U.S. Government, you will (i) inform the U.S. Government in writing that the Protected Items are and must be treated as commercial computer software and commercial computer software documentation developed at private expense;
+       (ii) inform the U.S. Government that the Protected Items are provided subject to the terms of the Agreement;
+       and (iii) mark the Protected Items as commercial computer software and commercial computer software documentation developed at private expense.
+       In no event will you permit the U.S. Government to acquire rights in Protected Items beyond those specified in 48 C.F.R.
+       52.227-19(b)(1)-(2) or 252.227-7013(c) except as expressly approved by NVIDIA in writing.
+
+    15.6. Notices.
+       Please direct your legal notices or other correspondence to legalnotices@nvidia.com with a copy mailed to NVIDIA Corporation, 2788 San Tomas Expressway, Santa Clara, California 95051, United States of America, Attention: Legal Department.
+       If NVIDIA needs to contact you, you consent to receive the notices by email and agree that such notices will satisfy any legal communication requirements.
+
+    15.7. Severability.
+       If a court of competent jurisdiction rules that a provision of this Agreement is unenforceable, that provision will be deemed modified to the extent necessary to make it enforceable and the remainder of this Agreement will continue in full force and effect.
+
+    15.8. Amendment.
+       Any amendment to this Agreement must be in writing and signed by authorized representatives of both parties.
+
+    15.9. Construction.
+       The headings in the Agreement are included solely for convenience and are not intended to affect the meaning or interpretation of the Agreement.
+       As required by the context of the Agreement, the singular of a term includes the plural and vice versa.
+
+    15.10. Force Majeure.
+        Neither party will be liable during any period where an event or circumstance prevents or delays that party from performing its obligations under this Agreement and that event or circumstance: (i) is not within the reasonable control of that party and is not the result of that party’s negligence, and (ii) cannot be overcome or avoided by that party using reasonably diligent efforts.
+
+    15.11. Entire Agreement.
+        Regarding the subject matter of this Agreement, the parties agree that (a) this Agreement constitutes the entire and exclusive agreement between the parties and supersedes all prior and contemporaneous communications and (b) any additional or different terms or conditions, whether contained in purchase orders, order acknowledgments, invoices or otherwise, will not be binding and are null and void.
+
+(v. May 8, 2025)
diff --git a/LICENSE.txt b/LICENSE.txt
index 47016fa7..e08eb49c 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -25,3 +25,10 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Certain files within this repository are subject to separate licensing terms:
+
+- The files located in the `python/CuTeDSL` directory are licensed under the
+  NVIDIA End User License Agreement (EULA). Please refer to
+  https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+  for the full terms.
diff --git a/README.md b/README.md
index 26ec3abd..667eb73d 100644
--- a/README.md
+++ b/README.md
@@ -1,86 +1,80 @@
 ![ALT](./media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
+# Overview
 
-# CUTLASS 3.9.2
+# CUTLASS 4.0.0
 
-_CUTLASS 3.9.2 - May 2025_
+_CUTLASS 4.0.0 - May 2025_
 
-CUTLASS is a collection of CUDA C++ template abstractions for implementing
-high-performance matrix-matrix multiplication (GEMM) and related computations at all levels 
-and scales within CUDA. It incorporates strategies for hierarchical decomposition and 
-data movement similar to those used to implement cuBLAS and cuDNN.  CUTLASS decomposes 
-these "moving parts" into reusable, modular software components abstracted by C++ template 
-classes.  Primitives for different levels of a conceptual parallelization hierarchy
-can be specialized and tuned via custom tiling sizes, data types,
-and other algorithmic policy. The resulting flexibility simplifies their use
-as building blocks within custom kernels and applications.
+CUTLASS is a collection of abstractions for implementing high-performance matrix-matrix multiplication (GEMM)
+and related computations at all levels and scales within CUDA. It incorporates strategies for
+hierarchical decomposition and data movement. CUTLASS decomposes these "moving parts" into reusable, modular
+software components and abstractions.
 
-To support a wide variety of applications, CUTLASS provides extensive support for
-mixed-precision computations, providing specialized data-movement and
+Primitives for different levels of a conceptual parallelization hierarchy can be specialized and tuned
+via custom tiling sizes, data types, and other algorithmic policy. The resulting flexibility simplifies
+their use as building blocks within custom kernels and applications.
+
+CUTLASS has been providing CUDA C++ template abstractions for high-performance linear algebra since 2017 and
+these abstractions provide extensive support for a wide range of computations including
+mixed-precision computations, specialized data-movement (async copy) and
 multiply-accumulate abstractions for FP64, FP32, TF32, FP16, BF16,
-[FP32 emulation via tensor core instruction](./examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm), 
+[FP32 emulation via tensor core instruction](https://github.com/NVIDIA/cutlass/tree/main/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm),
  8b floating point types (e5m2 and e4m3),
  block scaled data types (NVIDIA NVFP4 and OCP standard MXFP4, MXFP6, MXFP8),
  narrow integer types (4 and 8b signed and unsigned integers),
  and binary 1b data types (where architectures allow for the
-native support of such data types).
-CUTLASS demonstrates optimal matrix multiply operations
+native support of such data types) across NVIDIA's Volta, Turing, Ampere, Ada, Hopper, and Blackwell architectures.
+
+To this rich ecosystem of C++ based kernel programming abstractions, CUTLASS 4 adds CUTLASS DSLs. These are Python native interfaces for writing high-performance CUDA kernels based on core CUTLASS and CuTe concepts without any performance compromises. This allows for a much smoother learning curve, orders of magnitude faster compile times, native integration with DL frameworks without writing glue code, and much more intuitive metaprogramming that does not require deep C++ expertise.
+
+Overall we envision CUTLASS DSLs as a family of domain-specific languages (DSLs). With the release of 4.0, we are releasing the first of these in CuTe DSL. This is a low level programming model that is fully consistent with CuTe C++ abstractions — exposing core concepts such as layouts, tensors, hardware atoms, and full control over the hardware thread and data hierarchy.
+
+CuTe DSL demonstrates optimal matrix multiply and other linear algebra operations
 targeting the programmable, high-throughput _Tensor Cores_ implemented by
-NVIDIA's Volta, Turing, Ampere, Ada, Hopper, and Blackwell architectures.
+NVIDIA's Ampere, Hopper, and Blackwell architectures.
 
-In addition to GEMMs, CUTLASS implements high-performance convolution via
-the implicit GEMM algorithm. Implicit GEMM is the formulation of a convolution
-operation as a GEMM thereby taking advantage of CUTLASS's modular GEMM pipeline.
-This allows CUTLASS to build convolutions by reusing highly-optimized GEMM components.
+We believe it will become an indispensable tool for students, researchers, and performance
+engineers alike — flattening the learning curve of GPU programming, rapidly prototyping kernel
+designs, and bringing optimized solutions into production.
 
-See the [Quick Start Guide](./media/docs/cpp/quickstart.md) to get started quickly.
+CuTe DSL is currently in public beta and will graduate out of beta by end of summer 2025.
 
-See the [functionality docs](./media/docs/cpp/functionality.md) for a more comprehensive
-list of kernel level features, data types, instructions, and minimum supported by CUTLASS on each GPU
-architecture.
+To get started quickly - please refer :
+  - [CUTLASS C++ Quick Start Guide](./media/docs/cpp/quickstart.md).
+  - [CuTe DSL Quick Start Guide](./media/docs/pythonDSL/quick_start.rst).
 
-# What's New in CUTLASS 3.9
+# What's New in CUTLASS 4.0
 
-* Support for Blackwell SM120 kernels for GeForce GPUs in CUTLASS 3.x API:
-  - Collective mainloops that target for:
-    * [Blockscaled datatypes with support for dense GEMM](./include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp)
-    * [Blockscaled datatypes with support for sparse GEMM](./include/cutlass/gemm/collective/sm120_blockscaled_sparse_mma_tma.hpp)
-  - New [GEMM](./include/cutlass/gemm/dispatch_policy.hpp) and [epilogue](./include/cutlass/epilogue/dispatch_policy.hpp) dispatch policies for collectives, kernel layers, and builders.
-  - [Blackwell SM120 epilogue](./include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp) and [full set of EVT fusions](./include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp).
-* Set of examples that demonstrate the usage of the 3.x API for targeting Blackwell SM120 architecture:
-  - [Blockscaled GEMM with NVFP4 input datatype and BF16 output tensor](./examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu).
-  - [Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor with scale factor generation](./examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu).
-  - [Blockscaled GEMM with mixed input datatype (MXFP8 and MXFP6) and BF16 output tensor](./examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu).
-  - [Grouped GEMM with nvfp4 datatype](./examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu).
-  - [Sparse Blockscaled GEMM with mxfp8 input datatype and BF16 output tensor](./examples/80_blackwell_geforce_sparse_gemm/80a_blackwell_geforce_mxfp8_bf16_sparse_gemm.cu).
-  - [Sparse Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor](./examples/80_blackwell_geforce_sparse_gemm/80b_blackwell_geforce_nvfp4_nvfp4_sparse_gemm.cu).
-* Set of unit tests that demonstrate the usage of both [sparse](./test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/) and [dense](./test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/) Blackwell SM120 blockscaled GEMM.
-* Support for Blackwell SM100 Sparse kernels:
-  - Collective mainloop that target for
-    * [SM100 Sparse GEMM](./include/cutlass/gemm/collective/sm100_sparse_mma_warpspecialized.hpp)
-* Set of example that demonstrate the usage of the 3.x API for targeting Blackwell SM100 Sparse GEMM:
-  - [Sparse GEMM](./examples/83_blackwell_sparse_gemm/83_blackwell_sparse_gemm.cu)
-  - [Blockscaled Sparse GEMM with NVFP4 input data type](./examples/84_blackwell_narrow_precision_sparse_gemm/84a_blackwell_nvfp4_bf16_sparse_gemm.cu)
-  - [Blockscaled Sparse GEMM with mixed input data type (MXFP8 and MXFP4)](./examples/84_blackwell_narrow_precision_sparse_gemm/84b_blackwell_mixed_mxfp8_bf16_sparse_gemm.cu)
-* Set of unit tests that demonstrate the usage of [sparse](./test/unit/gemm/device/sm100_sparse_tensorop_gemm) and [blockscaled sparse](./test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm) Blackwell SM100 GEMM.
-* A new Multi-head Latent Attention (MLA) for SM100 Blackwell architecture in CUTLASS [example](./examples/77_blackwell_fmha/) covers the flashMLA-like weight-absorbed decoding use-case.
-* A new FMHA Backward kernel for SM100 Blackwell architecture extends CUTLASS [example](./examples/77_blackwell_fmha/) to show how the five backward pass MMAs can be fused into a single kernel to achieve high performance.
-* A new [distributed GEMM example](./examples/82_blackwell_distributed_gemm/82_blackwell_distributed_gemm.cu) for SM100 Blackwell architecture.
-* Enhancement and new support of block-wise and group-wise GEMM for Hopper and Blackwell architectures:
-  - Enhancement of [blockwise GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) for Hopper architecture.
-  - Enhancement of [groupwise GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) for Hopper architecture.
-  - Support for [grouped GEMM with blockwise and groupwise scaling](./examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/) for Hopper architecture.
-  - Support for [grouped-wise GEMM](./tools/profiler/src/blockwise_gemm_operation_profiler.cu) in CUTLASS profiler.
-  - Support for [blockwise GEMM](./examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu) for Blackwell architecture.
-  - Support for [groupwise GEMM](./examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu) for Blackwell architecture.
-  - Support for [grouped GEMM with blockwise](./examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu) and [groupwise scaling](./examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu) for Blackwell architecture.
-* Added support for enhanced kernel performance search (auto-tuning) in CUTLASS profiler:
-  - Sorting performance results by GFLOPs/second: Users can now sort the final performance report based on GFLOPs/second, making it easier to identify the most efficient kernels.
-  - Exhaustive search for best kernel performance in GFLOPs/second: The profiler now searches for the best-performing kernel across a range of problem sizes, swizzle sizes, rasterization orders, and dynamic cluster configurations to maximize performance.
-  - Performance search under a fixed GEMM shape: Enables exhaustive tuning within a fixed GEMM shape, exploring various kernel parameters to find the best configuration.
-  - More detailed introductions and examples to leverage this feature can be found in [profiler.md](./media/docs/cpp/profiler.md#exhaustive-search-mode-and-top-k-output-ranking-according-to-performance-in-gflopss).
-* Support `void` as the D element in sm100 kernel epilogues.
+## CuTe DSL
+* CuTe DSL, a Python DSL centered around CuTe's abstractions
+    - [Core DSL implementation files](https://github.com/NVIDIA/cutlass/tree/main/python/CuTeDSL)
+    - [DSL Quick Start](./media/docs/pythonDSL/quick_start.rst)
+    - [DSL Overview](./media/docs/pythonDSL/overview.rst)
+* [Overhauled documentation with an new dedicated website](https://docs.nvidia.com/cutlass)
+* Set of examples demonstrating how to use CuTe DSL to write peak-performance kernels
+    - [Blackwell persistent dense GEMM with static scheduling](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/blackwell/dense_gemm_persistent.py)
+    - [Blackwell grouped GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/blackwell/grouped_gemm.py)
+    - [Blackwell fused multi-head attention forward pass](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/blackwell/fmha.py)
+    - [Ampere GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/ampere/tensorop_gemm.py)
+    - [FlashAttention-2 implementation targeting Ampere and Ada class GPUs (SM80, SM86, SM89)](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/ampere/flash_attention_v2.py)
+* [Educational notebooks for getting started with CuTe DSL](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/notebooks)
 
-Note: CUTLASS 3.x builds are known to be down on Windows platforms for all CUDA toolkits.
+## CUTLASS C++
+* Support [Family Specific Architecture Features](https://developer.nvidia.com/blog/nvidia-blackwell-and-nvidia-cuda-12-9-introduce-family-specific-architecture-features/) which was introduced in CUDA 12.9
+  - 100f, 101f, 120f were added to support Family Specific Architecture Features which allows running the same binary on different chips belonging to the same Family (e.g. sm100) without recompiling.
+* Instruction shapes and redundant accumulation type have been removed from CUTLASS 3.x-style library kernel names to disambiguate kernels and shorten names.
+  - For example:
+    `(old) cutlass3x_sm90_tensorop_s64x128x16gemm_bf16_bf16_f32_bf16_bf16_128x256x64_1x1x1_0_tnn_align8_warpspecialized_cooperative_epi_tma`
+    `(new) cutlass3x_sm90_tensorop_gemm_bf16_bf16_f32_bf16_bf16_128x256x64_1x1x1_0_tnn_align8_warpspecialized_cooperative_epi_tma`
+   - If you are using the CUTLASS library kernel names directly (e.g. to compile a subset of the CUTLASS library with `-DCUTLASS_LIBRARY_KERNELS`, filter kernels in the CUTLASS profiler with `--kernels`), please update your uses accordingly, this is a breaking change.
+* Further improved [Blockwise](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) and [Groupwise](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) GEMMs on Hopper and Blackwell.
+  - Added non-power-of-two tile sizes.
+  - Improved performance for K-major scale factors.
+  - The argument `mma_promotion_interval` has been removed from non-grouped GEMM to align with the grouped and Blackwell versions.
+* Various improvements and fixes from the community and CUTLASS team. Thanks to everyone who submitted PRs!
+* Optimal code generation with CUDA toolkit versions 12.9.
+
+Note: CUTLASS 4.x builds are known to be down on Windows platforms for all CUDA toolkits.
 CUTLASS team is working on a fix.
 
 **See the [CHANGELOG](CHANGELOG.md) for details of all past releases and updates.**
@@ -89,21 +83,21 @@ CUTLASS team is working on a fix.
 
 CUTLASS primitives are very efficient.  When used to construct device-wide GEMM kernels,
 they exhibit nearly optimal utilization of peak theoretical throughput. The figure below
-shows CUTLASS 3.8's performance as a % of theoretical peak utilization 
+shows CUTLASS 3.8's performance as a % of theoretical peak utilization
 on various input and output data types when run on NVIDIA Blackwell SM100 architecture GPU.
 
-<p align="center"><img src=media/images/cutlass-3.8-blackwell-gemm-peak-performance.svg></p>
+![ALT](media/images/cutlass-3.8-blackwell-gemm-peak-performance.svg "")
 
-The two figures below show the continual CUTLASS performance improvements 
+The two figures below show the continual CUTLASS performance improvements
 on an [NVIDIA H100](https://www.nvidia.com/en-us/data-center/h100/) (NVIDIA Hopper architecture) since
 CUTLASS 3.1.
-CUTLASS 3.5.1 was compiled with the [CUDA 12.5u1 Toolkit](https://developer.nvidia.com/cuda-downloads). 
-Tensor Core operations are implemented using CUDA's 
+CUTLASS 3.5.1 was compiled with the [CUDA 12.5u1 Toolkit](https://developer.nvidia.com/cuda-downloads).
+Tensor Core operations are implemented using CUDA's
 [mma](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma) and
 [wgmma](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions) instructions.
 
-<p align="center"><img src=media/images/cutlass-3.5.1-gemm-peak-performance.png></p>
-<p align="center"><img src=media/images/cutlass-3.5.1-gemm-peak-performance-fp8.png></p>
+![ALT](media/images/cutlass-3.5.1-gemm-peak-performance.png "")
+![ALT](media/images/cutlass-3.5.1-gemm-peak-performance-fp8.png "")
 
 # CuTe
 
@@ -135,7 +129,7 @@ Minimum requirements:
 - Compiler: Must support at least C++17
 - CUDA Toolkit version: 11.4
 
-CUTLASS requires a C++17 host compiler and 
+CUTLASS requires a C++17 host compiler and
 performs best when built with the [**CUDA 12.8 Toolkit**](https://developer.nvidia.com/cuda-downloads).
 It is also compatible with CUDA 11.4, CUDA 11.5, CUDA 11.6, CUDA 11.7, CUDA 11.8, and all other CUDA 12.x versions.
 
@@ -196,17 +190,17 @@ the kernel is expected to fail with a runtime error.
 ```
 cmake .. -DCUTLASS_NVCC_ARCHS="90a"
 ```
-Or 
+Or
 
 ```
-cmake .. -DCUTLASS_NVCC_ARCHS="100a" 
+cmake .. -DCUTLASS_NVCC_ARCHS="100a"
 ```
 
-Note: The NVIDIA Blackwell SM100 architecture used in the datacenter 
-products has a different compute capability than the one underpinning 
-NVIDIA Blackwell GeForce RTX 50 series GPUs. As a result, kernels 
-compiled for Blackwell SM100 architecture with arch conditional features 
-(using `sm100a`) are not compatible with RTX 50 series GPUs. 
+Note: The NVIDIA Blackwell SM100 architecture used in the datacenter
+products has a different compute capability than the one underpinning
+NVIDIA Blackwell GeForce RTX 50 series GPUs. As a result, kernels
+compiled for Blackwell SM100 architecture with arch conditional features
+(using `sm100a`) are not compatible with RTX 50 series GPUs.
 
 Please refer to the [functionality documentation](./media/docs/cpp/functionality.md)
 for details on which kernels require which target architectures.
@@ -231,7 +225,7 @@ CUTLASS is described in the following documents and the accompanying
 - [Tile Iterators](./media/docs/cpp/tile_iterator_concept.md) - describes C++ concepts for iterating over tiles of matrices in memory
 - [CUTLASS Profiler](./media/docs/cpp/profiler.md) - command-line driven profiling application
 - [CUTLASS Utilities](./media/docs/cpp/utilities.md) - additional templates used to facilitate rapid development
-- [Dependent kernel launch](./media/docs/cpp/dependent_kernel_launch.md) - describes a new feature in Hopper which allows overlapping dependent 
+- [Dependent kernel launch](./media/docs/cpp/dependent_kernel_launch.md) - describes a new feature in Hopper which allows overlapping dependent
 kernels in the same stream, and how it is used in CUTLASS.
 
 # Resources
@@ -291,11 +285,11 @@ All tests should pass on supported platforms, though the exact number of tests m
 
 # Project Structure
 
-CUTLASS is arranged as a header-only library along with Utilities, Tools, Examples, and unit tests. 
-[Doxygen documentation](https://nvidia.github.io/cutlass) provides a complete list of files, classes, 
+CUTLASS is arranged as a header-only library along with Utilities, Tools, Examples, and unit tests.
+[Doxygen documentation](https://nvidia.github.io/cutlass) provides a complete list of files, classes,
 and template concepts defined in the CUTLASS project.
 
-A detailed explanation of the source code organization may be found in the 
+A detailed explanation of the source code organization may be found in the
 [CUTLASS documentation](./media/docs/cpp/code_organization.md), but several main components are summarized below.
 
 ## CUTLASS Template Library
@@ -320,7 +314,7 @@ include/                     # client applications should target this directory
     reduction/               # bandwidth-limited reduction kernels that do not fit the "gemm" model
 
     thread/                  # simt code that can be performed within a CUDA thread
-    
+
     transform/               # code specialized for layout, type, and domain transformations
 
     *                        # core vocabulary types, containers, and basic numeric operations
@@ -345,7 +339,7 @@ include/                     # client applications should target this directory
 
 ### CUTLASS SDK Examples
 
-[CUTLASS SDK examples](./examples) apply CUTLASS templates to implement basic computations.
+[CUTLASS SDK examples](https://github.com/NVIDIA/cutlass/tree/main/examples) apply CUTLASS templates to implement basic computations.
 
 ### Tools
 
@@ -358,7 +352,7 @@ tools/
 
   profiler/                  # CUTLASS Profiler         - command-line utility for executing operations in the
                              #                            CUTLASS Library
-  
+
   util/                      # CUTLASS Utilities        - contains numerous helper classes for
     include/                 #                            manging tensors in device memory, reference
       cutlass/               #                            implementations for GEMM, random initialization
@@ -384,7 +378,7 @@ $ make cutlass_profiler -j16
 
 By default, only one tile size is instantiated for each data type, math instruction, and layout.
 To instantiate all, set the following environment variable when running CMake from an empty `build/` directory.
-Beware, this results in *tens of thousands* of kernels and long build times. 
+Beware, this results in *tens of thousands* of kernels and long build times.
 This would also result in a large binary size and on some platforms linker to fail on building the library.
 Therefore, it's highly recommended to generate only a subset of kernels as demonstrated in the sub-section below.
 ```bash
@@ -395,13 +389,13 @@ $ make cutlass_profiler -j16
 
 ## Building a subset of GEMM and Convolution kernels (_reduced_ build times)
 
-To compile strictly one kernel or a small set of kernels, a comma-delimited list of kernel names with 
+To compile strictly one kernel or a small set of kernels, a comma-delimited list of kernel names with
 wildcard characters may be used to reduce the set of kernels. The following examples show building exactly one
 or a subset of kernels for NVIDIA Ampere and Turing architecture:
 
 ### Building a subset Tensor Core GEMM kernels
 
-To compile a subset of Tensor Core GEMM kernels with FP32 accumulation and FP16 input targeting NVIDIA Ampere and Turing architecture, 
+To compile a subset of Tensor Core GEMM kernels with FP32 accumulation and FP16 input targeting NVIDIA Ampere and Turing architecture,
 use the below cmake command line:
 ```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*gemm_f16_*_nt_align8
@@ -490,7 +484,7 @@ $ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096
 
 ### Building a subset of Tensor Core Convolution kernels
 
-To compile a subset of Tensor core convolution kernels implementing forward propagation (fprop) with FP32 accumulation 
+To compile a subset of Tensor core convolution kernels implementing forward propagation (fprop) with FP32 accumulation
 and FP16 input targeting NVIDIA Ampere and Turing architecture, use the below cmake command line:
 ```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*fprop_optimized_f16
@@ -538,7 +532,7 @@ reference_device: Passed
 
 ### Building one Convolution CUDA kernel
 
-To compile and run one CUDA Core convolution kernel implementing forward propagation (fprop) with F32 accumulation 
+To compile and run one CUDA Core convolution kernel implementing forward propagation (fprop) with F32 accumulation
 and FP32 input targeting NVIDIA Ampere and Turing architecture, use the below cmake command line:
 ```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sfprop_optimized_128x128_8x2_nhwc
@@ -586,14 +580,14 @@ reference_device: Passed
 
 ## More Details on Compiling CUTLASS Kernels and CUTLASS Profiler
 - Please follow the links for more CMake examples on selectively compiling CUTLASS kernels:
-  - [GEMM CMake Examples](./media/docs/cpp/quickstart.md#gemm-cmake-examples) 
+  - [GEMM CMake Examples](./media/docs/cpp/quickstart.md#gemm-cmake-examples)
   - [Implicit GEMM convolution CMake Examples](./media/docs/cpp/quickstart.md#convolution-cmake-examples)
 - [Further details about the CUTLASS Profiler are described here.](./media/docs/cpp/profiler.md)
 
 
 # About
 
-CUTLASS is released by NVIDIA Corporation as Open Source software under the 
+CUTLASS is released by NVIDIA Corporation as Open Source software under the
 [3-clause "New" BSD license](LICENSE.txt).
 
 # Contributors
diff --git a/customConfigs.cmake b/customConfigs.cmake
index d98fe6c5..a7342044 100644
--- a/customConfigs.cmake
+++ b/customConfigs.cmake
@@ -36,7 +36,7 @@ set(CUTLASS_PROFILER_REGRESSION_TEST_LEVEL  ${CUTLASS_TEST_LEVEL} CACHE STRING "
 
 find_package(Python3 3.5 COMPONENTS Interpreter REQUIRED)
 
-function(cutlass_generate_kernel_filter_and_testlists_files)
+function(cutlass_generate_kernel_filter_and_testlist_files)
 
   set(options)
   set(oneValueArgs TEST_SET_NAME)
@@ -59,30 +59,30 @@ function(cutlass_generate_kernel_filter_and_testlists_files)
   )
 
   if(NOT cutlass_FILTER_GENERATION_RESULT EQUAL 0)
-    message(FATAL_ERROR "Error generating kernel filters and testlists files. See ${CMAKE_CURRENT_BINARY_DIR}/library_filter_generation.log")
+    message(FATAL_ERROR "Error generating kernel filters and testlist files. See ${CMAKE_CURRENT_BINARY_DIR}/library_filter_generation.log")
   endif()
 endfunction()
 
 if(CUTLASS_BUILD_FOR_PROFILER_REGRESSIONS)
 
-    set(PROFILER_ARCH_LIST 100a 101a 120a)
+    set(PROFILER_ARCH_LIST 100a 100f 101a 101f 120a 120f)
     foreach(ARCH IN LISTS CUTLASS_NVCC_ARCHS)
       if(NOT (ARCH IN_LIST PROFILER_ARCH_LIST))
-        message(FATAL_ERROR "Only SM100a/101a/120a compute capability is supported with profiler-based unit tests")
+        message(FATAL_ERROR "Only SM${PROFILER_ARCH_LIST} compute capabilities are supported with profiler-based unit tests")
       endif()
     endforeach()
 
     if(CUTLASS_PROFILER_REGRESSION_TEST_LEVEL  EQUAL 0)
 
       message(STATUS "Building for L0 profiler-based functional regressions")
-      cutlass_generate_kernel_filter_and_testlists_files(TEST_SET_NAME kernel_testlist_l0)
+      cutlass_generate_kernel_filter_and_testlist_files(TEST_SET_NAME kernel_testlist_l0)
       set(KERNEL_FILTER_FILE ${CMAKE_CURRENT_BINARY_DIR}/FK_functional_L0_testlist_SM${CUTLASS_NVCC_ARCHS}_cutlass3x_gemm_kernel_filter.list CACHE STRING "Kernel set")
       set(CUTLASS_PROFILER_REGRESSION_LIST_FILE ${CMAKE_CURRENT_BINARY_DIR}/FK_functional_L0_testlist_SM${CUTLASS_NVCC_ARCHS}_cutlass3x_gemm.csv CACHE STRING "Regression set")
 
     elseif (CUTLASS_PROFILER_REGRESSION_TEST_LEVEL  EQUAL 1)
       
       message(STATUS "Building for L1 profiler-based functional regressions")
-      cutlass_generate_kernel_filter_and_testlists_files(TEST_SET_NAME kernel_testlist_l1)
+      cutlass_generate_kernel_filter_and_testlist_files(TEST_SET_NAME kernel_testlist_l1)
       set(KERNEL_FILTER_FILE ${CMAKE_CURRENT_BINARY_DIR}/FK_functional_L1_testlist_SM${CUTLASS_NVCC_ARCHS}_cutlass3x_gemm_kernel_filter.list CACHE STRING "Kernel set")
       set(CUTLASS_PROFILER_REGRESSION_LIST_FILE ${CMAKE_CURRENT_BINARY_DIR}/FK_functional_L1_testlist_SM${CUTLASS_NVCC_ARCHS}_cutlass3x_gemm.csv CACHE STRING "Regression set")
 
diff --git a/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu b/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu
index 4f77ae03..b12e75ec 100644
--- a/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu
+++ b/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu
@@ -489,7 +489,7 @@ int run(Options &options)
     std::cout << "  Batches     : " << options.l  << std::endl;
     std::cout << "  Alpha, Beta : " << options.alpha << ',' << options.beta << std::endl;
     std::cout << "  Avg runtime : " << result.avg_runtime_ms << " ms" << std::endl;
-    std::cout << "  GFLOPS      : " << result.gflops << std::endl;
+    std::cout << "  TFLOPS      : " << result.gflops / 1000.0 << std::endl;
   }
 
   return 0;
diff --git a/examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu b/examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu
index 6cedb599..eb449e8f 100644
--- a/examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu
+++ b/examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu
@@ -124,7 +124,7 @@ struct CooperativeConfig {
   using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum;
   using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;
   using TileShape           = Shape<_256,_128,_128>;
-  using ClusterShape        = Shape<_2,_2,_1>;
+  using ClusterShape        = Shape<_1,_2,_1>;
 };
 
 struct PingpongConfig {
@@ -296,14 +296,14 @@ struct Options {
       int m = cmd_line_m;
       int n = cmd_line_n;
       int k = cmd_line_k;
-      if (m < 1) {
-        m = alignment * ((rand() % 64) + 1);
+      if (m < 0) {
+        m = alignment * ((rand() % 64));
       }
-      if (n < 1) {
-        n = alignment * ((rand() % 64) + 1);
+      if (n < 0) {
+        n = alignment * ((rand() % 64));
       }
-      if (k < 1) {
-        k = alignment * ((rand() % 64) + 1);
+      if (k < 0) {
+        k = alignment * ((rand() % 64));
       }
       problem_sizes_host.push_back({m, n, k});
     }
@@ -333,19 +333,9 @@ struct Options {
       cutlass::CommandLine::tokenize(tokens, extent_str, 'x');
 
       for (int i = 0; i < int(tokens.size()); ++i) {
-        int x = std::atoi(tokens.at(i).c_str());
-
-        // round up
-        if (x % alignment) {
-          x += (alignment - (x % alignment));
-        }
-
-        extent.at(i) = x;
-      }
-
-      if (extent.product()) {
-        problem_sizes_host.push_back({extent.m(), extent.n(), extent.k()});
+        extent.at(i) = std::atoi(tokens.at(i).c_str());
       }
+      problem_sizes_host.push_back({extent.m(), extent.n(), extent.k()});
     }
     groups = static_cast<int>(problem_sizes_host.size());
 
@@ -500,10 +490,27 @@ void initialize(const Options &options) {
   std::vector<ElementAccumulator *> ptr_beta_host(options.groups);
 
   for (int32_t i = 0; i < options.groups; ++i) {
-    ptr_A_host.at(i) = block_A.get() + offset_A.at(i);
-    ptr_B_host.at(i) = block_B.get() + offset_B.at(i);
-    ptr_C_host.at(i) = block_C.get() + offset_C.at(i);
-    ptr_D_host.at(i) = block_D.get() + offset_D.at(i);
+    // If the current group's matrix has size 0, set the pointer to nullptr
+    if (i < options.groups - 1 && offset_A.at(i) == offset_A.at(i + 1)) {
+      ptr_A_host.at(i) = nullptr;
+    } else {
+      ptr_A_host.at(i) = block_A.get() + offset_A.at(i);
+    }
+    if (i < options.groups - 1 && offset_B.at(i) == offset_B.at(i + 1)) {
+      ptr_B_host.at(i) = nullptr;
+    } else {
+      ptr_B_host.at(i) = block_B.get() + offset_B.at(i);
+    }
+    if (i < options.groups - 1 && offset_C.at(i) == offset_C.at(i + 1)) {
+      ptr_C_host.at(i) = nullptr;
+    } else {
+      ptr_C_host.at(i) = block_C.get() + offset_C.at(i);
+    }
+    if (i < options.groups - 1 && offset_D.at(i) == offset_D.at(i + 1)) {
+      ptr_D_host.at(i) = nullptr;
+    } else {
+      ptr_D_host.at(i) = block_D.get() + offset_D.at(i);
+    }
     alpha_host.push_back((options.alpha == FLT_MAX) ? static_cast<ElementAccumulator>((rand() % 5) + 1) : options.alpha);
     beta_host.push_back((options.beta == FLT_MAX) ? static_cast<ElementAccumulator>(rand() % 5) : options.beta);
     ptr_alpha_host.at(i) = block_alpha.get() + i;
@@ -539,9 +546,10 @@ void initialize(const Options &options) {
   beta_device.reset(options.groups);
   beta_device.copy_from_host(ptr_beta_host.data());
 
-  initialize_block(block_A, seed + 2023);
+  initialize_block(block_A, seed + 2021);
   initialize_block(block_B, seed + 2022);
-  initialize_block(block_C, seed + 2021);
+  initialize_block(block_C, seed + 2023);
+  initialize_block(block_D, seed + 2024);
   block_alpha.copy_from_host(alpha_host.data());
   block_beta.copy_from_host(beta_host.data());
 }
@@ -653,6 +661,13 @@ int run(Options &options, bool host_problem_shapes_available = true)
   allocate(options);
   initialize(options);
 
+  std::cout << "  Problem Sizes, Alpha, Beta " << std::endl;
+  for (int32_t i = 0; i < options.groups; ++i) {
+    std::cout << "    " << options.problem_sizes_host.at(i);
+    std::cout << ", "   << alpha_host.at(i) << ", " << beta_host.at(i) << std::endl;
+  }
+  std::cout << "  Groups      : " << options.groups  << std::endl;
+
   // Instantiate CUTLASS kernel depending on templates
   GemmT gemm;
 
@@ -700,14 +715,8 @@ int run(Options &options, bool host_problem_shapes_available = true)
     result.avg_runtime_ms  = double(elapsed_ms) / double(options.iterations);
     result.gflops          = options.gflops(result.avg_runtime_ms / 1000.0, options.problem_sizes_host);
 
-    std::cout << "  Problem Sizes, Alpha, Beta " << std::endl;
-    for (int32_t i = 0; i < options.groups; ++i) {
-      std::cout << "    " << options.problem_sizes_host.at(i);
-      std::cout << ", " << alpha_host.at(i) << ", " << beta_host.at(i) << std::endl;
-    }
-    std::cout << "  Groups      : " << options.groups  << std::endl;
     std::cout << "  Avg runtime : " << result.avg_runtime_ms << " ms" << std::endl;
-    std::cout << "  GFLOPS      : " << result.gflops << std::endl;
+    std::cout << "  TFLOPS      : " << result.gflops / 1000.0 << std::endl;
   }
 
   return 0;
diff --git a/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu b/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu
index d84934ac..9f60d077 100644
--- a/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu
+++ b/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu
@@ -770,9 +770,6 @@ int main(int argc, char const** argv) {
 
   bool satisfied;
   if (props.major < 10) {
-    // Pre-Blackwell 
-    satisfied =  (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 4);
-    satisfied &= (props.major > 8) || (props.major == 8 && props.minor == 9); 
   }
   else {
     satisfied = (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8);
@@ -786,7 +783,6 @@ int main(int argc, char const** argv) {
     std::cout
       << "CUTLASS's FP8 SM89 example requires an NVIDIA GPU with compute capability 8.9 or greater "
       << "and CUDA toolkit version 12.4 or later"
-      << " (12.8 or later needed for SM100+)" 
       << std::endl;
 
     return 0;
diff --git a/examples/65_distributed_gemm/65_distributed_gemm.cu b/examples/65_distributed_gemm/65_distributed_gemm.cu
index 6509609f..06d18cef 100644
--- a/examples/65_distributed_gemm/65_distributed_gemm.cu
+++ b/examples/65_distributed_gemm/65_distributed_gemm.cu
@@ -132,7 +132,7 @@ using namespace cute;
 using TP = _8;
 static constexpr int TP_ = TP{};
 
-#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) && \
+#if defined(CUTLASS_ARCH_MMA_SM90A_ENABLED) && \
   (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 4))
 
 // Distributed GEMM tiling/sharding schedule
@@ -252,7 +252,7 @@ HostTensorB tensor_B_arr[TP_];
 HostTensorD tensor_C_arr[TP_];
 HostTensorD tensor_D_arr[TP_];
 
-#endif // (defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4))
+#endif // (defined(CUTLASS_ARCH_MMA_SM90A_ENABLED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4))
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 /// Testbed utility types
@@ -344,7 +344,7 @@ struct Result {
 
 };
 
-#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) && \
+#if defined(CUTLASS_ARCH_MMA_SM90A_ENABLED) && \
   (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 4))
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -803,7 +803,7 @@ int run(Options &options) {
   return 0;
 }
 
-#endif // (defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4))
+#endif // (defined(CUTLASS_ARCH_MMA_SM90A_ENABLED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4))
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -857,8 +857,12 @@ int main(int argc, char const **args) {
   // Evaluate CUTLASS kernels
   //
 
-#if (defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4))
+#if (defined(CUTLASS_ARCH_MMA_SM90A_ENABLED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4))
   run(options);
+#else
+    std::cerr
+      << "This example must be compiled with `sm90a` and CUDA Toolkit 12.4 or later." << std::endl;
+    return 0;
 #endif
 
   return 0;
diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
index 5d4fe1a1..b3da5583 100644
--- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
+++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
@@ -205,7 +205,6 @@ cutlass::HostTensor<ElementA  , LayoutA  > tensor_A;
 cutlass::HostTensor<ElementB  , LayoutB  > tensor_B;
 cutlass::HostTensor<ElementC  , LayoutC  > tensor_C;
 cutlass::HostTensor<ElementD  , LayoutD  > tensor_D;
-uint32_t mma_promotion_interval;
 cutlass::HostTensor<ElementBlockScale, LayoutScalar> blockscale_tensor_A;
 cutlass::HostTensor<ElementBlockScale, LayoutScalar> blockscale_tensor_B;
 cutlass::HostTensor<ElementD  , LayoutD  > tensor_ref_D;
@@ -405,12 +404,6 @@ void initialize(const Options<RasterOrderOptions> &options) {
   blockscale_tensor_A.sync_device();
   blockscale_tensor_B.sync_device();
 
-  // Note : This value has to match the KernelSchedule::ScalePromotionInterval
-  // Else kernel will fail can_implement() check
-  // Deprecation Notice : We plan to remove this params member in an upcoming release
-  // Users can safely delete this line from their code, since the default is already 4
-  mma_promotion_interval = 4;
-
   if (options.save_aux) {
     tensor_aux.resize(c_coord);
     tensor_aux.sync_device();
@@ -470,7 +463,6 @@ typename Gemm::Arguments args_from_options(const Options<RasterOrderOptions> &op
      stride_A,
      tensor_B.device_data(),
      stride_B,
-     mma_promotion_interval,
      blockscale_tensor_A.device_data(),
      layout_SFA,
      blockscale_tensor_B.device_data(),
diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu
index 096e56a6..e7e3e4ea 100644
--- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu
+++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu
@@ -215,7 +215,6 @@ cutlass::HostTensor<ElementA  , LayoutA  > tensor_A;
 cutlass::HostTensor<ElementB  , LayoutB  > tensor_B;
 cutlass::HostTensor<ElementC  , LayoutC  > tensor_C;
 cutlass::HostTensor<ElementD  , LayoutD  > tensor_D;
-uint32_t mma_promotion_interval;
 cutlass::HostTensor<ElementBlockScale, LayoutScalar> blockscale_tensor_A;
 cutlass::HostTensor<ElementBlockScale, LayoutScalar> blockscale_tensor_B;
 cutlass::HostTensor<ElementD  , LayoutD  > tensor_ref_D;
@@ -413,12 +412,6 @@ void initialize(const Options<RasterOrderOptions> &options) {
   blockscale_tensor_A.sync_device();
   blockscale_tensor_B.sync_device();
 
-  // Note : This value has to match the KernelSchedule::ScalePromotionInterval
-  // Else kernel will fail can_implement() check
-  // Deprecation Notice : We plan to remove this params member in an upcoming release
-  // Users can safely delete this line from their code, since the default is already 4
-  mma_promotion_interval = 4;
-
   if (options.save_aux) {
     tensor_aux.resize(c_coord);
     tensor_aux.sync_device();
@@ -479,7 +472,6 @@ GemmArguments args_from_options(const Options<RasterOrderOptions> &options)
      stride_A,
      tensor_B.device_data(),
      stride_B,
-     mma_promotion_interval,
      blockscale_tensor_A.device_data(),
      layout_SFA,
      blockscale_tensor_B.device_data(),
diff --git a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu
index ad563a4b..f9d5e842 100644
--- a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu
+++ b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu
@@ -354,19 +354,9 @@ struct Options {
       cutlass::CommandLine::tokenize(tokens, extent_str, 'x');
 
       for (int i = 0; i < int(tokens.size()); ++i) {
-        int x = std::atoi(tokens.at(i).c_str());
-
-        // round up
-        if (x % alignment) {
-          x += (alignment - (x % alignment));
-        }
-
-        extent.at(i) = x;
-      }
-
-      if (extent.product()) {
-        problem_sizes_host.push_back({extent.m(), extent.n(), extent.k()});
+        extent.at(i) = std::atoi(tokens.at(i).c_str());
       }
+      problem_sizes_host.push_back({extent.m(), extent.n(), extent.k()});
     }
     groups = static_cast<int>(problem_sizes_host.size());
 
@@ -745,7 +735,7 @@ int run(Options &options, bool host_problem_shapes_available = true)
     result.gflops          = options.gflops(result.avg_runtime_ms / 1000.0, options.problem_sizes_host);
 
     std::cout << "  Avg runtime : " << result.avg_runtime_ms << " ms" << std::endl;
-    std::cout << "  GFLOPS      : " << result.gflops << std::endl;
+    std::cout << "  TFLOPS      : " << result.gflops / 1000.0 << std::endl;
   }
 
   return 0;
diff --git a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu
index d5814c0a..f052b5f2 100644
--- a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu
+++ b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu
@@ -124,6 +124,7 @@ constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // A
 using ElementAccumulator  = float;                                          // Element type for internal accumulation
 
 // using ElementD = cutlass::float_e2m1_t; // Enable for SF Output          // Element type for D matrix operands
+
 using ElementSFD  = cutlass::float_ue4m3_t;                                 // Element type for SF Output operands
 constexpr int OutputSFVectorSize = 16;
 using FusionOperation = cutlass::epilogue::fusion::LinCombEltActBlockScaleFactor<
@@ -422,19 +423,9 @@ struct Options {
       cutlass::CommandLine::tokenize(tokens, extent_str, 'x');
 
       for (int i = 0; i < int(tokens.size()); ++i) {
-        int x = std::atoi(tokens.at(i).c_str());
-
-        // round up
-        if (x % alignment) {
-          x += (alignment - (x % alignment));
-        }
-
-        extent.at(i) = x;
-      }
-
-      if (extent.product()) {
-        problem_sizes_host.push_back({extent.m(), extent.n(), extent.k()});
+        extent.at(i) = std::atoi(tokens.at(i).c_str());
       }
+      problem_sizes_host.push_back({extent.m(), extent.n(), extent.k()});
     }
     groups = static_cast<int>(problem_sizes_host.size());
 
@@ -885,7 +876,7 @@ int run(Options &options, bool host_problem_shapes_available = true)
     result.gflops          = options.gflops(result.avg_runtime_ms / 1000.0, options.problem_sizes_host);
 
     std::cout << "  Avg runtime : " << result.avg_runtime_ms << " ms" << std::endl;
-    std::cout << "  GFLOPS      : " << result.gflops << std::endl;
+    std::cout << "  TFLOPS      : " << result.gflops / 1000.0 << std::endl;
   }
 
   return 0;
diff --git a/examples/77_blackwell_fmha/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp b/examples/77_blackwell_fmha/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp
index 60b411a3..1eaea0ce 100644
--- a/examples/77_blackwell_fmha/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp
+++ b/examples/77_blackwell_fmha/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp
@@ -505,12 +505,12 @@ struct Sm100FmhaFwdMainloopTmaWarpspecialized {
     // Q1 * K1  , Q2 * K1  , S11 * V1 , Q1 * K2  , S21 * V1  , Q2 * K2 , S12 * V2 , Q1 * K3  , S22 * K2 , ...
   }
 
-  template<bool need_apply_mask, class Stage, class BlkCoord, class CountingTensor, class ProblemShape>
+  template<bool need_apply_mask, class Stage, class BlkCoord, class CoordTensor, class ProblemShape>
   CUTLASS_DEVICE auto
   softmax_step(
       float& row_max, float& row_sum,
       Stage stage, bool final_call,
-      BlkCoord const& blk_coord, CountingTensor const& cS,
+      BlkCoord const& blk_coord, CoordTensor const& cS,
       Params const& params, ProblemShape const& problem_shape,
       PipelineS& pipeline_s, typename PipelineS::PipelineState& pipeline_s_consumer_state,
       PipelineC& pipeline_c, typename PipelineC::PipelineState& pipeline_c_producer_state,
diff --git a/examples/77_blackwell_fmha/collective/sm100_fmha_gen_mainloop_warpspecialized.hpp b/examples/77_blackwell_fmha/collective/sm100_fmha_gen_mainloop_warpspecialized.hpp
index 655c080e..4df7daf5 100644
--- a/examples/77_blackwell_fmha/collective/sm100_fmha_gen_mainloop_warpspecialized.hpp
+++ b/examples/77_blackwell_fmha/collective/sm100_fmha_gen_mainloop_warpspecialized.hpp
@@ -514,12 +514,12 @@ struct Sm100FmhaGenMainloopWarpspecialized {
     // Q1 * K1  , Q2 * K1  , S11 * V1 , Q1 * K2  , S21 * V1  , Q2 * K2 , S12 * V2 , Q1 * K3  , S22 * K2 , ...
   }
 
-  template<bool need_apply_mask, class Stage, class BlkCoord, class CountingTensor, class ProblemShape>
+  template<bool need_apply_mask, class Stage, class BlkCoord, class CoordTensor, class ProblemShape>
   CUTLASS_DEVICE auto
   softmax_step(
       float& row_max, float& row_sum,
       Stage stage, bool final_call,
-      BlkCoord const& blk_coord, CountingTensor const& cS,
+      BlkCoord const& blk_coord, CoordTensor const& cS,
       Params const& params, ProblemShape const& problem_shape,
       PipelineS& pipeline_s, typename PipelineS::PipelineState& pipeline_s_consumer_state,
       PipelineC& pipeline_c, typename PipelineC::PipelineState& pipeline_c_producer_state,
diff --git a/examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu b/examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu
index d36bf4dd..c86580db 100644
--- a/examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu
+++ b/examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu
@@ -861,7 +861,7 @@ int run(Options &options, bool host_problem_shapes_available = true)
     result.gflops          = options.gflops(result.avg_runtime_ms / 1000.0, options.problem_sizes_host);
 
     std::cout << "  Avg runtime : " << result.avg_runtime_ms << " ms" << std::endl;
-    std::cout << "  GFLOPS      : " << result.gflops << std::endl;
+    std::cout << "  TFLOPS      : " << result.gflops / 1000.0 << std::endl;
   }
 
   return 0;
diff --git a/examples/82_blackwell_distributed_gemm/82_blackwell_distributed_gemm.cu b/examples/82_blackwell_distributed_gemm/82_blackwell_distributed_gemm.cu
index f955b8e9..573c25cb 100644
--- a/examples/82_blackwell_distributed_gemm/82_blackwell_distributed_gemm.cu
+++ b/examples/82_blackwell_distributed_gemm/82_blackwell_distributed_gemm.cu
@@ -132,7 +132,7 @@ using namespace cute;
 using TP = _8;
 static constexpr int TP_ = TP{};
 
-#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && \
+#if defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) && \
   (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 4))
 
 // Distributed GEMM tiling/sharding schedule
@@ -254,7 +254,7 @@ HostTensorB tensor_B_arr[TP_];
 HostTensorD tensor_C_arr[TP_];
 HostTensorD tensor_D_arr[TP_];
 
-#endif // (defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4))
+#endif // (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4))
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 /// Testbed utility types
@@ -346,7 +346,7 @@ struct Result {
 
 };
 
-#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && \
+#if defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) && \
   (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 4))
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -805,7 +805,7 @@ int run(Options &options) {
   return 0;
 }
 
-#endif // (defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4))
+#endif // (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4))
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -861,8 +861,12 @@ int main(int argc, char const **args) {
   // Evaluate CUTLASS kernels
   //
 
-#if (defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4))
+#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4))
   run(options);
+#else
+    std::cerr
+      << "This example must be compiled with `sm100a` and CUDA Toolkit 12.4 or later." << std::endl;
+    return 0;
 #endif
 
   return 0;
diff --git a/examples/cute/tutorial/hopper/wgmma_sm90.cu b/examples/cute/tutorial/hopper/wgmma_sm90.cu
index 405bb310..e2b33fb4 100644
--- a/examples/cute/tutorial/hopper/wgmma_sm90.cu
+++ b/examples/cute/tutorial/hopper/wgmma_sm90.cu
@@ -506,13 +506,13 @@ int main(int argc, char** argv)
     return -1;
   }
 
-  if (props.major < 8) {
-    std::cout << "This example requires an Ampere GPU or newer (CC >= 80)" << std::endl;
+  if (props.major != 9) {
+    std::cout << "This example requires NVIDIA's Hopper Architecture GPU with compute capability 90a" << std::endl;
     // Return 0 so tests pass if run on unsupported architectures or CUDA Toolkits.
     return 0;
   }
 
-#if defined(CUTLASS_ARCH_MMA_SM90A_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
 
   int m = 5120;
   if (argc >= 2)
@@ -604,7 +604,7 @@ int main(int argc, char** argv)
   printf("CUTE_GEMM:     [%6.1f]GFlop/s  (%6.4f)ms\n", gflops / cute_time, cute_time*1000);
 
 #else
-  std::cout << "CUTLASS_ARCH_MMA_SM90A_SUPPORTED must be enabled, but it is not. Test is waived \n" << std::endl;
+  std::cout << "CUTLASS_ARCH_MMA_SM90_SUPPORTED must be enabled, but it is not. Test is waived \n" << std::endl;
 #endif
 
   return 0;
diff --git a/examples/cute/tutorial/hopper/wgmma_tma_sm90.cu b/examples/cute/tutorial/hopper/wgmma_tma_sm90.cu
index 77a30890..98df4fa4 100644
--- a/examples/cute/tutorial/hopper/wgmma_tma_sm90.cu
+++ b/examples/cute/tutorial/hopper/wgmma_tma_sm90.cu
@@ -461,7 +461,7 @@ int main(int argc, char** argv)
     return 0;
   }
 
-#if defined(CUTLASS_ARCH_MMA_SM90A_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
 
   int m = 512;
   if (argc >= 2)
@@ -553,7 +553,7 @@ int main(int argc, char** argv)
   printf("CUTE_GEMM:     [%6.1f]GFlop/s  (%6.4f)ms\n", gflops / cute_time, cute_time*1000);
 
 #else
-  std::cout << "CUTLASS_ARCH_MMA_SM90A_SUPPORTED must be enabled, but it is not. Test is waived \n" << std::endl;
+  std::cout << "CUTLASS_ARCH_MMA_SM90_SUPPORTED must be enabled, but it is not. Test is waived \n" << std::endl;
 #endif
 
  return 0;
diff --git a/examples/python/CuTeDSL/ampere/elementwise_add.py b/examples/python/CuTeDSL/ampere/elementwise_add.py
new file mode 100644
index 00000000..dc70a913
--- /dev/null
+++ b/examples/python/CuTeDSL/ampere/elementwise_add.py
@@ -0,0 +1,392 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import argparse
+import torch
+import time
+from typing import Type
+
+import cuda.bindings.driver as cuda
+
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.runtime import from_dlpack
+import cutlass.torch as cutlass_torch
+
+"""
+An Elementwise Addition Example using CuTe DSL.
+
+This example kernel copies data from global memory to register memory (rmem), performs the elementwise
+addition operation, and stores the result back to global memory.
+
+Primary goals of this example are to demonstrate how basic global memory copies can be expressed in
+CuTe DSL and illustrate canonical partitioning patterns in CuTe. It also implements canonical
+predication for tensors whose shape is not multiple of tile size to guard OOB reads.
+
+Thread-value (or TV) layouts are central to canonical partitioning patterns in CuTe. They provide a
+mapping from thread and a thread's value to the set of coordinates within a tile that we have sliced
+out from a data tensor.
+
+The input tensors are row-major layout, that leading dimension is the right most dimension. In order
+to efficiently copy data from global memory, we must map threads contiguously on row dimension.
+
+Thread ID mapping to 2D coordinates with layout `(4,32):(32,1)`:
+
+    +----+----+----+----+-----+----+
+    |    | 0  | 1  | 2  | ... | 31 |
+    +----+----+----+----+-----+----+
+    | 0  | T0 | T1 | T2 | ... | T31|
+    +----+----+----+----+-----+----+
+    | 1  |T32 |T33 |T34 | ... |T63 |
+    +----+----+----+----+-----+----+
+    | 2  |T64 |T65 |T66 | ... |T95 |
+    +----+----+----+----+-----+----+
+    | 3  |T96 |T97 |T98 | ... |T127|
+    +----+----+----+----+-----+----+
+
+As Ampere GPU supports a maximum of 128bit per load/store instruction and each element is 32bit, we
+can load 4 elements per instruction. Having additional contiguous values allows for vectorization
+across threads (coalesced accesses) and is required for saturating the memory bandwidth.
+
+We use `(4,4):(4,1)` as the val layout in this example. Notice that the major mode is the same as
+the major mode of the input tensor - without which vectorization would not be possible.
+
+If you already know the TV layout you want to use for your tiled copy, CuTe DSL provides utility
+`cute.make_layout_tv` to build the tiled copy type around it and the atom of your choice.
+
+.. code-block:: python
+
+    thr_layout = cute.make_layout((4, 32), stride=(32, 1))
+    val_layout = cute.make_layout((4, 4), stride=(4, 1))
+    tiler_mn, tv_layout = cute.make_layout_tv(thr_layout, val_layout)
+
+    # Tile input tensor to thread blocks: ((TileM,TileN),(RestM,RestN))
+    gA = cute.zipped_divide(mA, tiler_mn)
+
+where `tiler_mn` is the tile size per thread block and `tv_layout` is the TV layout which maps
+thread index and inter-thread index of data array per thread to logical coordinates of elements in
+input and output tensors.
+
+Then we can build tiled copy for input and output tensors with `cute.make_tiled_copy` utility.
+
+.. code-block:: python
+
+    blkA = gA[((None, None), bidx)]  # (TileM,TileN)
+
+    copy_atom_load = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), gA.element_type)
+    tiled_copy_A = cute.make_tiled_copy(copy_atom_load, tv_layout, tiler_mn)
+
+    # get slice of tiled_copy_A for current thread
+    thr_copy_A = tiled_copy_A.get_slice(tidx)
+
+    # partition per thread block tensor as source of tiled copy
+    thrA = thr_copy_A.partition_S(blkA)
+
+    # allocate fragment for gmem->rmem
+    frgA = cute.make_fragment_like(thrA)
+
+    # copy data from global memory to register memory
+    cute.copy(copy_atom_load, thrA, frgA)
+
+
+To run this example:
+
+.. code-block:: bash
+
+    python examples/ampere/elementwise_add.py --M 3 --N 12
+    python examples/ampere/elementwise_add.py --M 1024 --N 512
+    python examples/ampere/elementwise_add.py --M 1024 --N 1024 --benchmark --warmup_iterations 2 --iterations 1000
+
+To collect performance with NCU profiler:
+
+.. code-block:: bash
+
+    # Don't iterate too many times when profiling with ncu
+    ncu python examples/ampere/elementwise_add.py --M 2048 --N 2048 --benchmark --iterations 10 --skip_ref_check
+"""
+
+
+@cute.kernel
+def elementwise_add_kernel(
+    gA: cute.Tensor,
+    gB: cute.Tensor,
+    gC: cute.Tensor,
+    cC: cute.Tensor,  # coordinate tensor
+    shape: cute.Shape,
+    tv_layout: cute.Layout,
+    tiler_mn: cute.Shape,
+):
+    tidx, _, _ = cute.arch.thread_idx()
+    bidx, _, _ = cute.arch.block_idx()
+
+    # slice for CTAs
+    # logical id -> address
+    blk_coord = ((None, None), bidx)
+    blkA = gA[blk_coord]  # (TileM,TileN)
+    blkB = gB[blk_coord]  # (TileM,TileN)
+    blkC = gC[blk_coord]  # (TileM,TileN)
+    blkCrd = cC[blk_coord]  # (TileM, TileN)
+
+    print(f"[DSL INFO] Sliced Tensors per thread block:")
+    print(f"[DSL INFO]   blkA = {blkA.type}")
+    print(f"[DSL INFO]   blkB = {blkB.type}")
+    print(f"[DSL INFO]   blkC = {blkC.type}")
+    print(f"[DSL INFO]   blkCrd = {blkCrd.type}")
+
+    # # declare the atoms which will be used later for memory copy
+    copy_atom_load = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), gA.element_type)
+    copy_atom_store = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), gC.element_type)
+
+    tiled_copy_A = cute.make_tiled_copy(copy_atom_load, tv_layout, tiler_mn)
+    tiled_copy_B = cute.make_tiled_copy(copy_atom_load, tv_layout, tiler_mn)
+    tiled_copy_C = cute.make_tiled_copy(copy_atom_store, tv_layout, tiler_mn)
+
+    thr_copy_A = tiled_copy_A.get_slice(tidx)
+    thr_copy_B = tiled_copy_B.get_slice(tidx)
+    thr_copy_C = tiled_copy_C.get_slice(tidx)
+
+    thrA = thr_copy_A.partition_S(blkA)
+    thrB = thr_copy_B.partition_S(blkB)
+    thrC = thr_copy_C.partition_S(blkC)
+
+    # allocate fragments for gmem->rmem
+    frgA = cute.make_fragment_like(thrA)
+    frgB = cute.make_fragment_like(thrB)
+    frgC = cute.make_fragment_like(thrC)
+
+    thrCrd = thr_copy_C.partition_S(blkCrd)
+    frgPred = cute.make_fragment(thrCrd.shape, cutlass.Boolean)
+
+    print(f"[DSL INFO] Sliced Tensors per thread:")
+    print(f"[DSL INFO]   thrA = {thrA.type}")
+    print(f"[DSL INFO]   thrB = {thrB.type}")
+    print(f"[DSL INFO]   thrC = {thrC.type}")
+    print(f"[DSL INFO]   thrCrd = {thrCrd.type}")
+
+    for i in cutlass.range_dynamic(0, cute.size(frgPred), 1):
+        val = cute.elem_less(thrCrd[i], shape)
+        frgPred[i] = val
+
+    # Print per thread predicate mask
+    # if tidx == 0 and bidx == 0:
+    #     cute.printf("block_dim = {}", cute.arch.grid_dim())
+    #     cute.printf("shape = {}", shape)
+    #     cute.print_tensor(thrA)
+    #     cute.print_tensor(thrB)
+    #     cute.print_tensor(frgPred)
+
+    ##########################################################
+    # Move data to reg address space
+    ##########################################################
+
+    cute.copy(copy_atom_load, thrA, frgA, pred=frgPred)
+    cute.copy(copy_atom_load, thrB, frgB, pred=frgPred)
+
+    # if tidx == 0 and bidx == 0:
+    #     cute.print_tensor(frgA)
+    #     cute.print_tensor(frgB)
+
+    # Load data before use. The compiler will optimize the copy and load
+    # operations to convert some memory ld/st into register uses.
+    result = frgA.load() + frgB.load()
+
+    # Save the results back to registers. Here we reuse b's registers.
+    frgC.store(result)
+
+    # Copy the results back to c
+    cute.copy(copy_atom_store, frgC, thrC, pred=frgPred)
+
+
+@cute.jit
+def elementwise_add(mA, mB, mC, copy_bits: cutlass.Constexpr = 128):
+    dtype = mA.element_type
+    vector_size = copy_bits // dtype.width
+
+    thr_layout = cute.make_ordered_layout((4, 32), order=(1, 0))
+    val_layout = cute.make_ordered_layout((4, vector_size), order=(1, 0))
+    tiler_mn, tv_layout = cute.make_layout_tv(thr_layout, val_layout)
+
+    print(f"[DSL INFO] Input Tensors:")
+    print(f"[DSL INFO]   mA = {mA.type}")
+    print(f"[DSL INFO]   mB = {mB.type}")
+
+    print(f"[DSL INFO] Tiling Parameters:")
+    print(f"[DSL INFO]   tiler_mn = {tiler_mn} per thread block")
+    print(f"[DSL INFO]   tv_layout = {tv_layout}")
+
+    gA = cute.zipped_divide(mA, tiler_mn)  # ((TileM,TileN),(RestM,RestN))
+    gB = cute.zipped_divide(mB, tiler_mn)  # ((TileM,TileN),(RestM,RestN))
+    gC = cute.zipped_divide(mC, tiler_mn)  # ((TileM,TileN),(RestM,RestN))
+    print(f"[DSL INFO] Tiled Tensors:")
+    print(f"[DSL INFO]   gA = {gA.type}")
+    print(f"[DSL INFO]   gB = {gB.type}")
+    print(f"[DSL INFO]   gC = {gC.type}")
+
+    idC = cute.make_identity_tensor(mC.shape)
+    cC = cute.zipped_divide(idC, tiler=tiler_mn)
+    print(f"[DSL INFO]   coord tensor = {cC.type}")
+
+    elementwise_add_kernel(gA, gB, gC, cC, mC.shape, tv_layout, tiler_mn).launch(
+        grid=[cute.size(gC, mode=[1]), 1, 1],
+        block=[cute.size(tv_layout, mode=[0]), 1, 1],
+    )
+
+
+def run_elementwise_add(
+    M,
+    N,
+    dtype: Type[cutlass.Numeric],
+    is_a_dynamic_layout=False,
+    is_b_dynamic_layout=False,
+    is_result_dynamic_layout=False,
+    skip_ref_check=False,
+    benchmark=True,
+    warmup_iterations=2,
+    iterations=200,
+):
+    if not torch.cuda.is_available():
+        raise RuntimeError(f"Ampere GPU is required to run this example!")
+
+    print(f"\nRunning Elementwise Add test with:")
+    print(f"Tensor dimensions: [{M}, {N}]")
+    print(f"Input and Output Data type: {dtype}")
+
+    torch_dtype = cutlass_torch.dtype(dtype)
+    if dtype.is_integer:
+        a = torch.randint(0, 10, (M, N), device=torch.device("cuda"), dtype=torch_dtype)
+        b = torch.randint(0, 10, (M, N), device=torch.device("cuda"), dtype=torch_dtype)
+    else:
+        a = torch.randn(M, N, device=torch.device("cuda"), dtype=torch_dtype)
+        b = torch.randn(M, N, device=torch.device("cuda"), dtype=torch_dtype)
+
+    c = torch.zeros_like(a)
+
+    print(f"Input tensor shapes:")
+    print(f"a: {a.shape}, dtype: {a.dtype}")
+    print(f"b: {b.shape}, dtype: {b.dtype}")
+    print(f"c: {c.shape}, dtype: {c.dtype}\n")
+
+    if not is_a_dynamic_layout:
+        a_tensor = from_dlpack(a).mark_layout_dynamic()
+    else:
+        a_tensor = a
+
+    if not is_b_dynamic_layout:
+        b_tensor = from_dlpack(b).mark_layout_dynamic()
+    else:
+        b_tensor = b
+
+    if not is_result_dynamic_layout:
+        c_tensor = from_dlpack(c).mark_layout_dynamic()
+    else:
+        c_tensor = c
+
+    print("Compiling kernel with cute.compile ...")
+    start_time = time.time()
+    compiled_func = cute.compile(elementwise_add, a_tensor, b_tensor, c_tensor)
+    compilation_time = time.time() - start_time
+    print(f"Compilation time: {compilation_time:.4f} seconds")
+
+    print("Executing vector add kernel...")
+
+    # Get current CUDA stream from PyTorch
+    torch_stream = torch.cuda.current_stream()
+    # Get the raw stream pointer as a CUstream
+    current_stream = cuda.CUstream(torch_stream.cuda_stream)
+
+    if not skip_ref_check:
+        compiled_func(a_tensor, b_tensor, c_tensor)
+        print("Verifying results...")
+        torch.testing.assert_close(a + b, c)
+        print("Results verified successfully!")
+
+    if not benchmark:
+        return
+
+    # Create CUDA events for timing
+    start_event = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1]
+    end_event = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1]
+
+    # Warmup
+    for _ in range(warmup_iterations):
+        compiled_func(a_tensor, b_tensor, c_tensor)
+
+    # Use the current stream for CUDA events instead of the default stream
+    # Record start event
+    cuda.cuEventRecord(start_event, current_stream)
+
+    # Execute the kernel
+    for _ in range(iterations):
+        compiled_func(a_tensor, b_tensor, c_tensor)
+
+    # Record end event
+    cuda.cuEventRecord(end_event, current_stream)
+    cuda.cuEventSynchronize(end_event)
+
+    # Calculate elapsed time
+    err, elapsed_time = cuda.cuEventElapsedTime(start_event, end_event)
+    avg_time = elapsed_time / iterations
+
+    # Print execution results
+    print(f"Kernel execution time: {avg_time:.4f} ms")
+    print(
+        f"Achieved memory throughput: {(3 * a.numel() * dtype.width // 8) / (avg_time / 1000) / 1e9:.2f} GB/s"
+    )
+    print(f"First few elements of result: \n{c[:3, :3]}")
+
+    # Destroy events
+    cuda.cuEventDestroy(start_event)
+    cuda.cuEventDestroy(end_event)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="example of elementwise add to demonstrate the numpy/pytorch as input for kernels"
+    )
+    parser.add_argument("--M", default=1024, type=int)
+    parser.add_argument("--N", default=1024, type=int)
+    parser.add_argument("--warmup_iterations", default=2, type=int)
+    parser.add_argument("--iterations", default=100, type=int)
+    parser.add_argument("--skip_ref_check", action="store_true")
+    parser.add_argument("--benchmark", action="store_true")
+
+    args = parser.parse_args()
+    run_elementwise_add(
+        args.M,
+        args.N,
+        dtype=cutlass.Float32,
+        is_a_dynamic_layout=True,
+        is_b_dynamic_layout=True,
+        is_result_dynamic_layout=True,
+        skip_ref_check=args.skip_ref_check,
+        benchmark=args.benchmark,
+        warmup_iterations=args.warmup_iterations,
+        iterations=args.iterations,
+    )
+    print("\nPASS")
diff --git a/examples/python/CuTeDSL/ampere/elementwise_apply.py b/examples/python/CuTeDSL/ampere/elementwise_apply.py
new file mode 100644
index 00000000..e1e18729
--- /dev/null
+++ b/examples/python/CuTeDSL/ampere/elementwise_apply.py
@@ -0,0 +1,395 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import argparse
+import operator
+import torch
+from typing import Type
+import time
+
+import cuda.bindings.driver as cuda
+
+import cutlass
+import cutlass.cute as cute
+import cutlass.torch as cutlass_torch
+from cutlass.cute.runtime import from_dlpack
+
+"""
+An Elementwise Apply Example using CuTe DSL.
+
+This example kernel demonstrates the meta-programming capability of the CuTe DSL by allowing
+customization of elementwise operations through lambda functions. The kernel copies data from
+global memory to register memory (rmem), applies a user-defined operation to the elements,
+and stores the result back to global memory.
+
+Primary goals of this example:
+1. Demonstrate meta-programming capability by passing lambda functions to customize elementwise operations
+2. Show how to apply different operations (add, multiply, etc.) using the same kernel structure
+3. Illustrate how to parameterize CUDA kernels with operation types at compile time
+
+To run this example:
+
+.. code-block:: bash
+
+    # Run with addition operation
+    python examples/ampere/elementwise_apply.py --M 1024 --N 512 --op add
+
+    # Run with multiplication operation
+    python examples/ampere/elementwise_apply.py --M 1024 --N 512 --op mul
+
+    # Run with subtraction operation
+    python examples/ampere/elementwise_apply.py --M 1024 --N 512 --op sub
+
+    # Benchmark performance
+    python examples/ampere/elementwise_apply.py --M 2048 --N 2048 --op add --benchmark --warmup_iterations 2 --iterations 10
+
+The example demonstrates how to express complex CUDA kernels with customizable operations
+while maintaining high performance through efficient memory access patterns.
+"""
+
+
+@cute.kernel
+def elementwise_apply_kernel(
+    op: cutlass.Constexpr,
+    gA: cute.Tensor,
+    gB: cute.Tensor,
+    gC: cute.Tensor,
+    cC: cute.Tensor,  # coordinate tensor
+    shape: cute.Shape,
+    tv_layout: cute.Layout,  # (tid, vid) -> logic coord
+):
+    tidx, _, _ = cute.arch.thread_idx()
+    bidx, _, _ = cute.arch.block_idx()
+
+    # slice for CTAs
+    cta_coord = ((None, None), bidx)
+    # logical coord -> address
+    ctaA = gA[cta_coord]  # (TileM, TileN)
+    ctaB = gB[cta_coord]  # (TileM, TileN)
+    ctaC = gC[cta_coord]  # (TileM, TileN)
+    ctaCrd = cC[cta_coord]  # (TileM, TileN)
+
+    print(f"[DSL INFO] Sliced Tensors per thread block:")
+    print(f"[DSL INFO]   ctaA = {ctaA.type}")
+    print(f"[DSL INFO]   ctaB = {ctaB.type}")
+    print(f"[DSL INFO]   ctaC = {ctaC.type}")
+    print(f"[DSL INFO]   ctaCrd = {ctaCrd.type}")
+
+    # compose with CTA TV layout
+    # (tid, vid) -> address
+    tidfrgA = cute.composition(ctaA, tv_layout)
+    tidfrgB = cute.composition(ctaB, tv_layout)
+    tidfrgC = cute.composition(ctaC, tv_layout)
+    tidfrgCrd = cute.composition(ctaCrd, tv_layout)
+    # print(f"{tv_layout = }")
+    # print(f"{tidfrgA = }")
+
+    thr_coord = (tidx, (None, None))
+
+    # slice for threads
+    # vid -> address
+    thrA = tidfrgA[thr_coord]  # (V)
+    thrB = tidfrgB[thr_coord]  # (V)
+    thrC = tidfrgC[thr_coord]  # (V)
+    thrCrd = tidfrgCrd[thr_coord]
+
+    print(f"[DSL INFO] Sliced Tensors per thread:")
+    print(f"[DSL INFO]   thrA = {thrA.type}")
+    print(f"[DSL INFO]   thrB = {thrB.type}")
+    print(f"[DSL INFO]   thrC = {thrC.type}")
+    print(f"[DSL INFO]   thrCrd = {thrCrd.type}")
+
+    # allocate fragments for gmem->rmem
+    frgA = cute.make_fragment_like(thrA, gA.element_type)
+    frgB = cute.make_fragment_like(thrB, gB.element_type)
+    frgC = cute.make_fragment_like(thrC, gC.element_type)
+    frgPred = cute.make_fragment(thrCrd.shape, cutlass.Boolean)
+
+    for i in cutlass.range_dynamic(cute.size(frgPred), unroll=1):
+        frgPred[i] = cute.elem_less(thrCrd[i], shape)
+
+    # if tidx == 0 and bidx == 0:
+    #     cute.print_tensor(frgPred)
+
+    ##########################################################
+    # Move data to reg address space
+    ##########################################################
+
+    # declare the atoms which will be used later for memory copy
+    copy_atom_load = cute.make_copy_atom(
+        cute.nvgpu.CopyUniversalOp(),
+        gA.element_type,
+        num_bits_per_copy=gA.element_type.width,
+    )
+    copy_atom_store = cute.make_copy_atom(
+        cute.nvgpu.CopyUniversalOp(),
+        gC.element_type,
+        num_bits_per_copy=gC.element_type.width,
+    )
+
+    cute.copy(copy_atom_load, thrA, frgA, pred=frgPred)
+    cute.copy(copy_atom_load, thrB, frgB, pred=frgPred)
+
+    # Load data before use. The compiler will optimize the copy and load
+    # operations to convert some memory ld/st into register uses.
+    result = op(frgA.load(), frgB.load())
+
+    # Save the results back to registers. Here we reuse b's registers.
+    frgC.store(result)
+
+    # Copy the results back to c
+    cute.copy(copy_atom_store, frgC, thrC, pred=frgPred)
+
+
+@cute.jit
+def elementwise_apply(
+    op: cutlass.Constexpr,
+    a: cute.Tensor,
+    b: cute.Tensor,
+    result: cute.Tensor,
+):
+    """CUDA kernel applying binary operator on each element of two n-D input tensors in
+    CuTe Python and store to result tensor.
+
+    :param op: Binary operator or lambda function to apply element-wise
+    :type op: cutlass.Constexpr
+    :param a: First input tensor
+    :type a: cute.Tensor
+    :param b: Second input tensor
+    :type b: cute.Tensor
+    :param result: Output tensor to store the results of op(a, b)
+    :type result: cute.Tensor
+    :return: None
+    :rtype: None
+
+    .. code-block:: python
+
+        # Example 1: Adding two tensors
+        x = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32, device="cuda")
+        y = torch.tensor([[5, 6], [7, 8]], dtype=torch.float32, device="cuda")
+        result = torch.empty_like(x)
+        elementwise_apply(operator.add, from_dlpack(x), from_dlpack(y), from_dlpack(result))
+        # result:
+        # tensor([[6.0, 8.0],
+        #         [10.0, 12.0]], device='cuda:0')
+
+        # Example 2: Using a lambda function
+        elementwise_apply(lambda a, b: a * a + b * b, from_dlpack(x), from_dlpack(y), from_dlpack(result))
+        # result:
+        # tensor([[  2.,   8.],
+        #         [ 54., 512.]], device='cuda:0')
+    """
+
+    # Baseline: naive TV layout
+    #   * mA layout: (4096, 4096):(4096, 1)
+    #   * TV layout map to (512, 4) tile
+    #   * tidx maps to mode-0 but input layout is contiguous on mode-1, performance will be bad
+    # tv_layout = cute.make_layout((128, (4, 4)), stride=(4, (512, 1)))
+    # cta_tiler = (512, 4)
+
+    # Opt-1: better TV layout with better 1D thread layout (SOL with 1D thread layout)
+    #   * mA layout: (4096, 4096):(4096, 1)
+    #   * TV layout map to (4, 512) tile
+    #   * tidx maps to mode-1 which is leading mode of input tensor for coalesced load
+    # tv_layout = cute.make_layout((128, (4, 4)), stride=(16, (4, 1)))
+    # cta_tiler = (4, 512)
+
+    # Opt-2: 2D tile but worse
+    #   * mA layout: (4096, 4096):(4096, 1)
+    #   * TV layout map to (128, 16) logical tile
+    #   * V layout is bad as contiguous mode is not on right-most
+    #     * `cute.copy` only supports vectorize when stride-1 of v-layout on right-most )
+    # tv_layout = cute.make_layout(((32, 4), (4, 4)), stride=((4, 512), (1, 128)))
+    # cta_tiler = (128, 16)
+
+    # Opt-3: SOL with 2D thread tile
+    #   * mA layout: (4096, 4096):(4096, 1)
+    #   * TV layout map to (16, 128) logical tile
+    #   * tidx maps to mode-1 and input layout is contiguous on mode-1 for coalesced load-store
+    thr_layout = cute.make_layout((4, 32), stride=(32, 1))
+    val_layout = cute.make_layout((4, 4), stride=(4, 1))
+    tiler_mn, tv_layout = cute.make_layout_tv(thr_layout, val_layout)
+
+    print(f"[DSL INFO] Input Tensors:")
+    print(f"[DSL INFO]   a = {a.type}")
+    print(f"[DSL INFO]   b = {b.type}")
+    print(f"[DSL INFO]   result = {result.type}")
+
+    print(f"[DSL INFO] Tiling Parameters:")
+    print(f"[DSL INFO]   tiler_mn = {tiler_mn} per thread block")
+    print(f"[DSL INFO]   tv_layout = {tv_layout}")
+
+    gA = cute.zipped_divide(a, tiler_mn)  # ((TileM, TileN), (RestM, RestN))
+    gB = cute.zipped_divide(b, tiler_mn)  # ((TileM, TileN), (RestM, RestN))
+    gC = cute.zipped_divide(result, tiler_mn)  # ((TileM, TileN), (RestM, RestN))
+
+    print(f"[DSL INFO] Tiled Tensors:")
+    print(f"[DSL INFO]   gA = {gA.type}")
+    print(f"[DSL INFO]   gB = {gB.type}")
+    print(f"[DSL INFO]   gC = {gC.type}")
+
+    idC = cute.make_identity_tensor(result.shape)
+    cC = cute.zipped_divide(idC, tiler=tiler_mn)
+    print(f"[DSL INFO]   coord tensor = {cC.type}")
+
+    # Launch the kernel asynchronously
+    # Async token(s) can also be specified as dependencies
+    elementwise_apply_kernel(
+        op,
+        gA,
+        gB,
+        gC,
+        cC,
+        result.shape,
+        tv_layout,
+    ).launch(
+        grid=[cute.size(gC, mode=[1]), 1, 1],
+        block=[cute.size(tv_layout, mode=[0]), 1, 1],
+    )
+
+
+def run_elementwise_apply_and_verify(
+    op,
+    M,
+    N,
+    dtype: Type[cutlass.Numeric],
+    skip_ref_check=False,
+    benchmark=True,
+    warmup_iterations=2,
+    iterations=100,
+):
+    if not torch.cuda.is_available():
+        raise RuntimeError(f"Ampere GPU is required to run this example!")
+
+    print(f"\nRunning Elementwise Apply test with:")
+    print(f"Tensor dimensions: [{M}, {N}]")
+    print(f"Input and Output Data type: {dtype}")
+    print(f"Warmup iterations: {warmup_iterations}")
+    print(f"Measurement iterations: {iterations}\n")
+
+    torch_dtype = cutlass_torch.dtype(dtype)
+
+    # Allocate tensors with random values.
+    a = torch.randn(M, N, device=torch.device("cuda"), dtype=torch_dtype)
+    b = torch.randn(M, N, device=torch.device("cuda"), dtype=torch_dtype)
+    c = torch.zeros_like(a)
+
+    print(f"Input tensor shapes:")
+    print(f"a: {a.shape}, dtype: {a.dtype}")
+    print(f"b: {b.shape}, dtype: {b.dtype}")
+    print(f"c: {c.shape}, dtype: {c.dtype}\n")
+
+    epsilon = 1.2
+    if op in (operator.truediv, operator.floordiv):
+        b = torch.where(b == 0, torch.tensor(epsilon), b)
+
+    print("Compiling kernel with cute.compile ...")
+    start_time = time.time()
+    compilation_time = time.time() - start_time
+    print(f"Compilation time: {compilation_time:.4f} seconds")
+
+    print("Executing elementwise apply kernel...")
+    # Get current CUDA stream from PyTorch
+    torch_stream = torch.cuda.current_stream()
+    # Get the raw stream pointer as a CUstream
+    current_stream = cuda.CUstream(torch_stream.cuda_stream)
+
+    if not skip_ref_check:
+        elementwise_apply(
+            op, from_dlpack(a), from_dlpack(b), from_dlpack(c).mark_layout_dynamic()
+        )
+        print("Verifying results...")
+        torch.testing.assert_close(op(a, b), c)
+        print("Results verified successfully!")
+
+    if not benchmark:
+        return
+
+    # Create CUDA events for timing
+    start_event = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1]
+    end_event = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1]
+
+    # Warmup
+    for _ in range(warmup_iterations):
+        elementwise_apply(
+            op, from_dlpack(a), from_dlpack(b), from_dlpack(c).mark_layout_dynamic()
+        )
+
+    # Record start event
+    cuda.cuEventRecord(start_event, current_stream)
+
+    # Execute the kernel
+    for _ in range(iterations):
+        elementwise_apply(
+            op, from_dlpack(a), from_dlpack(b), from_dlpack(c).mark_layout_dynamic()
+        )
+
+    # Record end event
+    cuda.cuEventRecord(end_event, current_stream)
+    cuda.cuEventSynchronize(end_event)
+
+    # Calculate elapsed time
+    err, elapsed_time = cuda.cuEventElapsedTime(start_event, end_event)
+    avg_time = elapsed_time / iterations
+
+    # Print execution results
+    print(f"Kernel execution time: {avg_time:.4f} ms")
+    print(
+        f"Achieved memory throughput: {(3 * a.numel() * dtype.width // 8) / (avg_time / 1000) / 1e9:.2f} GB/s"
+    )
+    print(f"First few elements of result: \n{c[:3, :3]}")
+
+    # Destroy events
+    cuda.cuEventDestroy(start_event)
+    cuda.cuEventDestroy(end_event)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="example of elementwise apply to demonstrate building elementwise kernels"
+    )
+    parser.add_argument("--M", default=128, type=int)
+    parser.add_argument("--N", default=128, type=int)
+    parser.add_argument("--op", default="add", type=str)
+    parser.add_argument("--warmup_iterations", default=2, type=int)
+    parser.add_argument("--iterations", default=100, type=int)
+    parser.add_argument("--skip_ref_check", action="store_true")
+    parser.add_argument("--benchmark", action="store_true")
+    args = parser.parse_args()
+    run_elementwise_apply_and_verify(
+        getattr(operator, args.op),
+        args.M,
+        args.N,
+        dtype=cutlass.Float32,
+        warmup_iterations=args.warmup_iterations,
+        iterations=args.iterations,
+        skip_ref_check=args.skip_ref_check,
+        benchmark=args.benchmark,
+    )
+    print("\nPASS")
diff --git a/examples/python/CuTeDSL/ampere/flash_attention_v2.py b/examples/python/CuTeDSL/ampere/flash_attention_v2.py
new file mode 100644
index 00000000..0f41245e
--- /dev/null
+++ b/examples/python/CuTeDSL/ampere/flash_attention_v2.py
@@ -0,0 +1,1353 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+from types import SimpleNamespace
+from typing import Type, Union, Callable
+
+import torch
+import cuda.bindings.driver as cuda
+
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.nvgpu import cpasync, warp
+import cutlass.torch as cutlass_torch
+from cutlass.cute.runtime import from_dlpack
+import cutlass.utils.ampere_helpers as sm80_utils
+
+"""
+A flash attention v2 forward pass example for NVIDIA Ampere SM80 architecture using CUTE DSL.
+
+- Matrix Q is BxSqxNxH, B is batch dimension, Sq is query sequence length, N is number of heads, H is head dimension
+- Matrix K is BxSkxNxH, B is batch dimension, Sk is key sequence length, N is number of heads, H is head dimension
+- Matrix V is BxSkxNxH, B is batch dimension, Sk is key sequence length, N is number of heads, H is head dimension
+- Matrix O is BxSqxNxH, B is batch dimension, Sq is query sequence length, N is number of heads, H is head dimension
+
+This kernel supports the following features:
+    - Utilizes CpAsync for efficient memory operations
+    - Utilizes Ampere's tensor core for matrix multiply-accumulate (MMA) operations
+    - Utilizes register pipeline to overlap shared memory-to-register transfers with computations.
+    - Leverages DSL to implement an integrated online softmax fusion pattern.
+
+This kernel works as follows:
+1. Load Q and K matrices from global memory (GMEM) to shared memory (SMEM) using CpAsync operations.
+2. Perform matrix multiply-accumulate (MMA) operations using tensor core instructions to compute intermediate result S.
+3. Apply padding mask or causal mask to S during initial iterations.
+4. Apply online softmax to S and rescale O using results from previous iteration.
+5. Load V matrices and perform matrix multiply-accumulate (MMA) operations to compute final result O.
+6. Normalize O after all iterations complete and store result back to global memory (GMEM).
+
+To run this example:
+
+.. code-block:: bash
+
+    python examples/ampere/flash_attention_v2.py                                            \
+      --dtype Float16 --head_dim 128 --m_block_size 128 --n_block_size 128                  \
+      --num_threads 128 --batch_size 1 --seqlen_q 1280 --seqlen_k 1536                      \
+      --num_head 16 --softmax_scale 1.0 --is_causal
+
+The above command configures the model to use float16 for inputs and outputs. The problem dimensions
+are set to: batch size of 1, query sequence length of 1280, key sequence length of 1536, head dimension
+of 128, and 16 attention heads. The softmax scale is set to 1.0 and causal masking is enabled. The computation
+uses tiles of size 128x128 for m and n dimensions, and utilizes 128 parallel threads.
+
+To collect the performance with NCU profiler:
+
+.. code-block:: bash
+
+    ncu python examples/ampere/flash_attention_v2.py                                        \
+        --dtype Float16 --head_dim 128 --m_block_size 128 --n_block_size 128                \
+        --num_threads 128 --batch_size 1 --seqlen_q 1280 --seqlen_k 1536                    \
+        --num_head 16 --softmax_scale 1.0 --is_causal --skip_ref_check
+
+There are some constraints for this example:
+* Only fp16 and bf16 data types are supported.
+* The contiguous dimension of each tensor must be at least 16 bytes aligned.
+* The log-sum-exp(for training) is not computed in the kernel.
+* The values of `m_block_size`, `n_block_size`, and `head_dim` must be selected to stay within shared memory capacity limits.
+* `m_block_size * 2` must be divisible by `num_threads`, otherwise the kernel will not be able to get the correct result.
+"""
+
+
+class FlashAttentionForwardAmpere:
+    def __init__(
+        self,
+        head_dim: int,
+        m_block_size: int = 128,
+        n_block_size: int = 128,
+        num_threads: int = 128,
+        is_causal: bool = False,
+    ):
+        """Initializes the configuration for a flash attention v2 kernel.
+
+        All contiguous dimensions must be at least 16 bytes aligned which indicates the head dimension
+        should be a multiple of 8.
+
+        :param head_dim: head dimension
+        :type head_dim: int
+        :param m_block_size: m block size
+        :type m_block_size: int
+        :param n_block_size: n block size
+        :type n_block_size: int
+        :param num_threads: number of threads
+        :type num_threads: int
+        :param is_causal: is causal
+        """
+        self._head_dim = head_dim
+        self._m_block_size = m_block_size
+        self._n_block_size = n_block_size
+        # padding head_dim to a multiple of 32 as k_block_size
+        self._head_dim_padded = (head_dim + 31) // 32 * 32
+        self._num_threads = num_threads
+        self._is_causal = is_causal
+
+    @staticmethod
+    def can_implement(
+        dtype, head_dim, m_block_size, n_block_size, num_threads, is_causal
+    ) -> bool:
+        """Check if the kernel can be implemented with the given parameters.
+
+        :param dtype: data type
+        :type dtype: cutlass.Numeric
+        :param head_dim: head dimension
+        :type head_dim: int
+        :param m_block_size: m block size
+        :type m_block_size: int
+        :param n_block_size: n block size
+        :type n_block_size: int
+        :param num_threads: number of threads
+        :type num_threads: int
+        :param is_causal: is causal
+        :type is_causal: bool
+
+        :return: True if the kernel can be implemented, False otherwise
+        :rtype: bool
+        """
+        # Check if data type is fp16 or bf16
+        if dtype != cutlass.Float16 and dtype != cutlass.BFloat16:
+            return False
+
+        # Check if head dimension is a multiple of 8
+        if head_dim % 8 != 0:
+            return False
+
+        # Check if number of threads is a multiple of 32
+        if num_threads % 32 != 0:
+            return False
+
+        # Check if block size setting is out of shared memory capacity
+        # Shared memory usage: Q tile + (K tile + V tile) where K and V use the same tile size
+        smem_usage = (m_block_size * head_dim + n_block_size * head_dim * 2) * 2
+        smem_capacity = sm80_utils.SMEM_CAPACITY["sm80"]
+        if smem_usage > smem_capacity:
+            return False
+
+        # Check if twice the block size is divisible by the number of threads
+        if (m_block_size * 2) % num_threads != 0:
+            return False
+
+        return True
+
+    @cute.jit
+    def __call__(
+        self,
+        mQ: cute.Tensor,
+        mK: cute.Tensor,
+        mV: cute.Tensor,
+        mO: cute.Tensor,
+        softmax_scale: cutlass.Float32,
+        stream: cuda.CUstream,
+    ):
+        """Configures and launches the flash attention v2 kernel.
+
+        mQ/mK/mV/mO has same data types(supports fp16 and bf16) and same layout:
+        (batch_size, seqlen_q, num_head, head_dim):(seqlen_q * num_head * head_dim, num_head * head_dim, head_dim, 1)
+
+        Prepares the shared memory layout, tiled copy atoms, tiled mma and shared memory storage.
+        Then launches the kernel function with the prepared parameters.
+
+        :param mQ: query tensor
+        :type mQ: cute.Tensor
+        :param mK: key tensor
+        :type mK: cute.Tensor
+        :param mV: value tensor
+        :type mV: cute.Tensor
+        :param mO: output tensor
+        :type mO: cute.Tensor
+        :param softmax_scale: softmax scale
+        :type softmax_scale: cutlass.Float32
+        """
+        # Get the data type and check if it is fp16 or bf16
+        if cutlass.const_expr(
+            not (
+                mQ.element_type == mK.element_type == mV.element_type == mO.element_type
+            )
+        ):
+            raise TypeError("All tensors must have the same data type")
+        if cutlass.const_expr(
+            not (
+                mQ.element_type == cutlass.Float16
+                or mQ.element_type == cutlass.BFloat16
+            )
+        ):
+            raise TypeError("Only Float16 or BFloat16 is supported")
+        self._dtype: Type[cutlass.Numeric] = mQ.element_type
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Shared memory layout: Q/K/V
+        # ///////////////////////////////////////////////////////////////////////////////
+        smem_k_block_size = 64 if self._head_dim_padded % 64 == 0 else 32
+        swizzle_bits = 3 if smem_k_block_size == 64 else 2
+        sQ_layout_atom = cute.make_composed_layout(
+            cute.make_swizzle(swizzle_bits, 3, 3),
+            0,
+            cute.make_layout((8, smem_k_block_size), stride=(smem_k_block_size, 1)),
+        )
+        sQ_layout = cute.tile_to_shape(
+            sQ_layout_atom,
+            (self._m_block_size, self._head_dim_padded),
+            (0, 1),
+        )
+
+        sKV_layout_atom = sQ_layout_atom
+        sKV_layout = cute.tile_to_shape(
+            sKV_layout_atom,
+            (self._n_block_size, self._head_dim_padded),
+            (0, 1),
+        )
+
+        sO_layout = sQ_layout
+
+        @cute.struct
+        class SharedStorage:
+            sQ: cute.struct.Align[
+                cute.struct.MemRange[self._dtype, cute.cosize(sQ_layout)], 1024
+            ]
+            sK: cute.struct.Align[
+                cute.struct.MemRange[self._dtype, cute.cosize(sKV_layout)], 1024
+            ]
+            sV: cute.struct.Align[
+                cute.struct.MemRange[self._dtype, cute.cosize(sKV_layout)], 1024
+            ]
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # GMEM Tiled copy:
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Thread layouts for copies
+        universal_copy_bits = 128
+        async_copy_elems = universal_copy_bits // self._dtype.width
+        # atom_async_copy: async copy atom for QKV load
+        atom_async_copy = cute.make_copy_atom(
+            cpasync.CopyG2SOp(cache_mode=cpasync.LoadCacheMode.GLOBAL),
+            self._dtype,
+            num_bits_per_copy=universal_copy_bits,
+        )
+        # atom_universal_copy: universal copy atom for O store
+        atom_universal_copy = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(),
+            self._dtype,
+            num_bits_per_copy=universal_copy_bits,
+        )
+        # tQKV_layout: thread layout for QKV load
+        tQKV_shape_dim_1 = sQ_layout_atom.outer.shape[1] // async_copy_elems
+        tQKV_layout = cute.make_layout(
+            (self._num_threads // tQKV_shape_dim_1, tQKV_shape_dim_1),
+            stride=(tQKV_shape_dim_1, 1),
+        )
+        # tO_layout: thread layout for O store
+        tO_layout = tQKV_layout
+
+        # Value layouts for copies
+        vQKV_layout = cute.make_layout((1, async_copy_elems))
+        vO_layout = vQKV_layout
+
+        # gmem_tiled_copy_QKV: tiled copy for QKV load
+        gmem_tiled_copy_QKV = cute.make_tiled_copy_tv(
+            atom_async_copy, tQKV_layout, vQKV_layout
+        )
+        # gmem_tiled_copy_O: tiled copy for O store
+        gmem_tiled_copy_O = cute.make_tiled_copy_tv(
+            atom_universal_copy, tO_layout, vO_layout
+        )
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Tiled mma
+        # ///////////////////////////////////////////////////////////////////////////////
+        tiled_mma = cute.make_tiled_mma(
+            warp.MmaF16BF16Op(self._dtype, cutlass.Float32, (16, 8, 16)),
+            (self._num_threads // 32, 1, 1),
+            permutation_mnk=(self._num_threads // 32 * 16, 16, 16),
+        )
+
+        # grid_dim: (m_block, batch_size, num_head)
+        grid_dim = (
+            cute.ceil_div(mQ.shape[1], self._m_block_size),
+            cute.size(mQ.shape[0]),
+            cute.size(mQ.shape[2]),
+        )
+        LOG2_E = 1.4426950408889634074
+        softmax_scale_log2 = softmax_scale * LOG2_E
+        self.kernel(
+            mQ,
+            mK,
+            mV,
+            mO,
+            softmax_scale_log2,
+            sQ_layout,
+            sKV_layout,
+            sO_layout,
+            gmem_tiled_copy_QKV,
+            gmem_tiled_copy_O,
+            tiled_mma,
+            SharedStorage,
+        ).launch(
+            grid=grid_dim,
+            block=[self._num_threads, 1, 1],
+            smem=SharedStorage.size_in_bytes(),
+            stream=stream,
+        )
+
+    @cute.kernel
+    def kernel(
+        self,
+        mQ: cute.Tensor,
+        mK: cute.Tensor,
+        mV: cute.Tensor,
+        mO: cute.Tensor,
+        softmax_scale_log2: cutlass.Float32,
+        sQ_layout: cute.ComposedLayout,
+        sKV_layout: cute.ComposedLayout,
+        sO_layout: cute.ComposedLayout,
+        gmem_tiled_copy_QKV: cute.TiledCopy,
+        gmem_tiled_copy_O: cute.TiledCopy,
+        tiled_mma: cute.TiledMma,
+        SharedStorage: cutlass.Constexpr,
+    ):
+        """Kernel function for flash attention v2.
+
+        :param mQ: query tensor
+        :type mQ: cute.Tensor
+        :param mK: key tensor
+        :type mK: cute.Tensor
+        :param mV: value tensor
+        :type mV: cute.Tensor
+        :param mO: output tensor
+        :type mO: cute.Tensor
+        :param softmax_scale_log2: softmax scale log2
+        :type softmax_scale_log2: cutlass.Float32
+        :param sQ_layout: query layout
+        :type sQ_layout: cute.ComposedLayout
+        :param sKV_layout: key/value layout
+        :type sKV_layout: cute.ComposedLayout
+        :param sO_layout: output layout
+        :type sO_layout: cute.ComposedLayout
+        :param gmem_tiled_copy_QKV: tiled copy for QKV load
+        :type gmem_tiled_copy_QKV: cute.TiledCopy
+        :param gmem_tiled_copy_O: tiled copy for O store
+        :type gmem_tiled_copy_O: cute.TiledCopy
+        :param tiled_mma: tiled mma
+        :type tiled_mma: cute.TiledMma
+        :param SharedStorage: shared storage
+        :type SharedStorage: cutlass.Constexpr
+        """
+        # Thread index, block index
+        tidx, _, _ = cute.arch.thread_idx()
+        m_block, batch_size, num_head = cute.arch.block_idx()
+
+        n_block_max = cute.ceil_div(mK.shape[1], self._n_block_size)
+        if self._is_causal:
+            n_block_max = min(
+                cute.ceil_div(
+                    (m_block + 1) * self._m_block_size,
+                    self._n_block_size,
+                ),
+                n_block_max,
+            )
+        n_block = n_block_max - 1
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Get the appropriate tiles for this thread block.
+        # ///////////////////////////////////////////////////////////////////////////////
+        # (m_block_size, head_dim)
+        gQ = cute.local_tile(
+            mQ[batch_size, None, num_head, None],
+            (self._m_block_size, self._head_dim_padded),
+            (m_block, 0),
+        )
+        # (n_block_size, head_dim, n_block)
+        gK = cute.local_tile(
+            mK[batch_size, None, num_head, None],
+            (self._n_block_size, self._head_dim_padded),
+            (None, 0),
+        )
+        # (n_block_size, head_dim, n_block)
+        gV = cute.local_tile(
+            mV[batch_size, None, num_head, None],
+            (self._n_block_size, self._head_dim_padded),
+            (None, 0),
+        )
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Get shared memory buffer
+        # ///////////////////////////////////////////////////////////////////////////////
+        smem = cutlass.utils.SmemAllocator()
+
+        storage = smem.allocate(SharedStorage)
+        sQ = storage.sQ.get_tensor(sQ_layout)
+        sK = storage.sK.get_tensor(sKV_layout)
+        sV = storage.sV.get_tensor(sKV_layout)
+
+        # Transpose view of V to tensor with layout (head_dim, n_block_size) for tiled mma
+        sVt = cute.composition(
+            sV,
+            cute.make_layout(
+                (self._head_dim_padded, self._n_block_size),
+                stride=(self._n_block_size, 1),
+            ),
+        )
+
+        gmem_thr_copy_QKV = gmem_tiled_copy_QKV.get_slice(tidx)
+        # (CPY_Atom, CPY_M, CPY_K)
+        tQgQ = gmem_thr_copy_QKV.partition_S(gQ)
+        tQsQ = gmem_thr_copy_QKV.partition_D(sQ)
+        # (CPY_Atom, CPY_N, CPY_K, n_block)
+        tKgK = gmem_thr_copy_QKV.partition_S(gK)
+        tKsK = gmem_thr_copy_QKV.partition_D(sK)
+        # (CPY_Atom, CPY_N, CPY_K, n_block)
+        tVgV = gmem_thr_copy_QKV.partition_S(gV)
+        tVsV = gmem_thr_copy_QKV.partition_D(sV)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Tile MMA compute thread partitions and allocate accumulators
+        # ///////////////////////////////////////////////////////////////////////////////
+        thr_mma = tiled_mma.get_slice(tidx)
+        tSrQ = thr_mma.make_fragment_A(thr_mma.partition_A(sQ))
+        tSrK = thr_mma.make_fragment_B(thr_mma.partition_B(sK))
+        tOrVt = thr_mma.make_fragment_B(thr_mma.partition_B(sVt))
+        acc_shape_O = thr_mma.partition_shape_C(
+            (self._m_block_size, self._head_dim_padded)
+        )
+        acc_O = cute.make_fragment(acc_shape_O, cutlass.Float32)
+        acc_O.fill(0.0)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Smem copy atom tiling
+        # ///////////////////////////////////////////////////////////////////////////////
+        smem_copy_atom_Q = cute.make_copy_atom(
+            warp.LdMatrix8x8x16bOp(transpose=False, num_matrices=4),
+            self._dtype,
+        )
+        smem_copy_atom_K = cute.make_copy_atom(
+            warp.LdMatrix8x8x16bOp(transpose=False, num_matrices=4),
+            self._dtype,
+        )
+        smem_copy_atom_V = cute.make_copy_atom(
+            warp.LdMatrix8x8x16bOp(transpose=True, num_matrices=4),
+            self._dtype,
+        )
+        smem_tiled_copy_Q = cute.make_tiled_copy(
+            smem_copy_atom_Q,
+            layout_tv=tiled_mma.tv_layout_A_tiled,
+            tiler_mn=(tiled_mma.get_tile_size(0), tiled_mma.get_tile_size(2)),
+        )
+        smem_tiled_copy_K = cute.make_tiled_copy(
+            smem_copy_atom_K,
+            layout_tv=tiled_mma.tv_layout_B_tiled,
+            tiler_mn=(tiled_mma.get_tile_size(1), tiled_mma.get_tile_size(2)),
+        )
+        smem_tiled_copy_V = cute.make_tiled_copy(
+            smem_copy_atom_V,
+            layout_tv=tiled_mma.tv_layout_B_tiled,
+            tiler_mn=(tiled_mma.get_tile_size(1), tiled_mma.get_tile_size(2)),
+        )
+
+        smem_thr_copy_Q = smem_tiled_copy_Q.get_slice(tidx)
+        smem_thr_copy_K = smem_tiled_copy_K.get_slice(tidx)
+        smem_thr_copy_V = smem_tiled_copy_V.get_slice(tidx)
+
+        tSsQ = smem_thr_copy_Q.partition_S(sQ)
+        tSrQ_copy_view = smem_thr_copy_Q.retile(tSrQ)
+        tSsK = smem_thr_copy_K.partition_S(sK)
+        tSrK_copy_view = smem_thr_copy_K.retile(tSrK)
+        tOsVt = smem_thr_copy_V.partition_S(sVt)
+        tOrVt_copy_view = smem_thr_copy_V.retile(tOrVt)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Predicate: Mark indices that need to copy when problem_shape isn't a multiple
+        # of tile_shape
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Construct identity layout for Q and KV
+        mcQ = cute.make_identity_tensor(mQ.layout.shape)
+        mcKV = cute.make_identity_tensor(mK.layout.shape)
+        cQ = cute.local_tile(
+            mcQ[batch_size, None, num_head, None],
+            (self._m_block_size, self._head_dim_padded),
+            (m_block, 0),
+        )
+        cKV = cute.local_tile(
+            mcKV[batch_size, None, num_head, None],
+            (self._n_block_size, self._head_dim_padded),
+            (n_block, 0),
+        )
+
+        # Repeat the partitioning with identity layouts
+        tQcQ = gmem_thr_copy_QKV.partition_S(cQ)
+        tKVcKV = gmem_thr_copy_QKV.partition_S(cKV)
+        # Allocate predicate tensors for m and n, here we only allocate the tile of k, and do special process for mn.
+        # This is to reduce register pressure and gets 2-3% performance gain compared with allocating the whole tile.
+        tQpQ = cute.make_fragment(
+            cute.make_layout(
+                (
+                    tQsQ.shape[0][1],
+                    cute.size(tQsQ, mode=[1]),
+                    cute.size(tQsQ, mode=[2]),
+                ),
+                stride=(cute.size(tQsQ, mode=[2]), 0, 1),
+            ),
+            cutlass.Boolean,
+        )
+        tKVpKV = cute.make_fragment(
+            cute.make_layout(
+                (
+                    tKsK.shape[0][1],
+                    cute.size(tKsK, mode=[1]),
+                    cute.size(tKsK, mode=[2]),
+                ),
+                stride=(cute.size(tKsK, mode=[2]), 0, 1),
+            ),
+            cutlass.Boolean,
+        )
+        # Set predicates for head_dim bounds, seqlen_q/k bounds is processed at the first tile.
+        for rest_v in range(tQpQ.shape[0]):
+            for rest_k in range(tQpQ.shape[2]):
+                tQpQ[rest_v, 0, rest_k] = cute.elem_less(
+                    tQcQ[(0, rest_v), 0, rest_k][3], mQ.layout.shape[3]
+                )
+        for rest_v in range(tKVpKV.shape[0]):
+            for rest_k in range(tKVpKV.shape[2]):
+                tKVpKV[rest_v, 0, rest_k] = cute.elem_less(
+                    tKVcKV[(0, rest_v), 0, rest_k][3], mK.layout.shape[3]
+                )
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Prefetch Prologue
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Start async loads of the last mn-tile, where we take care of the mn residue
+        for m in range(cute.size(tQsQ.shape[1])):
+            if cute.elem_less(tQcQ[0, m, 0][1], mQ.layout.shape[1]):
+                cute.copy(
+                    gmem_tiled_copy_QKV,
+                    tQgQ[None, m, None],
+                    tQsQ[None, m, None],
+                    pred=tQpQ[None, m, None],
+                )
+            else:
+                # Clear the smem tiles to account for predicated off loads
+                tQsQ[None, m, None].fill(0)
+        for n in range(cute.size(tKsK.shape[1])):
+            if cute.elem_less(tKVcKV[0, n, 0][1], mK.layout.shape[1]):
+                cute.copy(
+                    gmem_tiled_copy_QKV,
+                    tKgK[None, n, None, n_block],
+                    tKsK[None, n, None],
+                    pred=tKVpKV[None, n, None],
+                )
+            else:
+                # Clear the smem tiles to account for predicated off loads
+                tKsK[None, n, None].fill(0)
+
+        cute.arch.cp_async_commit_group()
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Softmax intermediate result: row_max and row_sum
+        # ///////////////////////////////////////////////////////////////////////////////
+        # shape: (atom_v_m * rest_m)
+        row_max = cute.make_fragment(
+            (acc_O.shape[0][0] * acc_O.shape[1]), cutlass.Float32
+        )
+        # shape: (atom_v_m * rest_m)
+        row_sum = cute.make_fragment(
+            (acc_O.shape[0][0] * acc_O.shape[1]), cutlass.Float32
+        )
+        row_max.fill(-cutlass.Float32.inf)
+        row_sum.fill(0.0)
+
+        # group parameters for compute_one_n_block
+        basic_params = SimpleNamespace(
+            m_block=m_block,
+            n_block=n_block,
+            mQ=mQ,
+            mK=mK,
+            batch_size=batch_size,
+            num_head=num_head,
+        )
+        mma_params = SimpleNamespace(
+            thr_mma=thr_mma,
+            tiled_mma=tiled_mma,
+            tSrQ=tSrQ,
+            tSrK=tSrK,
+            tOrVt=tOrVt,
+            acc_O=acc_O,
+        )
+        gmem_copy_params = SimpleNamespace(
+            gmem_tiled_copy_QKV=gmem_tiled_copy_QKV,
+            tKVcKV=tKVcKV,
+            tKgK=tKgK,
+            tKsK=tKsK,
+            tVgV=tVgV,
+            tVsV=tVsV,
+            tKVpKV=tKVpKV,
+        )
+        smem_copy_params = SimpleNamespace(
+            smem_tiled_copy_Q=smem_tiled_copy_Q,
+            smem_tiled_copy_K=smem_tiled_copy_K,
+            smem_tiled_copy_V=smem_tiled_copy_V,
+            tSsQ=tSsQ,
+            tSrQ_copy_view=tSrQ_copy_view,
+            tSsK=tSsK,
+            tSrK_copy_view=tSrK_copy_view,
+            tOsVt=tOsVt,
+            tOrVt_copy_view=tOrVt_copy_view,
+        )
+        softmax_params = SimpleNamespace(
+            row_max=row_max,
+            row_sum=row_sum,
+            softmax_scale_log2=softmax_scale_log2,
+        )
+
+        # Start processing of the first n-block.
+        # For performance reason, we separate out two kinds of iterations:
+        # those that need masking on S, and those that don't.
+        # We need masking on S for the very last block when K and V has length not multiple of n_block_size.
+        # We also need masking on S if it's causal, for the last ceil_div(m_block_size, n_block_size) blocks.
+        # We will have at least 1 "masking" iteration.
+        mask_steps = 1
+        if self._is_causal:
+            mask_steps = cute.ceil_div(self._m_block_size, self._n_block_size)
+
+        for n_tile in range(mask_steps):
+            n_block = n_block_max - n_tile - 1
+            basic_params.n_block = n_block
+            if self._is_causal:
+                if n_block >= 0:
+                    self.compute_one_n_block(
+                        basic_params,
+                        mma_params,
+                        gmem_copy_params,
+                        smem_copy_params,
+                        softmax_params,
+                        is_first_n_block=(n_tile == 0),
+                        in_mask_steps=True,
+                    )
+            else:
+                self.compute_one_n_block(
+                    basic_params,
+                    mma_params,
+                    gmem_copy_params,
+                    smem_copy_params,
+                    softmax_params,
+                    is_first_n_block=True,
+                    in_mask_steps=True,
+                )
+
+        # Start async loads of rest k-tiles in reverse order, no k-residue handling needed
+        for n_tile in cutlass.range_dynamic(mask_steps, n_block_max, 1):
+            n_block = n_block_max - n_tile - 1
+            basic_params.n_block = n_block
+            self.compute_one_n_block(
+                basic_params,
+                mma_params,
+                gmem_copy_params,
+                smem_copy_params,
+                softmax_params,
+                is_first_n_block=False,
+                in_mask_steps=False,
+            )
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Epilogue
+        # ///////////////////////////////////////////////////////////////////////////////
+        # normalize acc_O by row_sum and calculate the lse
+        self.normalize_softmax(acc_O, row_sum)
+        # store acc_O
+        rO = cute.make_fragment_like(acc_O, self._dtype)
+        rO.store(acc_O.load().to(self._dtype))
+        # reuse sQ's data iterator
+        sO = cute.make_tensor(sQ.iterator, sO_layout)
+
+        # smem copy atom for O
+        smem_copy_atom_O = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(), self._dtype
+        )
+        # tiled copy atom for O
+        smem_tiled_copy_O = cute.make_tiled_copy(
+            smem_copy_atom_O,
+            layout_tv=tiled_mma.tv_layout_C_tiled,
+            tiler_mn=(tiled_mma.get_tile_size(0), tiled_mma.get_tile_size(1)),
+        )
+        smem_thr_copy_O = smem_tiled_copy_O.get_slice(tidx)
+        taccOrO = smem_thr_copy_O.retile(rO)
+        taccOsO = smem_thr_copy_O.partition_D(sO)
+        # copy acc O from rmem to smem with the smem copy atom
+        cute.copy(
+            smem_copy_atom_O,
+            taccOrO,
+            taccOsO,
+        )
+        gO = cute.local_tile(
+            mO[batch_size, None, num_head, None],
+            (self._m_block_size, self._head_dim_padded),
+            (m_block, 0),
+        )
+
+        gmem_thr_copy_O = gmem_tiled_copy_O.get_slice(tidx)
+        tOsO = gmem_thr_copy_O.partition_S(sO)
+        tOgO = gmem_thr_copy_O.partition_D(gO)
+        tOrO = cute.make_fragment_like(tOgO, self._dtype)
+        # sync before all smem stores are done.
+        cute.arch.barrier()
+        # load acc O from smem to rmem for wider vectorization
+        cute.copy(
+            gmem_tiled_copy_O,
+            tOsO,
+            tOrO,
+        )
+        mcO = cute.make_identity_tensor(mO.layout.shape)
+        cO = cute.local_tile(
+            mcO[batch_size, None, num_head, None],
+            (self._m_block_size, self._head_dim_padded),
+            (m_block, 0),
+        )
+        tOcO = gmem_thr_copy_O.partition_D(cO)
+        tOpO = cute.make_fragment(
+            cute.make_layout(
+                (tOgO.shape[0][1], tOgO.shape[1], tOgO.shape[2]),
+                stride=(tOgO.shape[2], 0, 1),
+            ),
+            cutlass.Boolean,
+        )
+        for rest_v in range(tOpO.shape[0]):
+            for rest_n in range(cute.size(tOpO.shape[2])):
+                tOpO[rest_v, 0, rest_n] = cute.elem_less(
+                    tOcO[(0, rest_v), 0, rest_n][3], mO.layout.shape[3]
+                )
+        # copy acc O from rmem to gmem
+        for rest_m in range(cute.size(tOpO.shape[1])):
+            if cute.elem_less(tOcO[0, rest_m, 0][1], mO.layout.shape[1]):
+                cute.copy(
+                    gmem_tiled_copy_O,
+                    tOrO[None, rest_m, None],
+                    tOgO[None, rest_m, None],
+                    pred=tOpO[None, rest_m, None],
+                )
+
+    @cute.jit
+    def compute_one_n_block(
+        self,
+        basic_params: SimpleNamespace,
+        mma_params: SimpleNamespace,
+        gmem_copy_params: SimpleNamespace,
+        smem_copy_params: SimpleNamespace,
+        softmax_params: SimpleNamespace,
+        is_first_n_block: cutlass.Constexpr,
+        in_mask_steps: cutlass.Constexpr,
+    ):
+        """Compute one n_block of S/O.
+
+        This function provides different variants for processing the first n block versus subsequent blocks,
+        as well as variants for handling masked and unmasked steps.
+
+        :param basic_params: basic parameters
+        :type basic_params: SimpleNamespace
+        :param mma_params: mma parameters
+        :type mma_params: SimpleNamespace
+        :param gmem_copy_params: gmem copy parameters
+        :type gmem_copy_params: SimpleNamespace
+        :param smem_copy_params: smem copy parameters
+        :type smem_copy_params: SimpleNamespace
+        :param softmax_params: softmax parameters
+        :type softmax_params: SimpleNamespace
+        :param is_first_n_block: is first n block
+        :type is_first_n_block: cutlass.Constexpr
+        """
+        acc_shape_S = mma_params.thr_mma.partition_shape_C(
+            (self._m_block_size, self._n_block_size)
+        )
+        acc_S = cute.make_fragment(acc_shape_S, cutlass.Float32)
+        acc_S.fill(0.0)
+
+        # wait for smem tile QK before mma calculation for S
+        cute.arch.cp_async_wait_group(0)
+        cute.arch.barrier()
+        # load smem tile V for O, special process for the first tile to avoid loading nan.
+        # The `if` here is a constexpr, won't be generated in the IR.
+        if is_first_n_block:
+            for n in range(cute.size(gmem_copy_params.tVsV.shape[1])):
+                if cute.elem_less(
+                    gmem_copy_params.tKVcKV[0, n, 0][1],
+                    basic_params.mK.layout.shape[1],
+                ):
+                    cute.copy(
+                        gmem_copy_params.gmem_tiled_copy_QKV,
+                        gmem_copy_params.tVgV[None, n, None, basic_params.n_block],
+                        gmem_copy_params.tVsV[None, n, None],
+                        pred=gmem_copy_params.tKVpKV[None, n, None],
+                    )
+                else:
+                    gmem_copy_params.tVsV[None, n, None].fill(0.0)
+        else:
+            cute.copy(
+                gmem_copy_params.gmem_tiled_copy_QKV,
+                gmem_copy_params.tVgV[None, None, None, basic_params.n_block],
+                gmem_copy_params.tVsV,
+                pred=gmem_copy_params.tKVpKV,
+            )
+
+        cute.arch.cp_async_commit_group()
+        # ///////////////////////////////////////////////////////////////////////////////
+        # S gemm calculation
+        # ///////////////////////////////////////////////////////////////////////////////
+        # load first QK k-block from smem to rmem for mma
+        cute.copy(
+            smem_copy_params.smem_tiled_copy_Q,
+            smem_copy_params.tSsQ[None, None, 0],
+            smem_copy_params.tSrQ_copy_view[None, None, 0],
+        )
+        cute.copy(
+            smem_copy_params.smem_tiled_copy_K,
+            smem_copy_params.tSsK[None, None, 0],
+            smem_copy_params.tSrK_copy_view[None, None, 0],
+        )
+        # mma for S
+        for k in range(cute.size(smem_copy_params.tSsQ.shape[2])):
+            # load next QK k-block from smem to rmem for mma
+            k_next = (k + 1) % cute.size(smem_copy_params.tSsQ.shape[2])
+            cute.copy(
+                smem_copy_params.smem_tiled_copy_Q,
+                smem_copy_params.tSsQ[None, None, k_next],
+                smem_copy_params.tSrQ_copy_view[None, None, k_next],
+            )
+            cute.copy(
+                smem_copy_params.smem_tiled_copy_K,
+                smem_copy_params.tSsK[None, None, k_next],
+                smem_copy_params.tSrK_copy_view[None, None, k_next],
+            )
+            cute.gemm(
+                mma_params.tiled_mma,
+                acc_S,
+                mma_params.tSrQ[None, None, k],
+                mma_params.tSrK[None, None, k],
+                acc_S,
+            )
+
+        # wait for smem tile V for O
+        cute.arch.cp_async_wait_group(0)
+        cute.arch.barrier()
+
+        if basic_params.n_block > 0:
+            cute.copy(
+                gmem_copy_params.gmem_tiled_copy_QKV,
+                gmem_copy_params.tKgK[None, None, None, basic_params.n_block - 1],
+                gmem_copy_params.tKsK,
+                pred=gmem_copy_params.tKVpKV,
+            )
+            cute.arch.cp_async_commit_group()
+        # ///////////////////////////////////////////////////////////////////////////////
+        # online softmax
+        # ///////////////////////////////////////////////////////////////////////////////
+        self.softmax_rescale_O(
+            basic_params,
+            mma_params,
+            softmax_params,
+            acc_S,
+            is_first_n_block,
+            in_mask_steps,
+        )
+
+        rP = cute.make_fragment_like(acc_S, self._dtype)
+        rP.store(acc_S.load().to(self._dtype))
+        # ///////////////////////////////////////////////////////////////////////////////
+        # O gemm calculation
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Convert layout of acc_S to gemm O accept layout.
+        # Due to the mma instruction shape is 16x8x16, we need to convert from (4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
+        # (4, MMA_M, MMA_N) -> (4, MMA_M, (2, MMA_N / 2))
+        rP_layout_divided = cute.logical_divide(rP.layout, (None, None, 2))
+        rP_mma_view = cute.make_layout(
+            (
+                (rP_layout_divided.shape[0], rP_layout_divided.shape[2][0]),
+                rP_layout_divided.shape[1],
+                rP_layout_divided.shape[2][1],
+            ),
+            stride=(
+                (rP_layout_divided.stride[0], rP_layout_divided.stride[2][0]),
+                rP_layout_divided.stride[1],
+                rP_layout_divided.stride[2][1],
+            ),
+        )
+        tOrS = cute.make_tensor(rP.iterator, rP_mma_view)
+
+        # load first V k-block from smem to rmem for mma
+        cute.copy(
+            smem_copy_params.smem_tiled_copy_V,
+            smem_copy_params.tOsVt[None, None, 0],
+            smem_copy_params.tOrVt_copy_view[None, None, 0],
+        )
+        # mma for O
+        for k in range(cute.size(tOrS.shape[2])):
+            # load next V k-block from smem to rmem for mma
+            k_next = (k + 1) % cute.size(tOrS.shape[2])
+            cute.copy(
+                smem_copy_params.smem_tiled_copy_V,
+                smem_copy_params.tOsVt[None, None, k_next],
+                smem_copy_params.tOrVt_copy_view[None, None, k_next],
+            )
+            cute.gemm(
+                mma_params.tiled_mma,
+                mma_params.acc_O,
+                tOrS[None, None, k],
+                mma_params.tOrVt[None, None, k],
+                mma_params.acc_O,
+            )
+
+    @cute.jit
+    def softmax_rescale_O(
+        self,
+        basic_params: SimpleNamespace,
+        mma_params: SimpleNamespace,
+        softmax_params: SimpleNamespace,
+        acc_S: cute.Tensor,
+        is_first_n_block: cutlass.Constexpr,
+        in_mask_steps: cutlass.Constexpr,
+    ):
+        """Apply online softmax and rescale acc_O.
+
+        This function provides different variants for processing the first n block versus subsequent blocks,
+        as well as variants for handling masked and unmasked steps.
+
+        :param basic_params: basic parameters
+        :type basic_params: SimpleNamespace
+        :param mma_params: mma parameters
+        :type mma_params: SimpleNamespace
+        :param softmax_params: softmax parameters
+        :type softmax_params: SimpleNamespace
+        :param acc_S: acc_S tensor
+        :type acc_S: cute.Tensor
+        :param is_first_n_block: is first n_block
+        :type is_first_n_block: cutlass.Constexpr
+        :param in_mask_steps: in mask steps
+        :type in_mask_steps: cutlass.Constexpr
+        """
+        # Change acc_S to M,N layout view.
+        acc_S_mn = self._make_acc_tensor_mn_view(acc_S)
+        acc_O_mn = self._make_acc_tensor_mn_view(mma_params.acc_O)
+        row_max_prev = None
+        # if it is not the first tile, load the row r of previous row_max and compare with row_max_cur_row.
+        if not is_first_n_block:
+            row_max_prev = cute.make_fragment_like(
+                softmax_params.row_max, cutlass.Float32
+            )
+            cute.basic_copy(softmax_params.row_max, row_max_prev)
+        # if it is the first tile, create a mask for residual of S to -inf for softmax.
+        tScS_mn = None
+        if in_mask_steps:
+            mcS = cute.make_identity_tensor(
+                (
+                    basic_params.mQ.shape[0],
+                    basic_params.mQ.shape[1],
+                    basic_params.mQ.shape[2],
+                    basic_params.mK.shape[1],
+                )
+            )
+            cS = cute.local_tile(
+                mcS[basic_params.batch_size, None, basic_params.num_head, None],
+                (self._m_block_size, self._n_block_size),
+                (basic_params.m_block, basic_params.n_block),
+            )
+            tScS = mma_params.thr_mma.partition_C(cS)
+            tScS_mn = self._make_acc_tensor_mn_view(tScS)
+
+        # Each iteration processes one row of acc_S
+        for r in range(cute.size(softmax_params.row_max)):
+            # mask residual of S with -inf
+            if in_mask_steps:
+                if not self._is_causal:
+                    # traverse column index.
+                    for c in range(cute.size(tScS_mn.shape[1])):
+                        if cute.elem_less(
+                            basic_params.mK.shape[1], tScS_mn[0, c][3] + 1
+                        ):
+                            acc_S_mn[r, c] = -cutlass.Float32.inf
+                else:
+                    # get the column index limit based on current row. Only consider the row index, so the column index sets to 0.
+                    col_idx_limit = cutlass.min(
+                        tScS_mn[r, 0][1] + 1, basic_params.mK.shape[1]
+                    )
+                    # traverse column index.
+                    for c in range(cute.size(tScS_mn.shape[1])):
+                        # only consider the column index, so the row index sets to 0.
+                        if cute.elem_less(col_idx_limit, tScS_mn[0, c][3] + 1):
+                            acc_S_mn[r, c] = -cutlass.Float32.inf
+
+            # (n_block_size)
+            acc_S_row = acc_S_mn[r, None].load()
+            # row_max_cur_row => f32
+            row_max_cur_row = acc_S_row.reduce(
+                cute.ReductionOp.MAX, -cutlass.Float32.inf, 0
+            )
+            # quad reduction for row_max
+            row_max_cur_row = self._threadquad_reduce_max(row_max_cur_row)
+            row_max_prev_row = None
+            # if it is not the first tile, load the row r of previous row_max and compare with row_max_cur_row.
+            if not is_first_n_block:
+                row_max_prev_row = row_max_prev[r]
+                row_max_cur_row = cute.arch.fmax(row_max_prev_row, row_max_cur_row)
+            if self._is_causal:
+                row_max_cur_row = (
+                    0.0 if row_max_cur_row == -cutlass.Float32.inf else row_max_cur_row
+                )
+
+            # compute exp(x - max) using exp2(x * log_2(e) - max * log_2(e))
+            acc_S_row_exp = cute.TensorSSA(
+                self._exp2f(
+                    acc_S_row * softmax_params.softmax_scale_log2
+                    - row_max_cur_row * softmax_params.softmax_scale_log2
+                ),
+                tuple(acc_S_row.shape),
+                cutlass.Float32,
+            )
+            # acc_S_row_sum => f32
+            acc_S_row_sum = acc_S_row_exp.reduce(
+                cute.ReductionOp.ADD, cutlass.Float32.zero, 0
+            )
+            # if it is not the first tile, load the row r of previous row_max and minus row_max_cur_row to update row_sum.
+            if not is_first_n_block:
+                prev_minus_cur_exp = self._exp2f(
+                    row_max_prev_row * softmax_params.softmax_scale_log2
+                    - row_max_cur_row * softmax_params.softmax_scale_log2
+                )
+                acc_S_row_sum = (
+                    acc_S_row_sum + softmax_params.row_sum[r] * prev_minus_cur_exp
+                )
+                acc_O_mn[r, None] = acc_O_mn[r, None].load() * prev_minus_cur_exp
+            # update row_max, row_sum and acc_S
+            softmax_params.row_max[r] = row_max_cur_row
+            softmax_params.row_sum[r] = acc_S_row_sum
+            acc_S_mn[r, None] = acc_S_row_exp
+
+    @cute.jit
+    def normalize_softmax(
+        self,
+        acc_O: cute.Tensor,
+        row_sum: cute.Tensor,
+    ):
+        """Normalize acc_O by row_sum.
+
+        :param acc_O: input tensor
+        :type acc_O: cute.Tensor
+        :param row_sum: row_sum tensor
+        :type row_sum: cute.Tensor
+        """
+        # do quad reduction for row_sum.
+        acc_O_mn = self._make_acc_tensor_mn_view(acc_O)
+        for r in range(cute.size(row_sum)):
+            row_sum[r] = self._threadquad_reduce_sum(row_sum[r])
+            # if row_sum is zero or nan, set acc_O_mn_row to 1.0
+            acc_O_mn_row_is_zero_or_nan = row_sum[r] == 0.0 or row_sum[r] != row_sum[r]
+
+            scale = (
+                1.0 if acc_O_mn_row_is_zero_or_nan else cute.arch.rcp_approx(row_sum[r])
+            )
+
+            acc_O_mn[r, None] = acc_O_mn[r, None].load() * scale
+
+    def _make_acc_tensor_mn_view(self, acc: cute.Tensor) -> cute.Tensor:
+        """make acc tensor as mn layout view
+
+        :param acc: input tensor
+        :type acc: cute.Tensor
+        :return: acc tensor mn layout view
+        :rtype: cute.Tensor
+        """
+        acc_layout_col_major = cute.make_layout(acc.layout.shape)
+        acc_layout_mn = cute.make_layout(
+            (
+                (
+                    acc_layout_col_major.shape[0][1],
+                    acc_layout_col_major.shape[1],
+                ),  # MMA_M
+                (
+                    acc_layout_col_major.shape[0][0],
+                    acc_layout_col_major.shape[2],
+                ),  # MMA_N
+            ),
+            stride=(
+                (
+                    acc_layout_col_major.stride[0][1],
+                    acc_layout_col_major.stride[1],
+                ),  # MMA_M
+                (
+                    acc_layout_col_major.stride[0][0],
+                    acc_layout_col_major.stride[2],
+                ),  # MMA_N
+            ),
+        )
+        acc_layout_mn = cute.composition(acc.layout, acc_layout_mn)
+        return cute.make_tensor(acc.iterator, acc_layout_mn)
+
+    def _threadquad_reduce(self, val: cutlass.Float32, op: Callable) -> cutlass.Float32:
+        """thread quad reduction
+
+        :param val: register value
+        :type val: cutlass.Float32
+        :param op: binary operator
+        :type op: Callable
+        :return: reduced value
+        :rtype: cutlass.Float32
+        """
+        val = op(
+            val,
+            cute.arch.shuffle_sync_bfly(val, offset=2, mask=-1, mask_and_clamp=31),
+        )
+        val = op(
+            val,
+            cute.arch.shuffle_sync_bfly(val, offset=1, mask=-1, mask_and_clamp=31),
+        )
+        return val
+
+    def _threadquad_reduce_max(self, val: cutlass.Float32) -> cutlass.Float32:
+        """thread quad reduction max
+
+        :param val: register value
+        :type val: cutlass.Float32
+        :return: max value
+        :rtype: cutlass.Float32
+        """
+        return self._threadquad_reduce(val, lambda x, y: cute.arch.fmax(x, y))
+
+    def _threadquad_reduce_sum(self, val: cutlass.Float32) -> cutlass.Float32:
+        """thread quad reduction sum
+
+        :param val: register value
+        :type val: cutlass.Float32
+        :return: sum value
+        :rtype: cutlass.Float32
+        """
+        return self._threadquad_reduce(val, lambda x, y: x + y)
+
+    def _exp2f(
+        self, x: Union[cute.TensorSSA, cutlass.Float32]
+    ) -> Union[cute.TensorSSA, cutlass.Float32]:
+        """exp2f calculation for both vector and scalar.
+
+        :param x: input value
+        :type x: cute.TensorSSA or cutlass.Float32
+        :return: exp2 value
+        :rtype: cute.TensorSSA or cutlass.Float32
+        """
+        if isinstance(x, cute.TensorSSA):
+            res = cute.make_fragment(x.shape, cutlass.Float32)
+            res.store(x)
+
+            for i in range(cute.size(x.shape)):
+                res[i] = self._exp2f(res[i])
+
+            return res.load()
+        return cute.arch.exp2(x)
+
+
+def run_flash_attention_fwd(
+    dtype: Type[cutlass.Numeric],
+    batch_size: int,
+    seqlen_q: int,
+    seqlen_k: int,
+    num_head: int,
+    head_dim: int,
+    softmax_scale: float = 1.0,
+    m_block_size: int = 128,
+    n_block_size: int = 128,
+    num_threads: int = 128,
+    is_causal: bool = False,
+    warmup_iterations: int = 0,
+    iterations: int = 1,
+    skip_ref_check: bool = False,
+):
+    # Skip unsupported testcase
+    if not FlashAttentionForwardAmpere.can_implement(
+        dtype,
+        head_dim,
+        m_block_size,
+        n_block_size,
+        num_threads,
+        is_causal,
+    ):
+        raise TypeError(
+            f"Unsupported testcase {dtype}, {head_dim}, {m_block_size}, {n_block_size}, {num_threads}, {is_causal}"
+        )
+
+    # Create tensor Q/K/V/O
+    def create_tensor(
+        batch_size: int,
+        seqlen: int,
+        num_head: int,
+        head_dim: int,
+        dtype: Type[cutlass.Numeric],
+    ) -> cute.Tensor:
+        # (batch_size, seqlen, num_head, head_dim)
+        shape = (batch_size, seqlen, num_head, head_dim)
+        return (
+            torch.empty(*shape, dtype=torch.int32).random_(-2, 2).to(dtype=dtype).cuda()
+        )
+
+    q = create_tensor(
+        batch_size, seqlen_q, num_head, head_dim, cutlass_torch.dtype(dtype)
+    )
+    k = create_tensor(
+        batch_size, seqlen_k, num_head, head_dim, cutlass_torch.dtype(dtype)
+    )
+    v = create_tensor(
+        batch_size, seqlen_k, num_head, head_dim, cutlass_torch.dtype(dtype)
+    )
+    o = create_tensor(
+        batch_size, seqlen_q, num_head, head_dim, cutlass_torch.dtype(dtype)
+    )
+
+    fa2_fwd = FlashAttentionForwardAmpere(
+        head_dim,
+        m_block_size,
+        n_block_size,
+        num_threads,
+        is_causal,
+    )
+    # assume input is 16B align.
+    q_tensor = (
+        from_dlpack(q, assumed_align=16)
+        .mark_layout_dynamic(leading_dim=3)
+        .mark_compact_shape_dynamic(
+            mode=3, stride_order=q.dim_order(), divisibility=(128 // dtype.width)
+        )
+    )
+    k_tensor = (
+        from_dlpack(k, assumed_align=16)
+        .mark_layout_dynamic(leading_dim=3)
+        .mark_compact_shape_dynamic(
+            mode=3, stride_order=k.dim_order(), divisibility=(128 // dtype.width)
+        )
+    )
+    v_tensor = (
+        from_dlpack(v, assumed_align=16)
+        .mark_layout_dynamic(leading_dim=3)
+        .mark_compact_shape_dynamic(
+            mode=3, stride_order=v.dim_order(), divisibility=(128 // dtype.width)
+        )
+    )
+    o_tensor = (
+        from_dlpack(o, assumed_align=16)
+        .mark_layout_dynamic(leading_dim=3)
+        .mark_compact_shape_dynamic(
+            mode=3, stride_order=o.dim_order(), divisibility=(128 // dtype.width)
+        )
+    )
+    # Get current CUDA stream from PyTorch
+    torch_stream = torch.cuda.current_stream()
+    # Get the raw stream pointer as a CUstream
+    current_stream = cuda.CUstream(torch_stream.cuda_stream)
+    # compile the fa2 forward pass
+    compiled_fa2_fwd = cute.compile(
+        fa2_fwd, q_tensor, k_tensor, v_tensor, o_tensor, softmax_scale, current_stream
+    )
+    # warmup
+    for _ in range(warmup_iterations):
+        compiled_fa2_fwd(
+            q_tensor,
+            k_tensor,
+            v_tensor,
+            o_tensor,
+            softmax_scale,
+            current_stream,
+        )
+    # run the compiled fa2 forward pass
+    for _ in range(iterations):
+        compiled_fa2_fwd(
+            q_tensor,
+            k_tensor,
+            v_tensor,
+            o_tensor,
+            softmax_scale,
+            current_stream,
+        )
+    torch.cuda.synchronize()
+
+    if skip_ref_check:
+        return
+    # reference implementation
+    q_ref = q.permute(0, 2, 1, 3)
+    k_ref = k.permute(0, 2, 1, 3)
+    v_ref = v.permute(0, 2, 1, 3)
+    torch.backends.cuda.enable_flash_sdp(enabled=True)
+    ref_o = torch.nn.functional.scaled_dot_product_attention(
+        q_ref, k_ref, v_ref, scale=softmax_scale, is_causal=is_causal
+    ).permute(0, 2, 1, 3)
+
+    torch.testing.assert_close(o.cpu(), ref_o.cpu(), atol=1e-02, rtol=1e-04)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="example of flash attention v2 with CuTe on GPU"
+    )
+    parser.add_argument("--dtype", type=cutlass.dtype, default=cutlass.BFloat16)
+    parser.add_argument("--batch_size", type=int, default=4)
+    parser.add_argument("--seqlen_q", type=int, default=8192)
+    parser.add_argument("--seqlen_k", type=int, default=8192)
+    parser.add_argument("--num_head", type=int, default=16)
+    parser.add_argument("--head_dim", type=int, default=128)
+    parser.add_argument("--softmax_scale", type=float, default=0.5)
+    parser.add_argument("--m_block_size", type=int, default=128)
+    parser.add_argument("--n_block_size", type=int, default=64)
+    parser.add_argument("--num_threads", type=int, default=128)
+    parser.add_argument("--is_causal", action="store_true", help="Enable causal mask")
+    parser.add_argument("--warmup_iterations", type=int, default=3)
+    parser.add_argument("--iterations", type=int, default=10)
+    parser.add_argument(
+        "--skip_ref_check", action="store_true", help="Skip reference check"
+    )
+
+    args = parser.parse_args()
+    run_flash_attention_fwd(
+        args.dtype,
+        args.batch_size,
+        args.seqlen_q,
+        args.seqlen_k,
+        args.num_head,
+        args.head_dim,
+        args.softmax_scale,
+        args.m_block_size,
+        args.n_block_size,
+        args.num_threads,
+        args.is_causal,
+    )
+
+    print("PASS")
diff --git a/examples/python/CuTeDSL/ampere/sgemm.py b/examples/python/CuTeDSL/ampere/sgemm.py
new file mode 100644
index 00000000..a4a032b4
--- /dev/null
+++ b/examples/python/CuTeDSL/ampere/sgemm.py
@@ -0,0 +1,780 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import time
+from typing import Tuple
+
+import cuda.bindings.driver as cuda
+import torch
+
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils as utils
+from cutlass.cute.runtime import from_dlpack
+
+"""
+A dense FP32 SIMT GEMM (C = A * B) example using CUTE DSL.
+- Matrix A is MxK, A can be row-major("K") or column-major("M")
+- Matrix B is NxK, B can be row-major("N") or column-major("K")
+- Matrix C is MxN, C can be row-major("N") or column-major("M")
+
+This GEMM kernel supports the following features:
+    - Utilizes FPU for matrix multiply-accumulate (MMA) operations
+    - Use multistage pipeline to overlap computation and memory access
+      * Shared memory pipeline: hides gmem-to-smem latency.
+      * Register pipeline: overlaps shared memory-to-register transfers with
+        computations and eliminates false data dependencies for
+        better parallelism.
+    - Use vectorized copies
+    - Add padding to reduce bank conflicts in global -> shared memory copies
+    - Use predication to avoid unnecessary copies or copies of stale data
+
+This GEMM works as follows:
+1. Load A and B matrices from global memory (GMEM) to shared memory (SMEM) using asynchronous copies.
+2. Perform matrix multiply-accumulate (MMA) operations using simple fused multiply-add atomics.
+3. Store results from registers (RMEM) to global memory (GMEM).
+
+To run this example:
+
+.. code-block:: bash
+
+    python examples/ampere/sgemm.py                       \
+      --mnk 8192,8192,8192                                \
+      --a_major m --b_major n --c_major n
+
+To collect performance with NCU profiler:
+
+.. code-block:: bash
+
+    ncu python examples/ampere/sgemm.py                   \
+      --mnk 8192,8192,8192                                \
+      --a_major m --b_major n --c_major n                 \
+      --skip_ref_check --iterations 2
+
+Constraints:
+* Supported input, output, and accumulator data types: fp32
+* Default tile shape is set to be 128x128x8
+* The contiguous dimension of A/B/C tensors must be at least 16 bytes aligned
+"""
+
+
+class SGemm:
+    def __init__(
+        self,
+        cta_tiler: Tuple[int, int, int] = (128, 128, 8),
+        num_stages: int = 3,
+        num_threads: int = 256,
+    ):
+        self._cta_tiler = cta_tiler
+        self._num_stages = num_stages
+        self._num_threads = num_threads
+        assert num_threads > 0, "needs at least one thread"
+        assert num_threads % 16 == 0, "multiples of 16 required for MMA thread layout"
+
+        self._bM, self._bN, self._bK = self._cta_tiler
+        assert self._bM % 16 == 0, "multiple of 16 required for tile dimension M"
+        assert self._bN % 16 == 0, "multiple of 16 required for tile dimension N"
+        assert self._num_stages >= 3, "num_stages must be greater than or equal to 3"
+
+    @cute.jit
+    def __call__(
+        self,
+        mA: cute.Tensor,
+        mB: cute.Tensor,
+        mC: cute.Tensor,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        self.a_major_mode = utils.LayoutEnum.from_tensor(mA)
+        self.b_major_mode = utils.LayoutEnum.from_tensor(mB)
+        self.c_major_mode = utils.LayoutEnum.from_tensor(mC)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Create layouts for shared memory for A and B:
+        #   - sA/sB is m/n-major to vectorized copies from shared
+        #       memory to registers. This is because the MMA layouts
+        #       for sA/sB are also m/n-major
+        #   - When gA/gB is k-major, pad 4 elements to reduce bank conflicts
+        # ///////////////////////////////////////////////////////////////////////////////
+
+        padding_a = 4 if self.a_major_mode == utils.LayoutEnum.ROW_MAJOR else 0
+        padding_b = 4 if self.b_major_mode == utils.LayoutEnum.ROW_MAJOR else 0
+        sA_layout = cute.make_layout(
+            (self._bM, self._bK, self._num_stages),
+            stride=(1, (self._bM + padding_a), self._bK * (self._bM + padding_a)),
+        )
+        sB_layout = cute.make_layout(
+            (self._bN, self._bK, self._num_stages),
+            stride=(1, (self._bN + padding_b), self._bK * (self._bN + padding_b)),
+        )
+
+        smem_size = cute.size_in_bytes(mA.element_type, sA_layout) + cute.size_in_bytes(
+            mB.element_type, sB_layout
+        )
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Create copy layouts that will be used for asynchronous
+        # global memory -> shared memory copies:
+        #   - The majorness of tA/tB follows the majorness of gA/gB
+        #   - For k-major, these layouts will copy values one-by-one from
+        #       from global memory, without vectorizing
+        #   - For m/n-major, it will vectorize to a 128bit copy for faster
+        #       data transfer between global and shared memory, as long
+        #       as the alignment of the tensor allows it. Otherwise, it
+        #       defaults to a non-vectorized copy
+        # ///////////////////////////////////////////////////////////////////////////////
+
+        tA = cute.make_layout(
+            (self._num_threads // self._bK, self._bK), stride=(self._bK, 1)
+        )
+        tB = cute.make_layout(
+            (self._num_threads // self._bK, self._bK), stride=(self._bK, 1)
+        )
+        vA = cute.make_layout((1, 1))
+        vB = cute.make_layout((1, 1))
+        atom_async_copy_A = cute.make_copy_atom(
+            cute.nvgpu.cpasync.CopyG2SOp(),
+            mA.element_type,
+            num_bits_per_copy=mA.element_type.width,
+        )
+        atom_async_copy_B = cute.make_copy_atom(
+            cute.nvgpu.cpasync.CopyG2SOp(),
+            mA.element_type,
+            num_bits_per_copy=mB.element_type.width,
+        )
+
+        if self.a_major_mode == utils.LayoutEnum.COL_MAJOR:
+            num_vectorized = 4 if (mA.layout.max_alignment % 16 == 0) else 1
+            atom_async_copy_A = cute.make_copy_atom(
+                cute.nvgpu.cpasync.CopyG2SOp(),
+                mA.element_type,
+                num_bits_per_copy=mA.element_type.width * num_vectorized,
+            )
+            major_mode_size = self._bM // num_vectorized
+            tA = cute.make_layout(
+                (major_mode_size, self._num_threads // major_mode_size),
+                stride=(1, major_mode_size),
+            )
+            vA = cute.make_layout((num_vectorized, 1))
+
+        if self.b_major_mode == utils.LayoutEnum.COL_MAJOR:
+            num_vectorized = 4 if (mB.layout.max_alignment % 16 == 0) else 1
+            atom_async_copy_B = cute.make_copy_atom(
+                cute.nvgpu.cpasync.CopyG2SOp(),
+                mA.element_type,
+                num_bits_per_copy=mB.element_type.width * num_vectorized,
+            )
+            major_mode_size = self._bN // num_vectorized
+            tB = cute.make_layout(
+                (major_mode_size, self._num_threads // major_mode_size),
+                stride=(1, major_mode_size),
+            )
+            vB = cute.make_layout((num_vectorized, 1))
+
+        tiled_copy_A = cute.make_tiled_copy_tv(atom_async_copy_A, tA, vA)
+        tiled_copy_B = cute.make_tiled_copy_tv(atom_async_copy_B, tB, vB)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Create layouts for GEMM:
+        # We tile an MMA atom across a tensor. `atoms_layout` is the layout
+        # of atoms in the tiled MMA. (Because we use an `MmaUniversalOp`,
+        # which has a trivial 1x1x1 MMA trait, `atoms_layout` is also
+        # simply the thread layout for C.) `permutation_tiler` reorders the
+        # elements of the tensor that the tiled MMA is applied to.
+        # Different combinations of `atoms_layout` and `permutation_tiler`
+        # values can create different MMA thread-value patterns.
+        #
+        # Here, the MMA layout is set so that each thread copies four
+        # consecutive elements from shared memory to registers.
+        # `permutation_tiler_M/N` maps the elements handled by each thread
+        # to the permuted element in the tensor.
+        # For increasing indices in the tensor, the thread ID that reads it is:
+        #   - (without permutation) ==>
+        #      0 1 2 ... 15 0 1 2 ... 15 0 1 2 ... 15 0 1 2 ... 15 ......
+        #   - (with permutation) ==>
+        #      0 0 0 0 1 1 1 1 2 2 2 2 ... 15 15 15 15 0 0 0 0 1 1 1 1 ......
+        # ///////////////////////////////////////////////////////////////////////////////
+        atoms_layout = cute.make_layout(
+            (self._num_threads // 16, 16, 1), stride=(16, 1, 0)
+        )
+        if self.c_major_mode == utils.LayoutEnum.COL_MAJOR:
+            atoms_layout = cute.make_layout(
+                (16, self._num_threads // 16, 1), stride=(1, 16, 0)
+            )
+        op = cute.nvgpu.MmaUniversalOp(cutlass.Float32)
+        permutation_tiler_M = cute.make_layout(
+            (atoms_layout.shape[0], 4), stride=(4, 1)
+        )
+        permutation_tiler_N = cute.make_layout(
+            (atoms_layout.shape[1], 4), stride=(4, 1)
+        )
+        tiled_mma = cute.make_tiled_mma(
+            op,
+            atoms_layout,
+            permutation_mnk=(permutation_tiler_M, permutation_tiler_N, None),
+        )
+
+        # grid_dim: ((m + BLK_M - 1) // BLK_M, (n + BLK_N - 1) // BLK_N, 1)
+        grid_dim = *cute.ceil_div(mC.shape, (self._bM, self._bN)), 1
+
+        self.kernel(
+            mA,
+            mB,
+            mC,
+            sA_layout,
+            sB_layout,
+            tiled_copy_A,
+            tiled_copy_B,
+            tiled_mma,
+            epilogue_op,
+        ).launch(
+            grid=grid_dim,
+            block=[cute.size(atoms_layout), 1, 1],
+            smem=smem_size,
+        )
+
+    @cute.kernel
+    def kernel(
+        self,
+        mA: cute.Tensor,
+        mB: cute.Tensor,
+        mC: cute.Tensor,
+        sA_layout: cute.Layout,
+        sB_layout: cute.Layout,
+        tiled_copy_A: cute.TiledCopy,
+        tiled_copy_B: cute.TiledCopy,
+        tiled_mma: cute.TiledMma,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        # Thread and block indices
+        tidx, tidy, tidz = cute.arch.thread_idx()
+        bidx, bidy, bidz = cute.arch.block_idx()
+        tiler_coord = (bidx, bidy, None)
+        thr_mma = tiled_mma.get_slice(tidx)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Get the appropriate tiles for this thread block.
+        # gA: (BLK_M, BLK_K, k), gB: (BLK_N, BLK_K, k), gC: (BLK_M, BLK_N)
+        # ///////////////////////////////////////////////////////////////////////////////
+        gA = cute.local_tile(
+            mA, tiler=self._cta_tiler, coord=tiler_coord, proj=(1, None, 1)
+        )
+        gB = cute.local_tile(
+            mB, tiler=self._cta_tiler, coord=tiler_coord, proj=(None, 1, 1)
+        )
+        gC = cute.local_tile(
+            mC, tiler=self._cta_tiler, coord=tiler_coord, proj=(1, 1, None)
+        )
+
+        # Move the pointer of gA/gB in the `-k`` direction, making the first
+        # tile (instead of the last one) irregular in shape when k is irregular.
+        # We first handle the irregular tile to avoid checking for this
+        # condition within the mainloop.
+        residue_k = mA.shape[1] - cutlass.Int32(self._bK) * gA.shape[2]
+        gA = cute.domain_offset((0, residue_k, 0), gA)
+        gB = cute.domain_offset((0, residue_k, 0), gB)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Get the appropriate tiles for this thread.
+        # sA:   (BLK_M, BLK_K, PIPE)       , sB:   (BLK_N, BLK_K, PIPE)
+        # tAgA: (CPY, CPY_M, CPY_K, k)     , tBgB: (CPY, CPY_N, CPY_K, k)
+        # tAsA: (CPY, CPY_M, CPY_K, PIPE)  , tBsB: (CPY, CPY_N, CPY_K, PIPE)
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Create shared memory buffer
+        smem = cutlass.utils.SmemAllocator()
+        sA = smem.allocate_tensor(mA.element_type, sA_layout, 16)
+        sB = smem.allocate_tensor(mB.element_type, sB_layout, 16)
+        thr_copy_A = tiled_copy_A.get_slice(tidx)
+        thr_copy_B = tiled_copy_B.get_slice(tidx)
+        tAgA = thr_copy_A.partition_S(gA)
+        tAsA = thr_copy_A.partition_D(sA)
+        tBgB = thr_copy_B.partition_S(gB)
+        tBsB = thr_copy_B.partition_D(sB)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Predicate: Mark indices that need to copy when the problem shape
+        # isn't a multiple of the tile shape. If tApA/B[i] is 0, then do not
+        # do the copy atom associated with index i.
+        # cA:    (BLK_M, BLK_K)      => (blk_m, blk_k)
+        # cB:    (BLK_N, BLK_K)      => (blk_n, blk_k)
+        # tAcA:  (CPY, CPY_M, CPY_K) => (blk_m, blk_k)
+        # tBcB:  (CPY, CPY_N, CPY_K) => (blk_n, blk_k)
+        # tApA: (rest_v, CPY_M, CPY_K), stride=(..., ..., 0)
+        # tBpB: (rest_v, CPY_N, CPY_K), stride=(..., ..., 0)
+        # CPY =  (atom_v, rest_v)
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Construct identity layout for sA and sB, used for predication
+        mcA = cute.make_identity_tensor(mA.shape)
+        mcB = cute.make_identity_tensor(mB.shape)
+        cA = cute.local_tile(
+            mcA, tiler=self._cta_tiler, coord=tiler_coord, proj=(1, None, 1)
+        )
+        cB = cute.local_tile(
+            mcB, tiler=self._cta_tiler, coord=tiler_coord, proj=(None, 1, 1)
+        )
+        cA = cute.domain_offset((0, residue_k, 0), cA)
+        cB = cute.domain_offset((0, residue_k, 0), cB)
+        # Repeat the partitioning with identity layouts
+        tAcA = thr_copy_A.partition_S(cA)
+        tBcB = thr_copy_B.partition_S(cB)
+        # Allocate predicate tensors for m and n
+        tApA = cute.make_fragment(
+            cute.make_layout(
+                (
+                    tAsA.shape[0][1],
+                    cute.size(tAsA, mode=[1]),
+                    cute.size(tAsA, mode=[2]),
+                ),
+                stride=(cute.size(tAsA, mode=[1]), 1, 0),
+            ),
+            cutlass.Boolean,
+        )
+        tBpB = cute.make_fragment(
+            cute.make_layout(
+                (
+                    tBsB.shape[0][1],
+                    cute.size(tBsB, mode=[1]),
+                    cute.size(tBsB, mode=[2]),
+                ),
+                stride=(cute.size(tBsB, mode=[1]), 1, 0),
+            ),
+            cutlass.Boolean,
+        )
+        # Allocate predicate tensors for m, n and k for residue k-tile
+        tApA_residue_k = cute.make_fragment(
+            cute.make_layout(
+                (
+                    tAsA.shape[0][1],
+                    cute.size(tAsA, mode=[1]),
+                    cute.size(tAsA, mode=[2]),
+                ),
+                stride=(
+                    cute.size(tAsA, mode=[1]) * cute.size(tAsA, mode=[2]),
+                    cute.size(tAsA, mode=[2]),
+                    1,
+                ),
+            ),
+            cutlass.Boolean,
+        )
+        tBpB_residue_k = cute.make_fragment(
+            cute.make_layout(
+                (
+                    tBsB.shape[0][1],
+                    cute.size(tBsB, mode=[1]),
+                    cute.size(tBsB, mode=[2]),
+                ),
+                stride=(
+                    cute.size(tBsB, mode=[1]) * cute.size(tBsB, mode=[2]),
+                    cute.size(tBsB, mode=[2]),
+                    1,
+                ),
+            ),
+            cutlass.Boolean,
+        )
+        # Set predicates for m/n bounds for mainloop
+        for rest_v in range(tApA.shape[0]):
+            for m in range(tApA.shape[1]):
+                tApA[rest_v, m, 0] = cute.elem_less(
+                    tAcA[(0, rest_v), m, 0, 0][0], mA.shape[0]
+                )
+        for rest_v in range(tBpB.shape[0]):
+            for n in range(tBpB.shape[1]):
+                tBpB[rest_v, n, 0] = cute.elem_less(
+                    tBcB[(0, rest_v), n, 0, 0][0], mB.shape[0]
+                )
+
+        # Set predicates for m/n/k bounds for residue k tile
+        for rest_v in range(tApA_residue_k.shape[0]):
+            for m in range(tApA_residue_k.shape[1]):
+                for k in range(tApA_residue_k.shape[2]):
+                    coord_A = tAcA[(0, rest_v), m, k, 0]
+                    tApA_residue_k[rest_v, m, k] = cute.elem_less(
+                        (coord_A[0], cutlass.Int32(-1)), (mA.shape[0], coord_A[1])
+                    )
+        for rest_v in range(tBpB_residue_k.shape[0]):
+            for n in range(tBpB_residue_k.shape[1]):
+                for k in range(tBpB_residue_k.shape[2]):
+                    coord_B = tBcB[(0, rest_v), n, k, 0]
+                    tBpB_residue_k[rest_v, n, k] = cute.elem_less(
+                        (coord_B[0], cutlass.Int32(-1)), (mB.shape[0], coord_B[1])
+                    )
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Prefetch Prologue
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Start async loads for 0th k-tile, where we take care of the k-residue
+        k_pipe_max = cute.size(tAsA, mode=[3])
+        k_tile_count = cute.size(tAgA, mode=[3])
+        gmem_pipe_read = cutlass.Int32(0)
+        cute.copy(
+            tiled_copy_A,
+            tAgA[None, None, None, gmem_pipe_read],
+            tAsA[None, None, None, 0],
+            pred=tApA_residue_k,
+        )
+        cute.copy(
+            tiled_copy_B,
+            tBgB[None, None, None, gmem_pipe_read],
+            tBsB[None, None, None, 0],
+            pred=tBpB_residue_k,
+        )
+        cute.arch.cp_async_commit_group()
+        gmem_pipe_read = (
+            gmem_pipe_read + 1
+            if gmem_pipe_read + 1 < k_tile_count
+            else cutlass.Int32(0)
+        )
+        # Start async loads for 1st k-tile onwards, no k-residue handling needed
+        for k_tile in range(1, k_pipe_max - 1):
+            if k_tile < k_tile_count:
+                cute.copy(
+                    tiled_copy_A,
+                    tAgA[None, None, None, gmem_pipe_read],
+                    tAsA[None, None, None, k_tile],
+                    pred=tApA,
+                )
+                cute.copy(
+                    tiled_copy_B,
+                    tBgB[None, None, None, gmem_pipe_read],
+                    tBsB[None, None, None, k_tile],
+                    pred=tBpB,
+                )
+
+            gmem_pipe_read = (
+                gmem_pipe_read + 1
+                if gmem_pipe_read + 1 < k_tile_count
+                else cutlass.Int32(0)
+            )
+            cute.arch.cp_async_commit_group()
+
+        # all tiles have been copied from global memory, so clear the
+        # predicate tensor
+        if k_tile_count < k_pipe_max:
+            for rest_v in range(tApA.shape[0]):
+                for m in range(tApA.shape[1]):
+                    tApA[rest_v, m, 0] = cutlass.Boolean(0)
+            for rest_v in range(tBpB.shape[0]):
+                for n in range(tBpB.shape[1]):
+                    tBpB[rest_v, n, 0] = cutlass.Boolean(0)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Define A/B partitioning and C accumulators.
+        # ///////////////////////////////////////////////////////////////////////////////
+        tCsA = thr_mma.partition_A(sA)
+        tCsB = thr_mma.partition_B(sB)
+        tCgC = thr_mma.partition_C(gC)
+        tCrA = tiled_mma.make_fragment_A(tCsA[None, None, None, 0])
+        tCrB = tiled_mma.make_fragment_B(tCsB[None, None, None, 0])
+        tCrC = tiled_mma.make_fragment_C(tCgC)
+        # Clear the accumulator
+        tCrC.fill(0.0)
+
+        # Current pipe index in smem to read from / write to
+        smem_pipe_read = cutlass.Int32(0)
+        smem_pipe_write = cutlass.Int32(k_pipe_max - 1)
+
+        tCsA_p = tCsA[None, None, None, smem_pipe_read]
+        tCsB_p = tCsB[None, None, None, smem_pipe_read]
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # PREFETCH register pipeline
+        # ///////////////////////////////////////////////////////////////////////////////
+        k_block_max = cute.size(tCrA, mode=[2])
+
+        if k_block_max > 1:
+            # Wait until our first prefetched tile is loaded in
+            cute.arch.cp_async_wait_group(k_pipe_max - 2)
+            cute.arch.barrier()
+            # Prefetch the first rmem from the first k-tile
+            cute.autovec_copy(tCsA_p[None, None, 0], tCrA[None, None, 0])
+            cute.autovec_copy(tCsB_p[None, None, 0], tCrB[None, None, 0])
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Mainloop
+        # 1. Shared memory pipeline (gmem -> smem):
+        #    The default smem pipeline depth is 3, meaning that for shared
+        # memory buffers, we allocate three times the size described by the
+        # CTA tiler. We prefetch 2 of these buffers before entering the main
+        # loop. Considering only the transfer from global memory to shared
+        # memory, the general structure of the mainloop is:
+        #   (1) copy k-tile from gmem to smem;
+        #   (2) perform gemm computation on k-tile;
+        #   (3) wait for the next copy to finish.
+        #    The `cute.arch.cp_async_wait_group(num_smem_stages - 2)` command
+        # waits for the number of unfinished 'copy' to be <= 1. The advantage
+        # of this approach is that it allows for simultaneous production
+        # (i.e., step (1)) and consumption (i.e., step (2)) of smem.
+        #    A common misconception is to prefetch N buffers and rewrite
+        # the pipeline logic to wait on N-1 pending copies. The disadvantage
+        # of this approach is that it requires fully consuming a buffer in
+        # order to open an empty buffer for the next copy.
+        # 2. Register pipeline (smem -> register):
+        #    Similarly, the register pipeline produces i+1, consumes i, and
+        # produces i+2... Notably, i and i+1 do not use the same register,
+        # eliminating dependencies on the same register for better parallelism.
+        # 3. Combining the smem and register pipelines results in the mainloop.
+        # ///////////////////////////////////////////////////////////////////////////////
+
+        for _ in cutlass.range_dynamic(k_tile_count, unroll=1):
+            for k_block in range(k_block_max):
+                if k_block == k_block_max - 1:
+                    tCsA_p = tCsA[None, None, None, smem_pipe_read]
+                    tCsB_p = tCsB[None, None, None, smem_pipe_read]
+                    cute.arch.cp_async_wait_group(k_pipe_max - 2)
+                    cute.arch.barrier()
+
+                # Load A, B from shared memory to registers for k_block + 1
+                k_block_next = (k_block + 1) % k_block_max  # static
+                cute.autovec_copy(
+                    tCsA_p[None, None, k_block_next],
+                    tCrA[None, None, k_block_next],
+                )
+                cute.autovec_copy(
+                    tCsB_p[None, None, k_block_next],
+                    tCrB[None, None, k_block_next],
+                )
+
+                # Fetch next A: To better interleave global memory access and
+                # compute instructions, we intentionally use the sequence:
+                # copy A, perform GEMM, then copy B.
+                if k_block == 0:
+                    cute.copy(
+                        tiled_copy_A,
+                        tAgA[None, None, None, gmem_pipe_read],
+                        tAsA[None, None, None, smem_pipe_write],
+                        # Use predicates because the m-mode may be irregular
+                        pred=tApA,
+                    )
+
+                # Thread-level register gemm for k_block
+                cute.gemm(
+                    tiled_mma,
+                    tCrC,
+                    tCrA[None, None, k_block],
+                    tCrB[None, None, k_block],
+                    tCrC,
+                )
+
+                # Fetch next B and update smem pipeline read/write
+                if k_block == 0:
+                    cute.copy(
+                        tiled_copy_B,
+                        tBgB[None, None, None, gmem_pipe_read],
+                        tBsB[None, None, None, smem_pipe_write],
+                        # Use predicates because the n-mode may be irregular
+                        pred=tBpB,
+                    )
+                    cute.arch.cp_async_commit_group()
+                    smem_pipe_write = smem_pipe_read
+                    smem_pipe_read = smem_pipe_read + 1
+                    if smem_pipe_read == k_pipe_max:
+                        smem_pipe_read = cutlass.Int32(0)
+                    # After copying all tiles, we avoid clearing the predicate
+                    # tensor in the `mainloop` to prevent increasing its
+                    # instruction count. Instead, we continue copying the
+                    # first tile, though it won't be used. The 0-th tile is not
+                    # copied due to its irregular shape, which could lead to
+                    # illegal memory accesses.
+                    gmem_pipe_read = (
+                        gmem_pipe_read + 1
+                        if gmem_pipe_read + 1 < k_tile_count
+                        else cutlass.Int32(1)
+                    )
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Epilogue
+        # Applies the epilogue operation to the accumulated results and copies
+        # them without vectorization.
+        # ///////////////////////////////////////////////////////////////////////////////
+        cute.arch.cp_async_wait_group(0)
+        cute.arch.barrier()
+        tCrC.store(epilogue_op(tCrC.load()))
+
+        # predicate
+        cC = cute.make_identity_tensor(gC.shape)
+        tCpC = thr_mma.partition_C(cC)
+        predC = cute.make_fragment(tCrC.layout, cutlass.Boolean)
+        residue_m = mC.shape[0] - cutlass.Int32(self._bM) * bidx
+        residue_n = mC.shape[1] - cutlass.Int32(self._bN) * bidy
+        for i in range(cute.size(tCrC.shape)):
+            predC[i] = cute.elem_less(tCpC[i], (residue_m, residue_n))
+        numIterM = cute.size(tCrC, mode=[1])
+        numIterN = cute.size(tCrC, mode=[2])
+        atom = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), mC.element_type)
+        cute.copy(atom, tCrC, tCgC, pred=predC)
+        return
+
+
+def main(
+    a_major: str,
+    b_major: str,
+    c_major: str,
+    problem_shape: Tuple[int, int, int],
+    warmup_iterations: int = 2,
+    iterations: int = 100,
+    skip_ref_check: bool = False,
+):
+    torch.manual_seed(1024)
+    M, N, K = problem_shape
+
+    # Create and permute tensor A/B/C
+    def create_and_permute_tensor(mode0, mode1, is_mode0_major, dtype):
+        # is_mode0_major: (mode1, mode0) -> (mode0, mode1)
+        # else: (mode0, mode1) -> (mode0, mode1)
+        shape = (mode1, mode0) if is_mode0_major else (mode0, mode1)
+        permute_order = (1, 0) if is_mode0_major else (0, 1)
+
+        return (
+            torch.empty(*shape, dtype=torch.int32)
+            .random_(-5, 5)
+            .to(dtype=dtype)
+            .permute(permute_order)
+            .cuda()
+        )
+
+    a = create_and_permute_tensor(M, K, a_major == "m", torch.float32)
+    b = create_and_permute_tensor(N, K, b_major == "n", torch.float32)
+    c = create_and_permute_tensor(M, N, c_major == "m", torch.float32)
+
+    divisibility_a = a.shape[1] if a_major == "k" else a.shape[0]
+    divisibility_b = b.shape[1] if b_major == "k" else b.shape[0]
+    divisibility_c = c.shape[1] if c_major == "n" else c.shape[0]
+
+    a_tensor = (
+        from_dlpack(a, assumed_align=16)
+        .mark_layout_dynamic(leading_dim=(1 if a_major == "k" else 0))
+        .mark_compact_shape_dynamic(
+            mode=(1 if a_major == "k" else 0),
+            divisibility=divisibility_a,
+        )
+    )
+
+    b_tensor = (
+        from_dlpack(b, assumed_align=16)
+        .mark_layout_dynamic(leading_dim=(1 if b_major == "k" else 0))
+        .mark_compact_shape_dynamic(
+            mode=(1 if b_major == "k" else 0),
+            divisibility=divisibility_b,
+        )
+    )
+
+    c_tensor = (
+        from_dlpack(c, assumed_align=16)
+        .mark_layout_dynamic(leading_dim=(1 if c_major == "n" else 0))
+        .mark_compact_shape_dynamic(
+            mode=(1 if c_major == "n" else 0),
+            divisibility=divisibility_c,
+        )
+    )
+
+    sgemm = SGemm()
+
+    print("Compiling kernel with cute.compile ...")
+    start_time = time.time()
+    gemm = cute.compile(sgemm, a_tensor, b_tensor, c_tensor)
+    compilation_time = time.time() - start_time
+    print(f"Compilation time: {compilation_time:.4f} seconds")
+
+    print("Executing GEMM kernel...")
+
+    # Get current CUDA stream from PyTorch
+    torch_stream = torch.cuda.current_stream()
+
+    # Get the raw stream pointer as a CUstream
+    current_stream = cuda.CUstream(torch_stream.cuda_stream)
+
+    # Create CUDA events for timing
+    start_event = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1]
+    end_event = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1]
+
+    # Warmup
+    for _ in range(warmup_iterations):
+        gemm(a_tensor, b_tensor, c_tensor)
+
+    # Use the current stream for CUDA events instead of the default stream
+    # Record start event
+    cuda.cuEventRecord(start_event, current_stream)
+
+    # Execute the kernel
+    for _ in range(iterations):
+        gemm(a_tensor, b_tensor, c_tensor)
+
+    # Record end event
+    cuda.cuEventRecord(end_event, current_stream)
+    cuda.cuEventSynchronize(end_event)
+
+    # Calculate elapsed time
+    err, elapsed_time = cuda.cuEventElapsedTime(start_event, end_event)
+
+    # Print execution results
+    print(f"Kernel execution time: {elapsed_time / iterations:.4f} ms")
+
+    # Destroy events
+    cuda.cuEventDestroy(start_event)
+    cuda.cuEventDestroy(end_event)
+
+    if not skip_ref_check:
+        print("Verifying results...")
+        ref = torch.einsum("mk,nk->mn", a, b)
+        torch.testing.assert_close(c.cpu(), ref.cpu(), atol=1e-03, rtol=1e-05)
+        print("Results verified successfully!")
+
+
+if __name__ == "__main__":
+
+    def parse_comma_separated_ints(s: str) -> Tuple[int, ...]:
+        try:
+            return tuple(int(x.strip()) for x in s.split(","))
+        except ValueError:
+            raise argparse.ArgumentTypeError(
+                "Invalid format. Expected comma-separated integers."
+            )
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mnk", type=parse_comma_separated_ints, default=(256, 256, 64)
+    )
+    parser.add_argument("--a_major", choices=["k", "m"], default="k")
+    parser.add_argument("--b_major", choices=["k", "n"], default="k")
+    parser.add_argument("--c_major", choices=["n", "m"], default="n")
+    parser.add_argument("--warmup_iterations", default=2, type=int)
+    parser.add_argument("--iterations", default=100, type=int)
+    parser.add_argument("--skip_ref_check", action="store_true")
+
+    args = parser.parse_args()
+    print("Running SIMT GEMM example:")
+    main(
+        args.a_major,
+        args.b_major,
+        args.c_major,
+        args.mnk,
+        args.warmup_iterations,
+        args.iterations,
+        args.skip_ref_check,
+    )
+    print("PASS")
diff --git a/examples/python/CuTeDSL/ampere/tensorop_gemm.py b/examples/python/CuTeDSL/ampere/tensorop_gemm.py
new file mode 100644
index 00000000..cc93f93d
--- /dev/null
+++ b/examples/python/CuTeDSL/ampere/tensorop_gemm.py
@@ -0,0 +1,968 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import math
+import time
+from typing import Tuple, Type
+
+import cuda.bindings.driver as cuda
+import torch
+
+import cutlass
+import cutlass.cute as cute
+import cutlass.torch as cutlass_torch
+import cutlass.utils as utils
+from cutlass.cute.runtime import from_dlpack
+
+"""
+A dense GEMM (C = A * B) example for the NVIDIA Ampere architecture using CUTE DSL.
+- Matrix A is MxKxL, L is batch dimension, A can be row-major("K") or column-major("M")
+- Matrix B is NxKxL, L is batch dimension, B can be row-major("N") or column-major("K")
+- Matrix C is MxNxL, L is batch dimension, C can be row-major("N") or column-major("M")
+
+This GEMM kernel supports the following features:
+    - Utilizes Ampere's tensor cores for matrix multiply-accumulate (MMA) operations
+    - Supports multi-stage pipeline to overlap computation and memory access
+    - Implements shared memory buffering for epilogue to increase coalesed global memory access
+
+This GEMM works as follows:
+1. Load A and B matrices from global memory (GMEM) to shared memory (SMEM) using asynchronous copies.
+2. Perform matrix multiply-accumulate (MMA) operations.
+3. Store results from registers (RMEM) to shared memory (SMEM), then to global memory (GMEM).
+
+The Ampere tensor core instruction used operates as follows:
+- Read matrix A from SMEM
+- Read matrix B from SMEM
+- Perform MMA operation and store the result in Accumulator(register)
+
+To run this example:
+
+.. code-block:: bash
+
+    python examples/ampere/tensorop_gemm.py                                  \
+      --mnkl 8192,8192,8192,1 --atom_layout_mnk 2,2,1                        \
+      --ab_dtype Float16                                                     \
+      --c_dtype Float16 --acc_dtype Float32                                  \
+      --a_major m --b_major n --c_major n
+
+The above example command computes with M=8192, N=8192, K=8192,
+batch_count=1. The atom layout's shape is 2x2x1 and the input, mma
+accumulator, and output data type are set as fp16, fp32 and fp16,
+respectively.
+
+To collect performance with NCU profiler:
+
+.. code-block:: bash
+
+    ncu python examples/ampere/tensorop_gemm.py                              \
+      --mnkl 8192,8192,8192,1 --atom_layout_mnk 2,2,1                        \
+      --ab_dtype Float16                                                     \
+      --c_dtype Float16 --acc_dtype Float32                                  \
+      --a_major m --b_major n --c_major n                                    \
+      --skip_ref_check --iterations 2
+
+Constraints:
+* Supported input and output data types: fp16
+* Support accumulator data types: f32
+* Default tile shape is set to be 128x128x32
+* Atom layout's MNK shape is set so that tile shape can be divided by MMA
+  instruction shape
+* The contiguous dimension of A/B/C tensors must be at least 16 bytes aligned,
+  i.e, number of elements is a multiple of 8
+"""
+
+
+class TensorOpGemm:
+    def __init__(
+        self,
+        ab_dtype: Type[cutlass.Numeric],
+        c_dtype: Type[cutlass.Numeric],
+        acc_dtype: Type[cutlass.Numeric],
+        atom_layout_mnk: Tuple[int, int, int],
+    ):
+        self.ab_dtype = ab_dtype
+        self.c_dtype = c_dtype
+        self.acc_dtype = acc_dtype
+        self.cta_tiler = (128, 128, 32)
+        self.num_stages = 3
+        self.atom_layout_mnk = atom_layout_mnk
+        atom_lay_M, atom_lay_N, atom_lay_K = self.atom_layout_mnk
+        self.num_threads = atom_lay_M * atom_lay_N * atom_lay_K * 32
+
+        self.bM, self.bN, self.bK = self.cta_tiler
+        self.mma_inst_shape = (16, 8, 16)
+        mmaM, mmaN, mmaK = self.mma_inst_shape
+
+        assert (
+            self.bM % (atom_lay_M * mmaM) == 0
+        ), "bM must be divisible by MMA instruction"
+        assert (
+            self.bN % (atom_lay_N * mmaN) == 0
+        ), "bN must be divisible by MMA instruction"
+        assert atom_lay_K == 1, "this example does not support atom layout K > 1"
+        assert self.bK % mmaK == 0, "bK must be divisible by MMA instruction"
+        assert self.num_stages >= 3, "num_stages must be greater than or equal to 3"
+
+    @cute.jit
+    def __call__(
+        self,
+        mA: cute.Tensor,
+        mB: cute.Tensor,
+        mC: cute.Tensor,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        # The grid divides the problems's M, N, and L dimensions by the
+        # respective modes of the tile shape (bM, bN, 1). The K dimension is
+        # handled within a block via a multistage process.
+
+        self.a_major_mode = utils.LayoutEnum.from_tensor(mA)
+        self.b_major_mode = utils.LayoutEnum.from_tensor(mB)
+        self.c_major_mode = utils.LayoutEnum.from_tensor(mC)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Shared memory layout:
+        # ///////////////////////////////////////////////////////////////////////////////
+
+        # Creates a layout with the size required for the provided tile
+        # size and num stages (stages are used for K dimension) that is also
+        # sectioned into 64x8 or 8x32 layout atoms. The swizzle is set so that
+        # the atom for the shared memory -> register copy does not encounter
+        # bank conflicts
+
+        # assume the input is 16B align
+        ab_copy_bits = 128
+        sA_layout = self._make_smem_layout_AB(
+            mA.element_type,
+            self.a_major_mode,
+            ab_copy_bits,
+            (self.cta_tiler[0], self.cta_tiler[2], self.num_stages),
+        )
+        sB_layout = self._make_smem_layout_AB(
+            mB.element_type,
+            self.b_major_mode,
+            ab_copy_bits,
+            (self.cta_tiler[1], self.cta_tiler[2], self.num_stages),
+        )
+
+        # Creates a similar layout but without num_stages or layout atoms
+        sC_layout = self._make_smem_layout_C(
+            mC.element_type,
+            self.c_major_mode,
+            ab_copy_bits,
+            (self.cta_tiler[0], self.cta_tiler[1]),
+        )
+
+        # Shared memory allocated for operations with A, B will be
+        # overwritten for operations on C. This is to improve performance
+        # by reducing the size of shared memory requested by each block
+        smem_size = max(
+            cute.size_in_bytes(mC.element_type, sC_layout),
+            cute.size_in_bytes(mA.element_type, sA_layout)
+            + cute.size_in_bytes(mB.element_type, sB_layout),
+        )
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Tiled copy:
+        # The majorness of tA/tB/tC follows the majorness of gA/gB/gC,
+        # enabling merged accesses to global memory for faster data
+        # transfer between global and shared memory.
+        # ///////////////////////////////////////////////////////////////////////////////
+
+        # Create a copy atom for a global to shared memory asynchronous copy
+        atom_async_copy = cute.make_copy_atom(
+            cute.nvgpu.cpasync.CopyG2SOp(
+                cache_mode=cute.nvgpu.cpasync.LoadCacheMode.GLOBAL
+            ),
+            mA.element_type,
+            num_bits_per_copy=ab_copy_bits,
+        )
+
+        # Create thread layouts for tiled copy from the copy atom where the
+        # thread layout simply follows the leading dimension of the tensor
+        tiled_copy_A = self._make_gmem_tiled_copy_AB(
+            atom_async_copy, mA.element_type, self.a_major_mode, ab_copy_bits
+        )
+        tiled_copy_B = self._make_gmem_tiled_copy_AB(
+            atom_async_copy, mB.element_type, self.b_major_mode, ab_copy_bits
+        )
+
+        # Creates a synchonous copy atom and thread layouts for the epilogue
+        c_copy_bits = 128
+        atom_sync_copy = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(),
+            mC.element_type,
+            num_bits_per_copy=c_copy_bits,
+        )
+        tiled_copy_C = self._make_gmem_tiled_copy_C(
+            atom_sync_copy, mC.element_type, self.c_major_mode, c_copy_bits
+        )
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Tiled MMA
+        # ///////////////////////////////////////////////////////////////////////////////
+
+        # Creates a mma atom with 16x8x16 shape for MNK
+        op = cute.nvgpu.warp.MmaF16BF16Op(
+            self.ab_dtype, self.acc_dtype, self.mma_inst_shape
+        )
+
+        permutation_mnk = (
+            self.atom_layout_mnk[0] * self.mma_inst_shape[0],
+            # if atom layout's N-mode is 1, to leverage the largest coalesced
+            # shared memory -> register copy, set the tiled mma's N mode to 16
+            self.atom_layout_mnk[1] * self.mma_inst_shape[1] * 2,
+            self.atom_layout_mnk[2] * self.mma_inst_shape[2],
+        )
+
+        # Created a tiled mma that tiles the atom according to specified layout.
+        # For a 2x2x1 atom layout, the mma atom is duplicated 4 times, twice
+        # across M and twice across N
+        tC = cute.make_layout(self.atom_layout_mnk)
+        tiled_mma = cute.make_tiled_mma(
+            op,
+            tC,
+            permutation_mnk=permutation_mnk,
+        )
+
+        # grid_dim: ((m + BLK_M - 1) // BLK_M, (n + BLK_N - 1) // BLK_N, l)
+        grid_dim = cute.ceil_div(mC.shape, (self.bM, self.bN, 1))
+
+        self.kernel(
+            mA,
+            mB,
+            mC,
+            sA_layout,
+            sB_layout,
+            sC_layout,
+            tiled_copy_A,
+            tiled_copy_B,
+            tiled_copy_C,
+            tiled_mma,
+            epilogue_op,
+        ).launch(
+            grid=grid_dim,
+            block=[self.num_threads, 1, 1],
+            smem=smem_size,
+        )
+
+    @cute.kernel
+    def kernel(
+        self,
+        mA: cute.Tensor,
+        mB: cute.Tensor,
+        mC: cute.Tensor,
+        sA_layout: cute.ComposedLayout,
+        sB_layout: cute.ComposedLayout,
+        sC_layout: cute.ComposedLayout,
+        tiled_copy_A: cute.TiledCopy,
+        tiled_copy_B: cute.TiledCopy,
+        tiled_copy_C: cute.TiledCopy,
+        tiled_mma: cute.TiledMma,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        # Thread index, block index
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, bidy, bidz = cute.arch.block_idx()
+        tiler_coord = (bidx, bidy, None)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Get the appropriate tiles for this thread block.
+        # gA: (BLK_M, BLK_N, k), gB: (BLK_N, BLK_K, k), gC: (BLK_M, BLK_N)
+        # ///////////////////////////////////////////////////////////////////////////////
+        gA = cute.local_tile(
+            mA[None, None, bidz],
+            tiler=self.cta_tiler,
+            coord=tiler_coord,
+            proj=(1, None, 1),
+        )
+        gB = cute.local_tile(
+            mB[None, None, bidz],
+            tiler=self.cta_tiler,
+            coord=tiler_coord,
+            proj=(None, 1, 1),
+        )
+        gC = cute.local_tile(
+            mC[None, None, bidz],
+            tiler=self.cta_tiler,
+            coord=tiler_coord,
+            proj=(1, 1, None),
+        )
+
+        # By default, if the tensor k mode does not divide into the tile k
+        # size, then last tiles in the k dimension are irregular.
+        # Instead, make the first tiles irregular when k is irregular.
+        # This allows us to handle the irregular tile first to avoid
+        # checking for this condition within the mainloop.
+
+        # residual_k is a negative number indicating the amount needed to
+        # shift the pointer by in dimension k
+        residual_k = cute.size(mA, mode=[1]) - cutlass.Int32(self.bK) * cute.size(
+            gA, mode=[2]
+        )
+
+        # move the pointer of gA/gB in the `-k` direction
+        gA = cute.domain_offset((0, residual_k, 0), gA)
+        gB = cute.domain_offset((0, residual_k, 0), gB)
+        # input is 16B aligned
+        gA = cute.make_tensor(gA.iterator.align(16), gA.layout)
+        gB = cute.make_tensor(gB.iterator.align(16), gB.layout)
+
+        # Construct identity layout for sA and sB (mirrors global tensors,
+        # used for predication only)
+        mcA = cute.make_identity_tensor(mA.layout.shape)
+        mcB = cute.make_identity_tensor(mB.layout.shape)
+        cA = cute.local_tile(
+            mcA[None, None, bidz],
+            tiler=self.cta_tiler,
+            coord=tiler_coord,
+            proj=(1, None, 1),
+        )
+        cB = cute.local_tile(
+            mcB[None, None, bidz],
+            tiler=self.cta_tiler,
+            coord=tiler_coord,
+            proj=(None, 1, 1),
+        )
+
+        cA = cute.domain_offset((0, residual_k, 0), cA)
+        cB = cute.domain_offset((0, residual_k, 0), cB)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Create shared memory buffers and get the appropriate fragments for this thread.
+        # sA:   (BLK_M, BLK_K, PIPE)       , sB:   (BLK_N, BLK_K, PIPE)
+        # tAgA: (CPY, CPY_M, CPY_K, k)     , tBgB: (CPY, CPY_N, CPY_K, k)
+        # tAsA: (CPY, CPY_M, CPY_K, PIPE)  , tBsB: (CPY, CPY_N, CPY_K, PIPE)
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Shared memory buffer
+        smem = cutlass.utils.SmemAllocator()
+
+        sA = smem.allocate_tensor(mA.element_type, sA_layout, 16)
+        sB = smem.allocate_tensor(mB.element_type, sB_layout, 16)
+        sC = cute.make_tensor(
+            cute.recast_ptr(sA.iterator, dtype=self.c_dtype), sC_layout
+        )
+
+        thr_copy_A = tiled_copy_A.get_slice(tidx)
+        thr_copy_B = tiled_copy_B.get_slice(tidx)
+        thr_copy_C = tiled_copy_C.get_slice(tidx)
+        tAgA = thr_copy_A.partition_S(gA)
+        tAsA = thr_copy_A.partition_D(sA)
+        tBgB = thr_copy_B.partition_S(gB)
+        tBsB = thr_copy_B.partition_D(sB)
+        tCsC_epilogue = thr_copy_C.partition_S(sC)
+        tCgC_epilogue = thr_copy_C.partition_D(gC)
+
+        # Repeat the partitioning with identity layouts
+        tAcA = thr_copy_A.partition_S(cA)
+        tBcB = thr_copy_B.partition_S(cB)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Predicate: Mark indices that need to copy when problem_shape isn't a multiple
+        # of tile_shape
+        # ///////////////////////////////////////////////////////////////////////////////
+
+        # For predication over the tensors A (M/K), B (N/K), and (in the
+        # epilogue) C (M/N), we will compute it in a fashion similar to an
+        # outer product. The predication along one of the dimensions is
+        # evaluated and stored in a predication tensor. Then, the
+        # predication for the remaining dimension is handled later via an
+        # if/else branch at the copy.
+        # For A and B, predication booleans along M/N are stored in a
+        # predication tensor and along K is handled via a if/else branch.
+
+        # Allocate predicate tensors for M and N. Predication is checked
+        # at the granularity of a copy atom, so the predicate tensor does not
+        # need separate booleans for individual elements within a copy
+        # atom (for example, the elements of tAgA.shape[0][0].)
+        tApA = cute.make_fragment(
+            cute.make_layout(
+                (
+                    tAgA.shape[0][1],
+                    cute.size(tAgA, mode=[1]),
+                    cute.size(tAgA, mode=[2]),
+                ),
+                stride=(cute.size(tAgA, mode=[1]), 1, 0),
+            ),
+            cutlass.Boolean,
+        )
+        tBpB = cute.make_fragment(
+            cute.make_layout(
+                (
+                    tBsB.shape[0][1],
+                    cute.size(tBsB, mode=[1]),
+                    cute.size(tBsB, mode=[2]),
+                ),
+                stride=(cute.size(tBsB, mode=[1]), 1, 0),
+            ),
+            cutlass.Boolean,
+        )
+        # Set predicates for M/N bounds
+        for rest_v in range(tApA.shape[0]):
+            for m in range(tApA.shape[1]):
+                tApA[rest_v, m, 0] = cute.elem_less(
+                    tAcA[(0, rest_v), m, 0, 0][0], mA.shape[0]
+                )
+        for rest_v in range(tBpB.shape[0]):
+            for n in range(tBpB.shape[1]):
+                tBpB[rest_v, n, 0] = cute.elem_less(
+                    tBcB[(0, rest_v), n, 0, 0][0], mB.shape[0]
+                )
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Prefetch Prologue
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Clear the smem tiles to account for predicated off loads
+        tAsA.fill(0)
+        tBsB.fill(0)
+        cute.arch.sync_threads()
+        # Start async loads for the first k-tile. Here we take care of the k residue
+        # via if/else check along the k dimension. Because we shifted the identity tensor
+        # by the residue_k and because the identity tensor is a counting tensor, the
+        # values of any identity tensor element that is poison is less than -1
+        num_smem_stages = cute.size(tAsA, mode=[3])
+        k_tile_count = cute.size(tAgA, mode=[3])
+        k_tile_index = cutlass.Int32(0)
+
+        for k in range(tApA.shape[2]):
+            if cute.elem_less(cutlass.Int32(-1), tAcA[0, 0, k, 0][1]):
+                cute.copy(
+                    tiled_copy_A,
+                    tAgA[None, None, k, k_tile_index],
+                    tAsA[None, None, k, 0],
+                    pred=tApA[None, None, k],
+                )
+        for k in range(tBpB.shape[2]):
+            if cute.elem_less(cutlass.Int32(-1), tBcB[0, 0, k, 0][1]):
+                cute.copy(
+                    tiled_copy_B,
+                    tBgB[None, None, k, k_tile_index],
+                    tBsB[None, None, k, 0],
+                    pred=tBpB[None, None, k],
+                )
+        k_tile_index = k_tile_index + 1
+        cute.arch.cp_async_commit_group()
+
+        # Start async loads for rest of the k-tiles
+        for k_tile in range(1, num_smem_stages - 1):
+            if k_tile == k_tile_count:
+                tApA.fill(0)
+                tBpB.fill(0)
+            cute.copy(
+                tiled_copy_A,
+                tAgA[None, None, None, k_tile_index],
+                tAsA[None, None, None, k_tile],
+                pred=tApA,
+            )
+            cute.copy(
+                tiled_copy_B,
+                tBgB[None, None, None, k_tile_index],
+                tBsB[None, None, None, k_tile],
+                pred=tBpB,
+            )
+            k_tile_index = k_tile_index + 1
+            cute.arch.cp_async_commit_group()
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Tile MMA compute thread partitions and allocate accumulators
+        # ///////////////////////////////////////////////////////////////////////////////
+        thr_mma = tiled_mma.get_slice(tidx)
+        tCsA = thr_mma.partition_A(sA)
+        tCsB = thr_mma.partition_B(sB)
+        tCsC = thr_mma.partition_C(sC)
+        tCgC = thr_mma.partition_C(gC)
+        tCrA = tiled_mma.make_fragment_A(tCsA[None, None, None, 0])
+        tCrB = tiled_mma.make_fragment_B(tCsB[None, None, None, 0])
+        tCrC = tiled_mma.make_fragment_C(tCgC)
+        # Clear the accumulator
+        tCrC.fill(0.0)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Copy Atom A/B retiling
+        # ///////////////////////////////////////////////////////////////////////////////
+
+        # Create the copy atoms for the copy from shared memory to register
+        atom_copy_s2r_A = cute.make_copy_atom(
+            cute.nvgpu.warp.LdMatrix8x8x16bOp(
+                self.a_major_mode != utils.LayoutEnum.ROW_MAJOR, 4
+            ),
+            mA.element_type,
+        )
+        atom_copy_s2r_B = cute.make_copy_atom(
+            cute.nvgpu.warp.LdMatrix8x8x16bOp(
+                self.b_major_mode != utils.LayoutEnum.ROW_MAJOR, 4
+            ),
+            mB.element_type,
+        )
+
+        # Creates the tiled copy so that it matches the thread-value layout
+        # expected by the tiled mma
+        tiled_copy_s2r_A = cute.make_tiled_copy(
+            atom_copy_s2r_A,
+            layout_tv=tiled_mma.tv_layout_A_tiled,
+            tiler_mn=(tiled_mma.get_tile_size(0), tiled_mma.get_tile_size(2)),
+        )
+        tiled_copy_s2r_B = cute.make_tiled_copy(
+            atom_copy_s2r_B,
+            layout_tv=tiled_mma.tv_layout_B_tiled,
+            tiler_mn=(tiled_mma.get_tile_size(1), tiled_mma.get_tile_size(2)),
+        )
+
+        thr_copy_ldmatrix_A = tiled_copy_s2r_A.get_slice(tidx)
+        thr_copy_ldmatrix_B = tiled_copy_s2r_B.get_slice(tidx)
+        tCsA_copy_view = thr_copy_ldmatrix_A.partition_S(sA)
+        tCrA_copy_view = thr_copy_ldmatrix_A.retile(tCrA)
+        tCsB_copy_view = thr_copy_ldmatrix_B.partition_S(sB)
+        tCrB_copy_view = thr_copy_ldmatrix_B.retile(tCrB)
+
+        # Current pipe index in smem to read from / write to
+        smem_pipe_read = 0
+        smem_pipe_write = num_smem_stages - 1
+
+        tCsA_p = tCsA_copy_view[None, None, None, smem_pipe_read]
+        tCsB_p = tCsB_copy_view[None, None, None, smem_pipe_read]
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # PREFETCH register pipeline
+        # ///////////////////////////////////////////////////////////////////////////////
+        num_k_block = cute.size(tCrA, mode=[2])
+        if num_k_block > 1:
+            # Wait until our first prefetched tile is loaded in
+            cute.arch.cp_async_wait_group(num_smem_stages - 2)
+            cute.arch.sync_threads()
+            # Prefetch the first k-block rmem from the first k-tile
+            cute.copy(
+                tiled_copy_s2r_A,
+                tCsA_p[None, None, 0],
+                tCrA_copy_view[None, None, 0],
+            )
+            cute.copy(
+                tiled_copy_s2r_B,
+                tCsB_p[None, None, 0],
+                tCrB_copy_view[None, None, 0],
+            )
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Mainloop
+        # 1. Shared memory pipeline (gmem -> smem):
+        #    The default smem pipeline depth is 3, meaning that for shared
+        # memory buffers, we allocate three times the size described by the
+        # CTA tiler. We prefetch 2 of these buffers before entering the main
+        # loop. Considering only the transfer from global memory to shared
+        # memory, the general structure of the mainloop is:
+        #   (1) copy k-tile from gmem to smem;
+        #   (2) perform gemm computation on k-tile;
+        #   (3) wait for the next copy to finish.
+        #    The `cute.arch.cp_async_wait_group(num_smem_stages - 2)` command
+        # waits for the number of unfinished 'copy' to be <= 1. The advantage
+        # of this approach is that it allows for simultaneous production
+        # (i.e., step (1)) and consumption (i.e., step (2)) of smem.
+        #    A common misconception is to prefetch N buffers and rewrite
+        # the pipeline logic to wait on N-1 pending copies. The disadvantage
+        # of this approach is that it requires fully consuming a buffer in
+        # order to open an empty buffer for the next copy.
+        # 2. Register pipeline (smem -> register):
+        #    Similarly, the register pipeline produces i+1, consumes i, and
+        # produces i+2... Notably, i and i+1 do not use the same register,
+        # eliminating dependencies on the same register for better parallelism.
+        # 3. Combining the smem and register pipelines results in the mainloop.
+        # ///////////////////////////////////////////////////////////////////////////////
+        for k_tile in cutlass.range_dynamic(k_tile_count, unroll=1):
+            for k_block in range(num_k_block):
+                if k_block == num_k_block - 1:
+                    tCsA_p = tCsA_copy_view[None, None, None, smem_pipe_read]
+                    tCsB_p = tCsB_copy_view[None, None, None, smem_pipe_read]
+                    cute.arch.cp_async_wait_group(num_smem_stages - 2)
+                    cute.arch.sync_threads()
+
+                # Load A, B from shared memory to registers for k_block + 1
+                k_block_next = (k_block + 1) % num_k_block  # static
+                cute.copy(
+                    tiled_copy_s2r_A,
+                    tCsA_p[None, None, k_block_next],
+                    tCrA_copy_view[None, None, k_block_next],
+                )
+                cute.copy(
+                    tiled_copy_s2r_B,
+                    tCsB_p[None, None, k_block_next],
+                    tCrB_copy_view[None, None, k_block_next],
+                )
+
+                # Fetch next A: To better interleave global memory access and compute
+                # instructions, we intentionally use the sequence: copy A, perform GEMM,
+                # then copy B.
+                if k_block == 0:
+                    if k_tile + num_smem_stages - 1 < k_tile_count:
+                        cute.copy(
+                            tiled_copy_A,
+                            tAgA[None, None, None, k_tile_index],
+                            tAsA[None, None, None, smem_pipe_write],
+                            pred=tApA,
+                        )
+
+                # Thread-level register gemm for k_block
+                cute.gemm(
+                    tiled_mma,
+                    tCrC,
+                    tCrA[None, None, k_block],
+                    tCrB[None, None, k_block],
+                    tCrC,
+                )
+
+                # Fetch next B and update smem pipeline read/write
+                if k_block == 0:
+                    if k_tile + num_smem_stages - 1 < k_tile_count:
+                        cute.copy(
+                            tiled_copy_B,
+                            tBgB[None, None, None, k_tile_index],
+                            tBsB[None, None, None, smem_pipe_write],
+                            pred=tBpB,
+                        )
+                    k_tile_index = k_tile_index + 1
+                    cute.arch.cp_async_commit_group()
+                    smem_pipe_write = smem_pipe_read
+                    smem_pipe_read = smem_pipe_read + 1
+                    if smem_pipe_read == num_smem_stages:
+                        smem_pipe_read = 0
+
+        # Sync before epilogue
+        cute.arch.cp_async_wait_group(0)
+        cute.arch.sync_threads()
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Epilogue with fusion
+        # ///////////////////////////////////////////////////////////////////////////////
+        tCrD = cute.make_fragment_like(tCrC, self.c_dtype)
+        tCrD[None] = epilogue_op(tCrC.load()).to(self.c_dtype)
+
+        # Copy results of D back to shared memory
+        cute.autovec_copy(tCrD, tCsC)
+
+        # Create counting tensor for C
+        ceilM, ceilN, _ = cute.ceil_div(mC.shape, (self.bM, self.bN, 1))
+        mcC = cute.make_identity_tensor(
+            (
+                cute.size(ceilM) * self.cta_tiler[0],
+                cute.size(ceilN) * self.cta_tiler[1],
+                1,
+            )
+        )
+        cC = cute.local_tile(
+            mcC[None, None, bidz],
+            tiler=self.cta_tiler,
+            coord=tiler_coord,
+            proj=(1, 1, None),
+        )
+        tCcC = thr_copy_C.partition_S(cC)
+
+        tCrC_epilogue = cute.make_fragment_like(tCsC_epilogue)
+        # Wait for all writes to shared memory to finish before starting copies
+        # using the new layouts
+        cute.arch.sync_threads()
+        cute.autovec_copy(tCsC_epilogue, tCrC_epilogue)
+
+        # Create predication tensor for m
+        tCpC = cute.make_fragment(
+            cute.make_layout(
+                (
+                    tCgC_epilogue.shape[0][1],
+                    cute.size(tCgC_epilogue, mode=[1]),
+                    cute.size(tCgC_epilogue, mode=[2]),
+                ),
+                stride=(cute.size(tCgC_epilogue, mode=[1]), 1, 0),
+            ),
+            cutlass.Boolean,
+        )
+        for rest_v in range(tCpC.shape[0]):
+            for m in range(tCpC.shape[1]):
+                tCpC[rest_v, m, 0] = cute.elem_less(
+                    tCcC[(0, rest_v), m, 0][0], mC.shape[0]
+                )
+
+        # Copy to global memory using better vectorization
+        for rest_v in range(tCpC.shape[0]):
+            for n in range(tCpC.shape[2]):
+                if cute.elem_less(tCcC[(0, rest_v), 0, n][1], mC.shape[1]):
+                    cute.copy(
+                        tiled_copy_C,
+                        tCrC_epilogue[None, None, n],
+                        tCgC_epilogue[None, None, n],
+                        pred=tCpC[None, None, n],
+                    )
+        return
+
+    def _make_smem_layout_AB(self, dtype, major_mode, copy_bits, smem_tiler):
+        major_mode_size = (
+            smem_tiler[1] if major_mode == utils.LayoutEnum.ROW_MAJOR else smem_tiler[0]
+        )
+        major_mode_size = 64 if major_mode_size >= 64 else major_mode_size
+
+        swizzle_bits = int(math.log2(major_mode_size * dtype.width // copy_bits))
+        swizzle_bits = min(swizzle_bits, 3)
+
+        layout_atom_outer = (
+            cute.make_layout((8, major_mode_size), stride=(major_mode_size, 1))
+            if major_mode == utils.LayoutEnum.ROW_MAJOR
+            else cute.make_layout((major_mode_size, 8), stride=(1, major_mode_size))
+        )
+        layout_atom = cute.make_composed_layout(
+            cute.make_swizzle(swizzle_bits, 3, 3),
+            0,
+            layout_atom_outer,
+        )
+        layout = cute.tile_to_shape(layout_atom, smem_tiler, (0, 1, 2))
+        return layout
+
+    def _make_smem_layout_C(self, dtype, major_mode, copy_bits, smem_tiler):
+        major_mode_size = (
+            smem_tiler[1] if major_mode == utils.LayoutEnum.ROW_MAJOR else smem_tiler[0]
+        )
+
+        swizzle_bits = int(math.log2(major_mode_size * dtype.width // copy_bits))
+        swizzle_bits = min(swizzle_bits, 3)
+
+        layout_atom_outer = (
+            cute.make_layout((8, major_mode_size), stride=(major_mode_size, 1))
+            if major_mode == utils.LayoutEnum.ROW_MAJOR
+            else cute.make_layout((major_mode_size, 8), stride=(1, major_mode_size))
+        )
+        layout_atom = cute.make_composed_layout(
+            cute.make_swizzle(swizzle_bits, 3, 4),
+            0,
+            layout_atom_outer,
+        )
+
+        # Due to the thread layout of the mma, remove swizzle in C to
+        # prevent shared memory fragments owned by an single thread from
+        # holding swizzles
+        if major_mode == utils.LayoutEnum.COL_MAJOR:
+            layout_atom = cute.make_composed_layout(
+                cute.make_swizzle(0, 3, 4), 0, layout_atom_outer
+            )
+        layout = cute.tile_to_shape(
+            layout_atom,
+            smem_tiler,
+            (0, 1),
+        )
+        return layout
+
+    def _make_gmem_tiled_copy_AB(self, atom_copy, dtype, major_mode, copy_bits):
+        copy_elems = copy_bits // dtype.width
+        shape_dim_1 = cute.size(self.bK) // copy_elems
+        # thread layout for copy
+        thread_layout = cute.make_layout(
+            (self.num_threads // shape_dim_1, shape_dim_1), stride=(shape_dim_1, 1)
+        )
+        if major_mode != utils.LayoutEnum.ROW_MAJOR:
+            shape_dim_0 = cute.size(self.bM) // copy_elems
+            thread_layout = cute.make_layout(
+                (shape_dim_0, self.num_threads // shape_dim_0), stride=(1, shape_dim_0)
+            )
+        # Value layout for copy
+        value_layout = (
+            cute.make_layout((1, copy_elems))
+            if major_mode == utils.LayoutEnum.ROW_MAJOR
+            else cute.make_layout((copy_elems, 1))
+        )
+        return cute.make_tiled_copy_tv(atom_copy, thread_layout, value_layout)
+
+    def _make_gmem_tiled_copy_C(self, atom_copy, dtype, major_mode, copy_bits):
+        copy_elems = copy_bits // dtype.width
+        shape_dim_1 = cute.size(self.bN) // copy_elems
+        # thread layout for copy
+        thread_layout = cute.make_layout(
+            (self.num_threads // shape_dim_1, shape_dim_1), stride=(shape_dim_1, 1)
+        )
+        if major_mode != utils.LayoutEnum.ROW_MAJOR:
+            shape_dim_0 = cute.size(self.bM) // copy_elems
+            thread_layout = cute.make_layout(
+                (shape_dim_0, self.num_threads // shape_dim_0), stride=(1, shape_dim_0)
+            )
+        value_layout = (
+            cute.make_layout((1, copy_elems))
+            if major_mode == utils.LayoutEnum.ROW_MAJOR
+            else cute.make_layout((copy_elems, 1))
+        )
+        tiler_mn, layout_tv = cute.make_layout_tv(thread_layout, value_layout)
+        return cute.make_tiled_copy(atom_copy, layout_tv, tiler_mn)
+
+
+def run_tensor_op_gemm(
+    a_major: str,
+    b_major: str,
+    c_major: str,
+    ab_dtype: Type[cutlass.Numeric],
+    c_dtype: Type[cutlass.Numeric],
+    acc_dtype: Type[cutlass.Numeric],
+    problem_shape: Tuple[int, int, int, int],
+    atom_layout_mnk: Tuple[int, int, int],
+    warmup_iterations: int = 2,
+    iterations: int = 100,
+    skip_ref_check: bool = False,
+):
+    M, N, K, L = problem_shape
+
+    # Create and permute tensor A/B/C
+    def create_and_permute_tensor(l, mode0, mode1, is_mode0_major, dtype):
+        # is_mode0_major: (l, mode1, mode0) -> (mode0, mode1, l)
+        # else: (l, mode0, mode1) -> (mode0, mode1, l)
+        shape = (l, mode1, mode0) if is_mode0_major else (l, mode0, mode1)
+        permute_order = (2, 1, 0) if is_mode0_major else (1, 2, 0)
+
+        return (
+            torch.empty(*shape, dtype=torch.int32)
+            .random_(-2, 2)
+            .to(dtype=dtype)
+            .permute(permute_order)
+            .cuda()
+        )
+
+    a = create_and_permute_tensor(
+        L, M, K, a_major == "m", cutlass_torch.dtype(ab_dtype)
+    )
+    b = create_and_permute_tensor(
+        L, N, K, b_major == "n", cutlass_torch.dtype(ab_dtype)
+    )
+    c = create_and_permute_tensor(L, M, N, c_major == "m", cutlass_torch.dtype(c_dtype))
+    ref = torch.einsum("mkl,nkl->mnl", a, b).to(cutlass_torch.dtype(c_dtype))
+
+    tensor_op_gemm = TensorOpGemm(
+        ab_dtype,
+        c_dtype,
+        acc_dtype,
+        atom_layout_mnk,
+    )
+
+    # assume input is 16B aligned
+    a_tensor = (
+        from_dlpack(a, assumed_align=16)
+        .mark_layout_dynamic(leading_dim=(1 if a_major == "k" else 0))
+        .mark_compact_shape_dynamic(
+            mode=(1 if a_major == "k" else 0),
+            stride_order=(2, 0, 1) if a_major == "k" else (2, 1, 0),
+            divisibility=(128 // ab_dtype.width),
+        )
+    )
+    b_tensor = (
+        from_dlpack(b, assumed_align=16)
+        .mark_layout_dynamic(leading_dim=(1 if b_major == "k" else 0))
+        .mark_compact_shape_dynamic(
+            mode=(1 if b_major == "k" else 0),
+            stride_order=(2, 0, 1) if b_major == "k" else (2, 1, 0),
+            divisibility=(128 // ab_dtype.width),
+        )
+    )
+    c_tensor = (
+        from_dlpack(c, assumed_align=16)
+        .mark_layout_dynamic(leading_dim=(1 if c_major == "n" else 0))
+        .mark_compact_shape_dynamic(
+            mode=(1 if c_major == "n" else 0),
+            stride_order=(2, 0, 1) if c_major == "n" else (2, 1, 0),
+            divisibility=(128 // c_dtype.width),
+        )
+    )
+
+    print("Compiling kernel with cute.compile ...")
+    gemm = cute.compile(tensor_op_gemm, a_tensor, b_tensor, c_tensor)
+
+    print("Executing GEMM kernel...")
+
+    # Warmup
+    for _ in range(warmup_iterations):
+        gemm(a_tensor, b_tensor, c_tensor)
+
+    # Execute the kernel
+    for _ in range(iterations):
+        gemm(a_tensor, b_tensor, c_tensor)
+
+    if not skip_ref_check:
+        print("Verifying results...")
+        torch.testing.assert_close(c.cpu(), ref.cpu(), atol=1e-03, rtol=1e-05)
+        print("Results verified successfully!")
+
+
+if __name__ == "__main__":
+
+    def parse_comma_separated_ints(s: str) -> Tuple[int, ...]:
+        try:
+            return tuple(int(x.strip()) for x in s.split(","))
+        except ValueError:
+            raise argparse.ArgumentTypeError(
+                "Invalid format. Expected comma-separated integers."
+            )
+
+    parser = argparse.ArgumentParser(
+        description="example of multistage block matmul with CuTe on GPU"
+    )
+    parser.add_argument(
+        "--mnkl", type=parse_comma_separated_ints, default=(112, 136, 40, 1)
+    )
+    parser.add_argument(
+        "--atom_layout_mnk", type=parse_comma_separated_ints, default=(2, 2, 1)
+    )
+    parser.add_argument(
+        "--ab_dtype",
+        type=cutlass.dtype,
+        choices=[cutlass.Float16],
+        default=cutlass.Float16,
+    )
+    parser.add_argument(
+        "--acc_dtype",
+        type=cutlass.dtype,
+        choices=[cutlass.Float32],
+        default=cutlass.Float32,
+    )
+    parser.add_argument(
+        "--c_dtype",
+        type=cutlass.dtype,
+        choices=[cutlass.Float16],
+        default=cutlass.Float16,
+    )
+    parser.add_argument("--a_major", choices=["k", "m"], default="m")
+    parser.add_argument("--b_major", choices=["k", "n"], default="n")
+    parser.add_argument("--c_major", choices=["n", "m"], default="n")
+    parser.add_argument("--warmup_iterations", default=2, type=int)
+    parser.add_argument("--iterations", default=100, type=int)
+    parser.add_argument("--skip_ref_check", action="store_true")
+
+    args = parser.parse_args()
+    print("Running Ampere tensor core GEMM example:")
+    run_tensor_op_gemm(
+        args.a_major,
+        args.b_major,
+        args.c_major,
+        args.ab_dtype,
+        args.c_dtype,
+        args.acc_dtype,
+        args.mnkl,
+        args.atom_layout_mnk,
+        args.warmup_iterations,
+        args.iterations,
+        args.skip_ref_check,
+    )
+    print("PASS")
diff --git a/examples/python/CuTeDSL/blackwell/dense_gemm.py b/examples/python/CuTeDSL/blackwell/dense_gemm.py
new file mode 100644
index 00000000..89696c8a
--- /dev/null
+++ b/examples/python/CuTeDSL/blackwell/dense_gemm.py
@@ -0,0 +1,1922 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+from typing import Optional, Type, Tuple, Union
+import cuda.bindings.driver as cuda
+
+import torch
+
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils as utils
+from cutlass.cute.nvgpu import cpasync, tcgen05
+import cutlass.torch as cutlass_torch
+import cutlass.utils.blackwell_helpers as sm100_utils
+from cutlass.cute.runtime import from_dlpack
+
+"""
+A high-performance batched dense GEMM (C = A * B) example for the NVIDIA Blackwell SM100 architecture
+using CUTE DSL.
+- Matrix A is MxKxL, L is batch dimension, A can be row-major("K") or column-major("M")
+- Matrix B is NxKxL, L is batch dimension, B can be row-major("N") or column-major("K")
+- Matrix C is MxNxL, L is batch dimension, C can be row-major("N") or column-major("M")
+
+This GEMM kernel supports the following features:
+    - Utilizes Tensor Memory Access (TMA) for efficient memory operations
+    - Utilizes Blackwell's tcgen05.mma for matrix multiply-accumulate (MMA) operations (including 2cta mma instructions)
+    - Implements TMA multicast with cluster to reduce L2 memory traffic
+    - Supports multi-stage pipeline to overlap computation and memory access
+
+This GEMM works as follows:
+1. Load A and B matrices from global memory (GMEM) to shared memory (SMEM) using TMA operations.
+2. Perform matrix multiply-accumulate (MMA) operations using tcgen05.mma instruction.
+3. Load completed accumulator from tensor memory (TMEM) to registers (RMEM) using tcgen05.ld.
+4. Type convert C matrix to output type.
+5. Optionally store C matrix from registers (RMEM) to shared memory (SMEM) to global memory (GMEM) with TMA operations,
+   or directly store C matrix from registers (RMEM) to global memory (GMEM) without TMA operations.
+6. Optionally accept an elementwise lambda function epilogue_op to apply to the output tensor:
+   e.g., relu can set epilogue_op = lambda x: cute.where(x > 0, x, cute.full_like(x, 0))
+
+SM100 tcgen05.mma instructions operate as follows:
+- Read matrix A from SMEM
+- Read matrix B from SMEM
+- Write accumulator to TMEM
+The accumulator in TMEM must then be loaded to registers before writing back to GMEM.
+
+To run this example:
+
+.. code-block:: bash
+
+    python examples/blackwell/dense_gemm.py                                     \
+      --ab_dtype Float16 --c_dtype Float16 --acc_dtype Float32                  \
+      --mma_tiler_mn 256,128 --cluster_shape_mn 2,1                             \
+      --mnkl 8192,8192,8192,1                                                   \
+      --use_tma_store --use_2cta_instrs
+
+The above example command compute batched gemm with M=8192, N=8192, K=8192,
+batch_count=1. The Blackwell tcgen05 MMA tile shape used 2 cta with 256x128
+MMA tile and the cluster shape is (2,1). The input, mma accumulator and output
+data type are set as fp16, fp32 and fp16, respectively.
+
+To collect performance with NCU profiler:
+
+.. code-block:: bash
+
+    ncu python examples/blackwell/dense_gemm.py                                \
+      --ab_dtype Float16 --c_dtype Float16 --acc_dtype Float32                 \
+      --mma_tiler_mn 256,128 --cluster_shape_mn 2,1                            \
+      --mnkl 8192,8192,8192,1                                                  \
+      --use_tma_store --use_2cta_instrs
+
+Constraints:
+* Supported input data types: fp16, bf16, tf32, int8, uint8, fp8 (e4m3fn, e5m2),
+  see detailed valid dtype combinations in below DenseGemmKernel class documentation
+* A/B tensor must have the same data type
+* Mma tiler M must be 64/128 (use_2cta_instrs=False) or 128/256 (use_2cta_instrs=True)
+* Mma tiler N must be 32-256, step 32
+* Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+* Cluster shape M must be multiple of 2 if use_2cta_instrs=True
+* The contiguous dimension of A/B/C tensors must be at least 16 bytes aligned,
+  i.e, number of elements is a multiple of 4, 8, and 16 for TFloat32,
+  Float16/BFloat16, and Int8/Uint8/Float8, respectively.
+* OOB tiles are not allowed when TMA store is disabled
+"""
+
+
+class DenseGemmKernel:
+    """
+    This class implements batched matrix multiplication (C = A x B) with support for various data types
+    and architectural features specific to Blackwell GPUs.
+
+    :param acc_dtype: Data type for accumulation during computation
+    :type acc_dtype: type[cutlass.Numeric]
+    :param use_2cta_instrs: Whether to use CTA group 2 for advanced thread cooperation
+    :type use_2cta_instrs: bool
+    :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tiler (M,N)
+    :type mma_tiler_mn: Tuple[int, int]
+    :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing
+    :type cluster_shape_mn: Tuple[int, int]
+    :param use_tma_store: Whether to use Tensor Memory Access (TMA) for storing results
+    :type use_tma_store: bool
+
+    :note: In current version, A and B tensor must have the same data type
+        - i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported
+
+    :note: Supported A/B data types:
+        - TFloat32
+        - Float16/BFloat16
+        - Int8/Uint8
+        - Float8E4M3FN/Float8E5M2
+
+    :note: Supported accumulator data types:
+        - Float32 (for all floating point A/B data types)
+        - Float16 (only for fp16 and fp8 A/B data types)
+        - Int32 (only for uint8/int8 A/B data types)
+
+    :note: Supported C data types:
+        - Float32 (for float32 and int32 accumulator data types)
+        - Int32 (for float32 and int32 accumulator data types)
+        - Float16/BFloat16 (for fp16 and fp8 accumulator data types)
+        - Int8/Uint8 (for uint8/int8 accumulator data types)
+        - Float8E4M3FN/Float8E5M2 (for float32 accumulator data types)
+
+    :note: Constraints:
+        - MMA tiler M must be 64/128 (use_2cta_instrs=False) or 128/256 (use_2cta_instrs=True)
+        - MMA tiler N must be 32-256, step 32
+        - Cluster shape M must be multiple of 2 if use_2cta_instrs=True
+        - Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+
+    Example:
+        >>> gemm = DenseGemmKernel(
+        ...     acc_dtype=cutlass.Float32,
+        ...     use_2cta_instrs=True,
+        ...     mma_tiler_mn=(128, 128),
+        ...     cluster_shape_mn=(2, 2)
+        ... )
+        >>> gemm(a_tensor, b_tensor, c_tensor, stream)
+    """
+
+    def __init__(
+        self,
+        acc_dtype: Type[cutlass.Numeric],
+        use_2cta_instrs: bool,
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+        use_tma_store: bool,
+    ):
+        """Initializes the configuration for a Blackwell dense GEMM kernel.
+
+        This configuration includes several key aspects:
+
+        1.  MMA Instruction Settings (tcgen05):
+            - acc_dtype: Data types for MMA accumulator.
+            - mma_tiler_mn: The (M, N) shape of the MMA instruction tiler.
+            - use_2cta_instrs: Boolean indicating if the tcgen05 MMA variant
+              with cta_group=2 should be used.
+
+        2.  Cluster Shape:
+            - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster.
+
+        3. Output C tensor store mode:
+            - use_tma_store: Boolean indicating whether to use Tensor Memory Access (TMA) for storing results.
+
+        :param acc_dtype: Data type of the accumulator.
+        :type acc_dtype: type[cutlass.Numeric]
+        :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction.
+        :type mma_tiler_mn: Tuple[int, int]
+        :param use_2cta_instrs: Boolean, True to use cta_group=2 MMA variant.
+        :type use_2cta_instrs: bool
+        :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
+        :type cluster_shape_mn: Tuple[int, int]
+        :param use_tma_store: Use Tensor Memory Access (TMA) or normal store for output C tensor.
+        :type use_tma_store: bool
+        """
+
+        self.acc_dtype: Type[cutlass.Numeric] = acc_dtype
+        self.use_2cta_instrs = use_2cta_instrs
+        self.cluster_shape_mn = cluster_shape_mn
+        # K dimension is deferred in _setup_attributes
+        self.mma_tiler = (*mma_tiler_mn, 1)
+        self.use_tma_store = use_tma_store
+
+        self.cta_group = (
+            tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
+        )
+
+        self.occupancy = 1
+        self.threads_per_cta = 128
+        self.num_smem_capacity = sm100_utils.SMEM_CAPACITY["sm100"]
+
+    def _setup_attributes(self):
+        """Set up configurations that are dependent on GEMM inputs
+
+        This method configures various attributes based on the input tensor properties
+        (data types, leading dimensions) and kernel settings:
+        - Configuring tiled MMA
+        - Computing MMA/cluster/tile shapes
+        - Computing cluster layout
+        - Computing multicast CTAs for A/B
+        - Computing epilogue subtile
+        - Setting up A/B/C stage counts in shared memory
+        - Computing A/B/C shared memory layout
+        - Computing tensor memory allocation columns
+        """
+        # Configure tiled mma
+        tiled_mma = sm100_utils.make_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.acc_dtype,
+            self.cta_group,
+            self.mma_tiler[:2],
+        )
+
+        # Compute mma/cluster/tile shapes
+        mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
+        mma_inst_tile_k = 4
+        self.mma_tiler = (
+            self.mma_tiler[0],
+            self.mma_tiler[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+        self.cta_tile_shape_mnk = (
+            self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler[1],
+            self.mma_tiler[2],
+        )
+
+        # Compute cluster layout
+        self.cluster_layout_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma.thr_id.shape,),
+        )
+
+        # Compute number of multicast CTAs for A/B
+        self.num_mcast_ctas_a = cute.size(self.cluster_layout_vmnk.shape[2])
+        self.num_mcast_ctas_b = cute.size(self.cluster_layout_vmnk.shape[1])
+        self.is_a_mcast = self.num_mcast_ctas_a > 1
+        self.is_b_mcast = self.num_mcast_ctas_b > 1
+
+        # Compute epilogue subtile
+        if cutlass.const_expr(self.use_tma_store):
+            self.epi_tile = sm100_utils.compute_epilogue_tile_shape(
+                self.cta_tile_shape_mnk,
+                self.use_2cta_instrs,
+                self.c_layout,
+                self.c_dtype,
+            )
+        else:
+            self.epi_tile = self.cta_tile_shape_mnk[:2]
+
+        # Setup A/B/C stage count in shared memory
+        self.num_acc_stage, self.num_ab_stage, self.num_c_stage = self._compute_stages(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.b_dtype,
+            self.epi_tile,
+            self.c_dtype,
+            self.c_layout,
+            self.num_smem_capacity,
+            self.occupancy,
+            self.use_tma_store,
+        )
+
+        # Compute A/B/C shared memory layout
+        self.a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.num_ab_stage,
+        )
+        self.b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            self.mma_tiler,
+            self.b_dtype,
+            self.num_ab_stage,
+        )
+        self.c_smem_layout_staged = (
+            sm100_utils.make_smem_layout_epi(
+                self.c_dtype,
+                self.c_layout,
+                self.epi_tile,
+                self.num_c_stage,
+            )
+            if cutlass.const_expr(self.use_tma_store)
+            else None
+        )
+
+        # Compute the number of tensor memory allocation columns
+        self.num_tmem_alloc_cols = self._compute_num_tmem_alloc_cols(
+            tiled_mma, self.mma_tiler
+        )
+
+    @cute.jit
+    def __call__(
+        self,
+        a: cute.Tensor,
+        b: cute.Tensor,
+        c: cute.Tensor,
+        stream: cuda.CUstream,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        """Execute the GEMM operation in steps:
+        - Setup static attributes
+        - Setup TMA load/store atoms and tensors
+        - Compute grid size
+        - Define shared storage for kernel
+        - Launch the kernel synchronously
+
+        :param a: Input tensor A
+        :type a: cute.Tensor
+        :param b: Input tensor B
+        :type b: cute.Tensor
+        :param c: Output tensor C
+        :type c: cute.Tensor
+        :param stream: CUDA stream for asynchronous execution
+        :type stream: cuda.CUstream
+        :param epilogue_op: Optional elementwise lambda function to apply to the output tensor
+        :type epilogue_op: cutlass.Constexpr
+        :raises TypeError: If input data types are incompatible with the MMA instruction.
+        :raises AssertionError: If OOB (Out-Of-Bounds) tiles are present when TMA store is disabled.
+        """
+        # Setup static attributes before smem/grid/tma computation
+        self.a_dtype: Type[cutlass.Numeric] = a.element_type
+        self.b_dtype: Type[cutlass.Numeric] = b.element_type
+        self.c_dtype: Type[cutlass.Numeric] = c.element_type
+        self.a_major_mode = utils.LayoutEnum.from_tensor(a).mma_major_mode()
+        self.b_major_mode = utils.LayoutEnum.from_tensor(b).mma_major_mode()
+        self.c_layout = utils.LayoutEnum.from_tensor(c)
+
+        # Check if input data types are compatible with MMA instruction
+        if cutlass.const_expr(self.a_dtype != self.b_dtype):
+            raise TypeError(f"Type must match: {self.a_dtype} != {self.b_dtype}")
+
+        # Setup attributes that dependent on gemm inputs
+        self._setup_attributes()
+
+        tiled_mma = sm100_utils.make_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.acc_dtype,
+            self.cta_group,
+            self.mma_tiler[:2],
+        )
+        atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+
+        # Setup TMA load for A
+        a_op = self._get_tma_atom_kind(atom_thr_size, self.is_a_mcast)
+        a_smem_layout = cute.slice_(self.a_smem_layout_staged, (None, None, None, 0))
+        tma_atom_a, tma_tensor_a = cute.nvgpu.make_tma_tile_atom_A(
+            a_op,
+            a,
+            a_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+            internal_type=(
+                cutlass.TFloat32 if a.element_type is cutlass.Float32 else None
+            ),
+        )
+
+        # Setup TMA load for B
+        b_op = self._get_tma_atom_kind(atom_thr_size, self.is_b_mcast)
+        b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, None, 0))
+        tma_atom_b, tma_tensor_b = cute.nvgpu.make_tma_tile_atom_B(
+            b_op,
+            b,
+            b_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+            internal_type=(
+                cutlass.TFloat32 if b.element_type is cutlass.Float32 else None
+            ),
+        )
+
+        a_copy_size = cute.size_in_bytes(self.a_dtype, a_smem_layout)
+        b_copy_size = cute.size_in_bytes(self.b_dtype, b_smem_layout)
+        self.num_tma_load_bytes = (a_copy_size + b_copy_size) * atom_thr_size
+
+        # Setup store for C
+        tma_atom_c = None
+        tma_tensor_c = None
+        if cutlass.const_expr(self.use_tma_store):
+            c_cta_v_layout = cute.composition(
+                cute.make_identity_layout(c.shape), self.epi_tile
+            )
+            epi_smem_layout = cute.slice_(self.c_smem_layout_staged, (None, None, 0))
+            tma_atom_c, tma_tensor_c = cpasync.make_tma_tile_atom(
+                cpasync.CopyBulkTensorTileS2GOp(),
+                c,
+                epi_smem_layout,
+                c_cta_v_layout,
+            )
+
+        # Compute grid size
+        grid = self._compute_grid(c, self.cta_tile_shape_mnk, self.cluster_shape_mn)
+
+        self.buffer_align_bytes = 1024
+
+        c_smem_size = (
+            cute.cosize(self.c_smem_layout_staged.outer)
+            if cutlass.const_expr(self.use_tma_store)
+            else 0
+        )
+
+        # Define shared storage for kernel
+        @cute.struct
+        class SharedStorage:
+            ab_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
+            ab_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
+            acc_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
+            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_holding_buf: cutlass.Int32
+            # (EPI_TILE_M, EPI_TILE_N, STAGE)
+            sC: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.c_dtype,
+                    c_smem_size,
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_M, MMA_K, STAGE)
+            sA: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer)
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_N, MMA_K, STAGE)
+            sB: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.b_dtype, cute.cosize(self.b_smem_layout_staged.outer)
+                ],
+                self.buffer_align_bytes,
+            ]
+
+        self.shared_storage = SharedStorage
+
+        # Launch the kernel synchronously
+        self.kernel(
+            tiled_mma,
+            tma_atom_a,
+            tma_tensor_a,
+            tma_atom_b,
+            tma_tensor_b,
+            tma_atom_c,
+            tma_tensor_c if cutlass.const_expr(self.use_tma_store) else c,
+            self.cluster_layout_vmnk,
+            self.a_smem_layout_staged,
+            self.b_smem_layout_staged,
+            self.c_smem_layout_staged,
+            self.epi_tile,
+            epilogue_op,
+        ).launch(
+            grid=grid,
+            block=[self.threads_per_cta, 1, 1],
+            cluster=(*self.cluster_shape_mn, 1),
+            smem=self.shared_storage.size_in_bytes(),
+            stream=stream,
+        )
+        return
+
+    # GPU device kernel
+    @cute.kernel
+    def kernel(
+        self,
+        tiled_mma: cute.TiledMma,
+        tma_atom_a: cute.CopyAtom,
+        mA_mkl: cute.Tensor,
+        tma_atom_b: cute.CopyAtom,
+        mB_nkl: cute.Tensor,
+        tma_atom_c: Optional[cute.CopyAtom],
+        mC_mnl: cute.Tensor,
+        cluster_layout_vmnk: cute.Layout,
+        a_smem_layout_staged: cute.ComposedLayout,
+        b_smem_layout_staged: cute.ComposedLayout,
+        c_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout, None],
+        epi_tile: cute.Tile,
+        epilogue_op: cutlass.Constexpr,
+    ):
+        """
+        GPU device kernel performing the batched GEMM computation.
+        """
+        warp_idx = cute.arch.warp_idx()
+        warp_idx = cute.arch.make_warp_uniform(warp_idx)
+
+        #
+        # Prefetch tma descriptor
+        #
+        if warp_idx == 0:
+            cpasync.prefetch_descriptor(tma_atom_a)
+            cpasync.prefetch_descriptor(tma_atom_b)
+            if cutlass.const_expr(self.use_tma_store):
+                cpasync.prefetch_descriptor(tma_atom_c)
+
+        use_2cta_instrs = cute.size(tiled_mma.thr_id.shape) == 2
+
+        #
+        # Setup cta/thread coordinates
+        #
+        # Coords inside cluster
+        bidx, bidy, bidz = cute.arch.block_idx()
+        mma_tile_coord_v = bidx % cute.size(tiled_mma.thr_id.shape)
+        is_leader_cta = mma_tile_coord_v == 0
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(
+            cta_rank_in_cluster
+        )
+        # Coords outside cluster
+        cta_coord = (bidx, bidy, bidz)
+        mma_tile_coord_mnl = (
+            cta_coord[0] // cute.size(tiled_mma.thr_id.shape),
+            cta_coord[1],
+            cta_coord[2],
+        )
+        # Coords inside cta
+        tidx, _, _ = cute.arch.thread_idx()
+
+        #
+        # Alloc and init: a+b full/empty, accumulator full, tensor memory dealloc barrier
+        #
+        smem = utils.SmemAllocator()
+        storage = smem.allocate(self.shared_storage)
+
+        tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr
+        tmem_holding_buf = storage.tmem_holding_buf
+
+        # Initialize mainloop ab_pipeline (barrier) and states
+        ab_pipeline_producer_group = utils.CooperativeGroup(utils.Agent.Thread)
+        num_tma_producer = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
+        ab_pipeline_consumer_group = utils.CooperativeGroup(
+            utils.Agent.Thread, num_tma_producer
+        )
+        ab_pipeline = utils.PipelineTmaUmma.create(
+            barrier_storage=storage.ab_full_mbar_ptr.data_ptr(),
+            num_stages=self.num_ab_stage,
+            producer_group=ab_pipeline_producer_group,
+            consumer_group=ab_pipeline_consumer_group,
+            tx_count=self.num_tma_load_bytes,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+        ab_producer_state = utils.make_pipeline_state(
+            utils.PipelineUserType.Producer, self.num_ab_stage
+        )
+        ab_consumer_state = utils.make_pipeline_state(
+            utils.PipelineUserType.Consumer, self.num_ab_stage
+        )
+
+        # Initialize acc_pipeline (barrier) and states
+        acc_pipeline_producer_group = utils.CooperativeGroup(utils.Agent.Thread)
+        acc_pipeline_consumer_group = utils.CooperativeGroup(
+            utils.Agent.Thread, self.threads_per_cta, self.threads_per_cta
+        )
+        acc_pipeline = utils.PipelineUmmaAsync.create(
+            barrier_storage=storage.acc_full_mbar_ptr.data_ptr(),
+            num_stages=self.num_acc_stage,
+            producer_group=acc_pipeline_producer_group,
+            consumer_group=acc_pipeline_consumer_group,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+        acc_producer_state = utils.make_pipeline_state(
+            utils.PipelineUserType.Producer, self.num_acc_stage
+        )
+        acc_consumer_state = utils.make_pipeline_state(
+            utils.PipelineUserType.Consumer, self.num_acc_stage
+        )
+
+        # Tensor memory dealloc barrier init
+        if use_2cta_instrs:
+            if warp_idx == 0:
+                num_tmem_dealloc_threads = 32
+                with cute.arch.elect_one():
+                    cute.arch.mbarrier_init_arrive_cnt(
+                        tmem_dealloc_mbar_ptr, num_tmem_dealloc_threads
+                    )
+        cute.arch.mbarrier_init_fence()
+
+        # Cluster arrive after barrier init
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_arrive_relaxed()
+
+        #
+        # Setup smem tensor A/B/C
+        #
+        # (EPI_TILE_M, EPI_TILE_N, STAGE)
+        sC = (
+            storage.sC.get_tensor(
+                c_smem_layout_staged.outer, swizzle=c_smem_layout_staged.inner
+            )
+            if cutlass.const_expr(self.use_tma_store)
+            else None
+        )
+        # (MMA, MMA_M, MMA_K, STAGE)
+        sA = storage.sA.get_tensor(
+            a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner
+        )
+        # (MMA, MMA_N, MMA_K, STAGE)
+        sB = storage.sB.get_tensor(
+            b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner
+        )
+
+        #
+        # Compute multicast mask for A/B buffer full
+        #
+        a_full_mcast_mask = None
+        b_full_mcast_mask = None
+        if self.is_a_mcast or self.is_b_mcast or use_2cta_instrs:
+            a_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2
+            )
+            b_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=1
+            )
+
+        #
+        # Local_tile partition global tensors
+        #
+        # (bM, bK, loopM, loopK, loopL)
+        gA_mkl = cute.local_tile(
+            mA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None)
+        )
+        # (bN, bK, loopN, loopK, loopL)
+        gB_nkl = cute.local_tile(
+            mB_nkl, cute.slice_(self.mma_tiler, (0, None, None)), (None, None, None)
+        )
+        # (bM, bN, loopM, loopN, loopL)
+        gC_mnl = cute.local_tile(
+            mC_mnl, cute.slice_(self.mma_tiler, (None, None, 0)), (None, None, None)
+        )
+        k_block_cnt = cute.size(gA_mkl, mode=[3])
+
+        #
+        # Partition global tensor for TiledMMA_A/B/C
+        #
+        thr_mma = tiled_mma.get_slice(mma_tile_coord_v)
+        # (MMA, MMA_M, MMA_K, loopM, loopK, loopL)
+        tCgA = thr_mma.partition_A(gA_mkl)
+        # (MMA, MMA_N, MMA_K, loopN, loopK, loopL)
+        tCgB = thr_mma.partition_B(gB_nkl)
+        # (MMA, MMA_M, MMA_N, loopM, loopN, loopL)
+        tCgC = thr_mma.partition_C(gC_mnl)
+
+        #
+        # Partition global/shared tensor for TMA load A/B
+        #
+        # TMA load A partition_S/D
+        a_cta_layout = cute.make_layout(
+            cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape
+        )
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), loopM, loopK, loopL)
+        tAsA, tAgA = cpasync.tma_partition(
+            tma_atom_a,
+            block_in_cluster_coord_vmnk[2],
+            a_cta_layout,
+            cute.group_modes(sA, 0, 3),
+            cute.group_modes(tCgA, 0, 3),
+        )
+        # TMA load B partition_S/D
+        b_cta_layout = cute.make_layout(
+            cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape
+        )
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), loopN, loopK, loopL)
+        tBsB, tBgB = cpasync.tma_partition(
+            tma_atom_b,
+            block_in_cluster_coord_vmnk[1],
+            b_cta_layout,
+            cute.group_modes(sB, 0, 3),
+            cute.group_modes(tCgB, 0, 3),
+        )
+
+        #
+        # Partition shared/tensor memory tensor for TiledMMA_A/B/C
+        #
+        # (MMA, MMA_M, MMA_K, STAGE)
+        tCrA = tiled_mma.make_fragment_A(sA)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        tCrB = tiled_mma.make_fragment_B(sB)
+        # (MMA, MMA_M, MMA_N)
+        acc_shape = tiled_mma.partition_shape_C(self.mma_tiler[:2])
+        # (MMA, MMA_M, MMA_N)
+        tCtAcc_fake = tiled_mma.make_fragment_C(acc_shape)
+
+        #
+        # Cluster wait before tensor memory alloc
+        #
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_wait()
+
+        #
+        # Alloc tensor memory buffer
+        #
+        if warp_idx == 0:
+            cute.arch.alloc_tmem(
+                self.num_tmem_alloc_cols, tmem_holding_buf, is_two_cta=use_2cta_instrs
+            )
+
+        #
+        # Bar sync for retrieve tensor memory ptr from shared memory
+        #
+        cute.arch.barrier()
+
+        #
+        # Retrieving tensor memory ptr and make accumulator tensor
+        #
+        tmem_ptr = cute.arch.retrieve_tmem_ptr(
+            self.acc_dtype, alignment=16, ptr_to_buffer_holding_addr=tmem_holding_buf
+        )
+        # (MMA, MMA_M, MMA_N)
+        tCtAcc = cute.make_tensor(tmem_ptr, tCtAcc_fake.layout)
+
+        #
+        # Partition for epilogue
+        #
+        tiled_copy_t2r, tTR_tAcc, tTR_rAcc = self.epilog_tmem_copy_and_partition(
+            tidx, tCtAcc, tCgC, epi_tile, use_2cta_instrs
+        )
+
+        tTR_rC = None
+        tiled_copy_r2s = None
+        simt_atom = None
+        tRS_rC = None
+        tRS_sC = None
+        bSG_sC = None
+        bSG_gC = None
+        tTR_gC = None
+        if cutlass.const_expr(self.use_tma_store):
+            tTR_rC = cute.make_fragment(tTR_rAcc.shape, self.c_dtype)
+            tiled_copy_r2s, tRS_rC, tRS_sC = self.epilog_smem_copy_and_partition(
+                tiled_copy_t2r, tTR_rC, tidx, sC
+            )
+            tma_atom_c, bSG_sC, bSG_gC = self.epilog_gmem_copy_and_partition(
+                tidx, tma_atom_c, tCgC, epi_tile, sC
+            )
+        else:
+            simt_atom, tTR_rC, tTR_gC = self.epilog_gmem_copy_and_partition(
+                tidx, tiled_copy_t2r, tCgC, epi_tile, sC
+            )
+
+        #
+        # Slice to per mma tile index
+        #
+        # ((atom_v, rest_v), loopK)
+        tAgA = tAgA[(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])]
+        # ((atom_v, rest_v), loopK)
+        tBgB = tBgB[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+        if cutlass.const_expr(self.use_tma_store):
+            # ((ATOM_V, REST_V), EPI_M, EPI_N)
+            bSG_gC = bSG_gC[(None, None, None, *mma_tile_coord_mnl)]
+        else:
+            # (T2R, T2R_M, T2R_N, EPI_M, EPI_N)
+            tTR_gC = tTR_gC[(None, None, None, None, None, *mma_tile_coord_mnl)]
+
+        #
+        # Pipelining TMA load A/B and MMA mainloop
+        #
+        prefetch_k_block_cnt = cutlass.min(self.num_ab_stage - 2, k_block_cnt)
+
+        if warp_idx == 0:
+            # Peek (try_wait) AB buffer empty for k_block = prefetch_k_block_cnt
+            peek_ab_empty_status = cutlass.Boolean(1)
+            if ab_producer_state.count < k_block_cnt:
+                peek_ab_empty_status = ab_pipeline.producer_try_acquire(
+                    ab_producer_state
+                )
+            #
+            # Prefetch TMA load A/B
+            #
+            for prefetch_idx in cutlass.range_dynamic(prefetch_k_block_cnt, unroll=1):
+                # Conditionally wait for AB buffer empty
+                ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status)
+
+                # TMA load A/B
+                cute.copy(
+                    tma_atom_a,
+                    tAgA[(None, ab_producer_state.count)],
+                    tAsA[(None, ab_producer_state.index)],
+                    tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                    mcast_mask=a_full_mcast_mask,
+                )
+                cute.copy(
+                    tma_atom_b,
+                    tBgB[(None, ab_producer_state.count)],
+                    tBsB[(None, ab_producer_state.index)],
+                    tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                    mcast_mask=b_full_mcast_mask,
+                )
+
+                # Peek (try_wait) AB buffer empty for k_block = prefetch_k_block_cnt + k_block + 1
+                ab_producer_state.advance()
+                peek_ab_empty_status = cutlass.Boolean(1)
+                if ab_producer_state.count < k_block_cnt:
+                    peek_ab_empty_status = ab_pipeline.producer_try_acquire(
+                        ab_producer_state
+                    )
+
+            # Peek (try_wait) AB buffer full for k_block = 0
+            peek_ab_full_status = cutlass.Boolean(1)
+            if ab_consumer_state.count < k_block_cnt and is_leader_cta:
+                peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_consumer_state)
+
+            #
+            # MMA mainloop
+            #
+            for k_block in cutlass.range_dynamic(0, k_block_cnt, 1, unroll=1):
+                # Conditionally wait for AB buffer empty
+                ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status)
+
+                if ab_producer_state.count < k_block_cnt:
+                    # TMA load A/B
+                    cute.copy(
+                        tma_atom_a,
+                        tAgA[(None, ab_producer_state.count)],
+                        tAsA[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        mcast_mask=a_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_b,
+                        tBgB[(None, ab_producer_state.count)],
+                        tBsB[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        mcast_mask=b_full_mcast_mask,
+                    )
+
+                if is_leader_cta:
+                    # Conditionally wait for AB buffer full
+                    ab_pipeline.consumer_wait(ab_consumer_state, peek_ab_full_status)
+
+                    # tCtAcc += tCrA * tCrB
+                    num_kphases = cute.size(tCrA, mode=[2])
+                    for kphase_idx in range(num_kphases):
+                        kphase_coord = (None, None, kphase_idx, ab_consumer_state.index)
+
+                        cute.gemm(
+                            tiled_mma,
+                            tCtAcc,
+                            tCrA[kphase_coord],
+                            tCrB[kphase_coord],
+                            tCtAcc,
+                        )
+                        # Enable accumulate on tCtAcc after first kphase
+                        tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+                    # Async arrive AB buffer empty
+                    ab_pipeline.consumer_release(ab_consumer_state)
+
+                # Peek (try_wait) AB buffer empty for k_block = prefetch_k_block_cnt + k_block + 1
+                ab_producer_state.advance()
+                peek_ab_empty_status = cutlass.Boolean(1)
+                if ab_producer_state.count < k_block_cnt:
+                    peek_ab_empty_status = ab_pipeline.producer_try_acquire(
+                        ab_producer_state
+                    )
+
+                # Peek (try_wait) AB buffer full for k_block = k_block + 1
+                ab_consumer_state.advance()
+                peek_ab_full_status = cutlass.Boolean(1)
+                if ab_consumer_state.count < k_block_cnt:
+                    if is_leader_cta:
+                        peek_ab_full_status = ab_pipeline.consumer_try_wait(
+                            ab_consumer_state
+                        )
+
+            # Async arrive accumulator buffer full
+            if is_leader_cta:
+                acc_pipeline.producer_commit(acc_producer_state)
+
+        #
+        # Epilogue
+        #
+
+        # Release tensor memory allocation lock
+        if warp_idx == 0:
+            cute.arch.relinquish_tmem_alloc_permit(is_two_cta=use_2cta_instrs)
+
+        # Wait for accumulator buffer full
+        acc_pipeline.consumer_wait(acc_consumer_state)
+
+        tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc))
+        if cutlass.const_expr(self.use_tma_store):
+            bSG_gC = cute.group_modes(bSG_gC, 1, cute.rank(bSG_gC))
+        else:
+            tTR_gC = cute.group_modes(tTR_gC, 3, cute.rank(tTR_gC))
+
+        c_pipeline = None
+        if cutlass.const_expr(self.use_tma_store):
+            # Initialize tma store c_pipeline
+            c_producer_group = utils.CooperativeGroup(
+                utils.Agent.Thread, self.threads_per_cta, self.threads_per_cta
+            )
+            c_pipeline = utils.PipelineTmaStore.create(
+                num_stages=self.num_c_stage,
+                producer_group=c_producer_group,
+            )
+
+        #
+        # Store accumulator to global memory in subtiles
+        #
+        subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3])
+        for subtile_idx in cutlass.range_dynamic(subtile_cnt):
+            #
+            # Load accumulator from tensor memory buffer to register
+            #
+            tTR_tAcc_mn = tTR_tAcc[(None, None, None, subtile_idx)]
+            cute.copy(tiled_copy_t2r, tTR_tAcc_mn, tTR_rAcc)
+
+            if cutlass.const_expr(self.use_tma_store):
+                #
+                # Perform epilogue op on accumulator and convert to C type
+                #
+                acc_vec = tiled_copy_r2s.retile(tTR_rAcc).load()
+                acc_vec = epilogue_op(acc_vec.to(self.c_dtype))
+                tRS_rC.store(acc_vec)
+
+                #
+                # Store C to shared memory
+                #
+                c_buffer = subtile_idx % self.num_c_stage
+                cute.copy(tiled_copy_r2s, tRS_rC, tRS_sC[(None, None, None, c_buffer)])
+                # Fence and barrier to make sure shared memory store is visible to TMA store
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared,
+                    space=cute.arch.SharedSpace.shared_cta,
+                )
+                cute.arch.barrier()
+
+                #
+                # TMA store C to global memory
+                #
+                if warp_idx == 0:
+                    cute.copy(
+                        tma_atom_c,
+                        bSG_sC[(None, c_buffer)],
+                        bSG_gC[(None, subtile_idx)],
+                    )
+                    # Fence and barrier to make sure TMA store is completed to recollect C buffer
+                    c_pipeline.producer_commit()
+                    c_pipeline.producer_acquire()
+                cute.arch.barrier()
+            else:
+                #
+                # Perform epilogue op on accumulator and convert to C type
+                #
+                acc_vec = tTR_rAcc.load()
+                acc_vec = epilogue_op(acc_vec.to(self.c_dtype))
+                tTR_rC.store(acc_vec)
+
+                #
+                # Store C to global memory
+                #
+                cute.copy(simt_atom, tTR_rC, tTR_gC[(None, None, None, subtile_idx)])
+
+        #
+        # Dealloc the tensor memory buffer
+        #
+        cute.arch.barrier()
+        if warp_idx == 0:
+            if use_2cta_instrs:
+                cute.arch.mbarrier_arrive(
+                    tmem_dealloc_mbar_ptr, cta_rank_in_cluster ^ 1
+                )
+                cute.arch.mbarrier_wait(tmem_dealloc_mbar_ptr, 0)
+            cute.arch.dealloc_tmem(
+                tmem_ptr, self.num_tmem_alloc_cols, is_two_cta=use_2cta_instrs
+            )
+
+        #
+        # Wait for C store complete
+        #
+        if cutlass.const_expr(self.use_tma_store):
+            c_pipeline.producer_tail()
+
+        #
+        # Wait A/B buffer empty
+        #
+        if warp_idx == 0:
+            # Reverse prefetch_k_block_cnt times to next available buffer
+            for i in cutlass.range_dynamic(prefetch_k_block_cnt):
+                ab_producer_state.reverse()
+            ab_pipeline.producer_tail(ab_producer_state)
+        return
+
+    def epilog_tmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        tAcc: cute.Tensor,
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        use_2cta_instrs: Union[cutlass.Boolean, bool],
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array (destination).
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param tAcc: The accumulator tensor to be copied and partitioned
+        :type tAcc: cute.Tensor
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param use_2cta_instrs: Whether use_2cta_instrs is enabled
+        :type use_2cta_instrs: bool
+
+        :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc) where:
+            - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+            - tTR_tAcc: The partitioned accumulator tensor
+            - tTR_rAcc: The accumulated tensor in register used to hold t2r results
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        # Make tiledCopy for tensor memory load
+        copy_atom_t2r = sm100_utils.get_tmem_load_op(
+            self.cta_tile_shape_mnk,
+            self.c_layout,
+            self.c_dtype,
+            self.acc_dtype,
+            epi_tile,
+            use_2cta_instrs,
+        )
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N)
+        tAcc_epi = cute.flat_divide(
+            tAcc[((None, None), 0, 0)],
+            epi_tile,
+        )
+        # (EPI_TILE_M, EPI_TILE_N)
+        tiled_copy_t2r = tcgen05.make_tmem_copy(
+            copy_atom_t2r, tAcc_epi[(None, None, 0, 0)]
+        )
+
+        thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_M)
+        tTR_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
+
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        gC_mnl_epi = cute.flat_divide(
+            gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile
+        )
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi)
+        # (T2R, T2R_M, T2R_N)
+        tTR_rAcc = cute.make_fragment(
+            tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype
+        )
+        return tiled_copy_t2r, tTR_tAcc, tTR_rAcc
+
+    def epilog_smem_copy_and_partition(
+        self,
+        tiled_copy_t2r: cute.TiledCopy,
+        tTR_rC: cute.Tensor,
+        tidx: cutlass.Int32,
+        sC: cute.Tensor,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for shared memory store, then use it to partition register array (source) and shared memory (destination).
+
+        :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+        :type tiled_copy_t2r: cute.TiledCopy
+        :param tTR_rC: The partitioned accumulator tensor
+        :type tTR_rC: cute.Tensor
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param sC: The shared memory tensor to be copied and partitioned
+        :type sC: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_r2s, tRS_rC, tRS_sC) where:
+            - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s)
+            - tRS_rC: The partitioned tensor C (register source)
+            - tRS_sC: The partitioned tensor C (smem destination)
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        copy_atom_r2s = sm100_utils.get_smem_store_op(
+            self.c_layout, self.c_dtype, self.acc_dtype, tiled_copy_t2r
+        )
+        tiled_copy_r2s = cute.make_tiled_copy(
+            copy_atom_r2s,
+            layout_tv=tiled_copy_t2r.layout_dst_tv_tiled,
+            tiler_mn=tiled_copy_t2r.tiler_mn,
+        )
+        # (R2S, R2S_M, R2S_N, PIPE_D)
+        thr_copy_r2s = tiled_copy_r2s.get_slice(tidx)
+        tRS_sC = thr_copy_r2s.partition_D(sC)
+        # (R2S, R2S_M, R2S_N)
+        tRS_rC = tiled_copy_r2s.retile(tTR_rC)
+        return tiled_copy_r2s, tRS_rC, tRS_sC
+
+    def epilog_gmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        atom: Union[cute.CopyAtom, cute.TiledCopy],
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        sC: cute.Tensor,
+    ) -> Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]:
+        """Make tiledCopy for global memory store, then use it to:
+        - partition register array (source) and global memory (destination) for none TMA store version;
+        - partition shared memory (source) and global memory (destination) for TMA store version.
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param atom: The copy_atom_c to be used for TMA store version, or tiled_copy_t2r for none TMA store version
+        :type atom: cute.CopyAtom or cute.TiledCopy
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param sC: The shared memory tensor to be copied and partitioned
+        :type sC: cute.Tensor
+
+        :return: A tuple containing either:
+            - For TMA store: (tma_atom_c, bSG_sC, bSG_gC) where:
+                - tma_atom_c: The TMA copy atom
+                - bSG_sC: The partitioned shared memory tensor C
+                - bSG_gC: The partitioned global tensor C
+            - For non-TMA store: (simt_atom, tTR_rC, tTR_gC) where:
+                - simt_atom: The SIMT copy atom
+                - tTR_rC: The register tensor C
+                - tTR_gC: The partitioned global tensor C
+        :rtype: Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]
+        """
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        gC_epi = cute.flat_divide(
+            gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile
+        )
+        if cutlass.const_expr(self.use_tma_store):
+            tma_atom_c = atom
+            sC_for_tma_partition = cute.group_modes(sC, 0, 2)
+            gC_for_tma_partition = cute.group_modes(gC_epi, 0, 2)
+            # ((ATOM_V, REST_V), EPI_M, EPI_N)
+            # ((ATOM_V, REST_V), EPI_M, EPI_N, loopM, loopN, loopL)
+            bSG_sC, bSG_gC = cpasync.tma_partition(
+                tma_atom_c,
+                0,
+                cute.make_layout(1),
+                sC_for_tma_partition,
+                gC_for_tma_partition,
+            )
+            return tma_atom_c, bSG_sC, bSG_gC
+        else:
+            tiled_copy_t2r = atom
+            # (T2R, T2R_M, T2R_N, EPI_M, EPI_N, loopM, loopN, loopL)
+            thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+            tTR_gC = thr_copy_t2r.partition_D(gC_epi)
+            # (T2R, T2R_M, T2R_N)
+            tTR_rC = cute.make_fragment(
+                tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.c_dtype
+            )
+            simt_atom = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), self.c_dtype)
+            return simt_atom, tTR_rC, tTR_gC
+
+    @staticmethod
+    def _compute_stages(
+        tiled_mma: cute.TiledMma,
+        mma_tiler_mnk: Tuple[int, int, int],
+        a_dtype: Type[cutlass.Numeric],
+        b_dtype: Type[cutlass.Numeric],
+        epi_tile: cute.Tile,
+        c_dtype: Type[cutlass.Numeric],
+        c_layout: utils.LayoutEnum,
+        num_smem_capacity: int,
+        occupancy: int,
+        use_tma_store: bool,
+    ) -> Tuple[int, int, int]:
+        """Computes the number of stages for A/B/C operands based on heuristics.
+
+        :param tiled_mma: The tiled MMA object defining the core computation.
+        :type tiled_mma: cute.TiledMma
+        :param mma_tiler_mnk: The shape (M, N, K) of the MMA tile.
+        :type mma_tiler_mnk: tuple[int, int, int]
+        :param a_dtype: Data type of operand A.
+        :type a_dtype: type[cutlass.Numeric]
+        :param b_dtype: Data type of operand B.
+        :type b_dtype: type[cutlass.Numeric]
+        :param epi_tile: The epilogue tile shape.
+        :type epi_tile: cute.Tile
+        :param c_dtype: Data type of operand C (output).
+        :type c_dtype: type[cutlass.Numeric]
+        :param c_layout: Layout enum of operand C in global memory.
+        :type c_layout: utils.LayoutEnum
+        :param num_smem_capacity: Total available shared memory capacity in bytes.
+        :type num_smem_capacity: int
+        :param occupancy: Target number of CTAs per SM (occupancy).
+        :type occupancy: int
+        :param use_tma_store: Whether TMA store is enabled.
+        :type use_tma_store: bool
+
+        :return: A tuple containing the computed number of stages for:
+                 (ACC stages, A/B operand stages, epilogue stages)
+        :rtype: tuple[int, int, int]
+        """
+        # Default ACC stages
+        num_acc_stage = 1
+        # Default C stages
+        num_c_stage = 2 if use_tma_store else 0
+
+        # Calculate smem layout and size for one stage of A, B, and C
+        a_smem_layout_stage_one = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            mma_tiler_mnk,
+            a_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+        b_smem_layout_staged_one = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            mma_tiler_mnk,
+            b_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+        c_smem_layout_staged_one = (
+            sm100_utils.make_smem_layout_epi(
+                c_dtype,
+                c_layout,
+                epi_tile,
+                1,
+            )
+            if use_tma_store
+            else None
+        )
+        ab_bytes_per_stage = cute.size_in_bytes(
+            a_dtype, a_smem_layout_stage_one
+        ) + cute.size_in_bytes(b_dtype, b_smem_layout_staged_one)
+        mbar_helpers_bytes = 1024
+        c_bytes_per_stage = (
+            cute.size_in_bytes(c_dtype, c_smem_layout_staged_one)
+            if use_tma_store
+            else 0
+        )
+        c_bytes = c_bytes_per_stage * num_c_stage
+
+        # Calculate A/B stages:
+        # Start with total smem per CTA (capacity / occupancy)
+        # Subtract reserved bytes and initial C stages bytes
+        # Divide remaining by bytes needed per A/B stage
+        num_ab_stage = (
+            num_smem_capacity - (occupancy + 1) * (mbar_helpers_bytes + c_bytes)
+        ) // ab_bytes_per_stage
+
+        # Refine epilogue stages:
+        # Calculate remaining smem after allocating for A/B stages and reserved bytes
+        # Add remaining unused smem to epilogue
+        if use_tma_store:
+            num_c_stage += (
+                num_smem_capacity
+                - ab_bytes_per_stage * num_ab_stage
+                - (occupancy + 1) * (mbar_helpers_bytes + c_bytes)
+            ) // ((occupancy + 1) * c_bytes_per_stage)
+        return num_acc_stage, num_ab_stage, num_c_stage
+
+    @staticmethod
+    def _compute_grid(
+        c: cute.Tensor,
+        cta_tile_shape_mnk: Tuple[int, int, int],
+        cluster_shape_mn: Tuple[int, int],
+    ) -> Tuple[int, int, int]:
+        """Compute grid shape for the output tensor C.
+
+        :param c: The output tensor C
+        :type c: cute.Tensor
+        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
+        :type cta_tile_shape_mnk: tuple[int, int, int]
+        :param cluster_shape_mn: Shape of each cluster in M, N dimensions.
+        :type cluster_shape_mn: tuple[int, int]
+
+        :return: Grid shape for kernel launch.
+        :rtype: tuple[int, int, int]
+        """
+
+        cluster_shape_mnl = (*cluster_shape_mn, 1)
+
+        grid = cute.round_up(
+            (
+                cute.ceil_div(c.layout.shape[0], cta_tile_shape_mnk[0]),
+                cute.ceil_div(c.layout.shape[1], cta_tile_shape_mnk[1]),
+                c.layout.shape[2],
+            ),
+            cluster_shape_mnl,
+        )
+
+        return grid
+
+    @staticmethod
+    def _get_tma_atom_kind(
+        atom_sm_cnt: cutlass.Int32, mcast: cutlass.Boolean
+    ) -> Union[
+        cpasync.CopyBulkTensorTileG2SMulticastOp, cpasync.CopyBulkTensorTileG2SOp
+    ]:
+        """
+        Select the appropriate TMA copy atom based on the number of SMs and the multicast flag.
+
+        :param atom_sm_cnt: The number of SMs
+        :type atom_sm_cnt: cutlass.Int32
+        :param mcast: The multicast flag
+        :type mcast: cutlass.Boolean
+
+        :return: The appropriate TMA copy atom kind
+        :rtype: cpasync.CopyBulkTensorTileG2SMulticastOp or cpasync.CopyBulkTensorTileG2SOp
+
+        :raise ValueError: If the atom_sm_cnt is invalid
+        """
+        if atom_sm_cnt == 2 and mcast:
+            return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.TWO)
+        elif atom_sm_cnt == 2 and not mcast:
+            return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.TWO)
+        elif atom_sm_cnt == 1 and mcast:
+            return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.ONE)
+        elif atom_sm_cnt == 1 and not mcast:
+            return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE)
+
+        raise ValueError(f"Invalid atom_sm_cnt: {atom_sm_cnt} and {mcast}")
+
+    @staticmethod
+    def _compute_num_tmem_alloc_cols(
+        tiled_mma: cute.TiledMma, mma_tiler: Tuple[int, int, int]
+    ) -> int:
+        """
+        Compute the number of tensor memory allocation columns.
+
+        :param tiled_mma: The tiled MMA object defining the core computation.
+        :type tiled_mma: cute.TiledMma
+        :param mma_tiler: The shape (M, N, K) of the MMA tile.
+        :type mma_tiler: tuple[int, int, int]
+
+        :return: The number of tensor memory allocation columns.
+        :rtype: int
+        """
+        acc_shape = tiled_mma.partition_shape_C(mma_tiler[:2])
+        tCtAcc_fake = tiled_mma.make_fragment_C(acc_shape)
+        return sm100_utils.get_num_tmem_alloc_cols(tCtAcc_fake)
+
+    @staticmethod
+    def is_valid_dtypes(
+        ab_dtype: Type[cutlass.Numeric],
+        acc_dtype: Type[cutlass.Numeric],
+        c_dtype: Type[cutlass.Numeric],
+    ) -> bool:
+        """
+        Check if the dtypes are valid
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param acc_dtype: The data type of the accumulator
+        :type acc_dtype: Type[cutlass.Numeric]
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+
+        :return: True if the dtypes are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+        if ab_dtype not in {
+            cutlass.Float16,
+            cutlass.BFloat16,
+            cutlass.TFloat32,
+            cutlass.Uint8,
+            cutlass.Int8,
+            cutlass.Float8E4M3FN,
+            cutlass.Float8E5M2,
+        }:
+            is_valid = False
+        if (
+            acc_dtype not in {cutlass.Float32, cutlass.Float16, cutlass.Int32}
+            or acc_dtype == cutlass.Float16
+            and ab_dtype
+            not in {cutlass.Float16, cutlass.Float8E4M3FN, cutlass.Float8E5M2}
+            or acc_dtype == cutlass.Int32
+            and ab_dtype not in {cutlass.Uint8, cutlass.Int8}
+        ):
+            is_valid = False
+        if (
+            acc_dtype == cutlass.Float32
+            and c_dtype
+            not in {
+                cutlass.Float32,
+                cutlass.Float16,
+                cutlass.BFloat16,
+                cutlass.Float8E4M3FN,
+                cutlass.Float8E5M2,
+                cutlass.Int32,
+                cutlass.Int8,
+                cutlass.Uint8,
+            }
+            or acc_dtype == cutlass.Float16
+            and c_dtype
+            not in {
+                cutlass.BFloat16,
+                cutlass.Float16,
+            }
+            or acc_dtype == cutlass.Int32
+            and c_dtype
+            not in {
+                cutlass.BFloat16,
+                cutlass.Float16,
+                cutlass.Float32,
+                cutlass.Int32,
+                cutlass.Int8,
+                cutlass.Uint8,
+            }
+        ):
+            is_valid = False
+        return is_valid
+
+    @staticmethod
+    def is_valid_mma_tiler_and_cluster_shape(
+        use_2cta_instrs: bool,
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+    ) -> bool:
+        """
+        Check if the mma tiler and cluster shape are valid
+
+        :param use_2cta_instrs: Whether to use 2 CTA groups
+        :type use_2cta_instrs: bool
+        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
+        :type cluster_shape_mn: Tuple[int, int]
+
+        :return: True if the mma tiler and cluster shape are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+        # Skip invalid mma tile shape
+        if not (
+            (not use_2cta_instrs and mma_tiler_mn[0] in [64, 128])
+            or (use_2cta_instrs and mma_tiler_mn[0] in [128, 256])
+        ):
+            is_valid = False
+        if mma_tiler_mn[1] not in range(32, 257, 32):
+            is_valid = False
+        # Skip illegal cluster shape
+        if cluster_shape_mn[0] % (2 if use_2cta_instrs else 1) != 0:
+            is_valid = False
+        # Skip invalid cluster shape
+        is_power_of_2 = lambda x: x > 0 and (x & (x - 1)) == 0
+        if (
+            cluster_shape_mn[0] * cluster_shape_mn[1] > 16
+            or cluster_shape_mn[0] <= 0
+            or cluster_shape_mn[1] <= 0
+            or not is_power_of_2(cluster_shape_mn[0])
+            or not is_power_of_2(cluster_shape_mn[1])
+        ):
+            is_valid = False
+        return is_valid
+
+    @staticmethod
+    def is_valid_tensor_alignment(
+        m: int,
+        n: int,
+        k: int,
+        l: int,
+        ab_dtype: Type[cutlass.Numeric],
+        c_dtype: Type[cutlass.Numeric],
+        a_major: str,
+        b_major: str,
+        c_major: str,
+    ) -> bool:
+        """
+        Check if the tensor alignment is valid
+
+        :param m: The number of rows in the A tensor
+        :type m: int
+        :param n: The number of columns in the B tensor
+        :type n: int
+        :param k: The number of columns in the A tensor
+        :type k: int
+        :param l: The number of columns in the C tensor
+        :type l: int
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+        :param a_major: The major axis of the A tensor
+        :type a_major: str
+        :param b_major: The major axis of the B tensor
+        :type b_major: str
+        :param c_major: The major axis of the C tensor
+        :type c_major: str
+
+        :return: True if the problem shape is valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+
+        def check_contigous_16B_alignment(dtype, is_mode0_major, tensor_shape):
+            major_mode_idx = 0 if is_mode0_major else 1
+            num_major_elements = tensor_shape[major_mode_idx]
+            num_contiguous_elements = 16 * 8 // dtype.width
+            return num_major_elements % num_contiguous_elements == 0
+
+        if (
+            not check_contigous_16B_alignment(ab_dtype, a_major == "m", (m, k, l))
+            or not check_contigous_16B_alignment(ab_dtype, b_major == "n", (n, k, l))
+            or not check_contigous_16B_alignment(c_dtype, c_major == "m", (m, n, l))
+        ):
+            is_valid = False
+        return is_valid
+
+    @staticmethod
+    def is_valid_epilog_store_option(
+        use_2cta_instrs: bool,
+        use_tma_store: bool,
+        m: int,
+        n: int,
+        mma_tiler_mn: Tuple[int, int],
+    ) -> bool:
+        """
+        Check if the epilogue store option is valid
+
+        :param use_2cta_instrs: Whether to use 2 CTA groups
+        :type use_2cta_instrs: bool
+        :param use_tma_store: Whether to use TMA store
+        :type use_tma_store: bool
+        :param m: The number of rows in the A tensor
+        :type m: int
+        :param n: The number of columns in the B tensor
+        :type n: int
+        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
+        :type mma_tiler_mn: Tuple[int, int]
+
+        :return: True if the epilogue store option is valid, False otherwise
+        :rtype: bool
+        """
+
+        is_valid = True
+        # None TMA store version does not have predication, can not support OOB tiles
+        cta_tile_shape_mn = (
+            mma_tiler_mn[0] // (2 if use_2cta_instrs else 1),
+            mma_tiler_mn[1],
+        )
+        if not use_tma_store:
+            if not (m % cta_tile_shape_mn[0] == 0 and n % cta_tile_shape_mn[1] == 0):
+                is_valid = False
+        return is_valid
+
+    @staticmethod
+    def can_implement(
+        ab_dtype: Type[cutlass.Numeric],
+        acc_dtype: Type[cutlass.Numeric],
+        c_dtype: Type[cutlass.Numeric],
+        use_2cta_instrs: bool,
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+        use_tma_store: bool,
+        m: int,
+        n: int,
+        k: int,
+        l: int,
+        a_major: str,
+        b_major: str,
+        c_major: str,
+    ) -> bool:
+        """
+        Check if the gemm can be implemented
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param acc_dtype: The data type of the accumulator
+        :type acc_dtype: Type[cutlass.Numeric]
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+        :param use_2cta_instrs: Whether to use 2 CTA groups
+        :type use_2cta_instrs: bool
+        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
+        :type cluster_shape_mn: Tuple[int, int]
+        :param use_tma_store: Whether to use TMA store
+        :type use_tma_store: bool
+        :param m: The number of rows in the A tensor
+        :type m: int
+        :param n: The number of columns in the B tensor
+        :type n: int
+        :param k: The number of columns in the A tensor
+        :type k: int
+        :param l: The number of columns in the C tensor
+        :type l: int
+        :param a_major: The major axis of the A tensor
+        :type a_major: str
+        :param b_major: The major axis of the B tensor
+        :type b_major: str
+        :param c_major: The major axis of the C tensor
+        :type c_major: str
+
+        :return: True if the gemm can be implemented, False otherwise
+        :rtype: bool
+        """
+        can_implement = True
+        # Skip unsupported types
+        if not DenseGemmKernel.is_valid_dtypes(ab_dtype, acc_dtype, c_dtype):
+            can_implement = False
+        # Skip invalid mma tile shape and cluster shape
+        if not DenseGemmKernel.is_valid_mma_tiler_and_cluster_shape(
+            use_2cta_instrs, mma_tiler_mn, cluster_shape_mn
+        ):
+            can_implement = False
+        # Skip illegal problem shape for load/store alignment
+        if not DenseGemmKernel.is_valid_tensor_alignment(
+            m, n, k, l, ab_dtype, c_dtype, a_major, b_major, c_major
+        ):
+            can_implement = False
+        # Skip invalid epilogue store option
+        if not DenseGemmKernel.is_valid_epilog_store_option(
+            use_2cta_instrs, use_tma_store, m, n, mma_tiler_mn
+        ):
+            can_implement = False
+        return can_implement
+
+
+def run_dense_gemm(
+    mnkl: Tuple[int, int, int, int],
+    ab_dtype: Type[cutlass.Numeric],
+    c_dtype: Type[cutlass.Numeric],
+    acc_dtype: Type[cutlass.Numeric],
+    a_major: str,
+    b_major: str,
+    c_major: str,
+    mma_tiler_mn: Tuple[int, int],
+    cluster_shape_mn: Tuple[int, int],
+    use_2cta_instrs: bool,
+    use_tma_store: bool,
+    tolerance: float,
+    warmup_iterations: int = 0,
+    iterations: int = 1,
+    skip_ref_check: bool = False,
+    measure_launch_overhead=False,
+):
+    """
+    Prepare A/B/C tensors, launch GPU kernel, and reference checking.
+    """
+    print(f"Running B100 Dense GEMM test with:")
+    print(f"mnkl: {mnkl}")
+    print(f"AB dtype: {ab_dtype}, C dtype: {c_dtype}, Acc dtype: {acc_dtype}")
+    print(f"Matrix majors - A: {a_major}, B: {b_major}, C: {c_major}")
+    print(f"Mma Tiler (M, N): {mma_tiler_mn}, Cluster Shape (M, N): {cluster_shape_mn}")
+    print(f"2CTA MMA instructions: {'True' if use_2cta_instrs else 'False'}")
+    print(f"Use TMA Store: {'True' if use_tma_store else 'False'}")
+    print(f"Tolerance: {tolerance}")
+    print(f"Warmup iterations: {warmup_iterations}")
+    print(f"Iterations: {iterations}")
+    print(f"Skip reference checking: {skip_ref_check}")
+
+    # Unpack parameters
+    m, n, k, l = mnkl
+
+    # Skip unsupported testcase
+    if not DenseGemmKernel.can_implement(
+        ab_dtype,
+        acc_dtype,
+        c_dtype,
+        use_2cta_instrs,
+        mma_tiler_mn,
+        cluster_shape_mn,
+        use_tma_store,
+        m,
+        n,
+        k,
+        l,
+        a_major,
+        b_major,
+        c_major,
+    ):
+        raise TypeError(
+            f"Unsupported testcase {ab_dtype}, {acc_dtype}, {c_dtype}, {use_2cta_instrs}, {mma_tiler_mn}, {cluster_shape_mn}, {use_tma_store}, {m}, {n}, {k}, {l}, {a_major}, {b_major}, {c_major}"
+        )
+
+    if not torch.cuda.is_available():
+        raise RuntimeError("GPU is required to run this example!")
+
+    torch.manual_seed(1111)
+
+    # Create and permute tensor A/B/C
+    def create_and_permute_tensor(
+        l, mode0, mode1, is_mode0_major, dtype, is_dynamic_layout=True
+    ):
+        # is_mode0_major: (l, mode1, mode0) -> (mode0, mode1, l)
+        # else: (l, mode0, mode1) -> (mode0, mode1, l)
+        shape = (l, mode1, mode0) if is_mode0_major else (l, mode0, mode1)
+        permute_order = (2, 1, 0) if is_mode0_major else (1, 2, 0)
+        is_unsigned = dtype in {cutlass.Uint8}
+        # Temporarily use uint8 as torch does not support fp8 type
+        torch_dtype = (
+            cutlass_torch.dtype(dtype)
+            if dtype not in {cutlass.Float8E5M2, cutlass.Float8E4M3FN}
+            else torch.uint8
+        )
+
+        # Create dtype torch tensor (cpu)
+        torch_tensor_cpu = cutlass_torch.create_and_permute_torch_tensor(
+            shape,
+            torch_dtype,
+            permute_order=permute_order,
+            init_type=cutlass_torch.TensorInitType.RANDOM,
+            init_config=cutlass_torch.RandomInitConfig(
+                min_val=0 if is_unsigned else -2, max_val=4 if is_unsigned else 2
+            ),
+        )
+        # Create dtype torch tensor (gpu)
+        torch_tensor = torch_tensor_cpu.cuda()
+
+        # Create f32 torch tensor (cpu)
+        f32_torch_tensor = torch_tensor_cpu.to(dtype=torch.float32)
+
+        # Create dtype cute tensor (gpu)
+        cute_tensor = from_dlpack(torch_tensor, assumed_align=16)
+        cute_tensor.element_type = dtype
+        if is_dynamic_layout:
+            cute_tensor = cute_tensor.mark_layout_dynamic(
+                leading_dim=(0 if is_mode0_major else 1)
+            )
+        cute_tensor = cutlass_torch.convert_cute_tensor(
+            f32_torch_tensor,
+            cute_tensor,
+            dtype,
+            is_dynamic_layout=is_dynamic_layout,
+        )
+
+        return f32_torch_tensor, cute_tensor, torch_tensor
+
+    a_ref, a_tensor, a_torch = create_and_permute_tensor(
+        l, m, k, a_major == "m", ab_dtype, is_dynamic_layout=True
+    )
+    b_ref, b_tensor, b_torch = create_and_permute_tensor(
+        l, n, k, b_major == "n", ab_dtype, is_dynamic_layout=True
+    )
+    c_ref, c_tensor, c_torch = create_and_permute_tensor(
+        l, m, n, c_major == "m", c_dtype, is_dynamic_layout=True
+    )
+
+    # Configure gemm kernel
+    gemm = DenseGemmKernel(
+        acc_dtype,
+        use_2cta_instrs,
+        mma_tiler_mn,
+        cluster_shape_mn,
+        use_tma_store,
+    )
+
+    torch_stream = torch.cuda.Stream()
+    stream = cuda.CUstream(torch_stream.cuda_stream)
+    # Compile gemm kernel
+    compiled_gemm = cute.compile(gemm, a_tensor, b_tensor, c_tensor, stream)
+
+    # Launch GPU kernel
+    # Warm up
+    for i in range(warmup_iterations):
+        compiled_gemm(a_tensor, b_tensor, c_tensor, stream)
+    # Execution
+    for i in range(iterations):
+        compiled_gemm(a_tensor, b_tensor, c_tensor, stream)
+
+    # Compute reference result
+    if not skip_ref_check:
+        if ab_dtype in {
+            cutlass.Int8,
+            cutlass.Uint8,
+            cutlass.Float8E4M3FN,
+            cutlass.Float8E5M2,
+        }:
+            ref = torch.einsum("mkl,nkl->mnl", a_ref.cpu(), b_ref.cpu())
+        else:
+            ref = (torch.einsum("mkl,nkl->mnl", a_ref, b_ref)).cpu()
+
+        # Copy gpu result back
+        gpu_c = c_torch.cpu()
+
+        # Convert ref to c_type
+        if c_dtype == cutlass.Float32:
+            ref_c = ref
+        elif c_dtype in {cutlass.Float8E5M2, cutlass.Float8E4M3FN}:
+            # m major: (l, n, m) -> (m, n, l)
+            # k major: (l, m, n) -> (m, n, l)
+            permute_order = (1, 2, 0) if c_major == "n" else (2, 1, 0)
+            shape = (l, m, n) if c_major == "n" else (l, n, m)
+            f8_torch_tensor = cutlass_torch.create_and_permute_torch_tensor(
+                shape,
+                torch.uint8,
+                permute_order=permute_order,
+                init_type=cutlass_torch.TensorInitType.SKIP,
+            ).cuda()
+            # Create dtype cute tensor (gpu)
+            ref_c_tensor = from_dlpack(
+                f8_torch_tensor, assumed_align=16
+            ).mark_layout_dynamic(leading_dim=(1 if c_major == "n" else 0))
+            ref_c_tensor.element_type = c_dtype
+            ref_c_tensor = cutlass_torch.convert_cute_tensor(
+                ref,
+                ref_c_tensor,
+                c_dtype,
+                is_dynamic_layout=True,
+            )
+
+            ref_c = f8_torch_tensor.cpu()
+        else:
+            ref_c = ref.to(cutlass_torch.dtype(c_dtype))
+
+        # Reference checking ref_c and gpu_c
+        torch.testing.assert_close(
+            gpu_c,
+            ref_c,
+            atol=tolerance,
+            rtol=1e-05,
+        )
+
+
+if __name__ == "__main__":
+
+    def parse_comma_separated_ints(s: str) -> Tuple[int, ...]:
+        try:
+            return tuple(int(x.strip()) for x in s.split(","))
+            # or: return tuple([int(x.strip()) for x in s.split(",")])
+        except ValueError:
+            raise argparse.ArgumentTypeError(
+                "Invalid format. Expected comma-separated integers."
+            )
+
+    parser = argparse.ArgumentParser(
+        description="Example of MxNxKxL GEMM on Blackwell."
+    )
+
+    parser.add_argument(
+        "--mnkl",
+        type=parse_comma_separated_ints,
+        default=(256, 256, 512, 1),
+        help="mnkl dimensions (comma-separated)",
+    )
+    parser.add_argument(
+        "--mma_tiler_mn",
+        type=parse_comma_separated_ints,
+        default=(128, 128),
+        help="Mma tiler (comma-separated)",
+    )
+    parser.add_argument(
+        "--cluster_shape_mn",
+        type=parse_comma_separated_ints,
+        default=(1, 1),
+        help="Cluster shape (comma-separated)",
+    )
+    parser.add_argument("--ab_dtype", type=cutlass.dtype, default=cutlass.TFloat32)
+    parser.add_argument("--c_dtype", type=cutlass.dtype, default=cutlass.Float32)
+    parser.add_argument("--acc_dtype", type=cutlass.dtype, default=cutlass.Float32)
+    parser.add_argument(
+        "--use_2cta_instrs",
+        action="store_true",
+        help="Enable 2CTA MMA instructions feature",
+    )
+    parser.add_argument("--a_major", choices=["k", "m"], type=str, default="k")
+    parser.add_argument("--b_major", choices=["k", "n"], type=str, default="k")
+    parser.add_argument("--c_major", choices=["n", "m"], type=str, default="n")
+    parser.add_argument(
+        "--use_tma_store", action="store_true", help="Use tma store or not"
+    )
+    parser.add_argument(
+        "--tolerance", type=float, default=1e-01, help="Tolerance for validation"
+    )
+    parser.add_argument(
+        "--warmup_iterations", type=int, default=0, help="Warmup iterations"
+    )
+    parser.add_argument("--iterations", type=int, default=1, help="Iterations")
+    parser.add_argument(
+        "--skip_ref_check", action="store_true", help="Skip reference checking"
+    )
+
+    args = parser.parse_args()
+
+    if len(args.mnkl) != 4:
+        parser.error("--mnkl must contain exactly 4 values")
+
+    if len(args.mma_tiler_mn) != 2:
+        parser.error("--mma_tiler_mn must contain exactly 2 values")
+
+    if len(args.cluster_shape_mn) != 2:
+        parser.error("--cluster_shape_mn must contain exactly 2 values")
+
+    run_dense_gemm(
+        args.mnkl,
+        args.ab_dtype,
+        args.c_dtype,
+        args.acc_dtype,
+        args.a_major,
+        args.b_major,
+        args.c_major,
+        args.mma_tiler_mn,
+        args.cluster_shape_mn,
+        args.use_2cta_instrs,
+        args.use_tma_store,
+        args.tolerance,
+        args.warmup_iterations,
+        args.iterations,
+        args.skip_ref_check,
+    )
+    print("PASS")
diff --git a/examples/python/CuTeDSL/blackwell/dense_gemm_persistent.py b/examples/python/CuTeDSL/blackwell/dense_gemm_persistent.py
new file mode 100644
index 00000000..abc2597d
--- /dev/null
+++ b/examples/python/CuTeDSL/blackwell/dense_gemm_persistent.py
@@ -0,0 +1,2144 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+from typing import Optional, Type, Tuple, Union
+
+import cuda.bindings.driver as cuda
+import torch
+
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.nvgpu import cpasync, tcgen05
+import cutlass.torch as cutlass_torch
+import cutlass.utils as utils
+import cutlass.utils.blackwell_helpers as sm100_utils
+from cutlass.cute.runtime import from_dlpack
+
+
+"""
+A high-performance persistent batched dense GEMM example for the NVIDIA Blackwell SM100 architecture
+using CUTE DSL.
+- Matrix A is MxKxL, L is batch dimension, A can be row-major("K") or column-major("M")
+- Matrix B is NxKxL, L is batch dimension, B can be row-major("N") or column-major("K")
+- Matrix C is MxNxL, L is batch dimension, C can be row-major("N") or column-major("M")
+
+This GEMM kernel supports the following features:
+    - Utilizes Tensor Memory Access (TMA) for efficient memory operations
+    - Utilizes Blackwell's tcgen05.mma for matrix multiply-accumulate (MMA) operations (including 2cta mma instructions)
+    - Implements TMA multicast with cluster to reduce L2 memory traffic
+    - Support persistent tile scheduling to better overlap memory load/store with mma between tiles
+    - Support warp specialization to avoid explicit pipelining between mainloop load and mma
+
+This GEMM works as follows:
+1. DMA warp: Load A and B matrices from global memory (GMEM) to shared memory (SMEM) using TMA operations.
+2. MMA warp: Perform matrix multiply-accumulate (MMA) operations using tcgen05.mma instruction.
+3. EPILOGUE warp:
+    - Load completed accumulator from tensor memory (TMEM) to registers (RMEM) using tcgen05.ld.
+    - Type convert C matrix to output type.
+    - Optionally store C matrix from registers (RMEM) to shared memory (SMEM) to global memory (GMEM) with TMA operations,
+      or directly store C matrix from registers (RMEM) to global memory (GMEM) without TMA operations.
+    - Optionally accept an elementwise lambda function epilogue_op to apply to the output tensor:
+      e.g., relu can set epilogue_op = lambda x: cute.where(x > 0, x, cute.full_like(x, 0))
+
+SM100 tcgen05.mma instructions operate as follows:
+- Read matrix A from SMEM
+- Read matrix B from SMEM
+- Write accumulator to TMEM
+The accumulator in TMEM must then be loaded to registers before writing back to GMEM.
+
+Input arguments to this example is same as dense_gemm.py.
+
+.. code-block:: bash
+
+    python examples/blackwell/dense_gemm_persistent.py                          \
+      --ab_dtype Float16 --c_dtype Float16 --acc_dtype Float32                  \
+      --mma_tiler_mn 256,128 --cluster_shape_mn 2,1                             \
+      --mnkl 8192,8192,8192,1                                                   \
+      --use_tma_store --use_2cta_instrs
+
+To collect performance with NCU profiler:
+
+.. code-block:: bash
+
+    ncu python examples/blackwell/dense_gemm_persistent.py                     \
+      --ab_dtype Float16 --c_dtype Float16 --acc_dtype Float32                 \
+      --mma_tiler_mn 256,128 --cluster_shape_mn 2,1                            \
+      --mnkl 8192,8192,8192,1                                                  \
+      --use_tma_store --use_2cta_instrs                                        \
+      --warmup_iterations 1 --iterations 10 --skip_ref_check
+
+
+Constraints are same as dense_gemm.py:
+* Supported input data types: fp16, bf16, tf32, int8, uint8, fp8 (e4m3fn, e5m2),
+  see detailed valid dtype combinations in below PersistentDenseGemmKernel class documentation
+* A/B tensor must have the same data type
+* Mma tiler M must be 64/128 (use_2cta_instrs=False) or 128/256 (use_2cta_instrs=True)
+* Mma tiler N must be 32-256, step 32
+* Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+* Cluster shape M must be multiple of 2 if use_2cta_instrs=True
+* The contiguous dimension of A/B/C tensors must be at least 16 bytes aligned,
+  i.e, number of elements is a multiple of 4, 8, and 16 for TFloat32,
+  Float16/BFloat16, and Int8/Uint8/Float8, respectively.
+* OOB tiles are not allowed when TMA store is disabled
+"""
+
+
+class PersistentDenseGemmKernel:
+    """This class implements batched matrix multiplication (C = A x B) with support for various data types
+    and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization.
+
+    :param acc_dtype: Data type for accumulation during computation
+    :type acc_dtype: type[cutlass.Numeric]
+    :param use_2cta_instrs: Whether to use CTA group 2 for advanced thread cooperation
+    :type use_2cta_instrs: bool
+    :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N)
+    :type mma_tiler_mn: Tuple[int, int]
+    :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing
+    :type cluster_shape_mn: Tuple[int, int]
+    :param use_tma_store: Whether to use Tensor Memory Access (TMA) for storing results
+    :type use_tma_store: bool
+
+    :note: In current version, A and B tensor must have the same data type
+        - i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported
+
+    :note: Supported A/B data types:
+        - TFloat32
+        - Float16/BFloat16
+        - Int8/Uint8
+        - Float8E4M3FN/Float8E5M2
+
+    :note: Supported accumulator data types:
+        - Float32 (for all floating point A/B data types)
+        - Float16 (only for fp16 and fp8 A/B data types)
+        - Int32 (only for uint8/int8 A/B data types)
+
+    :note: Supported C data types:
+        - Float32 (for float32 and int32 accumulator data types)
+        - Int32 (for float32 and int32 accumulator data types)
+        - Float16/BFloat16 (for fp16 and fp8 accumulator data types)
+        - Int8/Uint8 (for uint8/int8 accumulator data types)
+        - Float8E4M3FN/Float8E5M2 (for float32 accumulator data types)
+
+    :note: Constraints:
+        - MMA tiler M must be 64/128 (use_2cta_instrs=False) or 128/256 (use_2cta_instrs=True)
+        - MMA tiler N must be 32-256, step 32
+        - Cluster shape M must be multiple of 2 if use_2cta_instrs=True
+        - Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+
+    Example:
+        >>> gemm = PersistentDenseGemmKernel(
+        ...     acc_dtype=cutlass.Float32,
+        ...     use_2cta_instrs=True,
+        ...     mma_tiler_mn=(128, 128),
+        ...     cluster_shape_mn=(2, 2)
+        ... )
+        >>> gemm(a_tensor, b_tensor, c_tensor, max_active_clusters, stream)
+    """
+
+    def __init__(
+        self,
+        acc_dtype: Type[cutlass.Numeric],
+        use_2cta_instrs: bool,
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+        use_tma_store: bool,
+    ):
+        """Initializes the configuration for a Blackwell dense GEMM kernel.
+
+        This configuration includes several key aspects:
+
+        1.  MMA Instruction Settings (tcgen05):
+            - acc_dtype: Data types for MMA accumulator.
+            - mma_tiler_mn: The (M, N) shape of the MMA instruction tiler.
+            - use_2cta_instrs: Boolean indicating if the tcgen05 MMA variant
+              with cta_group=2 should be used.
+
+        2.  Cluster Shape:
+            - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster.
+
+        3. Output C tensor store mode:
+            - use_tma_store: Boolean indicating whether to use Tensor Memory Access (TMA) for storing results.
+
+        :param acc_dtype: Data type of the accumulator.
+        :type acc_dtype: type[cutlass.Numeric]
+        :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction.
+        :type mma_tiler_mn: Tuple[int, int]
+        :param use_2cta_instrs: Boolean, True to use cta_group=2 MMA variant.
+        :type use_2cta_instrs: bool
+        :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
+        :type cluster_shape_mn: Tuple[int, int]
+        :param use_tma_store: Use Tensor Memory Access (TMA) or normal store for output C tensor.
+        :type use_tma_store: bool
+        """
+
+        self.acc_dtype: Type[cutlass.Numeric] = acc_dtype
+        self.use_2cta_instrs = use_2cta_instrs
+        self.cluster_shape_mn = cluster_shape_mn
+        # K dimension is deferred in _setup_attributes
+        self.mma_tiler = (*mma_tiler_mn, 1)
+        self.use_tma_store = use_tma_store
+
+        self.cta_group = (
+            tcgen05.CtaGroup.TWO if use_2cta_instrs else tcgen05.CtaGroup.ONE
+        )
+
+        self.occupancy = 1
+        # Set specialized warp ids
+        self.epilog_warp_id = (
+            0,
+            1,
+            2,
+            3,
+        )
+        self.mma_warp_id = 4
+        self.tma_warp_id = 5
+        self.threads_per_cta = 32 * len(
+            (self.mma_warp_id, self.tma_warp_id, *self.epilog_warp_id)
+        )
+        # Set barrier id for cta sync, epilogue sync and tmem ptr sync
+        self.cta_sync_bar_id = 0
+        self.epilog_sync_bar_id = 1
+        self.tmem_ptr_sync_bar_id = 2
+        self.num_smem_capacity = sm100_utils.SMEM_CAPACITY["sm100"]
+
+    def _setup_attributes(self):
+        """Set up configurations that are dependent on GEMM inputs
+
+        This method configures various attributes based on the input tensor properties
+        (data types, leading dimensions) and kernel settings:
+        - Configuring tiled MMA
+        - Computing MMA/cluster/tile shapes
+        - Computing cluster layout
+        - Computing multicast CTAs for A/B
+        - Computing epilogue subtile
+        - Setting up A/B/C stage counts in shared memory
+        - Computing A/B/C shared memory layout
+        - Computing tensor memory allocation columns
+        """
+        # Configure tiled mma
+        tiled_mma = sm100_utils.make_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.acc_dtype,
+            self.cta_group,
+            self.mma_tiler[:2],
+        )
+
+        # Compute mma/cluster/tile shapes
+        mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
+        mma_inst_tile_k = 4
+        self.mma_tiler = (
+            self.mma_tiler[0],
+            self.mma_tiler[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+        self.cta_tile_shape_mnk = (
+            self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler[1],
+            self.mma_tiler[2],
+        )
+
+        # Compute cluster layout
+        self.cluster_layout_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma.thr_id.shape,),
+        )
+
+        # Compute number of multicast CTAs for A/B
+        self.num_mcast_ctas_a = cute.size(self.cluster_layout_vmnk.shape[2])
+        self.num_mcast_ctas_b = cute.size(self.cluster_layout_vmnk.shape[1])
+        self.is_a_mcast = self.num_mcast_ctas_a > 1
+        self.is_b_mcast = self.num_mcast_ctas_b > 1
+
+        # Compute epilogue subtile
+        if cutlass.const_expr(self.use_tma_store):
+            self.epi_tile = sm100_utils.compute_epilogue_tile_shape(
+                self.cta_tile_shape_mnk,
+                self.use_2cta_instrs,
+                self.c_layout,
+                self.c_dtype,
+            )
+        else:
+            self.epi_tile = self.cta_tile_shape_mnk[:2]
+
+        # Setup A/B/C stage count in shared memory and ACC stage count in tensor memory
+        self.num_acc_stage, self.num_ab_stage, self.num_c_stage = self._compute_stages(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.b_dtype,
+            self.epi_tile,
+            self.c_dtype,
+            self.c_layout,
+            self.num_smem_capacity,
+            self.occupancy,
+            self.use_tma_store,
+        )
+
+        # Compute A/B/C shared memory layout
+        self.a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.num_ab_stage,
+        )
+        self.b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            self.mma_tiler,
+            self.b_dtype,
+            self.num_ab_stage,
+        )
+        self.c_smem_layout_staged = (
+            sm100_utils.make_smem_layout_epi(
+                self.c_dtype,
+                self.c_layout,
+                self.epi_tile,
+                self.num_c_stage,
+            )
+            if cutlass.const_expr(self.use_tma_store)
+            else None
+        )
+
+        # Compute the number of tensor memory allocation columns
+        self.num_tmem_alloc_cols = self._compute_num_tmem_alloc_cols(
+            tiled_mma, self.mma_tiler, self.num_acc_stage
+        )
+
+    @cute.jit
+    def __call__(
+        self,
+        a: cute.Tensor,
+        b: cute.Tensor,
+        c: cute.Tensor,
+        max_active_clusters: cutlass.Constexpr,
+        stream: cuda.CUstream,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        """Execute the GEMM operation in steps:
+        - Setup static attributes before smem/grid/tma computation
+        - Setup TMA load/store atoms and tensors
+        - Compute grid size with regard to hardware constraints
+        - Define shared storage for kernel
+        - Launch the kernel synchronously
+
+        :param a: Input tensor A
+        :type a: cute.Tensor
+        :param b: Input tensor B
+        :type b: cute.Tensor
+        :param c: Output tensor C
+        :type c: cute.Tensor
+        :param max_active_clusters: Maximum number of active clusters
+        :type max_active_clusters: cutlass.Constexpr
+        :param stream: CUDA stream for asynchronous execution
+        :type stream: cuda.CUstream
+        :param epilogue_op: Optional elementwise lambda function to apply to the output tensor
+        :type epilogue_op: cutlass.Constexpr
+        :raises TypeError: If input data types are incompatible with the MMA instruction.
+        :raises AssertionError: If OOB (Out-Of-Bounds) tiles are present when TMA store is disabled.
+        """
+        # Setup static attributes before smem/grid/tma computation
+        self.a_dtype: Type[cutlass.Numeric] = a.element_type
+        self.b_dtype: Type[cutlass.Numeric] = b.element_type
+        self.c_dtype: Type[cutlass.Numeric] = c.element_type
+        self.a_major_mode = utils.LayoutEnum.from_tensor(a).mma_major_mode()
+        self.b_major_mode = utils.LayoutEnum.from_tensor(b).mma_major_mode()
+        self.c_layout = utils.LayoutEnum.from_tensor(c)
+
+        # Check if input data types are compatible with MMA instruction
+        if cutlass.const_expr(self.a_dtype != self.b_dtype):
+            raise TypeError(f"Type must match: {self.a_dtype} != {self.b_dtype}")
+
+        # Setup attributes that dependent on gemm inputs
+        self._setup_attributes()
+
+        tiled_mma = sm100_utils.make_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.acc_dtype,
+            self.cta_group,
+            self.mma_tiler[:2],
+        )
+        atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+
+        # Setup TMA load for A
+        a_op = self._get_tma_atom_kind(atom_thr_size, self.is_a_mcast)
+        a_smem_layout = cute.slice_(self.a_smem_layout_staged, (None, None, None, 0))
+        tma_atom_a, tma_tensor_a = cute.nvgpu.make_tma_tile_atom_A(
+            a_op,
+            a,
+            a_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+            internal_type=(
+                cutlass.TFloat32 if a.element_type is cutlass.Float32 else None
+            ),
+        )
+
+        # Setup TMA load for B
+        b_op = self._get_tma_atom_kind(atom_thr_size, self.is_b_mcast)
+        b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, None, 0))
+        tma_atom_b, tma_tensor_b = cute.nvgpu.make_tma_tile_atom_B(
+            b_op,
+            b,
+            b_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+            internal_type=(
+                cutlass.TFloat32 if b.element_type is cutlass.Float32 else None
+            ),
+        )
+
+        a_copy_size = cute.size_in_bytes(self.a_dtype, a_smem_layout)
+        b_copy_size = cute.size_in_bytes(self.b_dtype, b_smem_layout)
+        self.num_tma_load_bytes = (a_copy_size + b_copy_size) * atom_thr_size
+
+        # Setup TMA store for C
+        tma_atom_c = None
+        tma_tensor_c = None
+        if cutlass.const_expr(self.use_tma_store):
+            c_cta_v_layout = cute.composition(
+                cute.make_identity_layout(c.shape), self.epi_tile
+            )
+            epi_smem_layout = cute.slice_(self.c_smem_layout_staged, (None, None, 0))
+            tma_atom_c, tma_tensor_c = cpasync.make_tma_tile_atom(
+                cpasync.CopyBulkTensorTileS2GOp(),
+                c,
+                epi_smem_layout,
+                c_cta_v_layout,
+            )
+
+        # Compute grid size
+        self.tile_sched_params, grid = self._compute_grid(
+            c, self.cta_tile_shape_mnk, self.cluster_shape_mn, max_active_clusters
+        )
+
+        self.buffer_align_bytes = 1024
+
+        c_smem_size = (
+            cute.cosize(self.c_smem_layout_staged.outer)
+            if cutlass.const_expr(self.use_tma_store)
+            else 0
+        )
+
+        # Define shared storage for kernel
+        @cute.struct
+        class SharedStorage:
+            ab_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
+            ab_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
+            acc_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
+            acc_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
+            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_holding_buf: cutlass.Int32
+            # (EPI_TILE_M, EPI_TILE_N, STAGE)
+            sC: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.c_dtype,
+                    c_smem_size,
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_M, MMA_K, STAGE)
+            sA: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer)
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_N, MMA_K, STAGE)
+            sB: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.b_dtype, cute.cosize(self.b_smem_layout_staged.outer)
+                ],
+                self.buffer_align_bytes,
+            ]
+
+        self.shared_storage = SharedStorage
+
+        # Launch the kernel synchronously
+        self.kernel(
+            tiled_mma,
+            tma_atom_a,
+            tma_tensor_a,
+            tma_atom_b,
+            tma_tensor_b,
+            tma_atom_c,
+            tma_tensor_c if cutlass.const_expr(self.use_tma_store) else c,
+            self.cluster_layout_vmnk,
+            self.a_smem_layout_staged,
+            self.b_smem_layout_staged,
+            self.c_smem_layout_staged,
+            self.epi_tile,
+            self.tile_sched_params,
+            epilogue_op,
+        ).launch(
+            grid=grid,
+            block=[self.threads_per_cta, 1, 1],
+            cluster=(*self.cluster_shape_mn, 1),
+            smem=self.shared_storage.size_in_bytes(),
+            stream=stream,
+        )
+        return
+
+    # GPU device kernel
+    @cute.kernel
+    def kernel(
+        self,
+        tiled_mma: cute.TiledMma,
+        tma_atom_a: cute.CopyAtom,
+        mA_mkl: cute.Tensor,
+        tma_atom_b: cute.CopyAtom,
+        mB_nkl: cute.Tensor,
+        tma_atom_c: Optional[cute.CopyAtom],
+        mC_mnl: cute.Tensor,
+        cluster_layout_vmnk: cute.Layout,
+        a_smem_layout_staged: cute.ComposedLayout,
+        b_smem_layout_staged: cute.ComposedLayout,
+        c_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout, None],
+        epi_tile: cute.Tile,
+        tile_sched_params: utils.PersistentTileSchedulerParams,
+        epilogue_op: cutlass.Constexpr,
+    ):
+        """
+        GPU device kernel performing the Persistent batched GEMM computation.
+        """
+        warp_idx = cute.arch.warp_idx()
+        warp_idx = cute.arch.make_warp_uniform(warp_idx)
+
+        #
+        # Prefetch tma desc
+        #
+        if warp_idx == self.tma_warp_id:
+            cpasync.prefetch_descriptor(tma_atom_a)
+            cpasync.prefetch_descriptor(tma_atom_b)
+            if cutlass.const_expr(self.use_tma_store):
+                cpasync.prefetch_descriptor(tma_atom_c)
+
+        use_2cta_instrs = cute.size(tiled_mma.thr_id.shape) == 2
+
+        #
+        # Setup cta/thread coordinates
+        #
+        # Coords inside cluster
+        bidx, bidy, bidz = cute.arch.block_idx()
+        mma_tile_coord_v = bidx % cute.size(tiled_mma.thr_id.shape)
+        is_leader_cta = mma_tile_coord_v == 0
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(
+            cta_rank_in_cluster
+        )
+        # Coord inside cta
+        tidx, _, _ = cute.arch.thread_idx()
+
+        #
+        # Alloc and init: a+b full/empty, accumulator full/empty, tensor memory dealloc barrier
+        #
+        smem = utils.SmemAllocator()
+        storage = smem.allocate(self.shared_storage)
+
+        tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr
+        tmem_holding_buf = storage.tmem_holding_buf
+
+        # Initialize mainloop ab_pipeline (barrier) and states
+        ab_pipeline_producer_group = utils.CooperativeGroup(utils.Agent.Thread)
+        num_tma_producer = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
+        ab_pipeline_consumer_group = utils.CooperativeGroup(
+            utils.Agent.Thread, num_tma_producer
+        )
+        ab_pipeline = utils.PipelineTmaUmma.create(
+            barrier_storage=storage.ab_full_mbar_ptr.data_ptr(),
+            num_stages=self.num_ab_stage,
+            producer_group=ab_pipeline_producer_group,
+            consumer_group=ab_pipeline_consumer_group,
+            tx_count=self.num_tma_load_bytes,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+
+        # Initialize acc_pipeline (barrier) and states
+        acc_pipeline_producer_group = utils.CooperativeGroup(utils.Agent.Thread)
+        num_acc_consumer_threads = len(self.epilog_warp_id) * (
+            2 if use_2cta_instrs else 1
+        )
+        acc_pipeline_consumer_group = utils.CooperativeGroup(
+            utils.Agent.Thread, num_acc_consumer_threads
+        )
+        acc_pipeline = utils.PipelineUmmaAsync.create(
+            barrier_storage=storage.acc_full_mbar_ptr.data_ptr(),
+            num_stages=self.num_acc_stage,
+            producer_group=acc_pipeline_producer_group,
+            consumer_group=acc_pipeline_consumer_group,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+
+        # Tensor memory dealloc barrier init
+        if use_2cta_instrs:
+            if warp_idx == self.tma_warp_id:
+                num_tmem_dealloc_threads = 32
+                with cute.arch.elect_one():
+                    cute.arch.mbarrier_init_arrive_cnt(
+                        tmem_dealloc_mbar_ptr, num_tmem_dealloc_threads
+                    )
+        cute.arch.mbarrier_init_fence()
+
+        # Cluster arrive after barrier init
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_arrive_relaxed()
+
+        #
+        # Setup smem tensor A/B/C
+        #
+        # (EPI_TILE_M, EPI_TILE_N, STAGE)
+        sC = (
+            storage.sC.get_tensor(
+                c_smem_layout_staged.outer, swizzle=c_smem_layout_staged.inner
+            )
+            if cutlass.const_expr(self.use_tma_store)
+            else None
+        )
+        # (MMA, MMA_M, MMA_K, STAGE)
+        sA = storage.sA.get_tensor(
+            a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner
+        )
+        # (MMA, MMA_N, MMA_K, STAGE)
+        sB = storage.sB.get_tensor(
+            b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner
+        )
+
+        #
+        # Compute multicast mask for A/B buffer full
+        #
+        a_full_mcast_mask = None
+        b_full_mcast_mask = None
+        if self.is_a_mcast or self.is_b_mcast or use_2cta_instrs:
+            a_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2
+            )
+            b_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=1
+            )
+
+        #
+        # Local_tile partition global tensors
+        #
+        # (bM, bK, loopM, loopK, loopL)
+        gA_mkl = cute.local_tile(
+            mA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None)
+        )
+        # (bN, bK, loopN, loopK, loopL)
+        gB_nkl = cute.local_tile(
+            mB_nkl, cute.slice_(self.mma_tiler, (0, None, None)), (None, None, None)
+        )
+        # (bM, bN, loopM, loopN, loopL)
+        gC_mnl = cute.local_tile(
+            mC_mnl, cute.slice_(self.mma_tiler, (None, None, 0)), (None, None, None)
+        )
+        k_block_cnt = cute.size(gA_mkl, mode=[3])
+
+        #
+        # Partition global tensor for TiledMMA_A/B/C
+        #
+        thr_mma = tiled_mma.get_slice(mma_tile_coord_v)
+        # (MMA, MMA_M, MMA_K, loopM, loopK, loopL)
+        tCgA = thr_mma.partition_A(gA_mkl)
+        # (MMA, MMA_N, MMA_K, loopN, loopK, loopL)
+        tCgB = thr_mma.partition_B(gB_nkl)
+        # (MMA, MMA_M, MMA_N, loopM, loopN, loopL)
+        tCgC = thr_mma.partition_C(gC_mnl)
+
+        #
+        # Partition global/shared tensor for TMA load A/B
+        #
+        # TMA load A partition_S/D
+        a_cta_layout = cute.make_layout(
+            cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape
+        )
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), loopM, loopK, loopL)
+        tAsA, tAgA = cpasync.tma_partition(
+            tma_atom_a,
+            block_in_cluster_coord_vmnk[2],
+            a_cta_layout,
+            cute.group_modes(sA, 0, 3),
+            cute.group_modes(tCgA, 0, 3),
+        )
+        # TMA load B partition_S/D
+        b_cta_layout = cute.make_layout(
+            cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape
+        )
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), loopM, loopK, loopL)
+        tBsB, tBgB = cpasync.tma_partition(
+            tma_atom_b,
+            block_in_cluster_coord_vmnk[1],
+            b_cta_layout,
+            cute.group_modes(sB, 0, 3),
+            cute.group_modes(tCgB, 0, 3),
+        )
+
+        #
+        # Partition shared/tensor memory tensor for TiledMMA_A/B/C
+        #
+        # (MMA, MMA_M, MMA_K, STAGE)
+        tCrA = tiled_mma.make_fragment_A(sA)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        tCrB = tiled_mma.make_fragment_B(sB)
+        # (MMA, MMA_M, MMA_N)
+        acc_shape = tiled_mma.partition_shape_C(self.mma_tiler[:2])
+        # (MMA, MMA_M, MMA_N, STAGE)
+        tCtAcc_fake = tiled_mma.make_fragment_C(
+            cute.append(acc_shape, self.num_acc_stage)
+        )
+
+        #
+        # Cluster wait before tensor memory alloc
+        #
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_wait()
+        else:
+            cute.arch.barrier(
+                barrier_id=self.cta_sync_bar_id, number_of_threads=self.threads_per_cta
+            )
+
+        #
+        # Specialized TMA load warp
+        #
+
+        if warp_idx == self.tma_warp_id:
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            work_tile = tile_sched.initial_work_tile_info()
+
+            ab_producer_state = utils.make_pipeline_state(
+                utils.PipelineUserType.Producer, self.num_ab_stage
+            )
+
+            while work_tile.is_valid_tile:
+
+                # Get tile coord from tile scheduler
+                cur_tile_coord = work_tile.tile_idx
+                mma_tile_coord_mnl = (
+                    cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape),
+                    cur_tile_coord[1],
+                    cur_tile_coord[2],
+                )
+
+                #
+                # Slice to per mma tile index
+                #
+                # ((atom_v, rest_v), loopK)
+                tAgA_slice = tAgA[
+                    (None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])
+                ]
+                # ((atom_v, rest_v), loopK)
+                tBgB_slice = tBgB[
+                    (None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])
+                ]
+
+                # Peek (try_wait) AB buffer empty for k_block = prefetch_k_block_cnt
+                ab_producer_state.reset_count()
+                peek_ab_empty_status = cutlass.Boolean(1)
+                if ab_producer_state.count < k_block_cnt:
+                    peek_ab_empty_status = ab_pipeline.producer_try_acquire(
+                        ab_producer_state
+                    )
+                #
+                # Tma load loop
+                #
+                for k_block in cutlass.range_dynamic(0, k_block_cnt, 1, unroll=1):
+                    # Conditionally wait for AB buffer empty
+                    ab_pipeline.producer_acquire(
+                        ab_producer_state, peek_ab_empty_status
+                    )
+
+                    # TMA load A/B
+                    cute.copy(
+                        tma_atom_a,
+                        tAgA_slice[(None, ab_producer_state.count)],
+                        tAsA[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        mcast_mask=a_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_b,
+                        tBgB_slice[(None, ab_producer_state.count)],
+                        tBsB[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        mcast_mask=b_full_mcast_mask,
+                    )
+
+                    # Peek (try_wait) AB buffer empty for k_block = prefetch_k_block_cnt + k_block + 1
+                    ab_producer_state.advance()
+                    peek_ab_empty_status = cutlass.Boolean(1)
+                    if ab_producer_state.count < k_block_cnt:
+                        peek_ab_empty_status = ab_pipeline.producer_try_acquire(
+                            ab_producer_state
+                        )
+
+                #
+                # Advance to next tile
+                #
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+            #
+            # Wait A/B buffer empty
+            #
+            ab_pipeline.producer_tail(ab_producer_state)
+
+        #
+        # Specialized MMA warp
+        #
+        if warp_idx == self.mma_warp_id:
+            #
+            # Bar sync for retrieve tensor memory ptr from shared mem
+            #
+            tmem_ptr_read_threads = 32 * len((self.mma_warp_id, *self.epilog_warp_id))
+            cute.arch.barrier(
+                barrier_id=self.tmem_ptr_sync_bar_id,
+                number_of_threads=tmem_ptr_read_threads,
+            )
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            tmem_ptr = cute.arch.retrieve_tmem_ptr(
+                self.acc_dtype,
+                alignment=16,
+                ptr_to_buffer_holding_addr=tmem_holding_buf,
+            )
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(tmem_ptr, tCtAcc_fake.layout)
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            work_tile = tile_sched.initial_work_tile_info()
+
+            ab_consumer_state = utils.make_pipeline_state(
+                utils.PipelineUserType.Consumer, self.num_ab_stage
+            )
+            acc_producer_state = utils.make_pipeline_state(
+                utils.PipelineUserType.Producer, self.num_acc_stage
+            )
+
+            while work_tile.is_valid_tile:
+
+                # Get tile coord from tile scheduler
+                cur_tile_coord = work_tile.tile_idx
+                mma_tile_coord_mnl = (
+                    cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape),
+                    cur_tile_coord[1],
+                    cur_tile_coord[2],
+                )
+
+                # Set tensor memory buffer for current tile
+                # (MMA, MMA_M, MMA_N)
+                tCtAcc = tCtAcc_base[(None, None, None, acc_producer_state.index)]
+
+                # Peek (try_wait) AB buffer full for k_block = 0
+                ab_consumer_state.reset_count()
+                peek_ab_full_status = cutlass.Boolean(1)
+                if ab_consumer_state.count < k_block_cnt and is_leader_cta:
+                    peek_ab_full_status = ab_pipeline.consumer_try_wait(
+                        ab_consumer_state
+                    )
+
+                #
+                # Wait for accumulator buffer empty
+                #
+                if is_leader_cta:
+                    acc_pipeline.producer_acquire(acc_producer_state)
+
+                #
+                # Reset the ACCUMULATE field for each tile
+                #
+                tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+
+                #
+                # Mma mainloop
+                #
+                for k_block in cutlass.range_dynamic(0, k_block_cnt, 1, unroll=1):
+                    if is_leader_cta:
+                        # Conditionally wait for AB buffer full
+                        ab_pipeline.consumer_wait(
+                            ab_consumer_state, peek_ab_full_status
+                        )
+
+                        # tCtAcc += tCrA * tCrB
+                        num_kphases = cute.size(tCrA, mode=[2])
+                        for kphase_idx in range(num_kphases):
+                            kphase_coord = (
+                                None,
+                                None,
+                                kphase_idx,
+                                ab_consumer_state.index,
+                            )
+
+                            cute.gemm(
+                                tiled_mma,
+                                tCtAcc,
+                                tCrA[kphase_coord],
+                                tCrB[kphase_coord],
+                                tCtAcc,
+                            )
+                            # Enable accumulate on tCtAcc after first kphase
+                            tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+                        # Async arrive AB buffer empty
+                        ab_pipeline.consumer_release(ab_consumer_state)
+
+                    # Peek (try_wait) AB buffer full for k_block = k_block + 1
+                    ab_consumer_state.advance()
+                    peek_ab_full_status = cutlass.Boolean(1)
+                    if ab_consumer_state.count < k_block_cnt:
+                        if is_leader_cta:
+                            peek_ab_full_status = ab_pipeline.consumer_try_wait(
+                                ab_consumer_state
+                            )
+
+                #
+                # Async arrive accumulator buffer full
+                #
+                if is_leader_cta:
+                    acc_pipeline.producer_commit(acc_producer_state)
+                acc_producer_state.advance()
+
+                #
+                # Advance to next tile
+                #
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+            #
+            # Wait for accumulator buffer empty
+            #
+            acc_pipeline.producer_tail(acc_producer_state)
+        #
+        # Specialized epilogue warps
+        #
+        if warp_idx < self.mma_warp_id:
+            #
+            # Alloc tensor memory buffer
+            #
+            if warp_idx == self.epilog_warp_id[0]:
+                cute.arch.alloc_tmem(
+                    self.num_tmem_alloc_cols,
+                    tmem_holding_buf,
+                    is_two_cta=use_2cta_instrs,
+                )
+
+            #
+            # Bar sync for retrieve tensor memory ptr from shared memory
+            #
+            tmem_ptr_read_threads = 32 * len((self.mma_warp_id, *self.epilog_warp_id))
+            cute.arch.barrier(
+                barrier_id=self.tmem_ptr_sync_bar_id,
+                number_of_threads=tmem_ptr_read_threads,
+            )
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            tmem_ptr = cute.arch.retrieve_tmem_ptr(
+                self.acc_dtype,
+                alignment=16,
+                ptr_to_buffer_holding_addr=tmem_holding_buf,
+            )
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(tmem_ptr, tCtAcc_fake.layout)
+
+            #
+            # Partition for epilogue
+            #
+            epi_tidx = tidx
+            tiled_copy_t2r, tTR_tAcc_base, tTR_rAcc = (
+                self.epilog_tmem_copy_and_partition(
+                    epi_tidx, tCtAcc_base, tCgC, epi_tile, use_2cta_instrs
+                )
+            )
+
+            tTR_rC = None
+            tiled_copy_r2s = None
+            simt_atom = None
+            tRS_rC = None
+            tRS_sC = None
+            bSG_sC = None
+            bSG_gC_partitioned = None
+            tTR_gC_partitioned = None
+            if cutlass.const_expr(self.use_tma_store):
+                tTR_rC = cute.make_fragment(tTR_rAcc.shape, self.c_dtype)
+                tiled_copy_r2s, tRS_rC, tRS_sC = self.epilog_smem_copy_and_partition(
+                    tiled_copy_t2r, tTR_rC, epi_tidx, sC
+                )
+                tma_atom_c, bSG_sC, bSG_gC_partitioned = (
+                    self.epilog_gmem_copy_and_partition(
+                        epi_tidx, tma_atom_c, tCgC, epi_tile, sC
+                    )
+                )
+            else:
+                simt_atom, tTR_rC, tTR_gC_partitioned = (
+                    self.epilog_gmem_copy_and_partition(
+                        epi_tidx, tiled_copy_t2r, tCgC, epi_tile, sC
+                    )
+                )
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            work_tile = tile_sched.initial_work_tile_info()
+
+            acc_consumer_state = utils.make_pipeline_state(
+                utils.PipelineUserType.Consumer, self.num_acc_stage
+            )
+
+            c_pipeline = None
+            if cutlass.const_expr(self.use_tma_store):
+                # Threads/warps participating in tma store pipeline
+                c_producer_group = utils.CooperativeGroup(
+                    utils.Agent.Thread,
+                    32 * len(self.epilog_warp_id),
+                    32 * len(self.epilog_warp_id),
+                )
+                c_pipeline = utils.PipelineTmaStore.create(
+                    num_stages=self.num_c_stage,
+                    producer_group=c_producer_group,
+                )
+
+            while work_tile.is_valid_tile:
+
+                # Get tile coord from tile scheduler
+                cur_tile_coord = work_tile.tile_idx
+                mma_tile_coord_mnl = (
+                    cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape),
+                    cur_tile_coord[1],
+                    cur_tile_coord[2],
+                )
+
+                #
+                # Slice to per mma tile index
+                #
+                bSG_gC = None
+                tTR_gC = None
+                if cutlass.const_expr(self.use_tma_store):
+                    # ((ATOM_V, REST_V), EPI_M, EPI_N)
+                    bSG_gC = bSG_gC_partitioned[
+                        (
+                            None,
+                            None,
+                            None,
+                            *mma_tile_coord_mnl,
+                        )
+                    ]
+                else:
+                    # (T2R, T2R_M, T2R_N, EPI_M, EPI_N)
+                    tTR_gC = tTR_gC_partitioned[
+                        (
+                            None,
+                            None,
+                            None,
+                            None,
+                            None,
+                            *mma_tile_coord_mnl,
+                        )
+                    ]
+
+                # Set tensor memory buffer for current tile
+                # (T2R, T2R_M, T2R_N, EPI_M, EPI_M)
+                tTR_tAcc = tTR_tAcc_base[
+                    (None, None, None, None, None, acc_consumer_state.index)
+                ]
+
+                #
+                # Wait for accumulator buffer full
+                #
+                acc_pipeline.consumer_wait(acc_consumer_state)
+
+                tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc))
+                if cutlass.const_expr(self.use_tma_store):
+                    bSG_gC = cute.group_modes(bSG_gC, 1, cute.rank(bSG_gC))
+                else:
+                    tTR_gC = cute.group_modes(tTR_gC, 3, cute.rank(tTR_gC))
+
+                #
+                # Store accumulator to global memory in subtiles
+                #
+                subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3])
+                num_prev_subtiles = tile_sched.num_tiles_executed * subtile_cnt
+                for subtile_idx in cutlass.range_dynamic(subtile_cnt):
+                    #
+                    # Load accumulator from tensor memory buffer to register
+                    #
+                    tTR_tAcc_mn = tTR_tAcc[(None, None, None, subtile_idx)]
+                    cute.copy(tiled_copy_t2r, tTR_tAcc_mn, tTR_rAcc)
+
+                    if cutlass.const_expr(self.use_tma_store):
+                        #
+                        # Convert to C type
+                        #
+                        acc_vec = tiled_copy_r2s.retile(tTR_rAcc).load()
+                        acc_vec = epilogue_op(acc_vec.to(self.c_dtype))
+                        tRS_rC.store(acc_vec)
+
+                        #
+                        # Store C to shared memory
+                        #
+                        c_buffer = (num_prev_subtiles + subtile_idx) % self.num_c_stage
+                        cute.copy(
+                            tiled_copy_r2s,
+                            tRS_rC,
+                            tRS_sC[(None, None, None, c_buffer)],
+                        )
+                        # Fence and barrier to make sure shared memory store is visible to TMA store
+                        cute.arch.fence_proxy(
+                            cute.arch.ProxyKind.async_shared,
+                            space=cute.arch.SharedSpace.shared_cta,
+                        )
+                        epilog_threads = 32 * len(self.epilog_warp_id)
+                        cute.arch.barrier(
+                            barrier_id=self.epilog_sync_bar_id,
+                            number_of_threads=epilog_threads,
+                        )
+
+                        #
+                        # TMA store C to global memory
+                        #
+                        if warp_idx == self.epilog_warp_id[0]:
+                            cute.copy(
+                                tma_atom_c,
+                                bSG_sC[(None, c_buffer)],
+                                bSG_gC[(None, subtile_idx)],
+                            )
+                            # Fence and barrier to make sure shared memory store is visible to TMA store
+                            c_pipeline.producer_commit()
+                            c_pipeline.producer_acquire()
+                        cute.arch.barrier(
+                            barrier_id=self.epilog_sync_bar_id,
+                            number_of_threads=epilog_threads,
+                        )
+                    else:
+                        #
+                        # Convert to C type
+                        #
+                        acc_vec = tTR_rAcc.load()
+                        acc_vec = epilogue_op(acc_vec.to(self.c_dtype))
+                        tTR_rC.store(acc_vec)
+
+                        #
+                        # Store C to global memory
+                        #
+                        cute.copy(
+                            simt_atom, tTR_rC, tTR_gC[(None, None, None, subtile_idx)]
+                        )
+
+                #
+                # Async arrive accumulator buffer empty
+                #
+                with cute.arch.elect_one():
+                    acc_pipeline.consumer_release(acc_consumer_state)
+                acc_consumer_state.advance()
+
+                #
+                # Advance to next tile
+                #
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+            #
+            # Dealloc the tensor memory buffer
+            #
+            if warp_idx == self.epilog_warp_id[0]:
+                cute.arch.relinquish_tmem_alloc_permit(is_two_cta=use_2cta_instrs)
+            epilog_threads = 32 * len(self.epilog_warp_id)
+            cute.arch.barrier(
+                barrier_id=self.epilog_sync_bar_id, number_of_threads=epilog_threads
+            )
+            if warp_idx == self.epilog_warp_id[0]:
+                if use_2cta_instrs:
+                    cute.arch.mbarrier_arrive(
+                        tmem_dealloc_mbar_ptr, cta_rank_in_cluster ^ 1
+                    )
+                    cute.arch.mbarrier_wait(tmem_dealloc_mbar_ptr, 0)
+                cute.arch.dealloc_tmem(
+                    tmem_ptr, self.num_tmem_alloc_cols, is_two_cta=use_2cta_instrs
+                )
+            #
+            # Wait for C store complete
+            #
+            if cutlass.const_expr(self.use_tma_store):
+                c_pipeline.producer_tail()
+
+    def epilog_tmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        tAcc: cute.Tensor,
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        use_2cta_instrs: Union[cutlass.Boolean, bool],
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array (destination).
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param tAcc: The accumulator tensor to be copied and partitioned
+        :type tAcc: cute.Tensor
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param use_2cta_instrs: Whether use_2cta_instrs is enabled
+        :type use_2cta_instrs: bool
+
+        :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc) where:
+            - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+            - tTR_tAcc: The partitioned accumulator tensor
+            - tTR_rAcc: The accumulated tensor in register used to hold t2r results
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        # Make tiledCopy for tensor memory load
+        copy_atom_t2r = sm100_utils.get_tmem_load_op(
+            self.cta_tile_shape_mnk,
+            self.c_layout,
+            self.c_dtype,
+            self.acc_dtype,
+            epi_tile,
+            use_2cta_instrs,
+        )
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, STAGE)
+        tAcc_epi = cute.flat_divide(
+            tAcc[((None, None), 0, 0, None)],
+            epi_tile,
+        )
+        # (EPI_TILE_M, EPI_TILE_N)
+        tiled_copy_t2r = tcgen05.make_tmem_copy(
+            copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)]
+        )
+
+        thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_M, STAGE)
+        tTR_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
+
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        gC_mnl_epi = cute.flat_divide(
+            gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile
+        )
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi)
+        # (T2R, T2R_M, T2R_N)
+        tTR_rAcc = cute.make_fragment(
+            tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype
+        )
+        return tiled_copy_t2r, tTR_tAcc, tTR_rAcc
+
+    def epilog_smem_copy_and_partition(
+        self,
+        tiled_copy_t2r: cute.TiledCopy,
+        tTR_rC: cute.Tensor,
+        tidx: cutlass.Int32,
+        sC: cute.Tensor,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for shared memory store, then use it to partition register array (source) and shared memory (destination).
+
+        :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+        :type tiled_copy_t2r: cute.TiledCopy
+        :param tTR_rC: The partitioned accumulator tensor
+        :type tTR_rC: cute.Tensor
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param sC: The shared memory tensor to be copied and partitioned
+        :type sC: cute.Tensor
+        :type sepi: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_r2s, tRS_rC, tRS_sC) where:
+            - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s)
+            - tRS_rC: The partitioned tensor C (register source)
+            - tRS_sC: The partitioned tensor C (smem destination)
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        copy_atom_r2s = sm100_utils.get_smem_store_op(
+            self.c_layout, self.c_dtype, self.acc_dtype, tiled_copy_t2r
+        )
+        tiled_copy_r2s = cute.make_tiled_copy(
+            copy_atom_r2s,
+            layout_tv=tiled_copy_t2r.layout_dst_tv_tiled,
+            tiler_mn=tiled_copy_t2r.tiler_mn,
+        )
+        # (R2S, R2S_M, R2S_N, PIPE_D)
+        thr_copy_r2s = tiled_copy_r2s.get_slice(tidx)
+        tRS_sC = thr_copy_r2s.partition_D(sC)
+        # (R2S, R2S_M, R2S_N)
+        tRS_rC = tiled_copy_r2s.retile(tTR_rC)
+        return tiled_copy_r2s, tRS_rC, tRS_sC
+
+    def epilog_gmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        atom: Union[cute.CopyAtom, cute.TiledCopy],
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        sC: cute.Tensor,
+    ) -> Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]:
+        """Make tiledCopy for global memory store, then use it to:
+        - partition register array (source) and global memory (destination) for none TMA store version;
+        - partition shared memory (source) and global memory (destination) for TMA store version.
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param atom: The copy_atom_c to be used for TMA store version, or tiled_copy_t2r for none TMA store version
+        :type atom: cute.CopyAtom or cute.TiledCopy
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param sC: The shared memory tensor to be copied and partitioned
+        :type sC: cute.Tensor
+
+        :return: A tuple containing either:
+            - For TMA store: (tma_atom_c, bSG_sC, bSG_gC) where:
+                - tma_atom_c: The TMA copy atom
+                - bSG_sC: The partitioned shared memory tensor C
+                - bSG_gC: The partitioned global tensor C
+            - For non-TMA store: (simt_atom, tTR_rC, tTR_gC) where:
+                - simt_atom: The SIMT copy atom
+                - tTR_rC: The register tensor C
+                - tTR_gC: The partitioned global tensor C
+        :rtype: Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]
+        """
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        gC_epi = cute.flat_divide(
+            gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile
+        )
+        if cutlass.const_expr(self.use_tma_store):
+            tma_atom_c = atom
+            sC_for_tma_partition = cute.group_modes(sC, 0, 2)
+            gC_for_tma_partition = cute.group_modes(gC_epi, 0, 2)
+            # ((ATOM_V, REST_V), EPI_M, EPI_N)
+            # ((ATOM_V, REST_V), EPI_M, EPI_N, loopM, loopN, loopL)
+            bSG_sC, bSG_gC = cpasync.tma_partition(
+                tma_atom_c,
+                0,
+                cute.make_layout(1),
+                sC_for_tma_partition,
+                gC_for_tma_partition,
+            )
+            return tma_atom_c, bSG_sC, bSG_gC
+        else:
+            tiled_copy_t2r = atom
+            # (T2R, T2R_M, T2R_N, EPI_M, EPI_N, loopM, loopN, loopL)
+            thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+            tTR_gC = thr_copy_t2r.partition_D(gC_epi)
+            # (T2R, T2R_M, T2R_N)
+            tTR_rC = cute.make_fragment(
+                tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.c_dtype
+            )
+            simt_atom = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), self.c_dtype)
+            return simt_atom, tTR_rC, tTR_gC
+
+    @staticmethod
+    def _compute_stages(
+        tiled_mma: cute.TiledMma,
+        mma_tiler_mnk: Tuple[int, int, int],
+        a_dtype: Type[cutlass.Numeric],
+        b_dtype: Type[cutlass.Numeric],
+        epi_tile: cute.Tile,
+        c_dtype: Type[cutlass.Numeric],
+        c_layout: utils.LayoutEnum,
+        num_smem_capacity: int,
+        occupancy: int,
+        use_tma_store: bool,
+    ) -> Tuple[int, int, int]:
+        """Computes the number of stages for A/B/C operands based on heuristics.
+
+        :param tiled_mma: The tiled MMA object defining the core computation.
+        :type tiled_mma: cute.TiledMma
+        :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler.
+        :type mma_tiler_mnk: tuple[int, int, int]
+        :param a_dtype: Data type of operand A.
+        :type a_dtype: type[cutlass.Numeric]
+        :param b_dtype: Data type of operand B.
+        :type b_dtype: type[cutlass.Numeric]
+        :param epi_tile: The epilogue tile shape.
+        :type epi_tile: cute.Tile
+        :param c_dtype: Data type of operand C (output).
+        :type c_dtype: type[cutlass.Numeric]
+        :param c_layout: Layout enum of operand C.
+        :type c_layout: utils.LayoutEnum
+        :param num_smem_capacity: Total available shared memory capacity in bytes.
+        :type num_smem_capacity: int
+        :param occupancy: Target number of CTAs per SM (occupancy).
+        :type occupancy: int
+        :param use_tma_store: Whether TMA store is enabled.
+        :type use_tma_store: bool
+
+        :return: A tuple containing the computed number of stages for:
+                 (ACC stages, A/B operand stages, C stages)
+        :rtype: tuple[int, int, int]
+        """
+        # Default ACC stages
+        num_acc_stage = 2
+
+        # Default C stages
+        num_c_stage = 2 if use_tma_store else 0
+
+        # Calculate smem layout and size for one stage of A, B, and C
+        a_smem_layout_stage_one = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            mma_tiler_mnk,
+            a_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+        b_smem_layout_staged_one = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            mma_tiler_mnk,
+            b_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+        c_smem_layout_staged_one = (
+            sm100_utils.make_smem_layout_epi(
+                c_dtype,
+                c_layout,
+                epi_tile,
+                1,
+            )
+            if use_tma_store
+            else None
+        )
+        ab_bytes_per_stage = cute.size_in_bytes(
+            a_dtype, a_smem_layout_stage_one
+        ) + cute.size_in_bytes(b_dtype, b_smem_layout_staged_one)
+        mbar_helpers_bytes = 1024
+        c_bytes_per_stage = (
+            cute.size_in_bytes(c_dtype, c_smem_layout_staged_one)
+            if use_tma_store
+            else 0
+        )
+        c_bytes = c_bytes_per_stage * num_c_stage
+
+        # Calculate A/B stages:
+        # Start with total smem per CTA (capacity / occupancy)
+        # Subtract reserved bytes and initial C stages bytes
+        # Divide remaining by bytes needed per A/B stage
+        num_ab_stage = (
+            num_smem_capacity // occupancy - (mbar_helpers_bytes + c_bytes)
+        ) // ab_bytes_per_stage
+
+        # Refine epilogue stages:
+        # Calculate remaining smem after allocating for A/B stages and reserved bytes
+        # Add remaining unused smem to epilogue
+        if use_tma_store:
+            num_c_stage += (
+                num_smem_capacity
+                - occupancy * ab_bytes_per_stage * num_ab_stage
+                - occupancy * (mbar_helpers_bytes + c_bytes)
+            ) // (occupancy * c_bytes_per_stage)
+        return num_acc_stage, num_ab_stage, num_c_stage
+
+    @staticmethod
+    def _compute_grid(
+        c: cute.Tensor,
+        cta_tile_shape_mnk: Tuple[int, int, int],
+        cluster_shape_mn: Tuple[int, int],
+        max_active_clusters: cutlass.Constexpr,
+    ) -> Tuple[utils.PersistentTileSchedulerParams, Tuple[int, int, int]]:
+        """Use persistent tile scheduler to compute the grid size for the output tensor C.
+
+        :param c: The output tensor C
+        :type c: cute.Tensor
+        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
+        :type cta_tile_shape_mnk: tuple[int, int, int]
+        :param cluster_shape_mn: Shape of each cluster in M, N dimensions.
+        :type cluster_shape_mn: tuple[int, int]
+        :param max_active_clusters: Maximum number of active clusters.
+        :type max_active_clusters: cutlass.Constexpr
+
+        :return: A tuple containing:
+            - tile_sched_params: Parameters for the persistent tile scheduler.
+            - grid: Grid shape for kernel launch.
+        :rtype: Tuple[utils.PersistentTileSchedulerParams, tuple[int, int, int]]
+        """
+        c_shape = cute.slice_(cta_tile_shape_mnk, (None, None, 0))
+        gc = cute.zipped_divide(c, tiler=c_shape)
+        num_ctas_mnl = gc[(0, (None, None, None))].shape
+        cluster_shape_mnl = (*cluster_shape_mn, 1)
+
+        tile_sched_params = utils.PersistentTileSchedulerParams(
+            num_ctas_mnl, cluster_shape_mnl
+        )
+        grid = utils.StaticPersistentTileScheduler.get_grid_shape(
+            tile_sched_params, max_active_clusters
+        )
+
+        return tile_sched_params, grid
+
+    @staticmethod
+    def _get_tma_atom_kind(
+        atom_sm_cnt: cutlass.Int32, mcast: cutlass.Boolean
+    ) -> Union[
+        cpasync.CopyBulkTensorTileG2SMulticastOp, cpasync.CopyBulkTensorTileG2SOp
+    ]:
+        """
+        Select the appropriate TMA copy atom based on the number of SMs and the multicast flag.
+
+        :param atom_sm_cnt: The number of SMs
+        :type atom_sm_cnt: cutlass.Int32
+        :param mcast: The multicast flag
+        :type mcast: cutlass.Boolean
+
+        :return: The appropriate TMA copy atom kind
+        :rtype: cpasync.CopyBulkTensorTileG2SMulticastOp or cpasync.CopyBulkTensorTileG2SOp
+
+        :raise ValueError: If the atom_sm_cnt is invalid
+        """
+        if atom_sm_cnt == 2 and mcast:
+            return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.TWO)
+        elif atom_sm_cnt == 2 and not mcast:
+            return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.TWO)
+        elif atom_sm_cnt == 1 and mcast:
+            return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.ONE)
+        elif atom_sm_cnt == 1 and not mcast:
+            return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE)
+
+        raise ValueError(f"Invalid atom_sm_cnt: {atom_sm_cnt} and {mcast}")
+
+    @staticmethod
+    def _compute_num_tmem_alloc_cols(
+        tiled_mma: cute.TiledMma,
+        mma_tiler: Tuple[int, int, int],
+        num_acc_stage: int,
+    ) -> int:
+        """
+        Compute the number of tensor memory allocation columns.
+
+        :param tiled_mma: The tiled MMA object defining the core computation.
+        :type tiled_mma: cute.TiledMma
+        :param mma_tiler: The shape (M, N, K) of the MMA tile.
+        :type mma_tiler: tuple[int, int, int]
+        :param num_acc_stage: The stage of the accumulator tensor.
+        :type num_acc_stage: int
+
+        :return: The number of tensor memory allocation columns.
+        :rtype: int
+        """
+        acc_shape = tiled_mma.partition_shape_C(mma_tiler[:2])
+        tCtAcc_fake = tiled_mma.make_fragment_C(cute.append(acc_shape, num_acc_stage))
+        num_tmem_alloc_cols = utils.get_num_tmem_alloc_cols(tCtAcc_fake)
+
+        return num_tmem_alloc_cols
+
+    @staticmethod
+    def is_valid_dtypes(
+        ab_dtype: Type[cutlass.Numeric],
+        acc_dtype: Type[cutlass.Numeric],
+        c_dtype: Type[cutlass.Numeric],
+    ) -> bool:
+        """
+        Check if the dtypes are valid
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param acc_dtype: The data type of the accumulator
+        :type acc_dtype: Type[cutlass.Numeric]
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+
+        :return: True if the dtypes are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+        if ab_dtype not in {
+            cutlass.Float16,
+            cutlass.BFloat16,
+            cutlass.TFloat32,
+            cutlass.Uint8,
+            cutlass.Int8,
+            cutlass.Float8E4M3FN,
+            cutlass.Float8E5M2,
+        }:
+            is_valid = False
+        if (
+            acc_dtype not in {cutlass.Float32, cutlass.Float16, cutlass.Int32}
+            or acc_dtype == cutlass.Float16
+            and ab_dtype
+            not in {cutlass.Float16, cutlass.Float8E4M3FN, cutlass.Float8E5M2}
+            or acc_dtype == cutlass.Int32
+            and ab_dtype not in {cutlass.Uint8, cutlass.Int8}
+        ):
+            is_valid = False
+        if (
+            acc_dtype == cutlass.Float32
+            and c_dtype
+            not in {
+                cutlass.Float32,
+                cutlass.Float16,
+                cutlass.BFloat16,
+                cutlass.Float8E4M3FN,
+                cutlass.Float8E5M2,
+                cutlass.Int32,
+                cutlass.Int8,
+                cutlass.Uint8,
+            }
+            or acc_dtype == cutlass.Float16
+            and c_dtype
+            not in {
+                cutlass.BFloat16,
+                cutlass.Float16,
+            }
+            or acc_dtype == cutlass.Int32
+            and c_dtype
+            not in {
+                cutlass.BFloat16,
+                cutlass.Float16,
+                cutlass.Float32,
+                cutlass.Int32,
+                cutlass.Int8,
+                cutlass.Uint8,
+            }
+        ):
+            is_valid = False
+        return is_valid
+
+    @staticmethod
+    def is_valid_mma_tiler_and_cluster_shape(
+        use_2cta_instrs: bool,
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+    ) -> bool:
+        """
+        Check if the mma tiler and cluster shape are valid
+
+        :param use_2cta_instrs: Whether to use 2 CTA groups
+        :type use_2cta_instrs: bool
+        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
+        :type cluster_shape_mn: Tuple[int, int]
+
+        :return: True if the mma tiler and cluster shape are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+        # Skip invalid mma tile shape
+        if not (
+            (not use_2cta_instrs and mma_tiler_mn[0] in [64, 128])
+            or (use_2cta_instrs and mma_tiler_mn[0] in [128, 256])
+        ):
+            is_valid = False
+        if mma_tiler_mn[1] not in range(32, 257, 32):
+            is_valid = False
+        # Skip illegal cluster shape
+        if cluster_shape_mn[0] % (2 if use_2cta_instrs else 1) != 0:
+            is_valid = False
+        # Skip invalid cluster shape
+        is_power_of_2 = lambda x: x > 0 and (x & (x - 1)) == 0
+        if (
+            cluster_shape_mn[0] * cluster_shape_mn[1] > 16
+            or cluster_shape_mn[0] <= 0
+            or cluster_shape_mn[1] <= 0
+            or not is_power_of_2(cluster_shape_mn[0])
+            or not is_power_of_2(cluster_shape_mn[1])
+        ):
+            is_valid = False
+        return is_valid
+
+    @staticmethod
+    def is_valid_tensor_alignment(
+        m: int,
+        n: int,
+        k: int,
+        l: int,
+        ab_dtype: Type[cutlass.Numeric],
+        c_dtype: Type[cutlass.Numeric],
+        a_major: str,
+        b_major: str,
+        c_major: str,
+    ) -> bool:
+        """
+        Check if the tensor alignment is valid
+
+        :param m: The number of rows in the A tensor
+        :type m: int
+        :param n: The number of columns in the B tensor
+        :type n: int
+        :param k: The number of columns in the A tensor
+        :type k: int
+        :param l: The number of columns in the C tensor
+        :type l: int
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+        :param a_major: The major axis of the A tensor
+        :type a_major: str
+        :param b_major: The major axis of the B tensor
+        :type b_major: str
+        :param c_major: The major axis of the C tensor
+        :type c_major: str
+
+        :return: True if the problem shape is valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+
+        def check_contigous_16B_alignment(dtype, is_mode0_major, tensor_shape):
+            major_mode_idx = 0 if is_mode0_major else 1
+            num_major_elements = tensor_shape[major_mode_idx]
+            num_contiguous_elements = 16 * 8 // dtype.width
+            return num_major_elements % num_contiguous_elements == 0
+
+        if (
+            not check_contigous_16B_alignment(ab_dtype, a_major == "m", (m, k, l))
+            or not check_contigous_16B_alignment(ab_dtype, b_major == "n", (n, k, l))
+            or not check_contigous_16B_alignment(c_dtype, c_major == "m", (m, n, l))
+        ):
+            is_valid = False
+        return is_valid
+
+    @staticmethod
+    def is_valid_epilog_store_option(
+        use_2cta_instrs: bool,
+        use_tma_store: bool,
+        m: int,
+        n: int,
+        mma_tiler_mn: Tuple[int, int],
+    ) -> bool:
+        """
+        Check if the epilogue store option is valid
+
+        :param use_2cta_instrs: Whether to use 2 CTA groups
+        :type use_2cta_instrs: bool
+        :param use_tma_store: Whether to use TMA store
+        :type use_tma_store: bool
+        :param m: The number of rows in the A tensor
+        :type m: int
+        :param n: The number of columns in the B tensor
+        :type n: int
+        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
+        :type mma_tiler_mn: Tuple[int, int]
+
+        :return: True if the epilogue store option is valid, False otherwise
+        :rtype: bool
+        """
+
+        is_valid = True
+        # None TMA store version does not have predication, can not support OOB tiles
+        cta_tile_shape_mn = (
+            mma_tiler_mn[0] // (2 if use_2cta_instrs else 1),
+            mma_tiler_mn[1],
+        )
+        if not use_tma_store:
+            if not (m % cta_tile_shape_mn[0] == 0 and n % cta_tile_shape_mn[1] == 0):
+                is_valid = False
+        return is_valid
+
+    @staticmethod
+    def can_implement(
+        ab_dtype: Type[cutlass.Numeric],
+        acc_dtype: Type[cutlass.Numeric],
+        c_dtype: Type[cutlass.Numeric],
+        use_2cta_instrs: bool,
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+        use_tma_store: bool,
+        m: int,
+        n: int,
+        k: int,
+        l: int,
+        a_major: str,
+        b_major: str,
+        c_major: str,
+    ) -> bool:
+        """
+        Check if the gemm can be implemented
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param acc_dtype: The data type of the accumulator
+        :type acc_dtype: Type[cutlass.Numeric]
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+        :param use_2cta_instrs: Whether to use 2 CTA groups
+        :type use_2cta_instrs: bool
+        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
+        :type cluster_shape_mn: Tuple[int, int]
+        :param use_tma_store: Whether to use TMA store
+        :type use_tma_store: bool
+        :param m: The number of rows in the A tensor
+        :type m: int
+        :param n: The number of columns in the B tensor
+        :type n: int
+        :param k: The number of columns in the A tensor
+        :type k: int
+        :param l: The number of columns in the C tensor
+        :type l: int
+        :param a_major: The major axis of the A tensor
+        :type a_major: str
+        :param b_major: The major axis of the B tensor
+        :type b_major: str
+        :param c_major: The major axis of the C tensor
+        :type c_major: str
+
+        :return: True if the gemm can be implemented, False otherwise
+        :rtype: bool
+        """
+        can_implement = True
+        # Skip unsupported types
+        if not PersistentDenseGemmKernel.is_valid_dtypes(ab_dtype, acc_dtype, c_dtype):
+            can_implement = False
+        # Skip invalid mma tile shape and cluster shape
+        if not PersistentDenseGemmKernel.is_valid_mma_tiler_and_cluster_shape(
+            use_2cta_instrs, mma_tiler_mn, cluster_shape_mn
+        ):
+            can_implement = False
+        # Skip illegal problem shape for load/store alignment
+        if not PersistentDenseGemmKernel.is_valid_tensor_alignment(
+            m, n, k, l, ab_dtype, c_dtype, a_major, b_major, c_major
+        ):
+            can_implement = False
+        # Skip invalid epilogue store option
+        if not PersistentDenseGemmKernel.is_valid_epilog_store_option(
+            use_2cta_instrs, use_tma_store, m, n, mma_tiler_mn
+        ):
+            can_implement = False
+        return can_implement
+
+
+def run_dense_gemm(
+    mnkl: Tuple[int, int, int, int],
+    ab_dtype: Type[cutlass.Numeric],
+    c_dtype: Type[cutlass.Numeric],
+    acc_dtype: Type[cutlass.Numeric],
+    a_major: str,
+    b_major: str,
+    c_major: str,
+    mma_tiler_mn: Tuple[int, int],
+    cluster_shape_mn: Tuple[int, int],
+    use_2cta_instrs: bool,
+    use_tma_store: bool,
+    tolerance: float,
+    warmup_iterations: int = 0,
+    iterations: int = 1,
+    skip_ref_check: bool = False,
+):
+    """
+    Prepare A/B/C tensors, launch GPU kernel, and reference checking.
+    """
+    print(f"Running Blackwell Persistent Dense GEMM test with:")
+    print(f"mnkl: {mnkl}")
+    print(f"AB dtype: {ab_dtype}, C dtype: {c_dtype}, Acc dtype: {acc_dtype}")
+    print(f"Matrix majors - A: {a_major}, B: {b_major}, C: {c_major}")
+    print(f"Mma Tiler (M, N): {mma_tiler_mn}, Cluster Shape (M, N): {cluster_shape_mn}")
+    print(f"2CTA MMA instructions: {'True' if use_2cta_instrs else 'False'}")
+    print(f"Use TMA Store: {'True' if use_tma_store else 'False'}")
+    print(f"Tolerance: {tolerance}")
+    print(f"Warmup iterations: {warmup_iterations}")
+    print(f"Iterations: {iterations}")
+    print(f"Skip reference checking: {skip_ref_check}")
+
+    # Unpack parameters
+    m, n, k, l = mnkl
+
+    # Skip unsupported testcase
+    if not PersistentDenseGemmKernel.can_implement(
+        ab_dtype,
+        acc_dtype,
+        c_dtype,
+        use_2cta_instrs,
+        mma_tiler_mn,
+        cluster_shape_mn,
+        use_tma_store,
+        m,
+        n,
+        k,
+        l,
+        a_major,
+        b_major,
+        c_major,
+    ):
+        raise TypeError(
+            f"Unsupported testcase {ab_dtype}, {acc_dtype}, {c_dtype}, {use_2cta_instrs}, {mma_tiler_mn}, {cluster_shape_mn}, {use_tma_store}, {m}, {n}, {k}, {l}, {a_major}, {b_major}, {c_major}"
+        )
+
+    if not torch.cuda.is_available():
+        raise RuntimeError("GPU is required to run this example!")
+
+    torch.manual_seed(1111)
+
+    # Create and permute tensor A/B/C
+    def create_and_permute_tensor(
+        l, mode0, mode1, is_mode0_major, dtype, is_dynamic_layout=True
+    ):
+        # is_mode0_major: (l, mode1, mode0) -> (mode0, mode1, l)
+        # else: (l, mode0, mode1) -> (mode0, mode1, l)
+        shape = (l, mode1, mode0) if is_mode0_major else (l, mode0, mode1)
+        permute_order = (2, 1, 0) if is_mode0_major else (1, 2, 0)
+        is_unsigned = dtype in {cutlass.Uint8}
+        # Temporarily use uint8 as torch does not support fp8 type
+        torch_dtype = (
+            cutlass_torch.dtype(dtype)
+            if dtype not in {cutlass.Float8E5M2, cutlass.Float8E4M3FN}
+            else torch.uint8
+        )
+
+        # Create dtype torch tensor (cpu)
+        torch_tensor_cpu = cutlass_torch.create_and_permute_torch_tensor(
+            shape,
+            torch_dtype,
+            permute_order=permute_order,
+            init_type=cutlass_torch.TensorInitType.RANDOM,
+            init_config=cutlass_torch.RandomInitConfig(
+                min_val=0 if is_unsigned else -2, max_val=4 if is_unsigned else 2
+            ),
+        )
+        # Create dtype torch tensor (gpu)
+        torch_tensor = torch_tensor_cpu.cuda()
+
+        # Create f32 torch tensor (cpu)
+        f32_torch_tensor = torch_tensor_cpu.to(dtype=torch.float32)
+
+        # Create dtype cute tensor (gpu)
+        cute_tensor = from_dlpack(torch_tensor, assumed_align=16)
+        cute_tensor.element_type = dtype
+        if is_dynamic_layout:
+            cute_tensor = cute_tensor.mark_layout_dynamic(
+                leading_dim=(0 if is_mode0_major else 1)
+            )
+        cute_tensor = cutlass_torch.convert_cute_tensor(
+            f32_torch_tensor,
+            cute_tensor,
+            dtype,
+            is_dynamic_layout=is_dynamic_layout,
+        )
+
+        return f32_torch_tensor, cute_tensor, torch_tensor
+
+    a_ref, a_tensor, a_torch = create_and_permute_tensor(
+        l, m, k, a_major == "m", ab_dtype, is_dynamic_layout=True
+    )
+    b_ref, b_tensor, b_torch = create_and_permute_tensor(
+        l, n, k, b_major == "n", ab_dtype, is_dynamic_layout=True
+    )
+    c_ref, c_tensor, c_torch = create_and_permute_tensor(
+        l, m, n, c_major == "m", c_dtype, is_dynamic_layout=True
+    )
+
+    # Configure gemm kernel
+    gemm = PersistentDenseGemmKernel(
+        acc_dtype,
+        use_2cta_instrs,
+        mma_tiler_mn,
+        cluster_shape_mn,
+        use_tma_store,
+    )
+
+    # Compute max active clusters on current device
+    hardware_info = cutlass.utils.HardwareInfo()
+    max_active_clusters = hardware_info.get_max_active_clusters(
+        cluster_shape_mn[0] * cluster_shape_mn[1]
+    )
+
+    # Get current CUDA stream from PyTorch
+    torch_stream = torch.cuda.current_stream()
+    # Get the raw stream pointer as a CUstream
+    current_stream = cuda.CUstream(torch_stream.cuda_stream)
+    # Compile gemm kernel
+    compiled_gemm = cute.compile(
+        gemm, a_tensor, b_tensor, c_tensor, max_active_clusters, current_stream
+    )
+
+    # Launch GPU kernel
+    # Warm up
+    for i in range(warmup_iterations):
+        compiled_gemm(a_tensor, b_tensor, c_tensor, current_stream)
+    # Execution
+    for i in range(iterations):
+        compiled_gemm(a_tensor, b_tensor, c_tensor, current_stream)
+
+    # Compute reference result
+    if not skip_ref_check:
+        if ab_dtype in {
+            cutlass.Int8,
+            cutlass.Uint8,
+            cutlass.Float8E4M3FN,
+            cutlass.Float8E5M2,
+        }:
+            ref = torch.einsum("mkl,nkl->mnl", a_ref.cpu(), b_ref.cpu())
+        else:
+            ref = (torch.einsum("mkl,nkl->mnl", a_ref, b_ref)).cpu()
+
+        # Copy gpu result back
+        gpu_c = c_torch.cpu()
+
+        # Convert ref to c_type
+        if c_dtype == cutlass.Float32:
+            ref_c = ref
+        elif c_dtype in {cutlass.Float8E5M2, cutlass.Float8E4M3FN}:
+            # m major: (l, n, m) -> (m, n, l)
+            # n major: (l, m, n) -> (m, n, l)
+            permute_order = (1, 2, 0) if c_major == "n" else (2, 1, 0)
+            shape = (l, m, n) if c_major == "n" else (l, n, m)
+            f8_torch_tensor = cutlass_torch.create_and_permute_torch_tensor(
+                shape,
+                torch.uint8,
+                permute_order=permute_order,
+                init_type=cutlass_torch.TensorInitType.SKIP,
+            ).cuda()
+            # Create dtype cute tensor (gpu)
+            ref_c_tensor = from_dlpack(
+                f8_torch_tensor, assumed_align=16
+            ).mark_layout_dynamic(leading_dim=(1 if c_major == "n" else 0))
+            ref_c_tensor.element_type = c_dtype
+            ref_c_tensor = cutlass_torch.convert_cute_tensor(
+                ref,
+                ref_c_tensor,
+                c_dtype,
+                is_dynamic_layout=True,
+            )
+
+            ref_c = f8_torch_tensor.cpu()
+        else:
+            ref_c = ref.to(cutlass_torch.dtype(c_dtype))
+
+        # Reference checking ref_c and gpu_c
+        torch.testing.assert_close(
+            gpu_c,
+            ref_c,
+            atol=tolerance,
+            rtol=1e-05,
+        )
+
+
+if __name__ == "__main__":
+
+    def parse_comma_separated_ints(s: str) -> Tuple[int, ...]:
+        try:
+            return tuple(int(x.strip()) for x in s.split(","))
+        except ValueError:
+            raise argparse.ArgumentTypeError(
+                "Invalid format. Expected comma-separated integers."
+            )
+
+    parser = argparse.ArgumentParser(
+        description="Example of Dense Persistent GEMM on Blackwell."
+    )
+
+    parser.add_argument(
+        "--mnkl",
+        type=parse_comma_separated_ints,
+        default=(256, 256, 512, 1),
+        help="mnkl dimensions (comma-separated)",
+    )
+    parser.add_argument(
+        "--mma_tiler_mn",
+        type=parse_comma_separated_ints,
+        default=(128, 128),
+        help="Mma tile shape (comma-separated)",
+    )
+    parser.add_argument(
+        "--cluster_shape_mn",
+        type=parse_comma_separated_ints,
+        default=(1, 1),
+        help="Cluster shape (comma-separated)",
+    )
+    parser.add_argument("--ab_dtype", type=cutlass.dtype, default=cutlass.TFloat32)
+    parser.add_argument("--c_dtype", type=cutlass.dtype, default=cutlass.Float32)
+    parser.add_argument("--acc_dtype", type=cutlass.dtype, default=cutlass.Float32)
+    parser.add_argument(
+        "--use_2cta_instrs",
+        action="store_true",
+        help="Enable 2CTA MMA instructions feature",
+    )
+    parser.add_argument("--a_major", choices=["k", "m"], type=str, default="k")
+    parser.add_argument("--b_major", choices=["k", "n"], type=str, default="k")
+    parser.add_argument("--c_major", choices=["n", "m"], type=str, default="n")
+    parser.add_argument(
+        "--use_tma_store", action="store_true", help="Use tma store or not"
+    )
+    parser.add_argument(
+        "--tolerance", type=float, default=1e-01, help="Tolerance for validation"
+    )
+    parser.add_argument(
+        "--warmup_iterations", type=int, default=0, help="Warmup iterations"
+    )
+    parser.add_argument(
+        "--iterations",
+        type=int,
+        default=1,
+        help="Number of iterations to run the kernel",
+    )
+    parser.add_argument(
+        "--skip_ref_check", action="store_true", help="Skip reference checking"
+    )
+
+    args = parser.parse_args()
+
+    if len(args.mnkl) != 4:
+        parser.error("--mnkl must contain exactly 4 values")
+
+    if len(args.mma_tiler_mn) != 2:
+        parser.error("--mma_tiler_mn must contain exactly 2 values")
+
+    if len(args.cluster_shape_mn) != 2:
+        parser.error("--cluster_shape_mn must contain exactly 2 values")
+
+    run_dense_gemm(
+        args.mnkl,
+        args.ab_dtype,
+        args.c_dtype,
+        args.acc_dtype,
+        args.a_major,
+        args.b_major,
+        args.c_major,
+        args.mma_tiler_mn,
+        args.cluster_shape_mn,
+        args.use_2cta_instrs,
+        args.use_tma_store,
+        args.tolerance,
+        args.warmup_iterations,
+        args.iterations,
+        args.skip_ref_check,
+    )
+    print("PASS")
diff --git a/examples/python/CuTeDSL/blackwell/fmha.py b/examples/python/CuTeDSL/blackwell/fmha.py
new file mode 100644
index 00000000..144ba01b
--- /dev/null
+++ b/examples/python/CuTeDSL/blackwell/fmha.py
@@ -0,0 +1,2984 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import enum
+import math
+import time
+from typing import Type, Tuple
+
+import torch
+import torch.nn.functional as F
+import cuda.bindings.driver as cuda
+
+import cutlass
+import cutlass.cute as cute
+import cutlass.cute.nvgpu.tcgen05 as tcgen05
+import cutlass.utils as utils
+import cutlass.torch as cutlass_torch
+import cutlass.utils.blackwell_helpers as sm100_utils
+from cutlass.cute.runtime import from_dlpack
+
+"""
+A fused multi-head attention (FMHA) example for the NVIDIA Blackwell SM100 architecture using CUTE DSL
+
+This example demonstrates an implementation of fused multi-head attention using a TMA + Blackwell SM100
+TensorCore warp-specialized persistent kernel. The implementation integrates the Q*K^T matrix multiplication,
+softmax normalization, and softmax(Q*K^T)*V into a single kernel, avoiding intermediate data movement between
+global memory and shared memory, thus improving computational efficiency.
+
+The kernel implements key optimizations including:
+- Warp specialization for different computation phases (load, MMA, softmax, correction, epilogue)
+- Pipeline stages between different warps for overlapping computation and memory access
+- Support for different precision data types
+- Optional causal masking for autoregressive models
+
+To run this example:
+
+.. code-block:: bash
+
+    python examples/blackwell/fmha.py                                     \
+      --qk_acc_dtype Float32 --pv_acc_dtype Float32                       \
+      --mma_tiler_mn 128,128                                              \
+      --q_shape 4,1024,8,64 --k_shape 4,1024,8,64                         \
+      --is_persistent
+
+The above example runs FMHA with batch size 4, sequence length 1024, 8 attention heads, and head
+dimension 64. The Blackwell tcgen05 MMA tile shape is (128, 128), and the kernel uses fp16 for input/output
+with fp32 for accumulation.
+
+To collect performance with NCU profiler:
+
+.. code-block:: bash
+
+    ncu python examples/blackwell/fmha.py                                 \
+      --qk_acc_dtype Float32 --pv_acc_dtype Float32                       \
+      --mma_tiler_mn 128,128                                              \
+      --q_shape 4,1024,8,64 --k_shape 4,1024,8,64                         \
+      --is_persistent --warmup_iterations 10                              \
+      --iterations 10 --skip_ref_check
+
+Constraints for this example:
+* Supported head dimensions: 32, 64, and 128
+* Number of heads in Q must be divisible by number of heads in K
+* mma_tiler_mn must be 128,128
+* Batch size must be the same for Q, K, and V tensors
+* For causal masking, use --has_casual_mask (note: specify without =True/False)
+* For persistent scheduling, use --is_persistent (note: specify without =True/False)
+"""
+
+class FmhaStaticTileSchedulerParams:
+    def __init__(
+        self,
+        is_persistent: bool,
+        problem_shape_mbh: cute.Shape,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        self.is_persistent = is_persistent
+        self.problem_shape_mbh = problem_shape_mbh
+        self._loc = loc
+        self._ip = ip
+
+    def __extract_mlir_values__(self):
+        values, self._values_pos = [], []
+        for obj in [self.is_persistent, self.problem_shape_mbh]:
+            obj_values = cutlass.extract_mlir_values(obj)
+            values += obj_values
+            self._values_pos.append(len(obj_values))
+        return values
+
+    def __new_from_mlir_values__(self, values):
+        obj_list = []
+        for obj, n_items in zip(
+            [self.is_persistent, self.problem_shape_mbh], self._values_pos
+        ):
+            obj_list.append(cutlass.new_from_mlir_values(obj, values[:n_items]))
+            values = values[n_items:]
+        return FmhaStaticTileSchedulerParams(*(tuple(obj_list)), loc=self._loc)
+
+
+def create_fmha_static_tile_scheduler_params(
+    is_persistent: bool,
+    problem_shape_mbh: cute.Shape,
+) -> FmhaStaticTileSchedulerParams:
+    return FmhaStaticTileSchedulerParams(is_persistent, problem_shape_mbh)
+
+
+class FmhaStaticTileScheduler:
+
+    def __init__(
+        self,
+        params: FmhaStaticTileSchedulerParams,
+        current_work_linear_idx: cutlass.Int32,
+        blk_coord: cute.Coord,
+        grid_shape: cute.Shape,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        self._params = params
+        self._blk_coord = blk_coord
+        self._grid_shape = grid_shape
+        self._is_persistent = params.is_persistent
+        self._current_work_linear_idx = current_work_linear_idx
+        self._problem_shape_mbh = cute.make_layout(
+            params.problem_shape_mbh, loc=loc, ip=ip
+        )
+        self._num_blocks = cute.size(self._problem_shape_mbh, loc=loc, ip=ip)
+        self._is_first_block = True
+        self.num_persistent_sm = cute.size(grid_shape, loc=loc, ip=ip)
+        self._loc = loc
+        self._ip = ip
+
+    # called by host
+    @staticmethod
+    def get_grid_shape(
+        params: FmhaStaticTileSchedulerParams,
+        *,
+        loc=None,
+        ip=None,
+    ) -> cute.Shape:
+        if params.is_persistent:
+            hardware_info = cutlass.utils.HardwareInfo()
+            sm_count = hardware_info.get_device_multiprocessor_count()
+            return (
+                cutlass.min(
+                    sm_count, cute.size(params.problem_shape_mbh, loc=loc, ip=ip)
+                ),
+                1,
+                1,
+            )
+        else:
+            return params.problem_shape_mbh
+
+    def get_current_work(self, *, loc=None, ip=None) -> utils.WorkTileInfo:
+        is_valid = (
+            self._current_work_linear_idx < self._num_blocks
+            if self._is_persistent
+            else self._is_first_block
+        )
+
+        blk_coord = (0, 0, 0)
+        if self._is_persistent:
+            blk_coord = self._problem_shape_mbh.get_hier_coord(
+                self._current_work_linear_idx, loc=loc, ip=ip
+            )
+        else:
+            blk_coord = self._blk_coord
+
+        # cur_tile_coord is (mid, 0, (bid, hid))
+        cur_tile_coord = (
+            blk_coord[0],
+            0,
+            (blk_coord[1], blk_coord[2]),
+        )
+
+        return utils.WorkTileInfo(cur_tile_coord, is_valid)
+
+    def initial_work_tile_info(self, *, loc=None, ip=None):
+        return self.get_current_work(loc=loc, ip=ip)
+
+    def advance_to_next_work(self, *, advance_count=1, loc=None, ip=None):
+        if self._is_persistent:
+            self._current_work_linear_idx += advance_count * self.num_persistent_sm
+        self._is_first_block = False
+
+    def __extract_mlir_values__(self):
+        values = cutlass.extract_mlir_values(self._params)
+        values.extend(cutlass.extract_mlir_values(self._current_work_linear_idx))
+        values.extend(cutlass.extract_mlir_values(self._blk_coord))
+        values.extend(cutlass.extract_mlir_values(self._grid_shape))
+        return values
+
+    def __new_from_mlir_values__(self, values):
+        assert len(values) == 10
+        new_params = cutlass.new_from_mlir_values(self._params, values[0:3])
+        new_current_work_linear_idx = cutlass.new_from_mlir_values(
+            self._current_work_linear_idx, [values[3]]
+        )
+        new_blk_coord = cutlass.new_from_mlir_values(self._blk_coord, values[4:7])
+        new_grid_shape = cutlass.new_from_mlir_values(self._grid_shape, values[7:])
+        return FmhaStaticTileScheduler(
+            new_params, new_current_work_linear_idx, new_blk_coord, new_grid_shape
+        )
+
+
+def create_fmha_static_tile_scheduler(
+    params: FmhaStaticTileSchedulerParams,
+    blk_coord: cute.Coord,
+    grid_shape: cute.Shape,
+) -> FmhaStaticTileScheduler:
+    return FmhaStaticTileScheduler(params, blk_coord[0], blk_coord, grid_shape)
+
+
+class MaskType(enum.Enum):
+    NO_MASK = enum.auto()
+    RESIDUAL_MASK = enum.auto()
+    CAUSAL_MASK = enum.auto()
+
+
+class FusedMask:
+    def __init__(
+        self,
+        mask_type: MaskType,
+        seq_len_k: cutlass.Int32,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        self._mask_type = mask_type
+        self._seq_len_k = seq_len_k
+        self._loc = loc
+        self._ip = ip
+
+    def get_trip_count(
+        self,
+        blk_coord: cute.Coord,
+        tile_shape: cute.Shape,
+    ) -> cutlass.Int32:
+        result = 0
+        if (
+            self._mask_type == MaskType.NO_MASK
+            or self._mask_type == MaskType.RESIDUAL_MASK
+        ):
+            result = cute.ceil_div(self._seq_len_k, tile_shape[1])
+        elif self._mask_type == MaskType.CAUSAL_MASK:
+            max_blocks_k = cute.ceil_div(self._seq_len_k, tile_shape[1])
+            max_blocks_q = cute.ceil_div(
+                (blk_coord[0] + 1) * tile_shape[0], tile_shape[1]
+            )
+            result = cutlass.min(max_blocks_k, max_blocks_q)
+        return result
+
+    @cute.jit
+    def get_masked_trip_count(
+        self,
+        blk_coord: cute.Coord,
+        tile_shape: cute.Shape,
+    ) -> cutlass.Int32:
+        result = 0
+        if self._mask_type == MaskType.NO_MASK:
+            result = 0
+        elif self._mask_type == MaskType.RESIDUAL_MASK:
+            if self._seq_len_k % tile_shape[1] != 0:
+                result = 1
+            else:
+                result = 0
+        elif self._mask_type == MaskType.CAUSAL_MASK:
+            result = cute.ceil_div(tile_shape[0], tile_shape[1])
+        return result
+
+    @cute.jit
+    def get_unmasked_trip_count(
+        self,
+        blk_coord: cute.Coord,
+        tile_shape: cute.Shape,
+    ) -> cutlass.Int32:
+        result = 0
+        if self._mask_type == MaskType.NO_MASK:
+            result = self.get_trip_count(blk_coord, tile_shape)
+        elif self._mask_type == MaskType.RESIDUAL_MASK:
+            if self._seq_len_k % tile_shape[1] != 0:
+                result = self.get_trip_count(blk_coord, tile_shape) - 1
+            else:
+                result = self.get_trip_count(blk_coord, tile_shape)
+        elif self._mask_type == MaskType.CAUSAL_MASK:
+            result = self.get_trip_count(
+                blk_coord, tile_shape
+            ) - self.get_masked_trip_count(blk_coord, tile_shape)
+        return result
+
+    @cute.jit
+    def apply_mask(
+        self,
+        acc_qk: cute.Tensor,
+        index_qk: cute.Tensor,
+    ):
+        if self._mask_type == MaskType.RESIDUAL_MASK:
+            for i in range(cute.size(acc_qk)):
+                pos = index_qk[i]
+                if pos[1] >= self._seq_len_k:
+                    acc_qk[i] = -cutlass.Float32.inf
+        elif self._mask_type == MaskType.CAUSAL_MASK:
+            for i in range(cute.size(acc_qk)):
+                pos = index_qk[i]
+                if pos[0] < pos[1] or pos[1] >= self._seq_len_k:
+                    acc_qk[i] = -cutlass.Float32.inf
+
+    def __extract_mlir_values__(self):
+        values, self._values_pos = [], []
+        for obj in [self._mask_type, self._seq_len_k]:
+            obj_values = cutlass.extract_mlir_values(obj)
+            values += obj_values
+            self._values_pos.append(len(obj_values))
+        return values
+
+    def __new_from_mlir_values__(self, values):
+        obj_list = []
+        for obj, n_items in zip([self._mask_type, self._seq_len_k], self._values_pos):
+            obj_list.append(cutlass.new_from_mlir_values(obj, values[:n_items]))
+            values = values[n_items:]
+        return FusedMask(*(tuple(obj_list)), loc=self._loc)
+
+
+def create_fused_mask(
+    mask_type: MaskType,
+    seq_len_k: cutlass.Int32,
+) -> FusedMask:
+    return FusedMask(mask_type, seq_len_k)
+
+
+class BlackwellFusedMultiHeadAttentionForward:
+    def __init__(
+        self,
+        qk_acc_dtype: Type[cutlass.Numeric],
+        pv_acc_dtype: Type[cutlass.Numeric],
+        mma_tiler: Tuple[int, int, int],
+        is_persistent: bool,
+        mask_type: MaskType,
+    ):
+        """Initializes the configuration for a Blackwell Fused Multi-Head Attention (FMHA) kernel.
+
+        This configuration includes several key aspects:
+
+        1.  Data Type Settings:
+            - qk_acc_dtype: Data type for Q*K^T matrix multiplication accumulator
+            - pv_acc_dtype: Data type for P*V matrix multiplication accumulator
+
+        2.  MMA Instruction Settings:
+            - mma_tiler: The (M, N, K) shape of the MMA instruction unit
+            - qk_mma_tiler: MMA shape for Q*K^T computation
+            - pv_mma_tiler: MMA shape for P*V computation
+
+        3.  Kernel Execution Mode:
+            - is_persistent: Boolean indicating whether to use persistent kernel mode
+            - mask_type: Specifies the type of mask to use (no mask, residual mask, or causal mask)
+
+        :param qk_acc_dtype: Data type for Q*K^T matrix multiplication accumulator
+        :type qk_acc_dtype: Type[cutlass.Numeric]
+        :param pv_acc_dtype: Data type for P*V matrix multiplication accumulator
+        :type pv_acc_dtype: Type[cutlass.Numeric]
+        :param mma_tiler: The (M, N, K) shape of the MMA instruction
+        :type mma_tiler: Tuple[int, int, int]
+        :param is_persistent: Whether to use persistent kernel mode
+        :type is_persistent: bool
+        :param mask_type: Type of mask to use
+        :type mask_type: MaskType
+        """
+
+        self.qk_acc_dtype = qk_acc_dtype
+        self.pv_acc_dtype = pv_acc_dtype
+        self.cta_tiler = (
+            2 * mma_tiler[0],  # 2 Q tile per CTA
+            mma_tiler[1],
+            mma_tiler[2],
+        )
+        self.qk_mma_tiler = mma_tiler
+        self.pv_mma_tiler = (
+            mma_tiler[0],
+            mma_tiler[2],
+            mma_tiler[1],
+        )
+        self.cluster_shape_mn = (1, 1)
+        self.is_persistent = is_persistent
+        self.mask_type = mask_type
+
+        self.softmax0_warp_ids = (0, 1, 2, 3)
+        self.softmax1_warp_ids = (4, 5, 6, 7)
+        self.correction_warp_ids = (8, 9, 10, 11)
+        self.mma_warp_id = 12
+        self.load_warp_id = 13
+        self.epilogue_warp_id = 14
+        self.empty_warp_id = 15
+        SM100_TMEM_CAPACITY_COLUMNS = 512
+        self.tmem_alloc_cols = SM100_TMEM_CAPACITY_COLUMNS
+
+        self.threads_per_warp = 32
+        self.threads_per_cta = self.threads_per_warp * len(
+            (
+                *self.softmax0_warp_ids,
+                *self.softmax1_warp_ids,
+                *self.correction_warp_ids,
+                self.mma_warp_id,
+                self.load_warp_id,
+                self.epilogue_warp_id,
+                self.empty_warp_id,
+            )
+        )
+
+        self.cta_sync_bar_id = 0
+        self.tmem_alloc_sync_bar_id = 1
+
+        self.tmem_s0_offset = 0
+        self.tmem_s1_offset = 128
+        self.tmem_o0_offset = 256
+        self.tmem_o1_offset = 384
+        self.tmem_p0_offset = 32
+        self.tmem_p1_offset = 160
+
+        # vec buffer for row_max & row_sum
+        self.tmem_vec0_offset = 0
+        self.tmem_vec1_offset = 128
+
+        self.num_regs_softmax = 192
+        self.num_regs_correction = 96
+        self.num_regs_other = 32
+        self.num_regs_empty = 24
+
+        self.buffer_align_bytes = 1024
+
+        num_warps_per_warpgroup = 4
+        self.softmax_warpgroup_count = (
+            len((*self.softmax0_warp_ids, *self.softmax1_warp_ids))
+            // num_warps_per_warpgroup
+        )
+
+    def _setup_attributes(self):
+        """Set up configurations and parameters for the FMHA kernel operation.
+
+        This method initializes and configures various attributes required for the
+        execution of the fused multi-head attention kernel, mainly about the pipeline stages:
+
+        - Sets up staging parameters for Q, K, V inputs and accumulator data
+        - Configures pipeline stages for softmax, correction, and epilogue operations
+        """
+
+        self.q_stage = 2
+        self.kv_stage = 4 if self.q_dtype.width == 8 else 3
+        self.acc_stage = 1
+        self.softmax_corr_stage = 1
+        self.mma_corr_stage = 2
+        self.mma_softmax_stage = 1
+        self.epi_stage = 2
+
+    @cute.jit
+    def __call__(
+        self,
+        q: cute.Tensor,
+        k: cute.Tensor,
+        v: cute.Tensor,
+        o: cute.Tensor,
+        scale_softmax_log2: cutlass.Float32,
+        scale_output: cutlass.Float32,
+        stream: cuda.CUstream,
+    ):
+        """Execute the Fused Multi-Head Attention operation on the provided tensors.
+
+        This method prepares the input tensors for processing, validates their shapes and types,
+        configures the computation parameters, and launches the CUDA kernel.
+
+        The method handles:
+        1. Tensor layout transformations for specific memory access patterns
+        2. Validation of tensor shapes and data types
+        3. Initialization of hardware-specific parameters and memory layouts
+        4. Configuration of TMA (Tensor Memory Access) operations
+        5. Grid and work scheduling computation
+        6. Kernel launch with appropriate parameters
+
+        :param q: The query tensor with shape [seq_len_q, d_head, h_q, b]
+        :type q: cute.Tensor
+        :param k: The key tensor with shape [seq_len_k, d_head, h_k, b]
+        :type k: cute.Tensor
+        :param v: The value tensor with shape [seq_len_k, d_head, h_v, b]
+        :type v: cute.Tensor
+        :param o: The output tensor with shape [seq_len_q, d_head, h_q, b]
+        :type o: cute.Tensor
+        :param scale_softmax_log2: The log2 scale factor for softmax
+        :type scale_softmax_log2: cutlass.Float32
+        :param scale_output: The scale factor for the output
+        :type scale_output: cutlass.Float32
+        :param stream: The CUDA stream to execute the kernel on
+        :type stream: cuda.CUstream
+        :raises TypeError: If tensor data types don't match or aren't supported
+        :raises RuntimeError: If tensor layouts aren't in supported formats
+        """
+
+        # setup static attributes before smem/grid/tma computation
+        self.q_dtype = q.element_type
+        self.k_dtype = k.element_type
+        self.v_dtype = v.element_type
+        self.o_dtype = o.element_type
+
+        # (s, d, 1, h_k, b) -> (s, d, ((h_r, h_k), b))
+        k = cute.make_tensor(
+            k.iterator,
+            cute.make_layout(
+                (k.shape[0], k.shape[1], ((q.shape[2], k.shape[3]), k.shape[4])),
+                stride=(
+                    k.layout.stride[0],
+                    k.layout.stride[1],
+                    ((0, k.layout.stride[3]), k.layout.stride[4]),
+                ),
+            ),
+        )
+        # (s, d, 1, h_k, b) -> (d, s, ((h_r, h_k), b))
+        v = cute.make_tensor(
+            v.iterator,
+            cute.make_layout(
+                (v.shape[1], v.shape[0], ((q.shape[2], v.shape[3]), v.shape[4])),
+                stride=(
+                    v.layout.stride[1],
+                    v.layout.stride[0],
+                    ((0, v.layout.stride[3]), v.layout.stride[4]),
+                ),
+            ),
+        )
+
+        # (s, d, h_r, h_k, b) -> (s, d, ((h_r, h_k), b))
+        q = cute.group_modes(cute.group_modes(q, begin=2, end=4), begin=2, end=4)
+        o = cute.group_modes(cute.group_modes(o, begin=2, end=4), begin=2, end=4)
+
+        self.q_major_mode = utils.LayoutEnum.from_tensor(q).mma_major_mode()
+        self.k_major_mode = utils.LayoutEnum.from_tensor(k).mma_major_mode()
+        self.v_major_mode = utils.LayoutEnum.from_tensor(v).mma_major_mode()
+        self.o_layout = utils.LayoutEnum.from_tensor(o)
+
+        if cutlass.const_expr(self.q_major_mode != tcgen05.OperandMajorMode.K):
+            raise RuntimeError("The layout of q is not supported")
+        if cutlass.const_expr(self.k_major_mode != tcgen05.OperandMajorMode.K):
+            raise RuntimeError("The layout of k is not supported")
+        if cutlass.const_expr(self.v_major_mode != tcgen05.OperandMajorMode.MN):
+            raise RuntimeError("The layout of v is not supported")
+
+        # check type consistency
+        if cutlass.const_expr(self.q_dtype != self.k_dtype):
+            raise TypeError(f"Type mismatch: {self.q_dtype} != {self.k_dtype}")
+        if cutlass.const_expr(self.q_dtype != self.v_dtype):
+            raise TypeError(f"Type mismatch: {self.q_dtype} != {self.v_dtype}")
+        self._setup_attributes()
+
+        cta_group = tcgen05.CtaGroup.ONE
+        # the intermediate tensor p is from tmem & k-major
+        p_source = tcgen05.OperandSource.TMEM
+        p_major_mode = tcgen05.OperandMajorMode.K
+        qk_tiled_mma = sm100_utils.make_trivial_tiled_mma(
+            self.q_dtype,
+            self.q_major_mode,
+            self.k_major_mode,
+            self.qk_acc_dtype,
+            cta_group,
+            self.qk_mma_tiler[:2],
+        )
+        pv_tiled_mma = sm100_utils.make_trivial_tiled_mma(
+            self.v_dtype,
+            p_major_mode,
+            self.v_major_mode,
+            self.pv_acc_dtype,
+            cta_group,
+            self.pv_mma_tiler[:2],
+            p_source,
+        )
+
+        self.cluster_shape_mnk = (*self.cluster_shape_mn, 1)
+        self.cluster_layout_vmnk = cute.tiled_divide(
+            cute.make_layout(self.cluster_shape_mnk),
+            (qk_tiled_mma.thr_id.shape,),
+        )
+
+        self.epi_tile = self.pv_mma_tiler[:2]
+
+
+        q_smem_layout_staged = sm100_utils.make_smem_layout_a(
+            qk_tiled_mma,
+            self.qk_mma_tiler,
+            self.q_dtype,
+            self.q_stage,
+        )
+        k_smem_layout_staged = sm100_utils.make_smem_layout_b(
+            qk_tiled_mma,
+            self.qk_mma_tiler,
+            self.k_dtype,
+            self.kv_stage,
+        )
+        p_tmem_layout_staged = sm100_utils.make_smem_layout_a(
+            pv_tiled_mma,
+            self.pv_mma_tiler,
+            self.q_dtype,
+            self.acc_stage,
+        )
+        v_smem_layout_staged = sm100_utils.make_smem_layout_b(
+            pv_tiled_mma,
+            self.pv_mma_tiler,
+            self.v_dtype,
+            self.kv_stage,
+        )
+        o_smem_layout_staged = sm100_utils.make_smem_layout_epi(
+            self.o_dtype,
+            self.o_layout,
+            self.epi_tile,
+            self.epi_stage,
+        )
+
+        # TMA load for Q
+        tma_load_op = cute.nvgpu.cpasync.CopyBulkTensorTileG2SOp(cta_group)
+        tma_store_op = cute.nvgpu.cpasync.CopyBulkTensorTileS2GOp()
+
+        q_smem_layout = cute.select(q_smem_layout_staged, mode=[0, 1, 2])
+        tma_atom_q, tma_tensor_q = cute.nvgpu.make_tma_tile_atom_A(
+            tma_load_op,
+            q,
+            q_smem_layout,
+            self.qk_mma_tiler,
+            qk_tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        # TMA load for K
+        k_smem_layout = cute.select(k_smem_layout_staged, mode=[0, 1, 2])
+        tma_atom_k, tma_tensor_k = cute.nvgpu.make_tma_tile_atom_B(
+            tma_load_op,
+            k,
+            k_smem_layout,
+            self.qk_mma_tiler,
+            qk_tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+        # TMA load for V
+        v_smem_layout = cute.select(v_smem_layout_staged, mode=[0, 1, 2])
+        tma_atom_v, tma_tensor_v = cute.nvgpu.make_tma_tile_atom_B(
+            tma_load_op,
+            v,
+            v_smem_layout,
+            self.pv_mma_tiler,
+            pv_tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        o_cta_v_layout = cute.composition(
+            cute.make_identity_layout(o.shape), self.epi_tile
+        )
+        o_smem_layout = cute.select(o_smem_layout_staged, mode=[0, 1])
+
+        tma_atom_o, tma_tensor_o = cute.nvgpu.cpasync.make_tma_tile_atom(
+            tma_store_op,
+            o,
+            o_smem_layout,
+            o_cta_v_layout,
+        )
+
+        q_copy_size = cute.size_in_bytes(self.q_dtype, q_smem_layout)
+        k_copy_size = cute.size_in_bytes(self.k_dtype, k_smem_layout)
+        self.tma_copy_q_bytes = q_copy_size
+        self.tma_copy_kv_bytes = k_copy_size
+
+        self.tile_sched_params, grid = self._compute_grid(
+            o,
+            self.cta_tiler,
+            self.is_persistent,
+        )
+
+        @cute.struct
+        class SharedStorage:
+            # Pipeline barriers
+            load_q_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.q_stage * 2]
+            load_kv_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.kv_stage * 2]
+            mma_s0_mbar_ptr: cute.struct.MemRange[
+                cutlass.Int64, self.mma_softmax_stage * 2
+            ]
+            mma_s1_mbar_ptr: cute.struct.MemRange[
+                cutlass.Int64, self.mma_softmax_stage * 2
+            ]
+            s0_corr_mbar_ptr: cute.struct.MemRange[
+                cutlass.Int64, self.softmax_corr_stage * 2
+            ]
+            s1_corr_mbar_ptr: cute.struct.MemRange[
+                cutlass.Int64, self.softmax_corr_stage * 2
+            ]
+            s0_s1_sequence_mbar_ptr: cute.struct.MemRange[
+                cutlass.Int64, self.softmax_warpgroup_count
+            ]
+            corr_epi_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.epi_stage * 2]
+            mma_corr_mbar_ptr: cute.struct.MemRange[
+                cutlass.Int64, self.mma_corr_stage * 2
+            ]
+            max_reg_setting_mbar_ptr: cute.struct.MemRange[cutlass.Int64, 1]
+            tmem_dealloc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, 1]
+            # Tmem holding buffer
+            tmem_holding_buf: cutlass.Int32
+            # Smem tensors
+            sO: cute.struct.Align[
+                cute.struct.MemRange[self.o_dtype, cute.cosize(o_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+            sQ: cute.struct.Align[
+                cute.struct.MemRange[self.q_dtype, cute.cosize(q_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+            sK: cute.struct.Align[
+                cute.struct.MemRange[self.k_dtype, cute.cosize(k_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+
+        self.shared_storage = SharedStorage
+        fused_mask = create_fused_mask(self.mask_type, k.shape[0])
+
+        # Launch the kernel synchronously
+        self.kernel(
+            qk_tiled_mma,
+            pv_tiled_mma,
+            tma_atom_q,
+            tma_tensor_q,
+            tma_atom_k,
+            tma_tensor_k,
+            tma_atom_v,
+            tma_tensor_v,
+            tma_atom_o,
+            tma_tensor_o,
+            scale_softmax_log2,
+            scale_output,
+            q_smem_layout_staged,
+            k_smem_layout_staged,
+            p_tmem_layout_staged,
+            v_smem_layout_staged,
+            o_smem_layout_staged,
+            self.tile_sched_params,
+            fused_mask,
+        ).launch(
+            grid=grid,
+            block=[self.threads_per_cta, 1, 1],
+            cluster=self.cluster_shape_mnk,
+            smem=self.shared_storage.size_in_bytes(),
+            stream=stream,
+            min_blocks_per_mp=1,
+        )
+
+    #  GPU device kernel
+    @cute.kernel
+    def kernel(
+        self,
+        qk_tiled_mma: cute.TiledMma,
+        pv_tiled_mma: cute.TiledMma,
+        tma_atom_q: cute.CopyAtom,
+        mQ_qdl: cute.Tensor,
+        tma_atom_k: cute.CopyAtom,
+        mK_kdl: cute.Tensor,
+        tma_atom_v: cute.CopyAtom,
+        mV_dkl: cute.Tensor,
+        tma_atom_o: cute.CopyAtom,
+        mO_qdl: cute.Tensor,
+        scale_softmax_log2: cutlass.Float32,
+        scale_output: cutlass.Float32,
+        q_smem_layout_staged: cute.ComposedLayout,
+        k_smem_layout_staged: cute.ComposedLayout,
+        p_tmem_layout_staged: cute.ComposedLayout,
+        v_smem_layout_staged: cute.ComposedLayout,
+        o_smem_layout_staged: cute.ComposedLayout,
+        tile_sched_params: FmhaStaticTileSchedulerParams,
+        fused_mask: FusedMask,
+    ):
+        """The device kernel implementation of the Fused Multi-Head Attention.
+
+        This kernel coordinates multiple specialized warps to perform different phases of the FMHA computation:
+        1. Load warp: Loads Q, K, V data from global memory to shared memory using TMA
+        2. MMA warp: Performs matrix multiplications (Q*K^T and P*V)
+        3. Softmax warps: Compute softmax normalization on attention scores
+        4. Correction warps: Apply adjustments to intermediate results
+        5. Epilogue warp: Handles final output transformation and storage
+
+        The kernel implements a complex pipeline with overlapping computation and memory operations,
+        using tensor memory access (TMA) for efficient data loading, warp specialization for different
+        computation phases, and optional attention masking.
+
+        :param qk_tiled_mma: Tiled MMA for Q*K^T
+        :type qk_tiled_mma: cute.TiledMma
+        :param pv_tiled_mma: Tiled MMA for P*V
+        :type pv_tiled_mma: cute.TiledMma
+        :param tma_atom_q: TMA copy atom for query tensor
+        :type tma_atom_q: cute.CopyAtom
+        :param mQ_qdl: Partitioned query tensor
+        :type mQ_qdl: cute.Tensor
+        :param tma_atom_k: TMA copy atom for key tensor
+        :type tma_atom_k: cute.CopyAtom
+        :param mK_kdl: Partitioned key tensor
+        :type mK_kdl: cute.Tensor
+        :param tma_atom_v: TMA copy atom for value tensor
+        :type tma_atom_v: cute.CopyAtom
+        :param mV_dkl: Partitioned value tensor
+        :type mV_dkl: cute.Tensor
+        :param tma_atom_o: TMA copy atom for output tensor
+        :type tma_atom_o: cute.CopyAtom
+        :param mO_qdl: Partitioned output tensor
+        :type mO_qdl: cute.Tensor
+        :param scale_softmax_log2: The log2 scale factor for softmax
+        :type scale_softmax_log2: cutlass.Float32
+        :param scale_output: The scale factor for the output
+        :type scale_output: cutlass.Float32
+        :param q_smem_layout_staged: Shared memory layout for query tensor
+        :type q_smem_layout_staged: cute.ComposedLayout
+        :param k_smem_layout_staged: Shared memory layout for key tensor
+        :type k_smem_layout_staged: cute.ComposedLayout
+        :param p_tmem_layout_staged: Tensor memory layout for probability matrix
+        :type p_tmem_layout_staged: cute.ComposedLayout
+        :param v_smem_layout_staged: Shared memory layout for value tensor
+        :type v_smem_layout_staged: cute.ComposedLayout
+        :param o_smem_layout_staged: Shared memory layout for output tensor
+        :type o_smem_layout_staged: cute.ComposedLayout
+        :param tile_sched_params: Scheduling parameters for work distribution
+        :type tile_sched_params: FmhaStaticTileSchedulerParams
+        :param fused_mask: Masking configuration (causal/residual/none)
+        :type fused_mask: FusedMask
+        """
+
+        warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
+        # coord inside cta
+        tidx, _, _ = cute.arch.thread_idx()
+
+        # Alloc
+        smem = utils.SmemAllocator()
+        storage = smem.allocate(self.shared_storage)
+
+        load_q_pipeline = self.make_and_init_load_q_pipeline(
+            storage.load_q_mbar_ptr.data_ptr()
+        )
+        load_kv_pipeline = self.make_and_init_load_kv_pipeline(
+            storage.load_kv_mbar_ptr.data_ptr()
+        )
+        mma_s0_pipeline = self.make_and_init_mma_si_pipeline(
+            storage.mma_s0_mbar_ptr.data_ptr()
+        )
+        mma_s1_pipeline = self.make_and_init_mma_si_pipeline(
+            storage.mma_s1_mbar_ptr.data_ptr()
+        )
+        s0_corr_pipeline = self.make_and_init_si_corr_pipeline(
+            storage.s0_corr_mbar_ptr.data_ptr()
+        )
+        s1_corr_pipeline = self.make_and_init_si_corr_pipeline(
+            storage.s1_corr_mbar_ptr.data_ptr()
+        )
+        corr_epi_pipeline = self.make_and_init_corr_epi_pipeline(
+            storage.corr_epi_mbar_ptr.data_ptr()
+        )
+        mma_corr_pipeline = self.make_and_init_mma_corr_pipeline(
+            storage.mma_corr_mbar_ptr.data_ptr()
+        )
+        s0_s1_sequence_pipeline = self.make_and_init_si_sequence_pipeline(
+            storage.s0_s1_sequence_mbar_ptr.data_ptr()
+        )
+        max_reg_setting_mbar_ptr = storage.max_reg_setting_mbar_ptr.data_ptr()
+        tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr.data_ptr()
+
+        #  Correction & Epilogue & tmem barrier init
+        if warp_idx == self.empty_warp_id:
+            cute.arch.mbarrier_init_arrive_cnt(
+                max_reg_setting_mbar_ptr,
+                self.threads_per_warp
+                * len(
+                    (
+                        self.empty_warp_id,
+                        self.load_warp_id,
+                        self.mma_warp_id,
+                        self.epilogue_warp_id,
+                        *self.correction_warp_ids,
+                    )
+                ),
+            )
+            cute.arch.mbarrier_init_arrive_cnt(
+                tmem_dealloc_mbar_ptr,
+                self.threads_per_warp
+                * len(
+                    (
+                        *self.softmax0_warp_ids,
+                        *self.softmax1_warp_ids,
+                        *self.correction_warp_ids,
+                    )
+                ),
+            )
+
+        cute.arch.mbarrier_init_fence()
+
+        #  Generate smem tensor Q/K/V/O
+        # (MMA, MMA_Q, MMA_D, PIPE)
+        sQ = storage.sQ.get_tensor(
+            q_smem_layout_staged.outer, swizzle=q_smem_layout_staged.inner
+        )
+        # (MMA, MMA_K, MMA_D, PIPE)
+        sK = storage.sK.get_tensor(
+            k_smem_layout_staged.outer, swizzle=k_smem_layout_staged.inner
+        )
+        # (MMA, MMA_K, MMA_D, PIPE)
+        # Strip swizzle info to reuse smem
+        sV_ptr = cute.recast_ptr(sK.iterator, v_smem_layout_staged.inner)
+        sV = cute.make_tensor(sV_ptr, v_smem_layout_staged.outer)
+
+        sO = storage.sO.get_tensor(
+            o_smem_layout_staged.outer, swizzle=o_smem_layout_staged.inner
+        )
+
+        # Local tile partition global tensors
+        # (bM, bK, loopM, loopK, loopL)  need to check
+        gQ_qdl = cute.flat_divide(mQ_qdl, cute.select(self.qk_mma_tiler, mode=[0, 2]))
+        qk_thr_mma = qk_tiled_mma.get_slice(0)  # default 1sm
+        tSgQ_qdl = qk_thr_mma.partition_A(gQ_qdl)
+
+        tQsQ, tQgQ_qdl = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_q,
+            0,  # no multicast
+            cute.make_layout(1),
+            cute.group_modes(sQ, 0, 3),
+            cute.group_modes(tSgQ_qdl, 0, 3),
+        )
+
+        gK_kdl = cute.flat_divide(mK_kdl, cute.select(self.qk_mma_tiler, mode=[1, 2]))
+        tSgK_kdl = qk_thr_mma.partition_B(gK_kdl)
+        tKsK, tKgK_kdl = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_k,
+            0,  # no multicast
+            cute.make_layout(1),
+            cute.group_modes(sK, 0, 3),
+            cute.group_modes(tSgK_kdl, 0, 3),
+        )
+
+        # (bM, bN, loopM, loopN, loopL)
+        gV_dkl = cute.flat_divide(mV_dkl, cute.select(self.pv_mma_tiler, mode=[1, 2]))
+
+        pv_thr_mma = pv_tiled_mma.get_slice(0)  # default 1sm
+        tSgV_dkl = pv_thr_mma.partition_B(gV_dkl)
+        tVsV, tVgV_dkl = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_v,
+            0,  # no multicast
+            cute.make_layout(1),
+            cute.group_modes(sV, 0, 3),
+            cute.group_modes(tSgV_dkl, 0, 3),
+        )
+
+        tSrQ = qk_thr_mma.make_fragment_A(sQ)
+        tSrK = qk_thr_mma.make_fragment_B(sK)
+        tOrV = pv_thr_mma.make_fragment_B(sV)
+
+        gO_qdl = cute.flat_divide(mO_qdl, cute.select(self.pv_mma_tiler, mode=[0, 1]))
+
+        qk_acc_shape = qk_thr_mma.partition_shape_C(
+            (self.qk_mma_tiler[0], self.qk_mma_tiler[1])
+        )
+        tStS = qk_thr_mma.make_fragment_C(qk_acc_shape)
+
+        pv_acc_shape = pv_thr_mma.partition_shape_C(
+            (self.pv_mma_tiler[0], self.pv_mma_tiler[1])
+        )
+        tOtO = pv_thr_mma.make_fragment_C(pv_acc_shape)
+
+        tStS0 = cute.make_tensor(tStS.iterator + self.tmem_s0_offset, tStS.layout)
+        tStS1 = cute.make_tensor(tStS.iterator + self.tmem_s1_offset, tStS.layout)
+
+        tOtO0 = cute.make_tensor(tOtO.iterator + self.tmem_o0_offset, tOtO.layout)
+        tOtO1 = cute.make_tensor(tOtO.iterator + self.tmem_o1_offset, tOtO.layout)
+
+        tP = cute.make_tensor(tStS.iterator, p_tmem_layout_staged.outer)
+        tOrP = pv_thr_mma.make_fragment_A(tP)[None, None, None, 0]
+
+        tOrP0 = cute.make_tensor(
+            tOrP.iterator
+            + self.qk_acc_dtype.width // self.q_dtype.width * self.tmem_p0_offset,
+            tOrP.layout,
+        )
+        tOrP1 = cute.make_tensor(
+            tOrP.iterator
+            + self.qk_acc_dtype.width // self.q_dtype.width * self.tmem_p1_offset,
+            tOrP.layout,
+        )
+
+        cute.arch.barrier(
+            barrier_id=self.cta_sync_bar_id,
+            number_of_threads=self.threads_per_cta,
+        )
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        #  EMPTY
+        # ///////////////////////////////////////////////////////////////////////////////
+        if warp_idx == self.empty_warp_id:
+            cute.arch.warpgroup_reg_dealloc(self.num_regs_empty)
+            cute.arch.mbarrier_arrive(max_reg_setting_mbar_ptr)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        #  LOAD
+        # ///////////////////////////////////////////////////////////////////////////////
+        if warp_idx == self.load_warp_id:
+            cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+            cute.arch.mbarrier_arrive(max_reg_setting_mbar_ptr)
+
+            q_producer_state = utils.make_pipeline_state(
+                utils.PipelineUserType.Producer, self.q_stage
+            )
+            kv_producer_state = utils.make_pipeline_state(
+                utils.PipelineUserType.Producer, self.kv_stage
+            )
+
+            tile_sched = create_fmha_static_tile_scheduler(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            work_tile = tile_sched.initial_work_tile_info()
+
+            while work_tile.is_valid_tile:
+                curr_block_coord = work_tile.tile_idx
+                tQgQ = tQgQ_qdl[None, None, 0, curr_block_coord[2]]
+                tKgK = tKgK_kdl[None, None, 0, curr_block_coord[2]]
+                tVgV = tVgV_dkl[None, 0, None, curr_block_coord[2]]
+
+                # Q0
+                q0_coord = 2 * curr_block_coord[0]
+                load_q_pipeline.producer_acquire(q_producer_state)
+                cute.copy(
+                    tma_atom_q,
+                    tQgQ[None, q0_coord],
+                    tQsQ[None, q_producer_state.index],
+                    tma_bar_ptr=load_q_pipeline.producer_get_barrier(q_producer_state),
+                )
+                q_producer_state.advance()
+
+                # K0
+                kv_coord = 0  # seqlen_kv_loop
+                load_kv_pipeline.producer_acquire(kv_producer_state)
+                cute.copy(
+                    tma_atom_k,
+                    tKgK[None, kv_coord],
+                    tKsK[None, kv_producer_state.index],
+                    tma_bar_ptr=load_kv_pipeline.producer_get_barrier(
+                        kv_producer_state
+                    ),
+                )
+                kv_producer_state.advance()
+
+                # Q1
+                q1_coord = q0_coord + 1
+                load_q_pipeline.producer_acquire(q_producer_state)
+                cute.copy(
+                    tma_atom_q,
+                    tQgQ[None, q1_coord],
+                    tQsQ[None, q_producer_state.index],
+                    tma_bar_ptr=load_q_pipeline.producer_get_barrier(q_producer_state),
+                )
+                q_producer_state.advance()
+
+                # V0
+                load_kv_pipeline.producer_acquire(kv_producer_state)
+                cute.copy(
+                    tma_atom_v,
+                    tVgV[None, kv_coord],
+                    tVsV[None, kv_producer_state.index],
+                    tma_bar_ptr=load_kv_pipeline.producer_get_barrier(
+                        kv_producer_state
+                    ),
+                )
+                kv_producer_state.advance()
+                kv_coord += 1
+
+                seqlen_kv_loop_steps = (
+                    fused_mask.get_trip_count(curr_block_coord, self.cta_tiler) - 1
+                )
+                for i in cutlass.range_dynamic(0, seqlen_kv_loop_steps, 1, unroll=1):
+                    # Ki
+                    load_kv_pipeline.producer_acquire(kv_producer_state)
+                    cute.copy(
+                        tma_atom_k,
+                        tKgK[None, kv_coord],
+                        tKsK[None, kv_producer_state.index],
+                        tma_bar_ptr=load_kv_pipeline.producer_get_barrier(
+                            kv_producer_state
+                        ),
+                    )
+                    kv_producer_state.advance()
+                    # Vi
+                    load_kv_pipeline.producer_acquire(kv_producer_state)
+                    cute.copy(
+                        tma_atom_v,
+                        tVgV[None, kv_coord],
+                        tVsV[None, kv_producer_state.index],
+                        tma_bar_ptr=load_kv_pipeline.producer_get_barrier(
+                            kv_producer_state
+                        ),
+                    )
+                    kv_producer_state.advance()
+                    kv_coord += 1
+                # End of seqlen_kv loop
+
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+                # End of persistent scheduler loop
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        #  MMA
+        # ///////////////////////////////////////////////////////////////////////////////
+        if warp_idx == self.mma_warp_id:
+            cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+            cute.arch.mbarrier_arrive(max_reg_setting_mbar_ptr)
+
+            # Alloc tmem buffer
+            tmem_alloc_cols = cutlass.Int32(self.tmem_alloc_cols)
+            cute.arch.alloc_tmem(tmem_alloc_cols, storage.tmem_holding_buf)
+            cute.arch.barrier(
+                barrier_id=self.tmem_alloc_sync_bar_id,
+                number_of_threads=self.threads_per_warp,
+            )
+            mma_q_consumer_state = utils.make_pipeline_state(
+                utils.PipelineUserType.Consumer, self.q_stage
+            )
+            mma_kv_consumer_state = utils.make_pipeline_state(
+                utils.PipelineUserType.Consumer, self.kv_stage
+            )
+            mma_q_release_state = mma_q_consumer_state.clone()
+            mma_kv_release_state = mma_kv_consumer_state.clone()
+            mma_s0_producer_state = utils.make_pipeline_state(
+                utils.PipelineUserType.Producer, self.mma_softmax_stage
+            )
+            mma_s1_producer_state = utils.make_pipeline_state(
+                utils.PipelineUserType.Producer, self.mma_softmax_stage
+            )
+            mma_corr_producer_state = utils.make_pipeline_state(
+                utils.PipelineUserType.Producer, self.mma_corr_stage
+            )
+            tile_sched = create_fmha_static_tile_scheduler(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            work_tile = tile_sched.initial_work_tile_info()
+
+            while work_tile.is_valid_tile:
+                curr_block_coord = work_tile.tile_idx
+                # GEMM_QK00 (Q0 * K0 -> S0)
+                # 1. wait for Q0
+                load_q_pipeline.consumer_wait(mma_q_consumer_state)
+                tSrQ0 = tSrQ[None, None, None, mma_q_consumer_state.index]
+                mma_q_consumer_state.advance()
+                # 2. wait for K0
+                load_kv_pipeline.consumer_wait(mma_kv_consumer_state)
+                tSrK0 = tSrK[None, None, None, mma_kv_consumer_state.index]
+                mma_kv_consumer_state.advance()
+                # 3. acquire empty S0 buffer
+                mma_s0_pipeline.producer_acquire(mma_s0_producer_state)
+                # 4. gemm
+                num_kphases = cute.size(tSrQ0, mode=[2])
+                for kphase_idx in range(num_kphases):
+                    kphase_coord = (None, None, kphase_idx)
+                    qk_tiled_mma.set(tcgen05.Field.ACCUMULATE, kphase_idx != 0)
+                    cute.gemm(
+                        qk_tiled_mma,
+                        tStS0,
+                        tSrQ0[kphase_coord],
+                        tSrK0[kphase_coord],
+                        tStS0,
+                    )
+                # 5. release S0
+                mma_s0_pipeline.producer_commit(mma_s0_producer_state)
+                mma_s0_producer_state.advance()
+                # End of GEMM (Q0 * K0 -> S0)
+
+                # GEMM_QK10 (Q1 * K0 -> S1), K0 is ready in GEMM_QK00
+                # 1. wait for Q1
+                load_q_pipeline.consumer_wait(mma_q_consumer_state)
+                tSrQ1 = tSrQ[None, None, None, mma_q_consumer_state.index]
+                mma_q_consumer_state.advance()
+                # 2. acquire empty S1
+                mma_s1_pipeline.producer_acquire(mma_s1_producer_state)
+                # 3. gemm
+                num_kphases = cute.size(tSrQ1, mode=[2])
+                for kphase_idx in range(num_kphases):
+                    kphase_coord = (None, None, kphase_idx)
+                    qk_tiled_mma.set(tcgen05.Field.ACCUMULATE, kphase_idx != 0)
+                    cute.gemm(
+                        qk_tiled_mma,
+                        tStS1,
+                        tSrQ1[kphase_coord],
+                        tSrK0[kphase_coord],
+                        tStS1,
+                    )
+                # 4. release S1
+                mma_s1_pipeline.producer_commit(mma_s1_producer_state)
+                mma_s1_producer_state.advance()
+                # 5. release K0
+                load_kv_pipeline.consumer_release(mma_kv_release_state)
+                mma_kv_release_state.advance()
+                # End of GEMM (Q1 * K0 -> S1)
+                # Note: Q0 & Q1 are still needed in the seqlen_kv loop
+                # so we need to release them after the seqlen_kv loop
+
+                # GEMM_PV00 (P0 * V0 -> O0_partial), O0 needs to be accumulated in the seqlen_kv loop
+                # 1. wait for V0
+                load_kv_pipeline.consumer_wait(mma_kv_consumer_state)
+                tOrVi = tOrV[None, None, None, mma_kv_consumer_state.index]
+                mma_kv_consumer_state.advance()
+                # 2. acquire corrected O0_partial
+                # Note: acquire corr first to take it out of the critical
+                # path since softmax takes longer
+                mma_corr_pipeline.producer_acquire(mma_corr_producer_state)
+                # 3. acquire P0
+                # this acquire returns the ownership of all of S0 to the mma warp
+                # including the P0 part (inplaced in S0)
+                mma_s0_pipeline.producer_acquire(mma_s0_producer_state)
+                # 4. gemm
+                num_kphases = cute.size(tOrP0, mode=[2])
+                for kphase_idx in range(num_kphases):
+                    kphase_coord = (None, None, kphase_idx)
+                    pv_tiled_mma.set(tcgen05.Field.ACCUMULATE, kphase_idx != 0)
+                    cute.gemm(
+                        pv_tiled_mma,
+                        tOtO0,
+                        tOrP0[kphase_coord],
+                        tOrVi[kphase_coord],
+                        tOtO0,
+                    )
+                # 5. release accumulated O0_partial
+                mma_corr_pipeline.producer_commit(mma_corr_producer_state)
+                mma_corr_producer_state.advance()
+                # End of GEMM_PV00 (P0 * V0 -> O0_partial)
+
+                seqlen_kv_loop_steps = (
+                    fused_mask.get_trip_count(curr_block_coord, self.cta_tiler) - 1
+                )
+                # O1 hasn't been accumulated yet, its first MMA calculation doesn't need to accumulate
+                pv_whether_acc = False
+                for i in cutlass.range_dynamic(0, seqlen_kv_loop_steps, 1, unroll=1):
+                    # GEMM_QK0i (Q0 * Ki -> S0)
+                    # 1. wait for Ki
+                    load_kv_pipeline.consumer_wait(mma_kv_consumer_state)
+                    tSrKi = tSrK[None, None, None, mma_kv_consumer_state.index]
+                    mma_kv_consumer_state.advance()
+                    # 2. gemm
+                    inner_num_kphases = cute.size(tSrQ0, mode=[2])
+                    for kphase_idx in range(inner_num_kphases):
+                        kphase_coord = (None, None, kphase_idx)
+                        qk_tiled_mma.set(tcgen05.Field.ACCUMULATE, kphase_idx != 0)
+                        cute.gemm(
+                            qk_tiled_mma,
+                            tStS0,
+                            tSrQ0[kphase_coord],
+                            tSrKi[kphase_coord],
+                            tStS0,
+                        )
+                    # 3. release S0
+                    mma_s0_pipeline.producer_commit(mma_s0_producer_state)
+                    mma_s0_producer_state.advance()
+                    # End of GEMM_QK0i (Q0 * Ki -> S0)
+
+                    # GEMM_PV1(i-1) (P1 * V(i-1) -> O1_partial), V(i-1) is ready in GEMM_PV0(i-1)
+                    # 1. acquire corrected O1_partial
+                    mma_corr_pipeline.producer_acquire(mma_corr_producer_state)
+                    # 2. acquire P1
+                    mma_s1_pipeline.producer_acquire(mma_s1_producer_state)
+                    # 3. gemm
+                    inner_num_kphases = cute.size(tOrP0, mode=[2])
+                    for kphase_idx in range(inner_num_kphases):
+                        kphase_coord = (None, None, kphase_idx)
+                        pv_tiled_mma.set(tcgen05.Field.ACCUMULATE, pv_whether_acc)
+                        cute.gemm(
+                            pv_tiled_mma,
+                            tOtO1,
+                            tOrP1[kphase_coord],
+                            tOrVi[kphase_coord],
+                            tOtO1,
+                        )
+                        pv_whether_acc = True
+                    # 4. release accumulated O1_partial
+                    mma_corr_pipeline.producer_commit(mma_corr_producer_state)
+                    mma_corr_producer_state.advance()
+                    # 5. release V(i-1)
+                    load_kv_pipeline.consumer_release(mma_kv_release_state)
+                    mma_kv_release_state.advance()
+                    # End of GEMM_PV1(i-1) (P1 * V(i-1) -> O1_partial)
+
+                    # GEMM_QK1i (Q1 * Ki -> S1), Q1 is ready in GEMM_QK10; Ki is ready in GEMM_QK0i
+                    # 1. gemm
+                    inner_num_kphases = cute.size(tSrQ1, mode=[2])
+                    for kphase_idx in range(inner_num_kphases):
+                        kphase_coord = (None, None, kphase_idx)
+                        qk_tiled_mma.set(tcgen05.Field.ACCUMULATE, kphase_idx != 0)
+                        cute.gemm(
+                            qk_tiled_mma,
+                            tStS1,
+                            tSrQ1[kphase_coord],
+                            tSrKi[kphase_coord],
+                            tStS1,
+                        )
+                    mma_s1_pipeline.producer_commit(mma_s1_producer_state)
+                    mma_s1_producer_state.advance()
+                    # 2. release Ki
+                    load_kv_pipeline.consumer_release(mma_kv_release_state)
+                    mma_kv_release_state.advance()
+                    # End of GEMM_QK1i (Q1 * Ki -> S1)
+
+                    # GEMM_PV0i (P0 * Vi -> O0_partial)
+                    # 1. wait for Vi
+                    load_kv_pipeline.consumer_wait(mma_kv_consumer_state)
+                    tOrVi = tOrV[None, None, None, mma_kv_consumer_state.index]
+                    mma_kv_consumer_state.advance()
+                    # 2. acquire corrected O0_partial
+                    mma_corr_pipeline.producer_acquire(mma_corr_producer_state)
+                    # 3. acquire P0
+                    mma_s0_pipeline.producer_acquire(mma_s0_producer_state)
+                    # 4. gemm
+                    inner_num_kphases = cute.size(tOrP0, mode=[2])
+                    for kphase_idx in range(inner_num_kphases):
+                        kphase_coord = (None, None, kphase_idx)
+                        pv_tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+                        cute.gemm(
+                            pv_tiled_mma,
+                            tOtO0,
+                            tOrP0[kphase_coord],
+                            tOrVi[kphase_coord],
+                            tOtO0,
+                        )
+                    # 5. release accumulated O0_partial
+                    mma_corr_pipeline.producer_commit(mma_corr_producer_state)
+                    mma_corr_producer_state.advance()
+                    # End of GEMM_PV0i (P0 * Vi -> O0_partial)
+                # End of seqlen_kv loop
+
+                # release Q0 & Q1
+                load_q_pipeline.consumer_release(mma_q_release_state)
+                mma_q_release_state.advance()
+                load_q_pipeline.consumer_release(mma_q_release_state)
+                mma_q_release_state.advance()
+
+                # GEMM_PV1(i_end) (P1 * Vi_end -> O1)
+                # 1. acquire corrected O1_partial
+                mma_corr_pipeline.producer_acquire(mma_corr_producer_state)
+                # 2. acquire P1
+                mma_s1_pipeline.producer_acquire(mma_s1_producer_state)
+                # 3. gemm
+                num_kphases = cute.size(tOrP1, mode=[2])
+                for kphase_idx in range(num_kphases):
+                    kphase_coord = (None, None, kphase_idx)
+                    pv_tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+                    cute.gemm(
+                        pv_tiled_mma,
+                        tOtO1,
+                        tOrP1[kphase_coord],
+                        tOrVi[kphase_coord],
+                        tOtO1,
+                    )
+                # 4. commit accumulated O1
+                mma_corr_pipeline.producer_commit(mma_corr_producer_state)
+                mma_corr_producer_state.advance()
+                # 5. release Vi_end
+                load_kv_pipeline.consumer_release(mma_kv_release_state)
+                mma_kv_release_state.advance()
+                # End of GEMM_PV1(i_end) (P1 * Vi_end -> O1)
+
+                # Commit S0 and S1
+                mma_s0_pipeline.producer_commit(mma_s0_producer_state)
+                mma_s0_producer_state.advance()
+                mma_s1_pipeline.producer_commit(mma_s1_producer_state)
+                mma_s1_producer_state.advance()
+
+                # Advance to next tile
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+            # End of persistent scheduler loop
+
+            # dealloc tmem buffer
+            cute.arch.mbarrier_wait(tmem_dealloc_mbar_ptr, 0)
+            tmem_alloc_cols = cutlass.Int32(self.tmem_alloc_cols)
+            #  Retrieving tmem ptr and make acc
+            tmem_ptr = cute.arch.retrieve_tmem_ptr(
+                cutlass.Float32,
+                alignment=16,
+                ptr_to_buffer_holding_addr=storage.tmem_holding_buf,
+            )
+
+            cute.arch.dealloc_tmem(tmem_ptr, tmem_alloc_cols)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        #  Epilogue
+        # ///////////////////////////////////////////////////////////////////////////////
+        if warp_idx == self.epilogue_warp_id:
+            cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+            cute.arch.mbarrier_arrive(max_reg_setting_mbar_ptr)
+
+            corr_epi_consumer_state = utils.make_pipeline_state(
+                utils.PipelineUserType.Consumer, self.epi_stage
+            )
+            corr_epi_release_state = corr_epi_consumer_state.clone()
+
+            tile_sched = create_fmha_static_tile_scheduler(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            work_tile = tile_sched.initial_work_tile_info()
+
+            while work_tile.is_valid_tile:
+                curr_block_coord = work_tile.tile_idx
+
+                o0_coord = 2 * curr_block_coord[0]
+                o1_coord = o0_coord + 1
+                gO = gO_qdl[None, None, None, 0, curr_block_coord[2]]
+                tOsO, tOgO = cute.nvgpu.cpasync.tma_partition(
+                    tma_atom_o,
+                    0,
+                    cute.make_layout(1),
+                    cute.group_modes(sO, 0, 2),
+                    cute.group_modes(gO, 0, 2),
+                )
+
+                # O0 O1 using the same pipeline
+                # wait from corr, issue tma store on smem
+                # O0
+                # 1. wait for O0 final
+                corr_epi_pipeline.consumer_wait(corr_epi_consumer_state)
+                corr_epi_consumer_state.advance()
+                # 2. copy O0 to gmem
+                cute.copy(tma_atom_o, tOsO[None, 0], tOgO[None, o0_coord])
+                cute.arch.cp_async_bulk_commit_group()
+                # O1
+                # 1. wait for O1 final
+                corr_epi_pipeline.consumer_wait(corr_epi_consumer_state)
+                corr_epi_consumer_state.advance()
+                # 2. copy O1 to gmem
+                cute.copy(tma_atom_o, tOsO[None, 1], tOgO[None, o1_coord])
+                cute.arch.cp_async_bulk_commit_group()
+
+                # Ensure O0 buffer is ready to be released
+                cute.arch.cp_async_bulk_wait_group(1, read=True)
+                corr_epi_pipeline.consumer_release(corr_epi_release_state)
+                corr_epi_release_state.advance()
+                # Ensure O1 buffer is ready to be released
+                cute.arch.cp_async_bulk_wait_group(0, read=True)
+                corr_epi_pipeline.consumer_release(corr_epi_release_state)
+                corr_epi_release_state.advance()
+
+                # Advance to next tile
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+            # End of persistent scheduler loop
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        #  Softmax0
+        # ///////////////////////////////////////////////////////////////////////////////
+        if warp_idx < self.softmax1_warp_ids[0]:
+            # increase register after decreasing
+            cute.arch.mbarrier_wait(max_reg_setting_mbar_ptr, 0)
+            cute.arch.warpgroup_reg_alloc(self.num_regs_softmax)
+
+            self.softmax(
+                stage=0,
+                scale_softmax_log2=scale_softmax_log2,
+                qk_thr_mma=qk_thr_mma,
+                tStS=tStS,
+                tStSi=tStS0,
+                mma_si_pipeline=mma_s0_pipeline,
+                si_corr_pipeline=s0_corr_pipeline,
+                s0_s1_sequence_pipeline=s0_s1_sequence_pipeline,
+                tile_sched_params=tile_sched_params,
+                fused_mask=fused_mask,
+            )
+            cute.arch.mbarrier_arrive(tmem_dealloc_mbar_ptr)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        #  Softmax1
+        # ///////////////////////////////////////////////////////////////////////////////
+        if (
+            warp_idx < self.correction_warp_ids[0]
+            and warp_idx >= self.softmax1_warp_ids[0]
+        ):
+            # increase register after decreasing
+            cute.arch.mbarrier_wait(max_reg_setting_mbar_ptr, 0)
+            cute.arch.warpgroup_reg_alloc(self.num_regs_softmax)
+
+            self.softmax(
+                stage=1,
+                scale_softmax_log2=scale_softmax_log2,
+                qk_thr_mma=qk_thr_mma,
+                tStS=tStS,
+                tStSi=tStS1,
+                mma_si_pipeline=mma_s1_pipeline,
+                si_corr_pipeline=s1_corr_pipeline,
+                s0_s1_sequence_pipeline=s0_s1_sequence_pipeline,
+                tile_sched_params=tile_sched_params,
+                fused_mask=fused_mask,
+            )
+            cute.arch.mbarrier_arrive(tmem_dealloc_mbar_ptr)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        #  Correction
+        # ///////////////////////////////////////////////////////////////////////////////
+        if warp_idx >= self.correction_warp_ids[0] and warp_idx < self.mma_warp_id:
+            cute.arch.warpgroup_reg_dealloc(self.num_regs_correction)
+            cute.arch.mbarrier_arrive(max_reg_setting_mbar_ptr)
+
+            s0_corr_consumer_state = utils.make_pipeline_state(
+                utils.PipelineUserType.Consumer, self.softmax_corr_stage
+            )
+            s1_corr_consumer_state = utils.make_pipeline_state(
+                utils.PipelineUserType.Consumer, self.softmax_corr_stage
+            )
+            o_corr_consumer_state = utils.make_pipeline_state(
+                utils.PipelineUserType.Consumer, self.mma_corr_stage
+            )
+            corr_epi_producer_state = utils.make_pipeline_state(
+                utils.PipelineUserType.Producer, self.epi_stage
+            )
+
+            cS = cute.make_identity_tensor((self.qk_mma_tiler[0], self.qk_mma_tiler[1]))
+            tScS = qk_thr_mma.partition_C(cS)
+
+            tStS_vec_layout = cute.composition(tStS.layout, cute.make_layout((128, 2)))
+
+            tStS_vec0 = cute.make_tensor(
+                tStS.iterator + self.tmem_vec0_offset, tStS_vec_layout
+            )
+            tStS_vec1 = cute.make_tensor(
+                tStS.iterator + self.tmem_vec1_offset, tStS_vec_layout
+            )
+
+            tScS_vec_layout = cute.composition(tScS.layout, cute.make_layout((128, 2)))
+            tScS_vec = cute.make_tensor(tScS.iterator, tScS_vec_layout)
+
+            tmem_load_v_atom = cute.make_copy_atom(
+                tcgen05.copy.Ld32x32bOp(tcgen05.copy.Repetition(2)),
+                self.qk_acc_dtype,
+            )
+
+            tiled_tmem_load_vec = tcgen05.make_tmem_copy(tmem_load_v_atom, tStS_vec0)
+            thread_idx = tidx % (self.threads_per_warp * len(self.correction_warp_ids))
+            thr_tmem_load_vec = tiled_tmem_load_vec.get_slice(thread_idx)
+
+            tTMEM_LOAD_VECtS0 = thr_tmem_load_vec.partition_S(tStS_vec0)
+            tTMEM_LOAD_VECtS1 = thr_tmem_load_vec.partition_S(tStS_vec1)
+            tTMEM_LOAD_VECcS = thr_tmem_load_vec.partition_D(tScS_vec)
+
+            tile_sched = create_fmha_static_tile_scheduler(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            work_tile = tile_sched.initial_work_tile_info()
+
+            while work_tile.is_valid_tile:
+                curr_block_coord = work_tile.tile_idx
+
+                # Ignore first signal from softmax as no correction is required
+                s0_corr_pipeline.consumer_wait(s0_corr_consumer_state)
+                s0_corr_pipeline.consumer_release(s0_corr_consumer_state)
+                s0_corr_consumer_state.advance()
+
+                s1_corr_pipeline.consumer_wait(s1_corr_consumer_state)
+
+                seqlen_kv_loop_steps = (
+                    fused_mask.get_trip_count(curr_block_coord, self.cta_tiler) - 1
+                )
+                for i in cutlass.range_dynamic(0, seqlen_kv_loop_steps, 1, unroll=1):
+                    # wait for S0
+                    s0_corr_pipeline.consumer_wait(s0_corr_consumer_state)
+                    tTMEM_LOAD_VECrS = cute.make_fragment(
+                        tTMEM_LOAD_VECcS.shape, self.qk_acc_dtype
+                    )
+                    # read row_wise new global max
+                    cute.copy(tiled_tmem_load_vec, tTMEM_LOAD_VECtS0, tTMEM_LOAD_VECrS)
+
+                    scale_ = scale_softmax_log2 * (
+                        tTMEM_LOAD_VECrS[0] - tTMEM_LOAD_VECrS[1]
+                    )
+                    scale = cute.arch.exp2(scale_)
+
+                    mma_corr_pipeline.consumer_wait(o_corr_consumer_state)
+                    self.correction_rescale(pv_thr_mma, tOtO0, scale)
+
+                    s1_corr_pipeline.consumer_release(s1_corr_consumer_state)
+                    s1_corr_consumer_state.advance()
+
+                    cute.arch.fence_view_async_tmem_store()
+
+                    mma_corr_pipeline.consumer_release(o_corr_consumer_state)
+                    o_corr_consumer_state.advance()
+
+                    s1_corr_pipeline.consumer_wait(s1_corr_consumer_state)
+
+                    cute.copy(tiled_tmem_load_vec, tTMEM_LOAD_VECtS1, tTMEM_LOAD_VECrS)
+
+                    scale_ = scale_softmax_log2 * (
+                        tTMEM_LOAD_VECrS[0] - tTMEM_LOAD_VECrS[1]
+                    )
+                    scale = cute.arch.exp2(scale_)
+
+                    mma_corr_pipeline.consumer_wait(o_corr_consumer_state)
+                    self.correction_rescale(pv_thr_mma, tOtO1, scale)
+
+                    s0_corr_pipeline.consumer_release(s0_corr_consumer_state)
+                    s0_corr_consumer_state.advance()
+
+                    cute.arch.fence_view_async_tmem_store()
+                    mma_corr_pipeline.consumer_release(o_corr_consumer_state)
+                    o_corr_consumer_state.advance()
+                # End of seqlen_corr_loop_steps
+
+                s1_corr_pipeline.consumer_release(s1_corr_consumer_state)
+                s1_corr_consumer_state.advance()
+
+                s0_corr_pipeline.consumer_wait(s0_corr_consumer_state)
+
+                tTMEM_LOAD_VECrS = cute.make_fragment(
+                    tTMEM_LOAD_VECcS.shape, self.qk_acc_dtype
+                )
+                cute.copy(tiled_tmem_load_vec, tTMEM_LOAD_VECtS0, tTMEM_LOAD_VECrS)
+                cute.arch.fence_view_async_tmem_load()
+
+                s0_corr_pipeline.consumer_release(s0_corr_consumer_state)
+                s0_corr_consumer_state.advance()
+
+                mma_corr_pipeline.consumer_wait(o_corr_consumer_state)
+                corr_epi_pipeline.producer_acquire(corr_epi_producer_state)
+
+                self.correction_epilog(
+                    pv_thr_mma,
+                    tOtO0,
+                    scale_output / tTMEM_LOAD_VECrS[0],
+                    sO[None, None, 0],
+                )
+
+                mma_corr_pipeline.consumer_release(o_corr_consumer_state)
+                o_corr_consumer_state.advance()
+
+                corr_epi_pipeline.producer_commit(corr_epi_producer_state)
+                corr_epi_producer_state.advance()
+
+                s1_corr_pipeline.consumer_wait(s1_corr_consumer_state)
+                # load from V1
+                cute.copy(tiled_tmem_load_vec, tTMEM_LOAD_VECtS1, tTMEM_LOAD_VECrS)
+                cute.arch.fence_view_async_tmem_load()
+
+                s1_corr_pipeline.consumer_release(s1_corr_consumer_state)
+                s1_corr_consumer_state.advance()
+
+                mma_corr_pipeline.consumer_wait(o_corr_consumer_state)
+
+                corr_epi_pipeline.producer_acquire(corr_epi_producer_state)
+                self.correction_epilog(
+                    pv_thr_mma,
+                    tOtO1,
+                    scale_output / tTMEM_LOAD_VECrS[0],
+                    sO[None, None, 1],
+                )
+                mma_corr_pipeline.consumer_release(o_corr_consumer_state)
+                o_corr_consumer_state.advance()
+
+                corr_epi_pipeline.producer_commit(corr_epi_producer_state)
+                corr_epi_producer_state.advance()
+
+                # Advance to next tile
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+            # End of persistent scheduler loop
+
+            cute.arch.mbarrier_arrive(tmem_dealloc_mbar_ptr)
+
+        return
+
+    @cute.jit
+    def softmax_step(
+        self,
+        stage: int,
+        need_apply_mask: bool,
+        row_max: cutlass.Float32,
+        row_sum: cutlass.Float32,
+        mma_si_consumer_state: utils.PipelineState,
+        si_corr_producer_state: utils.PipelineState,
+        s0_s1_sequence_state: utils.PipelineState,
+        mma_si_pipeline: utils.PipelineAsync,
+        si_corr_pipeline: utils.PipelineAsync,
+        s0_s1_sequence_pipeline: utils.PipelineAsync,
+        scale_softmax_log2: cutlass.Float32,
+        cS: cute.Tensor,
+        qk_thr_mma: cute.core.ThrMma,
+        tiled_tmem_load: cute.TiledCopy,
+        tiled_tmem_store: cute.TiledCopy,
+        tiled_tmem_store_vec: cute.TiledCopy,
+        thr_tmem_load: cute.CopyAtom,
+        thr_tmem_store: cute.CopyAtom,
+        thr_tmem_store_vec: cute.CopyAtom,
+        tTMEM_LOADtS: cute.Tensor,
+        tTMEM_STORE_VECtS: cute.Tensor,
+        tTMEM_STOREtS_x4: cute.Tensor,
+        fused_mask: cute.Tensor,
+    ) -> Tuple[
+        cutlass.Float32,
+        cutlass.Float32,
+        utils.PipelineState,
+        utils.PipelineState,
+        utils.PipelineState,
+    ]:
+        """Perform a single step of the softmax computation on a block of attention scores.
+
+        This method processes one block of the attention matrix, computing numerically stable
+        softmax by first finding the row maximum, subtracting it from all elements, applying
+        exponential function, and then normalizing by the sum of exponentials. It also handles
+        optional masking of attention scores.
+
+        The method involves several key operations:
+        1. Loading attention scores from tensor memory
+        2. Applying optional masking based on position
+        3. Computing row-wise maximum values for numerical stability
+        4. Transforming scores using exp2(x*scale - max*scale)
+        5. Computing row sums for normalization
+        6. Coordinating pipeline synchronization between different processing stages
+
+        :param stage: Processing stage (0 for first half, 1 for second half)
+        :type stage: int
+        :param need_apply_mask: Whether to apply attention masking
+        :type need_apply_mask: bool
+        :param row_max: Current maximum value for the row
+        :type row_max: cute.core.Tensor
+        :param row_sum: Current sum value for the row
+        :type row_sum: cute.core.Tensor
+        :param mma_si_consumer_state: Pipeline state for MMA consumer operations
+        :type mma_si_consumer_state: utils.PipelineState
+        :param si_corr_producer_state: Pipeline state for correction producer operations
+        :type si_corr_producer_state: utils.PipelineState
+        :param s0_s1_sequence_state: Pipeline state for sequence synchronization
+        :type s0_s1_sequence_state: utils.PipelineState
+        :param mma_si_pipeline: Pipeline for MMA operations
+        :type mma_si_pipeline: utils.PipelineAsync
+        :param si_corr_pipeline: Pipeline for correction operations
+        :type si_corr_pipeline: utils.PipelineAsync
+        :param s0_s1_sequence_pipeline: Pipeline for sequence synchronization
+        :type s0_s1_sequence_pipeline: utils.PipelineAsync
+        :param scale_softmax_log2: Log2 scale factor for softmax computation
+        :type scale_softmax_log2: cutlass.Float32
+        :param cS: Current slice of attention matrix
+        :type cS: cute.Tensor
+        :param qk_thr_mma: Thread MMA operation
+        :type qk_thr_mma: cute.core.ThrMma
+        :param tiled_tmem_load: Tiled copy operation for loading from tensor memory
+        :type tiled_tmem_load: cute.TiledCopy
+        :param tiled_tmem_store: Tiled copy operation for storing to tensor memory
+        :type tiled_tmem_store: cute.TiledCopy
+        :param tiled_tmem_store_vec: Tiled copy operation for storing vector data
+        :type tiled_tmem_store_vec: cute.TiledCopy
+        :param thr_tmem_load: Thread copy operation for loading
+        :type thr_tmem_load: cute.CopyAtom
+        :param thr_tmem_store: Thread copy operation for storing
+        :type thr_tmem_store: cute.CopyAtom
+        :param thr_tmem_store_vec: Thread copy operation for storing vector data
+        :type thr_tmem_store_vec: cute.CopyAtom
+        :param tTMEM_LOADtS: Tensor for loading from tensor memory
+        :type tTMEM_LOADtS: cute.Tensor
+        :param tTMEM_STORE_VECtS: Tensor for storing vector data
+        :type tTMEM_STORE_VECtS: cute.Tensor
+        :param tTMEM_STOREtS_x4: Tensor for storing processed data
+        :type tTMEM_STOREtS_x4: cute.Tensor
+        :param fused_mask: Mask configuration for attention masking
+        :type fused_mask: cute.Tensor
+        :return: Updated state values (row_max, row_sum, and pipeline states)
+        :rtype: tuple
+        """
+        tilePlikeFP32 = (
+            self.qk_mma_tiler[1] // cutlass.Float32.width * self.o_dtype.width
+        )
+        tScS = qk_thr_mma.partition_C(cS)
+        tScS_vec_layout = cute.composition(tScS.layout, cute.make_layout((128, 2)))
+        tScS_vec = cute.make_tensor(tScS.iterator, tScS_vec_layout)
+
+        tScS_P_layout = cute.composition(
+            tScS.layout, cute.make_layout((128, tilePlikeFP32))
+        )
+        tScS_P = cute.make_tensor(tScS.iterator, tScS_P_layout)
+        tTMEM_LOADcS = thr_tmem_load.partition_D(tScS)
+        tTMEM_STORE_VECcS = thr_tmem_store_vec.partition_S(tScS_vec)
+        tTMEM_STOREcS = thr_tmem_store.partition_S(tScS_P)
+
+        # Wait for Si
+        mma_si_pipeline.consumer_wait(mma_si_consumer_state)
+        tTMEM_LOADrS = cute.make_fragment(tTMEM_LOADcS.shape, self.qk_acc_dtype)
+        cute.copy(tiled_tmem_load, tTMEM_LOADtS, tTMEM_LOADrS)
+
+        if need_apply_mask:
+            fused_mask.apply_mask(tTMEM_LOADrS, tTMEM_LOADcS)
+
+        old_row_max = row_max
+        row_max = tTMEM_LOADrS.load().reduce(cute.ReductionOp.MAX, row_max, 0)
+        row_max_safe = row_max
+        if row_max == -cutlass.Float32.inf:
+            row_max_safe = 0.0
+
+        tTMEM_STORE_VECrS = cute.make_fragment(
+            tTMEM_STORE_VECcS.shape, self.qk_acc_dtype
+        )
+        tTMEM_STORE_VECrS[0] = old_row_max
+        tTMEM_STORE_VECrS[1] = row_max_safe
+        cute.copy(tiled_tmem_store_vec, tTMEM_STORE_VECrS, tTMEM_STORE_VECtS)
+        cute.arch.fence_view_async_tmem_store()
+        # Notify correction wg that row_max is ready
+        si_corr_pipeline.producer_commit(si_corr_producer_state)
+        si_corr_producer_state.advance()
+
+        tTMEM_STORErS_x4 = cute.make_fragment(tTMEM_STOREcS.shape, self.qk_acc_dtype)
+        tTMEM_STORErS_x4_e = cute.make_tensor(
+            cute.recast_ptr(tTMEM_STORErS_x4.iterator, dtype=self.q_dtype),
+            tTMEM_LOADrS.layout,
+        )
+
+        scale = scale_softmax_log2
+        minus_row_max_scale = (0.0 - row_max_safe) * scale
+
+        # Sequence barrier wait
+        if stage == 0:
+            s0_s1_sequence_pipeline.producer_acquire(s0_s1_sequence_state)
+        else:
+            s0_s1_sequence_pipeline.consumer_wait(s0_s1_sequence_state)
+
+        frg_cnt = 4
+        frg_tile = cute.size(tTMEM_LOADrS) // frg_cnt
+        tTMEM_LOADrS_frg = cute.logical_divide(tTMEM_LOADrS, cute.make_layout(frg_tile))
+        tTMEM_STORErS_x4_e_frg = cute.logical_divide(
+            tTMEM_STORErS_x4_e, cute.make_layout(frg_tile)
+        )
+        for j in range(frg_cnt):
+            for k in range(0, cute.size(tTMEM_LOADrS_frg, mode=[0]), 2):
+                tTMEM_LOADrS_frg[k, j], tTMEM_LOADrS_frg[k + 1, j] = (
+                    cute.arch.fma_packed_f32x2(
+                        (tTMEM_LOADrS_frg[k, j], tTMEM_LOADrS_frg[k + 1, j]),
+                        (scale, scale),
+                        (minus_row_max_scale, minus_row_max_scale),
+                    )
+                )
+                tTMEM_LOADrS_frg[k, j] = cute.arch.exp2(tTMEM_LOADrS_frg[k, j])
+                tTMEM_LOADrS_frg[k + 1, j] = cute.arch.exp2(tTMEM_LOADrS_frg[k + 1, j])
+            s_vec = tTMEM_LOADrS_frg[None, j].load()
+            tTMEM_STORErS_x4_e_frg[None, j].store(s_vec.to(self.q_dtype))
+
+        # Sequence barrier arrive
+        if stage == 0:
+            s0_s1_sequence_pipeline.producer_commit(s0_s1_sequence_state)
+        else:
+            s0_s1_sequence_pipeline.consumer_release(s0_s1_sequence_state)
+        s0_s1_sequence_state.advance()
+
+        cute.copy(tiled_tmem_store, tTMEM_STORErS_x4, tTMEM_STOREtS_x4)
+        cute.arch.fence_view_async_tmem_store()
+
+        # Notify tensor core warp that P is ready
+        mma_si_pipeline.consumer_release(mma_si_consumer_state)
+        mma_si_consumer_state.advance()
+
+        si_corr_pipeline.producer_acquire(si_corr_producer_state)
+
+        acc_scale_ = scale * (old_row_max - row_max_safe)
+        acc_scale = cute.arch.exp2(acc_scale_) * 0.5
+        row_sum *= acc_scale
+        local_row_sum_0 = (row_sum, row_sum)
+        local_row_sum_1 = (0.0, 0.0)
+        local_row_sum_2 = (0.0, 0.0)
+        local_row_sum_3 = (0.0, 0.0)
+
+        reduction_unroll = 4
+        frg_tile = cute.size(tTMEM_LOADrS) // reduction_unroll
+        tTMEM_LOADrS_frg = cute.logical_divide(tTMEM_LOADrS, cute.make_layout(frg_tile))
+
+        for j in range(0, cute.size(tTMEM_LOADrS_frg, mode=[0]), 2):
+            local_row_sum_0 = cute.arch.add_packed_f32x2(
+                local_row_sum_0, (tTMEM_LOADrS_frg[j, 0], tTMEM_LOADrS_frg[j + 1, 0])
+            )
+            local_row_sum_1 = cute.arch.add_packed_f32x2(
+                local_row_sum_1, (tTMEM_LOADrS_frg[j, 1], tTMEM_LOADrS_frg[j + 1, 1])
+            )
+            local_row_sum_2 = cute.arch.add_packed_f32x2(
+                local_row_sum_2, (tTMEM_LOADrS_frg[j, 2], tTMEM_LOADrS_frg[j + 1, 2])
+            )
+            local_row_sum_3 = cute.arch.add_packed_f32x2(
+                local_row_sum_3, (tTMEM_LOADrS_frg[j, 3], tTMEM_LOADrS_frg[j + 1, 3])
+            )
+
+        local_row_sum_0 = cute.arch.add_packed_f32x2(local_row_sum_0, local_row_sum_1)
+        local_row_sum_2 = cute.arch.add_packed_f32x2(local_row_sum_2, local_row_sum_3)
+        local_row_sum_0 = cute.arch.add_packed_f32x2(local_row_sum_0, local_row_sum_2)
+        row_sum = local_row_sum_0[0] + local_row_sum_0[1]
+
+        return (
+            row_max,
+            row_sum,
+            mma_si_consumer_state,
+            si_corr_producer_state,
+            s0_s1_sequence_state,
+        )
+
+    # for both softmax0 and softmax1 warp group
+    @cute.jit
+    def softmax(
+        self,
+        stage: int,
+        scale_softmax_log2: cutlass.Float32,
+        qk_thr_mma: cute.core.ThrMma,
+        tStS: cute.Tensor,
+        tStSi: cute.Tensor,
+        mma_si_pipeline: utils.PipelineAsync,
+        si_corr_pipeline: utils.PipelineAsync,
+        s0_s1_sequence_pipeline: utils.PipelineAsync,
+        tile_sched_params: FmhaStaticTileSchedulerParams,
+        fused_mask: FusedMask,
+    ):
+        """Compute softmax on attention scores from QK matrix multiplication.
+
+        This method handles the softmax computation for either the first or second half of the
+        attention matrix, depending on the 'stage' parameter. It calculates row-wise maximum
+        and sum values needed for stable softmax computation, applies optional masking, and
+        transforms raw attention scores into probability distributions.
+
+        The implementation uses specialized memory access patterns and efficient math operations
+        for computing exp(x) using exp2 functions. It also coordinates pipeline
+        synchronization between MMA, correction, and sequence processing stages.
+
+        :param stage: Processing stage (0 for first half, 1 for second half of attention matrix)
+        :type stage: int
+        :param scale_softmax_log2: Log2 scale factor for softmax operation
+        :type scale_softmax_log2: cutlass.Float32
+        :param qk_thr_mma: Thread MMA operation for QK matrix multiplication
+        :type qk_thr_mma: cute.core.ThrMma
+        :param tStS: Shared tensor for softmax input/output
+        :type tStS: cute.Tensor
+        :param tStSi: Input tensor containing attention scores
+        :type tStSi: cute.Tensor
+        :param mma_si_pipeline: Pipeline for synchronizing with MMA operations
+        :type mma_si_pipeline: utils.PipelineAsync
+        :param si_corr_pipeline: Pipeline for synchronizing with correction operations
+        :type si_corr_pipeline: utils.PipelineAsync
+        :param s0_s1_sequence_pipeline: Pipeline for synchronizing between stage 0 and 1
+        :type s0_s1_sequence_pipeline: utils.PipelineAsync
+        :param tile_sched_params: Parameters for tile scheduling
+        :type tile_sched_params: FmhaStaticTileSchedulerParams
+        :param fused_mask: Mask configuration for attention masking
+        :type fused_mask: FusedMask
+        """
+        tidx, _, _ = cute.arch.thread_idx()
+        thread_idx = tidx % (
+            self.threads_per_warp
+            * (
+                len(self.softmax0_warp_ids)
+                if stage == 0
+                else len(self.softmax1_warp_ids)
+            )
+        )
+
+        cS_base = cute.make_identity_tensor(
+            (self.qk_mma_tiler[0], self.qk_mma_tiler[1])
+        )
+
+        tilePlikeFP32 = self.qk_mma_tiler[1] // 32 * self.o_dtype.width
+
+        tScS = qk_thr_mma.partition_C(cS_base)
+
+        tStS_vec_layout = cute.composition(tStS.layout, cute.make_layout((128, 2)))
+        tmem_vec_offset = self.tmem_vec0_offset if stage == 0 else self.tmem_vec1_offset
+        tStS_vec = cute.make_tensor(tStS.iterator + tmem_vec_offset, tStS_vec_layout)
+
+        tScS_vec_layout = cute.composition(tScS.layout, cute.make_layout((128, 2)))
+        tScS_vec = cute.make_tensor(tScS.iterator, tScS_vec_layout)
+
+        tStS_P_layout = cute.composition(
+            tStS.layout, cute.make_layout((128, tilePlikeFP32))
+        )
+        tmem_p_offset = self.tmem_p0_offset if stage == 0 else self.tmem_p1_offset
+        tStS_P = cute.make_tensor(tStS.iterator + tmem_p_offset, tStS_P_layout)
+
+        tmem_load_atom = cute.make_copy_atom(
+            tcgen05.copy.Ld32x32bOp(tcgen05.copy.Repetition(32)),
+            self.qk_acc_dtype,
+        )
+
+        tiled_tmem_load = tcgen05.make_tmem_copy(tmem_load_atom, tStSi)
+        thread_idx = tidx % (
+            self.threads_per_warp
+            * (
+                len(self.softmax0_warp_ids)
+                if stage == 0
+                else len(self.softmax1_warp_ids)
+            )
+        )
+        thr_tmem_load = tiled_tmem_load.get_slice(thread_idx)
+        tTMEM_LOADtS = thr_tmem_load.partition_S(tStSi)
+
+        tmem_store_vec_atom = cute.make_copy_atom(
+            tcgen05.copy.St32x32bOp(tcgen05.copy.Repetition(2)),
+            self.qk_acc_dtype,
+        )
+        tiled_tmem_store_vec = tcgen05.make_tmem_copy(tmem_store_vec_atom, tStS_vec)
+        thr_tmem_store_vec = tiled_tmem_store_vec.get_slice(thread_idx)
+
+        tTMEM_STORE_VECtS = thr_tmem_store_vec.partition_D(tStS_vec)
+        tTMEM_STORE_VECcS = thr_tmem_store_vec.partition_S(tScS_vec)
+        tmem_store_atom = cute.make_copy_atom(
+            tcgen05.copy.St32x32bOp(tcgen05.copy.Repetition(32)),
+            self.qk_acc_dtype,
+        )
+        tiled_tmem_store = tcgen05.make_tmem_copy(tmem_store_atom, tStS_P)
+        thr_tmem_store = tiled_tmem_store.get_slice(thread_idx)
+        tTMEM_STOREtS_x4 = thr_tmem_store.partition_D(tStS_P)
+
+        mma_si_consumer_state = utils.make_pipeline_state(
+            utils.PipelineUserType.Consumer, self.mma_softmax_stage
+        )
+        si_corr_producer_state = utils.make_pipeline_state(
+            utils.PipelineUserType.Producer, self.softmax_corr_stage
+        )
+        s0_s1_sequence_state = utils.make_pipeline_state(
+            (
+                utils.PipelineUserType.Producer
+                if stage == 0
+                else utils.PipelineUserType.Consumer
+            ),
+            1,
+        )
+
+        tile_sched = create_fmha_static_tile_scheduler(
+            tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+        )
+        work_tile = tile_sched.initial_work_tile_info()
+
+        while work_tile.is_valid_tile:
+            curr_block_coord = work_tile.tile_idx
+            logical_offset = (
+                curr_block_coord[0] * self.cta_tiler[0] + stage * self.qk_mma_tiler[0],
+                0,
+            )
+
+            cS = cute.domain_offset(logical_offset, cS_base)
+
+            si_corr_pipeline.producer_acquire(si_corr_producer_state)
+            unmask_count = fused_mask.get_unmasked_trip_count(
+                curr_block_coord,
+                self.cta_tiler,
+            )
+
+            row_max = -cutlass.Float32.inf
+            row_sum = 0.0
+
+            for i in cutlass.range_dynamic(0, unmask_count, 1, unroll=1):
+                cS_iter = cute.domain_offset((0, i * self.qk_mma_tiler[1]), cS)
+                (
+                    row_max,
+                    row_sum,
+                    mma_si_consumer_state,
+                    si_corr_producer_state,
+                    s0_s1_sequence_state,
+                ) = self.softmax_step(
+                    stage,
+                    False,
+                    row_max,
+                    row_sum,
+                    mma_si_consumer_state,
+                    si_corr_producer_state,
+                    s0_s1_sequence_state,
+                    mma_si_pipeline,
+                    si_corr_pipeline,
+                    s0_s1_sequence_pipeline,
+                    scale_softmax_log2,
+                    cS_iter,
+                    qk_thr_mma,
+                    tiled_tmem_load,
+                    tiled_tmem_store,
+                    tiled_tmem_store_vec,
+                    thr_tmem_load,
+                    thr_tmem_store,
+                    thr_tmem_store_vec,
+                    tTMEM_LOADtS,
+                    tTMEM_STORE_VECtS,
+                    tTMEM_STOREtS_x4,
+                    fused_mask,
+                )
+
+            mask_count = fused_mask.get_masked_trip_count(
+                curr_block_coord,
+                self.cta_tiler,
+            )
+
+            for i in cutlass.range_dynamic(
+                unmask_count, unmask_count + mask_count, 1, unroll=1
+            ):
+                cS_iter = cute.domain_offset((0, i * self.qk_mma_tiler[1]), cS)
+                (
+                    row_max,
+                    row_sum,
+                    mma_si_consumer_state,
+                    si_corr_producer_state,
+                    s0_s1_sequence_state,
+                ) = self.softmax_step(
+                    stage,
+                    True,
+                    row_max,
+                    row_sum,
+                    mma_si_consumer_state,
+                    si_corr_producer_state,
+                    s0_s1_sequence_state,
+                    mma_si_pipeline,
+                    si_corr_pipeline,
+                    s0_s1_sequence_pipeline,
+                    scale_softmax_log2,
+                    cS_iter,
+                    qk_thr_mma,
+                    tiled_tmem_load,
+                    tiled_tmem_store,
+                    tiled_tmem_store_vec,
+                    thr_tmem_load,
+                    thr_tmem_store,
+                    thr_tmem_store_vec,
+                    tTMEM_LOADtS,
+                    tTMEM_STORE_VECtS,
+                    tTMEM_STOREtS_x4,
+                    fused_mask,
+                )
+
+            mma_si_pipeline.consumer_wait(mma_si_consumer_state)
+
+            tTMEM_STORE_VECrS = cute.make_fragment(
+                tTMEM_STORE_VECcS.shape, self.qk_acc_dtype
+            )
+            tTMEM_STORE_VECrS[0] = row_sum
+            tTMEM_STORE_VECrS[1] = row_max
+            cute.copy(tiled_tmem_store_vec, tTMEM_STORE_VECrS, tTMEM_STORE_VECtS)
+            cute.arch.fence_view_async_tmem_store()
+
+            si_corr_pipeline.producer_commit(si_corr_producer_state)
+            si_corr_producer_state.advance()
+
+            si_corr_pipeline.producer_acquire(si_corr_producer_state)
+
+            # Empty step to sync against pipe s
+            mma_si_pipeline.consumer_release(mma_si_consumer_state)
+            mma_si_consumer_state.advance()
+
+            # Advance to next tile
+            tile_sched.advance_to_next_work()
+            work_tile = tile_sched.get_current_work()
+        # End of persistent scheduler loop
+
+    @cute.jit
+    def correction_rescale(
+        self,
+        thr_mma: cute.core.ThrMma,
+        tOtO: cute.Tensor,
+        scale: cutlass.Float32,
+    ):
+        """Rescale intermediate attention results based on softmax normalization factor.
+
+        This method performs a crucial correction step in the attention computation pipeline.
+        When processing attention in blocks, the softmax normalization factors may change
+        as new blocks are processed. This method rescales previously computed partial
+        output values to account for updated normalization factors.
+
+        The implementation uses efficient tensor memory operations to:
+        1. Load existing partial attention output from tensor memory
+        2. Apply the scaling factor to all elements
+        3. Store the rescaled results back to tensor memory
+
+        :param thr_mma: Thread MMA operation for the computation
+        :type thr_mma: cute.core.ThrMma
+        :param tOtO: Tensor representing partial attention output to be rescaled
+        :type tOtO: cute.Tensor
+        :param scale: Scaling factor to apply to the partial results
+        :type scale: cutlass.Float32
+        """
+        pv_tiled_mma_shape = (
+            self.pv_mma_tiler[0],
+            self.pv_mma_tiler[1],
+        )
+        cO = cute.make_identity_tensor(pv_tiled_mma_shape)
+        tOcO = thr_mma.partition_C(cO)
+
+        corr_tile_size = 16  # tuneable parameter
+        tmem_load_atom = cute.make_copy_atom(
+            tcgen05.copy.Ld32x32bOp(tcgen05.copy.Repetition(corr_tile_size)),
+            self.pv_acc_dtype,
+        )
+        tmem_store_atom = cute.make_copy_atom(
+            tcgen05.copy.St32x32bOp(tcgen05.copy.Repetition(corr_tile_size)),
+            self.pv_acc_dtype,
+        )
+
+        tOtO_i_layout = cute.composition(
+            tOtO.layout, cute.make_layout((128, corr_tile_size))
+        )
+        tOcO_i_layout = cute.composition(
+            tOcO.layout, cute.make_layout((128, corr_tile_size))
+        )
+
+        tOtO_i = cute.make_tensor(tOtO.iterator, tOtO_i_layout)
+        tOcO_i = cute.make_tensor(tOcO.iterator, tOcO_i_layout)
+
+        tiled_tmem_load = tcgen05.make_tmem_copy(tmem_load_atom, tOtO_i)
+        tiled_tmem_store = tcgen05.make_tmem_copy(tmem_store_atom, tOtO_i)
+        tidx, _, _ = cute.arch.thread_idx()
+        thread_idx = tidx % (self.threads_per_warp * len(self.correction_warp_ids))
+        thr_tmem_load = tiled_tmem_load.get_slice(thread_idx)
+        thr_tmem_store = tiled_tmem_store.get_slice(thread_idx)
+
+        tTMEM_LOADtO = thr_tmem_load.partition_S(tOtO_i)
+        tTMEM_LOADcO = thr_tmem_load.partition_D(tOcO_i)
+
+        tTMEM_STOREtO = thr_tmem_store.partition_D(tOtO_i)
+
+        tTMrO = cute.make_fragment(
+            (tTMEM_LOADcO.shape, 128 // corr_tile_size), self.pv_acc_dtype
+        )
+        for i in range(self.cta_tiler[2] // corr_tile_size):
+            tTMrO_i_ = tTMrO[None, i]
+            tTMrO_i_layout = cute.composition(
+                tTMrO_i_.layout, cute.make_layout(tTMrO.shape[0])
+            )
+            tTMrO_i = cute.make_tensor(tTMrO_i_.iterator, tTMrO_i_layout)
+            tTMEM_LOADtO_i = cute.make_tensor(
+                tTMEM_LOADtO.iterator + i * corr_tile_size, tTMEM_LOADtO.layout
+            )
+            tTMEM_STOREtO_i = cute.make_tensor(
+                tTMEM_STOREtO.iterator + i * corr_tile_size, tTMEM_STOREtO.layout
+            )
+
+            cute.copy(tiled_tmem_load, tTMEM_LOADtO_i, tTMrO_i)
+            for j in range(0, cute.size(tTMrO_i), 2):
+                tTMrO_i[j], tTMrO_i[j + 1] = cute.arch.mul_packed_f32x2(
+                    (tTMrO_i[j], tTMrO_i[j + 1]),
+                    (scale, scale),
+                )
+            cute.copy(tiled_tmem_store, tTMrO_i, tTMEM_STOREtO_i)
+
+    @cute.jit
+    def correction_epilog(
+        self,
+        thr_mma: cute.core.ThrMma,
+        tOtO: cute.Tensor,
+        scale: cutlass.Float32,
+        sO: cute.Tensor,
+    ):
+        """Apply final scaling and transformation to attention output before writing to global memory.
+
+        This correction_epilog function handles the final processing step for attention output values.
+        It applies a scaling factor to the accumulated attention results and prepares the
+        data for efficient transfer back to global memory.
+
+        The method performs:
+        1. Loading of accumulated attention results from tensor memory
+        2. Application of the final output scaling factor
+        3. Type conversion if necessary (typically from higher precision accumulator to output precision)
+        4. Reorganization of data for optimal memory access patterns
+        5. Preparation for efficient TMA store operations
+
+        :param thr_mma: Thread MMA operation for the computation
+        :type thr_mma: cute.core.ThrMma
+        :param tOtO: Tensor containing accumulated attention output
+        :type tOtO: cute.Tensor
+        :param scale: Final scaling factor to apply to the output
+        :type scale: cutlass.Float32
+        :param sO: Shared memory tensor for the final output
+        :type sO: cute.Tensor
+        """
+
+        pv_tiled_mma_shape = (
+            self.pv_mma_tiler[0],
+            self.pv_mma_tiler[1],
+        )
+        cO = cute.make_identity_tensor(pv_tiled_mma_shape)
+
+        corr_tile_size = 32 * 8 // self.o_dtype.width
+        tOsO = thr_mma.partition_C(sO)
+        tOcO = thr_mma.partition_C(cO)
+
+        tOtO_i = cute.logical_divide(tOtO, cute.make_layout((128, corr_tile_size)))
+        tOcO_i = cute.logical_divide(tOcO, cute.make_layout((128, corr_tile_size)))
+        tOsO_i = cute.logical_divide(tOsO, cute.make_layout((128, corr_tile_size)))
+        tidx, _, _ = cute.arch.thread_idx()
+        thread_idx = tidx % (self.threads_per_warp * len(self.correction_warp_ids))
+
+        epi_subtile = (self.epi_tile[0], corr_tile_size)
+        tmem_copy_atom = sm100_utils.get_tmem_load_op(
+            self.pv_mma_tiler,
+            self.o_layout,
+            self.o_dtype,
+            self.pv_acc_dtype,
+            epi_subtile,
+            use_2cta_instrs=False,
+        )
+
+        tiled_tmem_load = tcgen05.make_tmem_copy(
+            tmem_copy_atom, tOtO_i[(None, None), 0]
+        )
+
+        thr_tmem_load = tiled_tmem_load.get_slice(thread_idx)
+        smem_copy_atom = sm100_utils.get_smem_store_op(
+            self.o_layout, self.o_dtype, self.pv_acc_dtype, tiled_tmem_load
+        )
+        tiled_smem_store = cute.make_tiled_copy(
+            smem_copy_atom,
+            layout_tv=tiled_tmem_load.layout_dst_tv_tiled,
+            tiler_mn=tiled_tmem_load.tiler_mn,
+        )
+
+        tTMEM_LOADtO = thr_tmem_load.partition_S(tOtO_i[(None, None), None])
+        tTMEM_LOADsO = thr_tmem_load.partition_D(tOsO_i[(None, None), None])
+        tTMEM_LOADoO = thr_tmem_load.partition_D(tOcO_i[(None, None), None])
+
+        for i in range(self.cta_tiler[2] // corr_tile_size):
+            tTMEM_LOADtO_i = tTMEM_LOADtO[None, 0, 0, i]
+            tTMEM_LOADsO_i = tTMEM_LOADsO[None, 0, 0, i]
+            tTMrO = cute.make_fragment(
+                tTMEM_LOADoO[None, 0, 0, i].shape, self.pv_acc_dtype
+            )
+            cute.copy(tiled_tmem_load, tTMEM_LOADtO_i, tTMrO)
+            for j in range(0, cute.size(tTMrO), 2):
+                tTMrO[j], tTMrO[j + 1] = cute.arch.mul_packed_f32x2(
+                    (tTMrO[j], tTMrO[j + 1]),
+                    (scale, scale),
+                )
+            tSMrO = cute.make_fragment(tTMrO.shape, self.o_dtype)
+            o_vec = tTMrO.load()
+            tSMrO.store(o_vec.to(self.o_dtype))
+            cute.copy(tiled_smem_store, tSMrO, tTMEM_LOADsO_i)
+
+        # fence view async shared
+        cute.arch.fence_proxy(
+            cute.arch.ProxyKind.async_shared,
+            space=cute.arch.SharedSpace.shared_cta,
+        )
+
+    def make_and_init_load_q_pipeline(self, load_q_mbar_ptr):
+        load_q_producer_group = utils.CooperativeGroup(
+            utils.Agent.Thread, len([self.load_warp_id])
+        )
+        load_q_consumer_group = utils.CooperativeGroup(
+            utils.Agent.Thread, len([self.mma_warp_id])
+        )
+        return utils.PipelineTmaUmma.create(
+            barrier_storage=load_q_mbar_ptr,
+            num_stages=self.q_stage,
+            producer_group=load_q_producer_group,
+            consumer_group=load_q_consumer_group,
+            tx_count=self.tma_copy_q_bytes,
+        )
+
+    def make_and_init_load_kv_pipeline(self, load_kv_mbar_ptr):
+        load_kv_producer_group = utils.CooperativeGroup(
+            utils.Agent.Thread, len([self.load_warp_id])
+        )
+        load_kv_consumer_group = utils.CooperativeGroup(
+            utils.Agent.Thread, len([self.mma_warp_id])
+        )
+        return utils.PipelineTmaUmma.create(
+            barrier_storage=load_kv_mbar_ptr,
+            num_stages=self.kv_stage,
+            producer_group=load_kv_producer_group,
+            consumer_group=load_kv_consumer_group,
+            tx_count=self.tma_copy_kv_bytes,
+        )
+
+    def make_and_init_mma_si_pipeline(self, mma_si_mbar_ptr):
+        mma_si_producer_group = utils.CooperativeGroup(
+            utils.Agent.Thread, len([self.mma_warp_id])
+        )
+        mma_si_consumer_group = utils.CooperativeGroup(
+            utils.Agent.Thread,
+            self.threads_per_warp * len(self.softmax0_warp_ids),
+            self.threads_per_warp * len(self.softmax0_warp_ids),
+        )
+        return utils.PipelineUmmaAsync.create(
+            barrier_storage=mma_si_mbar_ptr,
+            num_stages=self.mma_softmax_stage,
+            producer_group=mma_si_producer_group,
+            consumer_group=mma_si_consumer_group,
+        )
+
+    def make_and_init_si_corr_pipeline(self, si_corr_mbar_ptr):
+        si_corr_producer_group = utils.CooperativeGroup(
+            utils.Agent.Thread,
+            self.threads_per_warp * len(self.softmax0_warp_ids),
+            self.threads_per_warp * len(self.softmax0_warp_ids),
+        )
+        si_corr_consumer_group = utils.CooperativeGroup(
+            utils.Agent.Thread,
+            self.threads_per_warp * len(self.correction_warp_ids),
+            self.threads_per_warp * len(self.correction_warp_ids),
+        )
+        return utils.PipelineAsync.create(
+            barrier_storage=si_corr_mbar_ptr,
+            num_stages=self.softmax_corr_stage,
+            producer_group=si_corr_producer_group,
+            consumer_group=si_corr_consumer_group,
+        )
+
+    def make_and_init_corr_epi_pipeline(self, corr_epi_mbar_ptr):
+        corr_epi_producer_group = utils.CooperativeGroup(
+            utils.Agent.Thread,
+            self.threads_per_warp * len(self.correction_warp_ids),
+            self.threads_per_warp * len(self.correction_warp_ids),
+        )
+        corr_epi_consumer_group = utils.CooperativeGroup(
+            utils.Agent.Thread,
+            self.threads_per_warp * len([self.epilogue_warp_id]),
+            self.threads_per_warp * len([self.epilogue_warp_id]),
+        )
+        return utils.PipelineAsync.create(
+            barrier_storage=corr_epi_mbar_ptr,
+            num_stages=self.epi_stage,
+            producer_group=corr_epi_producer_group,
+            consumer_group=corr_epi_consumer_group,
+        )
+
+    def make_and_init_mma_corr_pipeline(self, mma_corr_mbar_ptr):
+        mma_corr_producer_group = utils.CooperativeGroup(
+            utils.Agent.Thread, len([self.mma_warp_id])
+        )
+        mma_corr_consumer_group = utils.CooperativeGroup(
+            utils.Agent.Thread,
+            self.threads_per_warp * len(self.correction_warp_ids),
+            self.threads_per_warp * len(self.correction_warp_ids),
+        )
+        return utils.PipelineUmmaAsync.create(
+            barrier_storage=mma_corr_mbar_ptr,
+            num_stages=self.mma_corr_stage,
+            producer_group=mma_corr_producer_group,
+            consumer_group=mma_corr_consumer_group,
+        )
+
+    def make_and_init_si_sequence_pipeline(self, si_sequence_mbar_ptr):
+        s0_sequence_group = utils.CooperativeGroup(
+            utils.Agent.Thread,
+            self.threads_per_warp * len(self.softmax0_warp_ids),
+            self.threads_per_warp * len(self.softmax0_warp_ids),
+        )
+        s1_sequence_group = utils.CooperativeGroup(
+            utils.Agent.Thread,
+            self.threads_per_warp * len(self.softmax1_warp_ids),
+            self.threads_per_warp * len(self.softmax1_warp_ids),
+        )
+        return utils.PipelineAsync.create(
+            barrier_storage=si_sequence_mbar_ptr,
+            num_stages=1,
+            producer_group=s0_sequence_group,
+            consumer_group=s1_sequence_group,
+        )
+
+    @staticmethod
+    def _compute_grid(
+        o: cute.Tensor,
+        cta_tiler: Tuple[int, int, int],
+        is_persistent: bool,
+    ) -> Tuple[FmhaStaticTileSchedulerParams, Tuple[int, int, int]]:
+        o_shape = o.shape
+        tile_sched_params = create_fmha_static_tile_scheduler_params(
+            is_persistent,
+            (
+                cute.ceil_div(cute.size(o_shape[0]), cta_tiler[0]),
+                cute.size(o_shape[2][0]),
+                cute.size(o_shape[2][1]),
+            ),
+        )
+        grid = FmhaStaticTileScheduler.get_grid_shape(tile_sched_params)
+
+        return tile_sched_params, grid
+
+
+def run_fmha_and_verify(
+    q_shape: Tuple[int, int, int, int],
+    k_shape: Tuple[int, int, int, int],
+    in_dtype: Type[cutlass.Numeric],
+    out_dtype: Type[cutlass.Numeric],
+    qk_acc_dtype: Type[cutlass.Numeric],
+    pv_acc_dtype: Type[cutlass.Numeric],
+    mma_tiler_mn: Tuple[int, int],
+    is_persistent: bool,
+    has_casual_mask: bool,
+    scale_q: float,
+    scale_k: float,
+    scale_v: float,
+    inv_scale_o: float,
+    scale_softmax: float,
+    tolerance: float,
+    warmup_iterations: int,
+    iterations: int,
+    skip_ref_check: bool,
+):
+    """Execute Fused Multi-Head Attention (FMHA) on Blackwell architecture and validate results.
+
+    This function creates random input tensors for query, key, and value, then performs the
+    complete FMHA computation pipeline. It supports configurable data types, tiling parameters,
+    and various attention masking options. Results can be validated against a PyTorch reference
+    implementation or run multiple times for performance measurement.
+
+    The implementation leverages specialized tensor memory operations and efficient math
+    operations optimized for Blackwell architecture, including pipelined computation stages
+    for maximum throughput.
+
+    :param q_shape: Query tensor shape (B, S_q, H, D) where B=batch size, S_q=query sequence length,
+                    H=number of heads, D=head dimension
+    :type q_shape: Tuple[int, int, int, int]
+    :param k_shape: Key tensor shape (B, S_k, H_k, D) where B=batch size, S_k=key sequence length,
+                    H_k=number of key heads (H must be divisible by H_k), D=head dimension
+    :type k_shape: Tuple[int, int, int, int]
+    :param in_dtype: Input data type for query, key and value tensors
+    :type in_dtype: Type[cutlass.Numeric]
+    :param out_dtype: Output data type for attention output
+    :type out_dtype: Type[cutlass.Numeric]
+    :param qk_acc_dtype: Accumulator data type for query-key matrix multiplication
+    :type qk_acc_dtype: Type[cutlass.Numeric]
+    :param pv_acc_dtype: Accumulator data type for probability-value matrix multiplication
+    :type pv_acc_dtype: Type[cutlass.Numeric]
+    :param mma_tiler_mn: Matrix multiply accumulate tile shape (M, N)
+    :type mma_tiler_mn: Tuple[int, int]
+    :param is_persistent: Whether to use persistent kernel optimization
+    :type is_persistent: bool
+    :param has_casual_mask: Whether to apply causal masking
+    :type has_casual_mask: bool
+    :param scale_q: Scaling factor for query tensor
+    :type scale_q: float
+    :param scale_k: Scaling factor for key tensor
+    :type scale_k: float
+    :param scale_v: Scaling factor for value tensor
+    :type scale_v: float
+    :param inv_scale_o: Inverse scaling factor for output tensor
+    :type inv_scale_o: float
+    :param scale_softmax: Attention score scaling factor (defaults to 1/sqrt(D) if set to 0)
+    :type scale_softmax: float
+    :param tolerance: Maximum acceptable error for validation
+    :type tolerance: float
+    :param warmup_iterations: Number of warmup iterations
+    :type warmup_iterations: int
+    :param iterations: Number of iterations to run for performance testing
+    :type iterations: int
+    :param skip_ref_check: Skip validation against reference implementation
+    :type skip_ref_check: bool
+
+    :raises ValueError: If input shapes are incompatible or head dimension is unsupported
+    :raises RuntimeError: If GPU is unavailable for computation
+    """
+
+    print(f"Running Blackwell SM100 FMHA test with:")
+    print(f"  q_shape: {q_shape}")
+    print(f"  k_shape: {k_shape}")
+    print(f"  in_dtype: {in_dtype}")
+    print(f"  out_dtype: {out_dtype}")
+    print(f"  qk_acc_dtype: {qk_acc_dtype}")
+    print(f"  pv_acc_dtype: {pv_acc_dtype}")
+    print(f"  mma_tiler_mn: {mma_tiler_mn}")
+    print(f"  is_persistent: {is_persistent}")
+    print(f"  has_casual_mask: {has_casual_mask}")
+    print(f"  scale_q: {scale_q}")
+    print(f"  scale_k: {scale_k}")
+    print(f"  scale_v: {scale_v}")
+    print(f"  inv_scale_o: {inv_scale_o}")
+    print(f"  scale_softmax: {scale_softmax}")
+    print(f"  tolerance: {tolerance}")
+
+    # Unpack parameters
+    b, s_q, h, d = q_shape
+    b_, s_k, h_k, d_ = k_shape
+
+    if b != b_:
+        raise ValueError("q & k must have the same batch size")
+
+    if d != d_:
+        raise ValueError("q & k must have the same head dimension")
+
+    if d not in {32, 64, 128}:
+        raise ValueError("head dimension must be 32, 64, or 128")
+
+    if h % h_k != 0:
+        raise ValueError("h must be divisible by h_k")
+
+    if in_dtype not in {cutlass.Float8E4M3FN, cutlass.Float16}:
+        raise ValueError("in_dtype must be Float8E4M3FN or Float16")
+
+    if out_dtype not in {cutlass.Float8E4M3FN, cutlass.Float16}:
+        raise ValueError("out_dtype must be Float8E4M3FN or Float16")
+
+    if qk_acc_dtype not in {cutlass.Float32}:
+        raise ValueError("qk_acc_dtype must be Float32")
+
+    if pv_acc_dtype not in {cutlass.Float32}:
+        raise ValueError("pv_acc_dtype must be Float32")
+
+    if iterations < 1:
+        raise ValueError("iterations must be at least 1")
+
+    h_r = h // h_k
+
+    # Prepare pytorch tensors: Q, K, V (random from 0 to 2) and O (all zero)
+    if not torch.cuda.is_available():
+        raise RuntimeError("GPU is required to run this example!")
+
+    torch.manual_seed(1111)
+
+    def create_and_permute_tensor(b, s, h_r, h_k, d, dtype, is_dynamic_layout=True):
+        # (b, s, h_r, h_k, d) -> (s, d, h_r, h_k, b)
+        shape = (b, s, h_r, h_k, d)
+        permute_order = (1, 4, 2, 3, 0)
+        is_fp8 = dtype in {cutlass.Float8E5M2, cutlass.Float8E4M3FN}
+
+        # torch does not support fp8 type
+        torch_dtype = cutlass.torch.dtype(dtype) if not is_fp8 else torch.uint8
+
+        # Create dtype torch tensor (cpu)
+        torch_tensor_cpu = cutlass_torch.create_and_permute_torch_tensor(
+            shape,
+            torch_dtype,
+            permute_order=permute_order,
+            init_type=cutlass.torch.TensorInitType.RANDOM,
+            init_config=cutlass.torch.RandomInitConfig(
+                min_val=0 if is_fp8 else -2, max_val=2
+            ),
+        )
+        # Create dtype torch tensor (gpu)
+        torch_tensor_gpu = torch_tensor_cpu.cuda()
+
+        # Create f32 torch tensor (cpu)
+        f32_torch_tensor = torch_tensor_cpu.to(dtype=torch.float32)
+
+        # Create dtype cute tensor (gpu)
+        cute_tensor = from_dlpack(torch_tensor_gpu, assumed_align=16)
+        cute_tensor.element_type = dtype
+        if is_dynamic_layout:
+            cute_tensor = cute_tensor.mark_layout_dynamic(leading_dim=1)
+        cute_tensor = cutlass_torch.convert_cute_tensor(
+            f32_torch_tensor,
+            cute_tensor,
+            dtype,
+            is_dynamic_layout=is_dynamic_layout,
+        )
+
+        return f32_torch_tensor, cute_tensor, torch_tensor_gpu
+
+    q_ref, q_tensor, q_torch = create_and_permute_tensor(
+        b, s_q, h_r, h_k, d, in_dtype, is_dynamic_layout=True
+    )
+    k_ref, k_tensor, k_torch = create_and_permute_tensor(
+        b, s_k, 1, h_k, d, in_dtype, is_dynamic_layout=True
+    )
+    v_ref, v_tensor, v_torch = create_and_permute_tensor(
+        b, s_k, 1, h_k, d, in_dtype, is_dynamic_layout=True
+    )
+    o_ref, o_tensor, o_torch = create_and_permute_tensor(
+        b, s_q, h_r, h_k, d, out_dtype, is_dynamic_layout=True
+    )
+
+    mma_tiler = (*mma_tiler_mn, d)
+
+    mask_type = MaskType.NO_MASK
+    if has_casual_mask:
+        mask_type = MaskType.CAUSAL_MASK
+    else:
+        if s_k % mma_tiler_mn[1] != 0:
+            mask_type = MaskType.RESIDUAL_MASK
+
+    fmha = BlackwellFusedMultiHeadAttentionForward(
+        qk_acc_dtype,
+        pv_acc_dtype,
+        mma_tiler,
+        is_persistent,
+        mask_type,
+    )
+
+    # Get current CUDA stream from PyTorch
+    torch_stream = torch.cuda.current_stream()
+    # Get the raw stream pointer as a CUstream
+    current_stream = cuda.CUstream(torch_stream.cuda_stream)
+
+    if scale_softmax == 0.0:  # default to 1/sqrt(d)
+        scale_softmax = 1.0 / math.sqrt(q_shape[1])
+    log2_e = math.log2(
+        math.exp(1.0)
+    )  # gpu uses exp2 for perf concerns, we need an extra factor 'log2_e' here
+
+    scale_softmax = scale_q * scale_k * scale_softmax
+    scale_softmax_log2 = scale_softmax * log2_e
+    scale_output = scale_v * inv_scale_o
+
+    print("Compiling kernel with cute.compile ...")
+    start_time = time.time()
+    # compile fmha kernel
+    compiled_fmha = cute.compile(
+        fmha,
+        q_tensor,
+        k_tensor,
+        v_tensor,
+        o_tensor,
+        scale_softmax_log2,
+        scale_output,
+        current_stream,
+    )
+    compilation_time = time.time() - start_time
+    print(f"Compilation time: {compilation_time:.4f} seconds")
+
+    # Warmup
+    for _ in range(warmup_iterations):
+        compiled_fmha(
+            q_tensor,
+            k_tensor,
+            v_tensor,
+            o_tensor,
+            scale_softmax_log2,
+            scale_output,
+            current_stream,
+        )
+
+    # Execute kernel
+    for _ in range(iterations):
+        compiled_fmha(
+            q_tensor,
+            k_tensor,
+            v_tensor,
+            o_tensor,
+            scale_softmax_log2,
+            scale_output,
+            current_stream,
+        )
+
+    torch.cuda.synchronize()
+
+    def run_torch_fmha(
+        q, k, v, scale_softmax=1.0, scale_output=1.0, has_casual_mask=False
+    ):
+        s_q, d, h_r, h_k, b = q.shape
+        s_k = k.shape[0]
+
+        # broadcast k and v to have the same shape as q
+        k = k.expand(s_k, d, h_r, h_k, b)
+        v = v.expand(s_k, d, h_r, h_k, b)
+
+        q_tmp = q.permute(4, 2, 3, 0, 1).contiguous().view(b, -1, s_q, d)
+        k_tmp = k.permute(4, 2, 3, 0, 1).contiguous().view(b, -1, s_k, d)
+        v_tmp = v.permute(4, 2, 3, 0, 1).contiguous().view(b, -1, s_k, d)
+
+        ref = F.scaled_dot_product_attention(
+            q_tmp,
+            k_tmp,
+            v_tmp,
+            attn_mask=None,
+            dropout_p=0.0,
+            scale=scale_softmax,
+            is_causal=has_casual_mask,
+        )
+        ref = ref.view(b, h_r, h_k, s_q, d).permute(3, 4, 1, 2, 0) * scale_output
+
+        return ref
+
+    if not skip_ref_check:
+        print("Verifying results...")
+        ref = run_torch_fmha(
+            q_ref, k_ref, v_ref, scale_softmax, scale_output, has_casual_mask
+        )
+
+        # Copy gpu result back
+        gpu_o = o_torch.cpu()
+
+        # convert ref to out_type
+        if out_dtype == cutlass.Float16:
+            ref_o = ref.to(cutlass.torch.dtype(out_dtype))
+        elif out_dtype in {cutlass.Float8E4M3FN, cutlass.Float8E5M2}:
+            # convert ref : f32 -> fp8 -> f32
+            permute_order_0 = (4, 0, 2, 3, 1)
+            permute_order_1 = (1, 4, 2, 3, 0)
+
+            shape = (b, s_q, h_r, h_k, d)
+
+            f8_torch_tensor = cutlass.torch.create_and_permute_torch_tensor(
+                shape,
+                torch.uint8,
+                permute_order=permute_order_1,
+                init_type=cutlass.torch.TensorInitType.SKIP,
+            ).cuda()
+
+            # Create dtype tensor (gpu)
+            ref_o_tensor = from_dlpack(
+                f8_torch_tensor, assumed_align=16
+            ).mark_layout_dynamic(leading_dim=1)
+            ref_o_tensor.element_type = out_dtype
+            ref_o_tensor = cutlass.torch.convert_cute_tensor(
+                # ref for torch tensor is contiguous in shape (b, h_r, h_k, s_q, d), but shape is (s, d, h_r, h_k, b)
+                # need to make it contiguous first then permute
+                ref.permute(permute_order_0).contiguous().permute(permute_order_1).cuda(),
+                ref_o_tensor,
+                out_dtype,
+                is_dynamic_layout=True,
+            )
+
+            ref_o = f8_torch_tensor.cpu()
+
+            # uint8 check; the minimum difference is 1
+            tolerance = 2
+        else:
+            pass
+
+        # Assert close results
+        torch.testing.assert_close(gpu_o, ref_o, atol=tolerance, rtol=1e-05)
+        print("Results verified successfully!")
+
+if __name__ == "__main__":
+
+    def parse_comma_separated_ints(s: str) -> Tuple[int, ...]:
+        try:
+            return tuple(int(x.strip()) for x in s.split(","))
+        except ValueError:
+            raise argparse.ArgumentTypeError(
+                "Invalid format. Expected comma-separated integers."
+            )
+
+    parser = argparse.ArgumentParser(description="Example of FMHA on Blackwell.")
+
+    parser.add_argument(
+        "--in_dtype",
+        type=cutlass.dtype,
+        default=cutlass.Float16,
+        help="Input data type",
+    )
+
+    parser.add_argument(
+        "--out_dtype",
+        type=cutlass.dtype,
+        default=cutlass.Float16,
+        help="Output data type",
+    )
+
+    parser.add_argument(
+        "--qk_acc_dtype",
+        type=cutlass.dtype,
+        default=cutlass.Float32,
+        help="QK accumulator data type",
+    )
+
+    parser.add_argument(
+        "--pv_acc_dtype",
+        type=cutlass.dtype,
+        default=cutlass.Float32,
+        help="PV accumulator data type",
+    )
+
+    parser.add_argument(
+        "--mma_tiler_mn",
+        type=parse_comma_separated_ints,
+        default=(128, 128),
+        help="MMA tile shape (M, N)",
+    )
+
+    parser.add_argument(
+        "--is_persistent",
+        action="store_true",
+        help="Is persistent",
+    )
+
+    parser.add_argument(
+        "--has_casual_mask",
+        action="store_true",
+        help="Whether to use casual mask",
+    )
+
+    parser.add_argument(
+        "--q_shape",
+        type=parse_comma_separated_ints,
+        default=(1, 256, 8, 128),
+        help="Shape of Q (B, S_q, H, D)",
+    )
+
+    parser.add_argument(
+        "--k_shape",
+        type=parse_comma_separated_ints,
+        default=(1, 256, 8, 128),
+        help="Shape of K (B, S_k, H_k, D)",
+    )
+
+    parser.add_argument(
+        "--scale_q",
+        type=float,
+        default=1.0,
+        help="Scaling factors to dequantize Q",
+    )
+
+    parser.add_argument(
+        "--scale_k",
+        type=float,
+        default=1.0,
+        help="Scaling factors to dequantize K",
+    )
+
+    parser.add_argument(
+        "--scale_v",
+        type=float,
+        default=1.0,
+        help="Scaling factors to dequantize V",
+    )
+
+    parser.add_argument(
+        "--inv_scale_o",
+        type=float,
+        default=1.0,
+        help="Scaling factor to quantize O",
+    )
+
+    parser.add_argument(
+        "--scale_softmax",
+        type=float,
+        default=0.0,
+        help="Scaling factor to scale S (i.e. Q*K); if zero, defaults to 1/sqrt(D)",
+    )
+
+    parser.add_argument(
+        "--tolerance", type=float, default=1e-01, help="Tolerance for validation"
+    )
+
+    parser.add_argument(
+        "--warmup_iterations",
+        type=int,
+        default=0,
+        help="Number of iterations for warmup",
+    )
+
+    parser.add_argument(
+        "--iterations",
+        type=int,
+        default=1,
+        help="Number of iterations after warmup",
+    )
+
+    parser.add_argument(
+        "--skip_ref_check",
+        action="store_true",
+        help="Skip reference check",
+    )
+
+    args = parser.parse_args()
+
+    if len(args.q_shape) != 4:
+        parser.error("--q_shape must contain exactly 4 values")
+
+    if len(args.k_shape) != 4:
+        parser.error("--k_shape must contain exactly 4 values")
+
+    if len(args.mma_tiler_mn) != 2:
+        parser.error("--mma_tiler_mn must contain exactly 2 values")
+
+    run_fmha_and_verify(
+        args.q_shape,
+        args.k_shape,
+        args.in_dtype,
+        args.out_dtype,
+        args.qk_acc_dtype,
+        args.pv_acc_dtype,
+        args.mma_tiler_mn,
+        args.is_persistent,
+        args.has_casual_mask,
+        args.scale_q,
+        args.scale_k,
+        args.scale_v,
+        args.inv_scale_o,
+        args.scale_softmax,
+        args.tolerance,
+        args.warmup_iterations,
+        args.iterations,
+        args.skip_ref_check,
+    )
+
+    print("PASS")
diff --git a/examples/python/CuTeDSL/blackwell/grouped_gemm.py b/examples/python/CuTeDSL/blackwell/grouped_gemm.py
new file mode 100644
index 00000000..d2e6f9ab
--- /dev/null
+++ b/examples/python/CuTeDSL/blackwell/grouped_gemm.py
@@ -0,0 +1,2287 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import functools
+from typing import List, Type, Union
+from inspect import isclass
+
+import torch
+import cuda.bindings.driver as cuda
+
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils as utils
+from cutlass.cute.nvgpu import cpasync, tcgen05
+import cutlass.utils.blackwell_helpers as sm100_utils
+import cutlass.torch as cutlass_torch
+from cutlass.cute.runtime import from_dlpack
+
+"""
+A grouped GEMM example for the NVIDIA Blackwell SM100 architecture using CUTE DSL
+
+This example demonstrates an implementation of grouped GEMM using a TMA plus Blackwell SM100 TensorCore
+warp-specialized persistent kernel.
+The grouped GEMM workload computes a batch of GEMM operations with distinct problem sizes. Pointers to matrices
+in global memory are passed to the kernel in an array (also held in global memory). Similarly, problem shapes and
+strides are also stored in arrays in GMEM.
+
+This differs from "Batched Array" GEMM since the size of each GEMM problem in the grouped GEMM concept may be distinct.
+
+To run this example:
+
+.. code-block:: bash
+
+    python examples/blackwell/grouped_gemm.py                                                 \
+      --ab_dtype Float16 --c_dtype Float16 --acc_dtype Float32                                \
+      --mma_tiler_mn 128,64 --cluster_shape_mn 1,1                                            \
+      --problem_sizes_mnkl "(8192,1280,32,1),(16,384,1536,1),(640,1280,16,1),(640,160,16,1)"  \
+      --num_groups 4  --tensormap_update_mode SMEM
+
+The above example command makes 4 groups of different m, n, k sizes. The Blackwell tcgen05 MMA tile shape
+is specified as (128, 64) and the cluster shape is (1,1). The input, mma accumulator and output data type
+are set as fp16, fp32 and fp16, respectively.
+
+To collect performance with NCU profiler:
+
+.. code-block:: bash
+
+    ncu python examples/blackwell/grouped_gemm.py                                             \
+      --ab_dtype Float16 --c_dtype Float16 --acc_dtype Float32                                \
+      --mma_tiler_mn 128,64 --cluster_shape_mn 1,1                                            \
+      --problem_sizes_mnkl "(8192,1280,32,1),(16,384,1536,1),(640,1280,16,1),(640,160,16,1)"  \
+      --num_groups 4  --tensormap_update_mode SMEM                                            \
+      --warmup_iterations 1 --iterations 10 --skip_ref_check
+
+There are some constrains for this example. Besides the constrains from the Balckwell dense GEMM persistent example,
+there are also the following constrains:
+* Only fp16 and bf16 data types are supported as inputs.
+* Output data types could be fp16, bf16 or fp32.
+* The contiguous dimension of each tensor must be at least 16 bytes aligned.
+* The l mode(aka, batch size) for each group must be 1.
+* The majorness for A, B and C must be the same across all groups.
+"""
+
+
+class GroupedGemmKernel:
+
+    def __init__(
+        self,
+        acc_dtype: type[cutlass.Numeric],
+        use_2cta_instrs: bool,
+        mma_tiler_mn: tuple[int, int],
+        cluster_shape_mn: tuple[int, int],
+        tensormap_update_mode: utils.TensorMapUpdateMode = utils.TensorMapUpdateMode.SMEM,
+    ):
+        """Initializes the configuration for a Blackwell grouped GEMM kernel.
+
+        Besides configurations for dense persistent GEMM, there is an extra config specific to grouped GEMM:
+
+        Tensormap Update Mode:
+        - tensormap_update_mode: Specifies whether the tensormap is
+            updated in global memory(GMEM) or shared memory(SMEM).
+           The 2 modes are functionally equivalent and the difference are:
+            - We buffer 3 tensormaps in SMEM for A, B, and C tensors (each TMA descriptor takes 128B) when TMA updates performed on SMEM.
+            - Performance varies between modes depending on problem size; optimal choice differs across workloads.
+
+        :param acc_dtype: Data type of the accumulator.
+        :type acc_dtype: type[cutlass.Numeric]
+        :param use_2cta_instrs: Boolean, True to use cta_group=2 MMA variant.
+        :type use_2cta_instrs: bool
+        :param mma_tiler_mn: tuple (M, N) shape of the MMA instruction.
+        :type mma_tiler_mn: tuple[int, int]
+        :param cluster_shape_mn: tuple (ClusterM, ClusterN) shape of the cluster.
+        :type cluster_shape_mn: tuple[int, int]
+        :param tensormap_update_mode: Mode for updating the tensormap (GMEM or SMEM), defaults to SMEM.
+        :type tensormap_update_mode: utils.TensorMapUpdateMode, optional
+        """
+        self.acc_dtype: Type[cutlass.Numeric] = acc_dtype
+        self.use_2cta_instrs = use_2cta_instrs
+        self.cluster_shape_mn = cluster_shape_mn
+        # K dimension is deferred in _setup_attributes
+        self.mma_tiler = (*mma_tiler_mn, 1)
+        self.cta_group = (
+            tcgen05.CtaGroup.TWO if use_2cta_instrs else tcgen05.CtaGroup.ONE
+        )
+
+        self.tensormap_update_mode = tensormap_update_mode
+        # Delegate tensormap ab initialization to MMA warp when SMEM mode is used for better latency hiding
+        self.delegate_tensormap_ab_init = (
+            tensormap_update_mode == utils.TensorMapUpdateMode.SMEM
+        )
+
+        self.num_mcast_ctas_a = 1
+        self.num_mcast_ctas_b = 1
+        self.is_a_mcast = False
+        self.is_b_mcast = False
+
+        self.occupancy = 1
+        # Set specialized warp ids
+        self.epilog_warp_id = (
+            0,
+            1,
+            2,
+            3,
+        )
+        self.mma_warp_id = 4
+        self.tma_warp_id = 5
+        self.threads_per_cta = 32 * len(
+            (self.mma_warp_id, self.tma_warp_id, *self.epilog_warp_id)
+        )
+        # Set barrier id for cta sync, epilog sync, tmem ptr sync and tensormap update sync
+        self.cta_sync_bar_id = 0
+        self.epilog_sync_bar_id = 1
+        self.tmem_ptr_sync_bar_id = 2
+        # Barrier ID used by MMA/TMA warps to signal A/B tensormap initialization completion
+        self.tensormap_ab_init_bar_id = 4
+        self.num_smem_capacity = sm100_utils.SMEM_CAPACITY["sm100"]
+        self.num_tma_load_bytes = 0
+
+    def _setup_attributes(self):
+        """Set up configurations that are dependent on GEMM inputs
+
+        Most of the implementation follows standard dense GEMM patterns,
+        with the key difference being additional consideration for SMEM
+        buffer needed for tensormap updates.
+        """
+        # Configure tiled mma
+        tiled_mma = sm100_utils.make_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.acc_dtype,
+            self.cta_group,
+            self.mma_tiler[:2],
+        )
+
+        # Compute mma/cluster/tile shapes
+        mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
+        mma_inst_tile_k = 4
+        self.mma_tiler = (
+            self.mma_tiler[0],
+            self.mma_tiler[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+        self.cta_tile_shape_mnk = (
+            self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler[1],
+            self.mma_tiler[2],
+        )
+        self.cluster_tile_shape_mnk = tuple(
+            x * y for x, y in zip(self.cta_tile_shape_mnk, (*self.cluster_shape_mn, 1))
+        )
+
+        # Compute cluster layout
+        self.cluster_layout_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma.thr_id.shape,),
+        )
+
+        # Compute number of multicast CTAs for A/B
+        self.num_mcast_ctas_a = cute.size(self.cluster_layout_vmnk.shape[2])
+        self.num_mcast_ctas_b = cute.size(self.cluster_layout_vmnk.shape[1])
+        self.is_a_mcast = self.num_mcast_ctas_a > 1
+        self.is_b_mcast = self.num_mcast_ctas_b > 1
+
+        # Compute epilogue subtile
+        self.epi_tile = utils.compute_epilogue_tile_shape(
+            self.cta_tile_shape_mnk,
+            self.use_2cta_instrs,
+            self.c_layout,
+            self.c_dtype,
+        )
+
+        # Setup A/B/C stage count in shared memory and ACC stage count in tensor memory
+        self.num_acc_stage, self.num_ab_stage, self.num_epi_stage = (
+            self._compute_stages(
+                tiled_mma,
+                self.mma_tiler,
+                self.a_dtype,
+                self.b_dtype,
+                self.epi_tile,
+                self.c_dtype,
+                self.c_layout,
+                self.num_smem_capacity,
+                self.occupancy,
+            )
+        )
+
+        self.a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.num_ab_stage,
+        )
+        self.b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            self.mma_tiler,
+            self.b_dtype,
+            self.num_ab_stage,
+        )
+        self.epi_smem_layout_staged = sm100_utils.make_smem_layout_epi(
+            self.c_dtype,
+            self.c_layout,
+            self.epi_tile,
+            self.num_epi_stage,
+        )
+
+        tensor_smem_bytes = self._get_tensor_smem_bytes(
+            self.a_smem_layout_staged,
+            self.a_dtype,
+            self.b_smem_layout_staged,
+            self.b_dtype,
+            self.epi_smem_layout_staged,
+            self.c_dtype,
+        )
+        mbar_smem_bytes = self._get_mbar_smem_bytes(
+            num_acc_stage=self.num_acc_stage,
+            num_ab_stage=self.num_ab_stage,
+            num_epi_stage=self.num_epi_stage,
+        )
+        tensormap_smem_bytes = self._get_tensormap_smem_bytes(
+            self.tensormap_update_mode
+        )
+        if (
+            mbar_smem_bytes
+            + tensormap_smem_bytes
+            + GroupedGemmKernel.tensor_memory_management_bytes
+            > self.reserved_smem_bytes
+        ):
+            raise ValueError(
+                f"smem consumption for mbar and tensormap {mbar_smem_bytes + tensormap_smem_bytes} exceeds the "
+                f"reserved smem bytes {self.reserved_smem_bytes}"
+            )
+
+        # Compute the number of tensor memory allocation columns
+        self.num_tmem_alloc_cols = self._compute_num_tmem_alloc_cols(
+            tiled_mma, self.mma_tiler, self.num_acc_stage
+        )
+
+    @cute.jit
+    def __call__(
+        self,
+        initial_a: cute.Tensor,
+        initial_b: cute.Tensor,
+        initial_c: cute.Tensor,
+        group_count: cutlass.Constexpr[int],
+        problem_shape_mnkl: cute.Tensor,
+        strides_abc: cute.Tensor,
+        tensor_address_abc: cute.Tensor,
+        total_num_clusters: cutlass.Constexpr[int],
+        tensormap_cute_tensor: cute.Tensor,
+        max_active_clusters: cutlass.Constexpr[int],
+        stream: cuda.CUstream,
+    ):
+        """Execute the GEMM operation in steps:
+        - Setup static attributes before smem/grid/tma computation
+        - Setup TMA load/store atoms and tensors
+        - Compute grid size with regard to hardware constraints
+        - Define shared storage for kernel
+        - Launch the kernel synchronously
+
+        For grouped GEMM, tensor shapes, tensor strides, and tensor address are all provided
+        by different tensors in global memory. The "initial" tensors only carry data type and
+        majorness information.
+
+        :param initial_a: Initial tensor A, used for data type and majorness information.
+        :type initial_a: cute.Tensor
+        :param initial_b: Initial tensor B, used for data type and majorness information.
+        :type initial_b: cute.Tensor
+        :param initial_c: Initial tensor C, used for data type and majorness information.
+        :type initial_c: cute.Tensor
+        :param group_count: The number of GEMM groups.
+        :type group_count: cutlass.Constexpr[int]
+        :param problem_shape_mnkl: Tensor containing the (M, N, K, L) shape for each group.
+        :type problem_shape_mnkl: cute.Tensor
+        :param strides_abc: Tensor containing the strides for A, B, and C for each group.
+        :type strides_abc: cute.Tensor
+        :param tensor_address_abc: Tensor containing the base addresses for A, B, and C for each group.
+        :type tensor_address_abc: cute.Tensor
+        :param total_num_clusters: Total number of clusters needed for all groups.
+        :type total_num_clusters: cutlass.Constexpr[int]
+        :param tensormap_cute_tensor: Tensor for storing tensormaps.
+        :type tensormap_cute_tensor: cute.Tensor
+        :param max_active_clusters: Maximum number of active clusters.
+        :type max_active_clusters: cutlass.Constexpr[int]
+        :param stream: CUDA stream for asynchronous execution.
+        :type stream: cuda.CUstream
+        :raises TypeError: If A and B data types do not match.
+        """
+        self.a_dtype = initial_a.element_type
+        self.b_dtype = initial_b.element_type
+        self.c_dtype = initial_c.element_type
+        self.a_major_mode = utils.LayoutEnum.from_tensor(initial_a).mma_major_mode()
+        self.b_major_mode = utils.LayoutEnum.from_tensor(initial_b).mma_major_mode()
+        self.c_layout = utils.LayoutEnum.from_tensor(initial_c)
+        if cutlass.const_expr(self.a_dtype != self.b_dtype):
+            raise TypeError(f"Type mismatch: {self.a_dtype} != {self.b_dtype}")
+
+        # Setup attributes that dependent on gemm inputs
+        self._setup_attributes()
+
+        tiled_mma = sm100_utils.make_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.acc_dtype,
+            self.cta_group,
+            self.mma_tiler[:2],
+        )
+        atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+
+        # Setup TMA load for A
+        a_op = self._get_tma_atom_kind(atom_thr_size, self.is_a_mcast)
+        a_smem_layout = cute.slice_(self.a_smem_layout_staged, (None, None, None, 0))
+        tma_atom_a, tma_tensor_a = cute.nvgpu.make_tma_tile_atom_A(
+            a_op,
+            initial_a,
+            a_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        # Setup TMA load for B
+        b_op = self._get_tma_atom_kind(atom_thr_size, self.is_b_mcast)
+        b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, None, 0))
+        tma_atom_b, tma_tensor_b = cute.nvgpu.make_tma_tile_atom_B(
+            b_op,
+            initial_b,
+            b_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        a_copy_size = cute.size_in_bytes(self.a_dtype, a_smem_layout)
+        b_copy_size = cute.size_in_bytes(self.b_dtype, b_smem_layout)
+        self.num_tma_load_bytes = (a_copy_size + b_copy_size) * atom_thr_size
+
+        # Setup TMA store for C
+        tma_atom_c = None
+        tma_tensor_c = None
+        c_cta_v_layout = cute.composition(
+            cute.make_identity_layout(initial_c.shape), self.epi_tile
+        )
+        epi_smem_layout = cute.slice_(self.epi_smem_layout_staged, (None, None, 0))
+        tma_atom_c, tma_tensor_c = cpasync.make_tma_tile_atom(
+            cpasync.CopyBulkTensorTileS2GOp(),
+            initial_c,
+            epi_smem_layout,
+            c_cta_v_layout,
+        )
+
+        self.tile_sched_params, grid = self._compute_grid(
+            total_num_clusters, self.cluster_shape_mn, max_active_clusters
+        )
+
+        self.buffer_align_bytes = 1024
+        self.size_tensormap_in_i64 = (
+            0
+            if cutlass.const_expr(
+                self.tensormap_update_mode == utils.TensorMapUpdateMode.GMEM
+            )
+            else GroupedGemmKernel.num_tensormaps
+            * GroupedGemmKernel.bytes_per_tensormap
+            // 8
+        )
+
+        # Define shared storage for kernel
+        @cute.struct
+        class SharedStorage:
+            tensormap_buffer: cute.struct.MemRange[
+                cutlass.Int64, self.size_tensormap_in_i64
+            ]
+            ab_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
+            ab_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
+            acc_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
+            acc_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
+            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_holding_buf: cutlass.Int32
+            # (EPI_TILE_M, EPI_TILE_N, STAGE)
+            sC: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.c_dtype,
+                    cute.cosize(self.epi_smem_layout_staged.outer),
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_M, MMA_K, STAGE)
+            sA: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer)
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_N, MMA_K, STAGE)
+            sB: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.b_dtype, cute.cosize(self.b_smem_layout_staged.outer)
+                ],
+                self.buffer_align_bytes,
+            ]
+
+        self.shared_storage = SharedStorage
+
+        # Launch the kernel synchronously
+        self.kernel(
+            tiled_mma,
+            tma_atom_a,
+            tma_tensor_a,
+            tma_atom_b,
+            tma_tensor_b,
+            tma_atom_c,
+            tma_tensor_c,
+            self.cluster_layout_vmnk,
+            self.a_smem_layout_staged,
+            self.b_smem_layout_staged,
+            self.epi_smem_layout_staged,
+            self.epi_tile,
+            self.tile_sched_params,
+            group_count,
+            problem_shape_mnkl,
+            strides_abc,
+            tensor_address_abc,
+            tensormap_cute_tensor,
+        ).launch(
+            grid=grid,
+            block=[self.threads_per_cta, 1, 1],
+            cluster=(*self.cluster_shape_mn, 1),
+            smem=self.shared_storage.size_in_bytes(),
+            stream=stream,
+        )
+        return
+
+    # GPU device kernel
+    @cute.kernel
+    def kernel(
+        self,
+        tiled_mma: cute.TiledMma,
+        tma_atom_a: cute.CopyAtom,
+        mA_mkl: cute.Tensor,
+        tma_atom_b: cute.CopyAtom,
+        mB_nkl: cute.Tensor,
+        tma_atom_c: cute.CopyAtom,
+        mC_mnl: cute.Tensor,
+        cluster_layout_vmnk: cute.Layout,
+        a_smem_layout_staged: cute.ComposedLayout,
+        b_smem_layout_staged: cute.ComposedLayout,
+        epi_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout],
+        epi_tile: cute.Tile,
+        tile_sched_params: utils.PersistentTileSchedulerParams,
+        group_count: cutlass.Constexpr[int],
+        problem_sizes_mnkl: cute.Tensor,
+        strides_abc: cute.Tensor,
+        ptrs_abc: cute.Tensor,
+        tensormaps: cute.Tensor,
+    ):
+        """
+        GPU device kernel performing the grouped GEMM computation.
+        """
+        warp_idx = cute.arch.warp_idx()
+        warp_idx = cute.arch.make_warp_uniform(warp_idx)
+
+        #
+        # Prefetch tma desc
+        #
+        if warp_idx == self.tma_warp_id:
+            cpasync.prefetch_descriptor(tma_atom_a)
+            cpasync.prefetch_descriptor(tma_atom_b)
+            cpasync.prefetch_descriptor(tma_atom_c)
+
+        use_2cta_instrs = cute.size(tiled_mma.thr_id.shape) == 2
+
+        #
+        # Setup cta/thread coordinates
+        #
+        # Coord inside cluster
+        bid = cute.arch.block_idx()
+        mma_tile_coord_v = bid[0] % cute.size(tiled_mma.thr_id.shape)
+        is_leader_cta = mma_tile_coord_v == 0
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(
+            cta_rank_in_cluster
+        )
+        # Coord inside cta
+        tidx, _, _ = cute.arch.thread_idx()
+
+        #
+        # Alloc and init: tensormap buffer, a+b full/empty, accumulator full/empty, tensor memory dealloc barrier
+        #
+        smem = utils.SmemAllocator()
+        storage = smem.allocate(self.shared_storage)
+
+        tensormap_a_smem_ptr = None
+        tensormap_b_smem_ptr = None
+        tensormap_c_smem_ptr = None
+        if cutlass.const_expr(
+            self.tensormap_update_mode == utils.TensorMapUpdateMode.SMEM
+        ):
+            tensormap_smem_ptr = storage.tensormap_buffer.data_ptr()
+            tensormap_a_smem_ptr = tensormap_smem_ptr
+            tensormap_b_smem_ptr = (
+                tensormap_a_smem_ptr + GroupedGemmKernel.bytes_per_tensormap // 8
+            )
+            tensormap_c_smem_ptr = (
+                tensormap_b_smem_ptr + GroupedGemmKernel.bytes_per_tensormap // 8
+            )
+        ab_full_mbar_ptr = storage.ab_full_mbar_ptr.data_ptr()
+        ab_empty_mbar_ptr = storage.ab_empty_mbar_ptr.data_ptr()
+        acc_full_mbar_ptr = storage.acc_full_mbar_ptr.data_ptr()
+        acc_empty_mbar_ptr = storage.acc_empty_mbar_ptr.data_ptr()
+        tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr
+        tmem_holding_buf = storage.tmem_holding_buf
+
+        #  init barrier for loading A, B with TMA
+        if warp_idx == self.epilog_warp_id[0]:
+            for k_stage in range(self.num_ab_stage):
+                num_tma_producer = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
+                with cute.arch.elect_one():
+                    cute.arch.mbarrier_init_arrive_cnt(ab_full_mbar_ptr + k_stage, 1)
+                    cute.arch.mbarrier_init_arrive_cnt(
+                        ab_empty_mbar_ptr + k_stage, num_tma_producer
+                    )
+        # Accumulator barrier init
+        if warp_idx == self.mma_warp_id:
+            for acc_stage in range(self.num_acc_stage):
+                with cute.arch.elect_one():
+                    cute.arch.mbarrier_init_arrive_cnt(acc_full_mbar_ptr + acc_stage, 1)
+                    cute.arch.mbarrier_init_arrive_cnt(
+                        acc_empty_mbar_ptr + acc_stage, 8 if use_2cta_instrs else 4
+                    )
+        # Tensor memory dealloc barrier init
+        if use_2cta_instrs:
+            if warp_idx == self.tma_warp_id:
+                num_tmem_dealloc_threads = 32
+                with cute.arch.elect_one():
+                    cute.arch.mbarrier_init_arrive_cnt(
+                        tmem_dealloc_mbar_ptr, num_tmem_dealloc_threads
+                    )
+        cute.arch.mbarrier_init_fence()
+
+        # Cluster arrive after barrier init
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_arrive_relaxed()
+
+        #
+        # Setup smem tensor A/B/C
+        #
+        # (EPI_TILE_M, EPI_TILE_N, STAGE)
+        sC = storage.sC.get_tensor(
+            epi_smem_layout_staged.outer, swizzle=epi_smem_layout_staged.inner
+        )
+        # (MMA, MMA_M, MMA_K, STAGE)
+        sA = storage.sA.get_tensor(
+            a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner
+        )
+        # (MMA, MMA_N, MMA_K, STAGE)
+        sB = storage.sB.get_tensor(
+            b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner
+        )
+
+        #
+        # Compute multicast mask for A/B buffer full and empty
+        #
+        a_full_mcast_mask = None
+        b_full_mcast_mask = None
+        ab_empty_mcast_mask = None
+        if self.is_a_mcast or self.is_b_mcast or use_2cta_instrs:
+            a_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2
+            )
+            b_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=1
+            )
+            ab_empty_mcast_mask = a_full_mcast_mask | b_full_mcast_mask
+        acc_full_mcast_mask = None
+        if use_2cta_instrs:
+            acc_full_mcast_mask = cute.make_layout_image_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mode=0
+            )
+            block_in_cluster_coord_vmnk_peer = (
+                block_in_cluster_coord_vmnk[0] ^ 1,
+                *block_in_cluster_coord_vmnk[1:],
+            )
+            a_full_mcast_mask_peer = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk_peer, mcast_mode=2
+            )
+            b_full_mcast_mask_peer = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk_peer, mcast_mode=1
+            )
+            ab_empty_mcast_mask = (
+                a_full_mcast_mask_peer
+                | b_full_mcast_mask_peer
+                | cutlass.Int16(
+                    0 if ab_empty_mcast_mask is None else ab_empty_mcast_mask
+                )
+            )
+
+        #
+        # Local_tile partition global tensors
+        #
+        # (bM, bK, loopM, loopK, loopL)
+        gA_mkl = cute.local_tile(
+            mA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None)
+        )
+        # (bN, bK, loopN, loopK, loopL)
+        gB_nkl = cute.local_tile(
+            mB_nkl, cute.slice_(self.mma_tiler, (0, None, None)), (None, None, None)
+        )
+        # (bM, bN, loopM, loopN, loopL)
+        gC_mnl = cute.local_tile(
+            mC_mnl, cute.slice_(self.mma_tiler, (None, None, 0)), (None, None, None)
+        )
+
+        #
+        # Partition global tensor for TiledMMA_A/B/C
+        #
+        thr_mma = tiled_mma.get_slice(mma_tile_coord_v)
+        # (MMA, MMA_M, MMA_K, loopM, loopK, loopL)
+        tCgA = thr_mma.partition_A(gA_mkl)
+        # (MMA, MMA_N, MMA_K, loopN, loopK, loopL)
+        tCgB = thr_mma.partition_B(gB_nkl)
+        # (MMA, MMA_M, MMA_N, loopM, loopN, loopL)
+        tCgC = thr_mma.partition_C(gC_mnl)
+
+        #
+        # Partition global/shared tensor for load A, B with TMA
+        #
+        a_cta_layout = cute.make_layout(
+            cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape
+        )
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), loopM, loopK, loopL)
+        tAsA, tAgA = cpasync.tma_partition(
+            tma_atom_a,
+            block_in_cluster_coord_vmnk[2],
+            a_cta_layout,
+            cute.group_modes(sA, 0, 3),
+            cute.group_modes(tCgA, 0, 3),
+        )
+        # TMA load B partition_S/D
+        b_cta_layout = cute.make_layout(
+            cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape
+        )
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), loopM, loopK, loopL)
+        tBsB, tBgB = cpasync.tma_partition(
+            tma_atom_b,
+            block_in_cluster_coord_vmnk[1],
+            b_cta_layout,
+            cute.group_modes(sB, 0, 3),
+            cute.group_modes(tCgB, 0, 3),
+        )
+
+        #
+        # Partition shared/tensor memory tensor for TiledMMA_A/B/C
+        #
+        # (MMA, MMA_M, MMA_K, STAGE)
+        tCrA = tiled_mma.make_fragment_A(sA)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        tCrB = tiled_mma.make_fragment_B(sB)
+        # (MMA, MMA_M, MMA_N)
+        acc_shape = tiled_mma.partition_shape_C(self.mma_tiler[:2])
+        # (MMA, MMA_M, MMA_N, STAGE)
+        tCtAcc_fake = tiled_mma.make_fragment_C(
+            cute.append(acc_shape, self.num_acc_stage)
+        )
+
+        #
+        # Cluster wait before tensor memory alloc
+        #
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_wait()
+        else:
+            cute.arch.barrier(
+                barrier_id=self.cta_sync_bar_id, number_of_threads=self.threads_per_cta
+            )
+
+        #
+        # Get tensormap buffer address
+        #
+        grid_dim = cute.arch.grid_dim()
+        tensormap_workspace_idx = (
+            bid[2] * grid_dim[1] * grid_dim[0] + bid[1] * grid_dim[0] + bid[0]
+        )
+
+        tensormap_manager = utils.TensorMapManager(
+            self.tensormap_update_mode, GroupedGemmKernel.bytes_per_tensormap
+        )
+        tensormap_a_ptr = tensormap_manager.get_tensormap_ptr(
+            tensormaps[(tensormap_workspace_idx, 0, None)].iterator
+        )
+        tensormap_b_ptr = tensormap_manager.get_tensormap_ptr(
+            tensormaps[(tensormap_workspace_idx, 1, None)].iterator
+        )
+        tensormap_c_ptr = tensormap_manager.get_tensormap_ptr(
+            tensormaps[(tensormap_workspace_idx, 2, None)].iterator
+        )
+        # Setup tensormap initialization pointer based on the mode
+        if cutlass.const_expr(
+            self.tensormap_update_mode == utils.TensorMapUpdateMode.SMEM
+        ):
+            tensormap_a_init_ptr = tensormap_a_smem_ptr
+            tensormap_b_init_ptr = tensormap_b_smem_ptr
+            tensormap_c_init_ptr = tensormap_c_smem_ptr
+        else:
+            tensormap_a_init_ptr = tensormap_a_ptr
+            tensormap_b_init_ptr = tensormap_b_ptr
+            tensormap_c_init_ptr = tensormap_c_ptr
+
+        #
+        # Specialized TMA load warp
+        #
+        if warp_idx == self.tma_warp_id:
+            # Initialize tensormaps for A, B
+            if cutlass.const_expr(self.delegate_tensormap_ab_init == False):
+                tensormap_manager.init_tensormap_from_atom(
+                    tma_atom_a, tensormap_a_init_ptr, self.tma_warp_id
+                )
+                tensormap_manager.init_tensormap_from_atom(
+                    tma_atom_b, tensormap_b_init_ptr, self.tma_warp_id
+                )
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, bid, grid_dim
+            )
+            # grouped gemm tile scheduler helper will compute the group index for the tile we're working on
+            group_gemm_ts_helper = utils.GroupedGemmTileSchedulerHelper(
+                group_count,
+                tile_sched_params,
+                self.cluster_tile_shape_mnk,
+                utils.create_initial_search_state(),
+            )
+            tensormap_init_done = cutlass.Boolean(False)
+            # tile count we have searched
+            total_k_block_cnt = cutlass.Int32(0)
+            # group index of last tile
+            last_group_idx = cutlass.Int32(-1)
+            work_tile = tile_sched.initial_work_tile_info()
+            while work_tile.is_valid_tile:
+                cur_tile_coord = work_tile.tile_idx
+                grouped_gemm_cta_tile_info = group_gemm_ts_helper.delinearize_z(
+                    cur_tile_coord,
+                    problem_sizes_mnkl,
+                )
+                cur_k_block_cnt = grouped_gemm_cta_tile_info.cta_tile_count_k
+                cur_group_idx = grouped_gemm_cta_tile_info.group_idx
+                is_group_changed = cur_group_idx != last_group_idx
+                # skip tensormap update if we're working on the same group
+                if is_group_changed:
+                    real_tensor_a = self.make_tensor_for_tensormap_update(
+                        cur_group_idx,
+                        self.a_dtype,
+                        (
+                            grouped_gemm_cta_tile_info.problem_shape_m,
+                            grouped_gemm_cta_tile_info.problem_shape_n,
+                            grouped_gemm_cta_tile_info.problem_shape_k,
+                        ),
+                        strides_abc,
+                        ptrs_abc,
+                        0,  # 0 for tensor A
+                    )
+                    real_tensor_b = self.make_tensor_for_tensormap_update(
+                        cur_group_idx,
+                        self.b_dtype,
+                        (
+                            grouped_gemm_cta_tile_info.problem_shape_m,
+                            grouped_gemm_cta_tile_info.problem_shape_n,
+                            grouped_gemm_cta_tile_info.problem_shape_k,
+                        ),
+                        strides_abc,
+                        ptrs_abc,
+                        1,  # 1 for tensor B
+                    )
+                    # wait tensormap initialization complete before update
+                    if tensormap_init_done == False:
+                        if cutlass.const_expr(self.delegate_tensormap_ab_init):
+                            cute.arch.barrier(
+                                barrier_id=self.tensormap_ab_init_bar_id,
+                                number_of_threads=64,
+                            )
+                        tensormap_manager.fence_tensormap_initialization()
+                        tensormap_init_done = True
+
+                    tensormap_manager.update_tensormap(
+                        (real_tensor_a, real_tensor_b),
+                        (tma_atom_a, tma_atom_b),
+                        (tensormap_a_ptr, tensormap_b_ptr),
+                        self.tma_warp_id,
+                        (tensormap_a_smem_ptr, tensormap_b_smem_ptr),
+                    )
+
+                mma_tile_coord_mnl = (
+                    grouped_gemm_cta_tile_info.cta_tile_idx_m
+                    // cute.size(tiled_mma.thr_id.shape),
+                    grouped_gemm_cta_tile_info.cta_tile_idx_n,
+                    0,
+                )
+
+                #
+                # Slice to per mma tile index
+                #
+                # ((atom_v, rest_v), loopK)
+                tAgA_slice = tAgA[
+                    (None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])
+                ]
+                # ((atom_v, rest_v), loopK)
+                tBgB_slice = tBgB[
+                    (None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])
+                ]
+
+                num_prev_k_blk = total_k_block_cnt
+                total_k_block_cnt += cur_k_block_cnt
+
+                # Peek (try_wait) AB buffer empty for k_block = prefetch_k_block_cnt
+                tma_wr_k_block = cutlass.Int32(0)
+                smem_wr_buffer = (num_prev_k_blk + tma_wr_k_block) % self.num_ab_stage
+                tma_wr_ab_empty_phase = (
+                    num_prev_k_blk + tma_wr_k_block
+                ) // self.num_ab_stage % 2 ^ 1
+                peek_ab_empty_status = cute.arch.conditional_mbarrier_try_wait(
+                    tma_wr_k_block < cur_k_block_cnt,
+                    ab_empty_mbar_ptr + smem_wr_buffer,
+                    tma_wr_ab_empty_phase,
+                )
+                # ensure the update to tensormap has completed before using it
+                if is_group_changed:
+                    tensormap_manager.fence_tensormap_update(tensormap_a_ptr)
+                    tensormap_manager.fence_tensormap_update(tensormap_b_ptr)
+                #
+                # Tma load loop
+                #
+                for k_block in cutlass.range_dynamic(0, cur_k_block_cnt, 1, unroll=1):
+                    tma_wr_k_block_next = tma_wr_k_block + 1
+                    smem_wr_buffer_next = (
+                        num_prev_k_blk + tma_wr_k_block_next
+                    ) % self.num_ab_stage
+                    tma_wr_ab_empty_phase_next = (
+                        tma_wr_ab_empty_phase ^ 1
+                        if smem_wr_buffer_next == 0
+                        else tma_wr_ab_empty_phase
+                    )
+
+                    smem_full_mbar_ptr = ab_full_mbar_ptr + smem_wr_buffer
+
+                    # Wait for AB buffer empty
+                    if peek_ab_empty_status == 0:
+                        cute.arch.mbarrier_wait(
+                            ab_empty_mbar_ptr + smem_wr_buffer, tma_wr_ab_empty_phase
+                        )
+
+                    # Init AB buffer full transaction byte
+                    if is_leader_cta:
+                        with cute.arch.elect_one():
+                            cute.arch.mbarrier_init_tx_bytes(
+                                smem_full_mbar_ptr, self.num_tma_load_bytes
+                            )
+
+                    # Load A/B with TMA
+                    cute.copy(
+                        tma_atom_a,
+                        tAgA_slice[(None, tma_wr_k_block)],
+                        tAsA[(None, smem_wr_buffer)],
+                        tma_bar_ptr=smem_full_mbar_ptr,
+                        mcast_mask=a_full_mcast_mask,
+                        tma_desc_ptr=tensormap_manager.get_tensormap_ptr(
+                            tensormap_a_ptr,
+                            cute.AddressSpace.generic,
+                        ),
+                    )
+                    cute.copy(
+                        tma_atom_b,
+                        tBgB_slice[(None, tma_wr_k_block)],
+                        tBsB[(None, smem_wr_buffer)],
+                        tma_bar_ptr=smem_full_mbar_ptr,
+                        mcast_mask=b_full_mcast_mask,
+                        tma_desc_ptr=tensormap_manager.get_tensormap_ptr(
+                            tensormap_b_ptr,
+                            cute.AddressSpace.generic,
+                        ),
+                    )
+
+                    # Peek (try_wait) AB buffer empty for k_block = prefetch_k_block_cnt + k_block + 1
+                    peek_ab_empty_status = cute.arch.conditional_mbarrier_try_wait(
+                        tma_wr_k_block_next < cur_k_block_cnt,
+                        ab_empty_mbar_ptr + smem_wr_buffer_next,
+                        tma_wr_ab_empty_phase_next,
+                    )
+
+                    tma_wr_k_block = tma_wr_k_block_next
+                    smem_wr_buffer = smem_wr_buffer_next
+                    tma_wr_ab_empty_phase = tma_wr_ab_empty_phase_next
+
+                # Advance to next tile
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+                last_group_idx = cur_group_idx
+
+        #
+        # Specialized MMA warp
+        #
+        if warp_idx == self.mma_warp_id:
+            # initilize tensormap A, B for TMA warp
+            if cutlass.const_expr(self.delegate_tensormap_ab_init):
+                tensormap_manager.init_tensormap_from_atom(
+                    tma_atom_a, tensormap_a_init_ptr, self.mma_warp_id
+                )
+                tensormap_manager.init_tensormap_from_atom(
+                    tma_atom_b, tensormap_b_init_ptr, self.mma_warp_id
+                )
+                # signal tensormap initialization has finished
+                cute.arch.barrier(
+                    barrier_id=self.tensormap_ab_init_bar_id, number_of_threads=64
+                )
+            #  Bar sync for retrieve tmem ptr from shared mem
+            tmem_ptr_read_threads = 32 * len((self.mma_warp_id, *self.epilog_warp_id))
+            cute.arch.barrier(
+                barrier_id=self.tmem_ptr_sync_bar_id,
+                number_of_threads=tmem_ptr_read_threads,
+            )
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            tmem_ptr = cute.arch.retrieve_tmem_ptr(
+                self.acc_dtype,
+                alignment=16,
+                ptr_to_buffer_holding_addr=tmem_holding_buf,
+            )
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(tmem_ptr, tCtAcc_fake.layout)
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, bid, grid_dim
+            )
+            # grouped gemm tile scheduler helper will compute the group index for the tile we're working on
+            group_gemm_ts_helper = utils.GroupedGemmTileSchedulerHelper(
+                group_count,
+                tile_sched_params,
+                self.cluster_tile_shape_mnk,
+                utils.create_initial_search_state(),
+            )
+
+            work_tile = tile_sched.initial_work_tile_info()
+            # tile count we have searched
+            total_k_block_cnt = cutlass.Int32(0)
+            while work_tile.is_valid_tile:
+                cur_tile_coord = work_tile.tile_idx
+                # MMA warp is only interested in number of tiles along K dimension
+                cur_k_block_cnt, cur_group_idx = (
+                    group_gemm_ts_helper.search_cluster_tile_count_k(
+                        cur_tile_coord,
+                        problem_sizes_mnkl,
+                    )
+                )
+                # Set tensor memory buffer for current tile
+                acc_buf_idx = tile_sched.num_tiles_executed % self.num_acc_stage
+                # (MMA, MMA_M, MMA_N)
+                tCtAcc = tCtAcc_base[(None, None, None, acc_buf_idx)]
+
+                num_prev_k_blk = total_k_block_cnt
+                total_k_block_cnt += cur_k_block_cnt
+
+                # Peek (try_wait) AB buffer full for k_block = 0
+                mma_rd_k_block = cutlass.Int32(0)
+                smem_rd_buffer = (num_prev_k_blk + mma_rd_k_block) % self.num_ab_stage
+                need_check_rd_buffer_full = (
+                    mma_rd_k_block < cur_k_block_cnt and is_leader_cta
+                )
+                mma_rd_ab_full_phase = (
+                    (num_prev_k_blk + mma_rd_k_block) // self.num_ab_stage % 2
+                )
+                peek_ab_full_status = cute.arch.conditional_mbarrier_try_wait(
+                    need_check_rd_buffer_full,
+                    ab_full_mbar_ptr + smem_rd_buffer,
+                    mma_rd_ab_full_phase,
+                )
+
+                #
+                # Wait for accumulator buffer empty
+                #
+                if is_leader_cta:
+                    acc_empty_phase = (
+                        tile_sched.num_tiles_executed // self.num_acc_stage % 2 ^ 1
+                    )
+                    cute.arch.mbarrier_wait(
+                        acc_empty_mbar_ptr + acc_buf_idx, acc_empty_phase
+                    )
+
+                #
+                # Reset the ACCUMULATE field for each tile
+                #
+                tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+
+                #
+                # Mma mainloop
+                #
+                for k_block in cutlass.range_dynamic(0, cur_k_block_cnt, 1, unroll=1):
+                    mma_rd_k_block_next = cutlass.Int32(k_block + 1)
+                    smem_rd_buffer_next = (
+                        num_prev_k_blk + mma_rd_k_block_next
+                    ) % self.num_ab_stage
+                    mma_rd_ab_full_phase_next = (
+                        mma_rd_ab_full_phase ^ 1
+                        if smem_rd_buffer_next == 0
+                        else mma_rd_ab_full_phase
+                    )
+                    if is_leader_cta:
+                        # Wait for AB buffer full
+                        if peek_ab_full_status == 0:
+                            cute.arch.mbarrier_wait(
+                                ab_full_mbar_ptr + smem_rd_buffer, mma_rd_ab_full_phase
+                            )
+
+                        # tCtAcc += tCrA * tCrB
+                        num_kphases = cute.size(tCrA, mode=[2])
+                        for kphase_idx in range(num_kphases):
+                            kphase_coord = (None, None, kphase_idx, smem_rd_buffer)
+
+                            cute.gemm(
+                                tiled_mma,
+                                tCtAcc,
+                                tCrA[kphase_coord],
+                                tCrB[kphase_coord],
+                                tCtAcc,
+                            )
+                            # Enable accumulate on tCtAcc after first kphase
+                            tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+                        # Async arrive AB buffer empty
+                        with cute.arch.elect_one():
+                            tcgen05.commit(
+                                ab_empty_mbar_ptr + smem_rd_buffer,
+                                ab_empty_mcast_mask,
+                                self.cta_group,
+                            )
+
+                    # Peek (try_wait) AB buffer full for k_block = k_block + 1
+                    need_check_rd_buffer_full = (
+                        mma_rd_k_block_next < cur_k_block_cnt and is_leader_cta
+                    )
+
+                    peek_ab_full_status = cute.arch.conditional_mbarrier_try_wait(
+                        need_check_rd_buffer_full,
+                        ab_full_mbar_ptr + smem_rd_buffer_next,
+                        mma_rd_ab_full_phase_next,
+                    )
+
+                    mma_rd_k_block = mma_rd_k_block_next
+                    smem_rd_buffer = smem_rd_buffer_next
+                    mma_rd_ab_full_phase = mma_rd_ab_full_phase_next
+
+                #
+                # Async arrive accumulator buffer full
+                #
+                if is_leader_cta:
+                    with cute.arch.elect_one():
+                        tcgen05.commit(
+                            acc_full_mbar_ptr + acc_buf_idx,
+                            acc_full_mcast_mask,
+                            self.cta_group,
+                        )
+
+                #
+                # Advance to next tile
+                #
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+        #
+        # Specialized epilogue warps
+        #
+        if warp_idx < self.mma_warp_id:
+            # initialize tensorap for C
+            tensormap_manager.init_tensormap_from_atom(
+                tma_atom_c,
+                tensormap_c_init_ptr,
+                self.epilog_warp_id[0],
+            )
+            # Alloc tensor memory buffer
+            if warp_idx == self.epilog_warp_id[0]:
+                cute.arch.alloc_tmem(
+                    self.num_tmem_alloc_cols,
+                    tmem_holding_buf,
+                    is_two_cta=use_2cta_instrs,
+                )
+
+            #
+            # Bar sync for retrieve tensor memory ptr from shared memory
+            #
+            tmem_ptr_read_threads = 32 * len((self.mma_warp_id, *self.epilog_warp_id))
+            cute.arch.barrier(
+                barrier_id=self.tmem_ptr_sync_bar_id,
+                number_of_threads=tmem_ptr_read_threads,
+            )
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            tmem_ptr = cute.arch.retrieve_tmem_ptr(
+                self.acc_dtype,
+                alignment=16,
+                ptr_to_buffer_holding_addr=tmem_holding_buf,
+            )
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(tmem_ptr, tCtAcc_fake.layout)
+
+            epi_tidx = tidx
+            #
+            # Partition for epilogue
+            #
+            tiled_copy_t2r, tTR_tAcc_base, tTR_rAcc = (
+                self.epilog_tmem_copy_and_partition(
+                    epi_tidx, tCtAcc_base, tCgC, epi_tile, use_2cta_instrs
+                )
+            )
+
+            tTR_rC = cute.make_fragment(tTR_rAcc.shape, self.c_dtype)
+            tiled_copy_r2s, tRS_rC, tRS_sC = self.epilog_smem_copy_and_partition(
+                tiled_copy_t2r, tTR_rC, epi_tidx, sC
+            )
+            tma_atom_c, bSG_sC, bSG_gC_partitioned = (
+                self.epilog_gmem_copy_and_partition(tma_atom_c, tCgC, epi_tile, sC)
+            )
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, bid, grid_dim
+            )
+            # grouped gemm tile scheduler helper will compute the group index for the tile we're working on
+            group_gemm_ts_helper = utils.GroupedGemmTileSchedulerHelper(
+                group_count,
+                tile_sched_params,
+                self.cluster_tile_shape_mnk,
+                utils.create_initial_search_state(),
+            )
+
+            work_tile = tile_sched.initial_work_tile_info()
+            # wait tensormap initialization complete before update
+            tensormap_manager.fence_tensormap_initialization()
+            # tile count we have searched
+            total_k_block_cnt = cutlass.Int32(0)
+            # group index of last tile
+            last_group_idx = cutlass.Int32(-1)
+            while work_tile.is_valid_tile:
+                cur_tile_coord = work_tile.tile_idx
+                grouped_gemm_cta_tile_info = group_gemm_ts_helper.delinearize_z(
+                    cur_tile_coord,
+                    problem_sizes_mnkl,
+                )
+                cur_group_idx = grouped_gemm_cta_tile_info.group_idx
+                is_group_changed = cur_group_idx != last_group_idx
+                if is_group_changed:
+                    # construct tensor C based on real address, shape and stride information
+                    real_tensor_c = self.make_tensor_for_tensormap_update(
+                        cur_group_idx,
+                        self.c_dtype,
+                        (
+                            grouped_gemm_cta_tile_info.problem_shape_m,
+                            grouped_gemm_cta_tile_info.problem_shape_n,
+                            grouped_gemm_cta_tile_info.problem_shape_k,
+                        ),
+                        strides_abc,
+                        ptrs_abc,
+                        2,  # 2 for tensor C
+                    )
+                    tensormap_manager.update_tensormap(
+                        ((real_tensor_c),),
+                        ((tma_atom_c),),
+                        ((tensormap_c_ptr),),
+                        self.epilog_warp_id[0],
+                        (tensormap_c_smem_ptr,),
+                    )
+
+                mma_tile_coord_mnl = (
+                    grouped_gemm_cta_tile_info.cta_tile_idx_m
+                    // cute.size(tiled_mma.thr_id.shape),
+                    grouped_gemm_cta_tile_info.cta_tile_idx_n,
+                    0,
+                )
+                cur_k_block_cnt = grouped_gemm_cta_tile_info.cta_tile_count_k
+                total_k_block_cnt += cur_k_block_cnt
+
+                #
+                # Slice to per mma tile index
+                #
+                # ((ATOM_V, REST_V), EPI_M, EPI_N)
+                bSG_gC = bSG_gC_partitioned[
+                    (
+                        None,
+                        None,
+                        None,
+                        *mma_tile_coord_mnl,
+                    )
+                ]
+
+                # Set tensor memory buffer for current tile
+                acc_buf_idx = tile_sched.num_tiles_executed % self.num_acc_stage
+                # (T2R, T2R_M, T2R_N, EPI_M, EPI_M)
+                tTR_tAcc = tTR_tAcc_base[(None, None, None, None, None, acc_buf_idx)]
+
+                #
+                # Wait for accumulator buffer full
+                #
+                acc_full_phase = tile_sched.num_tiles_executed // self.num_acc_stage % 2
+                cute.arch.mbarrier_wait(acc_full_mbar_ptr + acc_buf_idx, acc_full_phase)
+
+                tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc))
+                bSG_gC = cute.group_modes(bSG_gC, 1, cute.rank(bSG_gC))
+                # ensure the update to tensormap has completed before using it
+                if is_group_changed:
+                    if warp_idx == self.epilog_warp_id[0]:
+                        tensormap_manager.fence_tensormap_update(tensormap_c_ptr)
+                #
+                # Store accumulator to global memory in subtiles
+                #
+                subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3])
+                num_prev_subtiles = tile_sched.num_tiles_executed * subtile_cnt
+                for subtile_idx in cutlass.range_dynamic(subtile_cnt):
+                    #
+                    # Load accumulator from tensor memory buffer to register
+                    #
+                    tTR_tAcc_mn = tTR_tAcc[(None, None, None, subtile_idx)]
+                    cute.copy(tiled_copy_t2r, tTR_tAcc_mn, tTR_rAcc)
+
+                    #
+                    # Convert to output type
+                    #
+                    acc_vec = tiled_copy_r2s.retile(tTR_rAcc).load()
+                    tRS_rC.store(acc_vec.to(self.c_dtype))
+                    #
+                    # Store C to shared memory
+                    #
+                    epi_buffer = (num_prev_subtiles + subtile_idx) % self.num_epi_stage
+                    cute.copy(
+                        tiled_copy_r2s,
+                        tRS_rC,
+                        tRS_sC[(None, None, None, epi_buffer)],
+                    )
+                    # Fence and barrier to make sure shared memory store is visible to TMA store
+                    cute.arch.fence_proxy(
+                        cute.arch.ProxyKind.async_shared,
+                        space=cute.arch.SharedSpace.shared_cta,
+                    )
+                    epilog_threads = 32 * len(self.epilog_warp_id)
+                    cute.arch.barrier(
+                        barrier_id=self.epilog_sync_bar_id,
+                        number_of_threads=epilog_threads,
+                    )
+                    #
+                    # store C to global memory with TMA
+                    #
+                    if warp_idx == self.epilog_warp_id[0]:
+                        cute.copy(
+                            tma_atom_c,
+                            bSG_sC[(None, epi_buffer)],
+                            bSG_gC[(None, subtile_idx)],
+                            tma_desc_ptr=tensormap_manager.get_tensormap_ptr(
+                                tensormap_c_ptr,
+                                cute.AddressSpace.generic,
+                            ),
+                        )
+                        cute.arch.cp_async_bulk_commit_group()
+                        cute.arch.cp_async_bulk_wait_group(
+                            self.num_epi_stage - 1, read=True
+                        )
+                    cute.arch.barrier(
+                        barrier_id=self.epilog_sync_bar_id,
+                        number_of_threads=epilog_threads,
+                    )
+                #
+                # Async arrive accumulator buffer empty
+                #
+                with cute.arch.elect_one():
+                    cute.arch.mbarrier_arrive(
+                        acc_empty_mbar_ptr + acc_buf_idx,
+                        cta_rank_in_cluster // 2 * 2 if use_2cta_instrs else None,
+                    )
+
+                #
+                # Advance to next tile
+                #
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+                last_group_idx = cur_group_idx
+
+            #
+            # Dealloc the tensor memory buffer
+            #
+            if warp_idx == self.epilog_warp_id[0]:
+                cute.arch.relinquish_tmem_alloc_permit(is_two_cta=use_2cta_instrs)
+            epilog_threads = 32 * len(self.epilog_warp_id)
+            cute.arch.barrier(
+                barrier_id=self.epilog_sync_bar_id, number_of_threads=epilog_threads
+            )
+            if warp_idx == self.epilog_warp_id[0]:
+                if use_2cta_instrs:
+                    cute.arch.mbarrier_arrive(
+                        tmem_dealloc_mbar_ptr, cta_rank_in_cluster ^ 1
+                    )
+                    cute.arch.mbarrier_wait(tmem_dealloc_mbar_ptr, 0)
+                cute.arch.dealloc_tmem(
+                    tmem_ptr, self.num_tmem_alloc_cols, is_two_cta=use_2cta_instrs
+                )
+
+            #
+            # Wait a/b buffer empty
+            #
+            if warp_idx == self.epilog_warp_id[0]:
+                cute.arch.mbarrier_wait(
+                    (ab_empty_mbar_ptr + ((total_k_block_cnt - 1) % self.num_ab_stage)),
+                    (((total_k_block_cnt - 1) // self.num_ab_stage) % 2),
+                )
+
+    @cute.jit
+    def make_tensor_for_tensormap_update(
+        self,
+        group_idx: cutlass.Int32,
+        dtype: Type[cutlass.Numeric],
+        problem_shape_mnk: tuple[cutlass.Int32, cutlass.Int32, cutlass.Int32],
+        strides_abc: cute.Tensor,
+        tensor_address_abc: cute.Tensor,
+        tensor_index: int,
+    ):
+        """Extract stride and tensor address for a given group and construct a global tensor.
+
+        This function is used within the kernel to dynamically create a CUTE tensor
+        representing A, B, or C for the current group being processed, using the
+        group-specific address, shape, and stride information.
+
+        :param group_idx: The index of the current group within the grouped GEMM.
+        :type group_idx: cutlass.Int32
+        :param dtype: The data type of the tensor elements (e.g., cutlass.Float16).
+        :type dtype: Type[cutlass.Numeric]
+        :param problem_shape_mnk: The (M, N, K) problem shape for the current group.
+        :type problem_shape_mnk: tuple[cutlass.Int32, cutlass.Int32, cutlass.Int32]
+        :param strides_abc: Tensor containing strides for A, B, C for all groups. Layout: (group_count, 3, 2).
+        :type strides_abc: cute.Tensor
+        :param tensor_address_abc: Tensor containing global memory addresses for A, B, C for all groups. Layout: (group_count, 3).
+        :type tensor_address_abc: cute.Tensor
+        :param tensor_index: Specifies which tensor to create: 0 for A, 1 for B, 2 for C.
+        :type tensor_index: int
+        :return: A CUTE tensor representing the requested global memory tensor (A, B, or C) for the specified group.
+        :rtype: cute.Tensor
+        :raises TypeError: If the provided dtype is not a subclass of cutlass.Numeric.
+        """
+        ptr_i64 = tensor_address_abc[(group_idx, tensor_index)]
+        if cutlass.const_expr(
+            not isclass(dtype) or not issubclass(dtype, cutlass.Numeric)
+        ):
+            raise TypeError(
+                f"dtype must be a type of cutlass.Numeric, got {type(dtype)}"
+            )
+        tensor_gmem_ptr = cute.make_ptr(
+            dtype, ptr_i64, cute.AddressSpace.gmem, assumed_align=16
+        )
+
+        strides_tensor_gmem = strides_abc[(group_idx, tensor_index, None)]
+        strides_tensor_reg = cute.make_fragment(
+            cute.make_layout(2),
+            strides_abc.element_type,
+        )
+        cute.autovec_copy(strides_tensor_gmem, strides_tensor_reg)
+        stride_mn = strides_tensor_reg[0]
+        stride_k = strides_tensor_reg[1]
+        c1 = cutlass.Int32(1)
+        c0 = cutlass.Int32(0)
+
+        if cutlass.const_expr(tensor_index == 0):  # tensor A
+            m = problem_shape_mnk[0]
+            k = problem_shape_mnk[2]
+            return cute.make_tensor(
+                tensor_gmem_ptr,
+                cute.make_layout((m, k, c1), stride=(stride_mn, stride_k, c0)),
+            )
+        elif cutlass.const_expr(tensor_index == 1):  # tensor B
+            n = problem_shape_mnk[1]
+            k = problem_shape_mnk[2]
+            return cute.make_tensor(
+                tensor_gmem_ptr,
+                cute.make_layout((n, k, c1), stride=(stride_mn, stride_k, c0)),
+            )
+        else:  # tensor C
+            m = problem_shape_mnk[0]
+            n = problem_shape_mnk[1]
+            return cute.make_tensor(
+                tensor_gmem_ptr,
+                cute.make_layout((m, n, c1), stride=(stride_mn, stride_k, c0)),
+            )
+
+    def epilog_tmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        tAcc: cute.Tensor,
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        use_2cta_instrs: Union[cutlass.Boolean, bool],
+    ) -> tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array (destination).
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param tAcc: The accumulator tensor to be copied and partitioned
+        :type tAcc: cute.Tensor
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param use_2cta_instrs: Whether use_2cta_instrs is enabled
+        :type use_2cta_instrs: bool
+
+        :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc) where:
+            - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+            - tTR_tAcc: The partitioned accumulator tensor
+            - tTR_rAcc: The accumulated tensor in register used to hold t2r results
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        # Make tiledCopy for tensor memory load(t2r)
+        copy_atom_t2r = sm100_utils.get_tmem_load_op(
+            self.cta_tile_shape_mnk,
+            self.c_layout,
+            self.c_dtype,
+            self.acc_dtype,
+            epi_tile,
+            use_2cta_instrs,
+        )
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, STAGE)
+        tAcc_epi = cute.flat_divide(
+            tAcc[((None, None), 0, 0, None)],
+            epi_tile,
+        )
+        # (EPI_TILE_M, EPI_TILE_N)
+        tiled_copy_t2r = tcgen05.make_tmem_copy(
+            copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)]
+        )
+
+        thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_M, STAGE)
+        tTR_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
+
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        gC_mnl_epi = cute.flat_divide(
+            gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile
+        )
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi)
+        # (T2R, T2R_M, T2R_N)
+        tTR_rAcc = cute.make_fragment(
+            tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype
+        )
+        return tiled_copy_t2r, tTR_tAcc, tTR_rAcc
+
+    def epilog_smem_copy_and_partition(
+        self,
+        tiled_copy_t2r: cute.TiledCopy,
+        tTR_rC: cute.Tensor,
+        tidx: cutlass.Int32,
+        sC: cute.Tensor,
+    ) -> tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for shared memory store, then use it to partition register array (source) and shared memory (destination).
+
+        :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+        :type tiled_copy_t2r: cute.TiledCopy
+        :param tTR_rC: The partitioned accumulator tensor
+        :type tTR_rC: cute.Tensor
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param sC: The shared memory tensor to be copied and partitioned
+        :type sC: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_r2s, tRS_rC, tRS_sC) where:
+            - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s)
+            - tRS_rC: The partitioned tensor C (register source)
+            - tRS_sC: The partitioned tensor C (smem destination)
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        copy_atom_r2s = sm100_utils.get_smem_store_op(
+            self.c_layout, self.c_dtype, self.acc_dtype, tiled_copy_t2r
+        )
+        tiled_copy_r2s = cute.make_tiled_copy(
+            copy_atom_r2s,
+            layout_tv=tiled_copy_t2r.layout_dst_tv_tiled,
+            tiler_mn=tiled_copy_t2r.tiler_mn,
+        )
+        # (R2S, R2S_M, R2S_N, PIPE_D)
+        thr_copy_r2s = tiled_copy_r2s.get_slice(tidx)
+        tRS_sC = thr_copy_r2s.partition_D(sC)
+        # (R2S, R2S_M, R2S_N)
+        tRS_rC = tiled_copy_r2s.retile(tTR_rC)
+        return tiled_copy_r2s, tRS_rC, tRS_sC
+
+    def epilog_gmem_copy_and_partition(
+        self,
+        tma_atom_c: cute.CopyAtom,
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        sC: cute.Tensor,
+    ) -> tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]:
+        """Make tiledCopy for global memory store, then use it to partition
+        shared memory (source) and global memory (destination) for TMA store version.
+
+        :param tma_atom_c: The TMA copy atom configured for storing tensor C.
+        :type tma_atom_c: cute.CopyAtom
+        :param gC_mnl: The global memory tensor C.
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler defining the granularity of the operation.
+        :type epi_tile: cute.Tile
+        :param sC: The shared memory epilogue buffer tensor.
+        :type sC: cute.Tensor
+        :return: A tuple containing:
+                 - tma_atom_c: The input TMA copy atom (passed through).
+                 - bSG_sC: The source shared memory tensor partitioned for the TMA operation.
+                 - tCgC: The destination global memory tensor partitioned for the TMA operation.
+        :rtype: tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]
+        """
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        gC_epi = cute.flat_divide(
+            gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile
+        )
+        sC_for_tma_partition = cute.group_modes(sC, 0, 2)
+        gC_for_tma_partition = cute.group_modes(gC_epi, 0, 2)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N, loopM, loopN, loopL)
+        bSG_sC, bSG_gC = cpasync.tma_partition(
+            tma_atom_c,
+            0,
+            cute.make_layout(1),
+            sC_for_tma_partition,
+            gC_for_tma_partition,
+        )
+        return tma_atom_c, bSG_sC, bSG_gC
+
+    @staticmethod
+    def _compute_stages(
+        tiled_mma: cute.TiledMma,
+        mma_tiler_mnk: tuple[int, int, int],
+        a_dtype: type[cutlass.Numeric],
+        b_dtype: type[cutlass.Numeric],
+        epi_tile: cute.Tile,
+        c_dtype: type[cutlass.Numeric],
+        c_layout: utils.LayoutEnum,
+        num_smem_capacity: int,
+        occupancy: int,
+    ) -> tuple[int, int, int]:
+        """Computes the number of stages for accumulator, A/B operands, and epilogue based on heuristics.
+
+        :param tiled_mma: The tiled MMA object defining the core computation.
+        :type tiled_mma: cute.TiledMma
+        :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler.
+        :type mma_tiler_mnk: tuple[int, int, int]
+        :param a_dtype: Data type of operand A.
+        :type a_dtype: type[cutlass.Numeric]
+        :param b_dtype: Data type of operand B.
+        :type b_dtype: type[cutlass.Numeric]
+        :param epi_tile: The epilogue tile shape.
+        :type epi_tile: cute.Tile
+        :param c_dtype: Data type of operand C (output).
+        :type c_dtype: type[cutlass.Numeric]
+        :param c_layout: Layout enum of operand C in global memory.
+        :type c_layout: utils.LayoutEnum
+        :param num_smem_capacity: Total available shared memory capacity in bytes.
+        :type num_smem_capacity: int
+        :param occupancy: Target number of CTAs per SM (occupancy).
+        :type occupancy: int
+
+        :return: A tuple containing the computed number of stages for:
+                 (accumulator stages, A/B operand stages, epilogue stages)
+        :rtype: tuple[int, int, int]
+        """
+        # Default accumulator and epilogue stages
+        num_acc_stage = 2
+        num_epi_stage = 2
+
+        # Calculate smem layout and size for one stage of A, B, and Epilogue
+        a_smem_layout_stage_one = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            mma_tiler_mnk,
+            a_dtype,
+            1,  # stage=1
+        )
+        b_smem_layout_staged_one = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            mma_tiler_mnk,
+            b_dtype,
+            1,  # stage=1
+        )
+        epi_smem_layout_staged_one = sm100_utils.make_smem_layout_epi(
+            c_dtype,
+            c_layout,
+            epi_tile,
+            1,  # stage=1
+        )
+        ab_bytes_per_stage = cute.size_in_bytes(
+            a_dtype, a_smem_layout_stage_one
+        ) + cute.size_in_bytes(b_dtype, b_smem_layout_staged_one)
+
+        epi_bytes_per_stage = cute.size_in_bytes(c_dtype, epi_smem_layout_staged_one)
+        epi_bytes = epi_bytes_per_stage * num_epi_stage
+
+        # Calculate A/B stages:
+        # Start with total smem per CTA (capacity / occupancy)
+        # Subtract reserved bytes and initial epilogue bytes
+        # Divide remaining by bytes needed per A/B stage
+        num_ab_stage = (
+            num_smem_capacity // occupancy
+            - GroupedGemmKernel.reserved_smem_bytes
+            - epi_bytes
+        ) // ab_bytes_per_stage
+
+        # Refine epilogue stages:
+        # Calculate remaining smem after allocating for A/B stages and reserved bytes
+        # Add remaining unused smem to epilogue
+        remaining_smem = (
+            num_smem_capacity
+            - occupancy * ab_bytes_per_stage * num_ab_stage
+            - occupancy * (GroupedGemmKernel.reserved_smem_bytes + epi_bytes)
+        )
+        num_epi_stage += remaining_smem // (occupancy * epi_bytes_per_stage)
+        return num_acc_stage, num_ab_stage, num_epi_stage
+
+    @staticmethod
+    def _compute_grid(
+        total_num_clusters: int,
+        cluster_shape_mn: tuple[int, int],
+        max_active_clusters: cutlass.Constexpr[int],
+    ) -> tuple[utils.PersistentTileSchedulerParams, tuple[int, int, int]]:
+        """Compute tile scheduler parameters and grid shape for grouped GEMM operations.
+
+        :param total_num_clusters: Total number of clusters to process across all groups.
+        :type total_num_clusters: int
+        :param cluster_shape_mn: Shape of each cluster in M, N dimensions.
+        :type cluster_shape_mn: tuple[int, int]
+        :param max_active_clusters: Maximum number of active clusters.
+        :type max_active_clusters: cutlass.Constexpr[int]
+
+        :return: A tuple containing:
+            - tile_sched_params: Parameters for the persistent tile scheduler.
+            - grid: Grid shape for kernel launch.
+        :rtype: tuple[utils.PersistentTileSchedulerParams, tuple[int, ...]]
+        """
+        # Create problem shape with M, N dimensions from cluster shape
+        # and L dimension representing the total number of clusters.
+        problem_shape_ntile_mnl = (
+            cluster_shape_mn[0],
+            cluster_shape_mn[1],
+            cutlass.Int32(total_num_clusters),
+        )
+
+        tile_sched_params = utils.PersistentTileSchedulerParams(
+            problem_shape_ntile_mnl, (*cluster_shape_mn, 1)
+        )
+
+        grid = utils.StaticPersistentTileScheduler.get_grid_shape(
+            tile_sched_params, max_active_clusters
+        )
+
+        return tile_sched_params, grid
+
+    @staticmethod
+    def _get_mbar_smem_bytes(**kwargs_stages: int) -> int:
+        """Calculate shared memory consumption for memory barriers based on provided stages.
+
+        Each stage requires 2 barriers, and each barrier consumes 8 bytes of shared memory.
+        The total consumption is the sum across all provided stages. This function calculates the total
+        shared memory needed for these barriers.
+
+        :param kwargs_stages: Variable keyword arguments where each key is a stage name
+                              (e.g., num_acc_stage, num_ab_stage) and each value is the
+                              number of stages of that type.
+        :type kwargs_stages: int
+        :return: Total shared memory bytes required for all memory barriers.
+        :rtype: int
+        """
+        num_barriers_per_stage = 2
+        num_bytes_per_barrier = 8
+        mbar_smem_consumption = sum(
+            [
+                num_barriers_per_stage * num_bytes_per_barrier * stage
+                for stage in kwargs_stages.values()
+            ]
+        )
+        return mbar_smem_consumption
+
+    @staticmethod
+    def _get_tensormap_smem_bytes(
+        tensormap_update_mode: utils.TensorMapUpdateMode,
+    ) -> int:
+        """Get the SMEM consumption for the tensormap buffer based on the update mode.
+
+        :param tensormap_update_mode: Specifies whether tensormaps are updated in GMEM or SMEM.
+        :type tensormap_update_mode: utils.TensorMapUpdateMode
+        :return: The shared memory bytes required for the tensormap buffer. Returns 0 if mode is GMEM.
+        :rtype: int
+        :raises ValueError: If an invalid tensormap update mode is provided.
+        """
+        if tensormap_update_mode == utils.TensorMapUpdateMode.GMEM:
+            return 0
+        elif tensormap_update_mode == utils.TensorMapUpdateMode.SMEM:
+            return (
+                GroupedGemmKernel.bytes_per_tensormap * GroupedGemmKernel.num_tensormaps
+            )
+        else:
+            raise ValueError(f"Invalid tensormap update mode: {tensormap_update_mode}")
+
+    @staticmethod
+    def _get_tensor_smem_bytes(
+        a_smem_layout_staged: cute.Layout,
+        a_dtype: Type[cutlass.Numeric],
+        b_smem_layout_staged: cute.Layout,
+        b_dtype: Type[cutlass.Numeric],
+        epi_smem_layout_staged: cute.Layout,
+        c_dtype: Type[cutlass.Numeric],
+    ) -> int:
+        """Compute the total SMEM consumption for tensor A, B and C."""
+        ab_bytes = cute.size_in_bytes(
+            a_dtype, a_smem_layout_staged
+        ) + cute.size_in_bytes(b_dtype, b_smem_layout_staged)
+
+        epi_bytes = cute.size_in_bytes(c_dtype, epi_smem_layout_staged)
+        return ab_bytes + epi_bytes
+
+    @staticmethod
+    def _get_tma_atom_kind(atom_sm_cnt: int, mcast: bool):
+        """Select the appropriate TMA copy atom based on the number of SMs and the multicast flag."""
+        if atom_sm_cnt == 2 and mcast:
+            return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.TWO)
+        elif atom_sm_cnt == 2 and not mcast:
+            return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.TWO)
+        elif atom_sm_cnt == 1 and mcast:
+            return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.ONE)
+        elif atom_sm_cnt == 1 and not mcast:
+            return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE)
+
+        raise ValueError(f"Invalid atom_sm_cnt: {atom_sm_cnt} and {mcast}")
+
+    @staticmethod
+    def _compute_num_tmem_alloc_cols(
+        tiled_mma: cute.TiledMma,
+        mma_tiler: tuple[int, int, int],
+        num_acc_stage: int,
+    ) -> int:
+        """
+        Compute the number of tensor memory allocation columns.
+
+        :param tiled_mma: The tiled MMA object defining the core computation.
+        :type tiled_mma: cute.TiledMma
+        :param mma_tiler: The shape (M, N, K) of the MMA tile.
+        :type mma_tiler: tuple[int, int, int]
+        :param acc_stage: The stage of the accumulator tensor.
+        :type acc_stage: int
+
+        :return: The number of tensor memory allocation columns.
+        :rtype: int
+        """
+        acc_shape = tiled_mma.partition_shape_C(mma_tiler[:2])
+        tCtAcc_fake = tiled_mma.make_fragment_C(cute.append(acc_shape, num_acc_stage))
+        num_tmem_alloc_cols = utils.get_num_tmem_alloc_cols(tCtAcc_fake)
+
+        return num_tmem_alloc_cols
+
+    # Size of smem we reserved for mbarrier, tensor memory management and tensormap update
+    reserved_smem_bytes = 1024
+    bytes_per_tensormap = 128
+    num_tensormaps = 3
+    # size of smem used for tensor memory management
+    tensor_memory_management_bytes = 12
+
+
+def run_grouped_gemm(
+    num_groups: int,
+    problem_sizes_mnkl: tuple[int, int, int, int],
+    ab_dtype: Type[cutlass.Numeric],
+    c_dtype: Type[cutlass.Numeric],
+    acc_dtype: Type[cutlass.Numeric],
+    a_major: str,
+    b_major: str,
+    c_major: str,
+    mma_tiler_mn: tuple[int, int],
+    cluster_shape_mn: tuple[int, int],
+    use_2cta_instrs: bool,
+    tensormap_update_mode: utils.TensorMapUpdateMode,
+    tolerance: float,
+    warmup_iterations: int,
+    iterations: int,
+    skip_ref_check: bool,
+):
+    """Run grouped GEMM example with specified configurations."""
+    print(f"Running Blackwell Grouped GEMM test with:")
+    print(f"{num_groups} groups")
+    for i, (m, n, k, l) in enumerate(problem_sizes_mnkl):
+        print(f"Group {i}: {m}x{n}x{k}x{l}")
+    print(f"AB dtype: {ab_dtype}, C dtype: {c_dtype}, Acc dtype: {acc_dtype}")
+    print(f"Matrix majors - A: {a_major}, B: {b_major}, C: {c_major}")
+    print(f"Mma Tiler (M, N): {mma_tiler_mn}, Cluster Shape (M, N): {cluster_shape_mn}")
+    print(f"2CTA MMA instructions: {'True' if use_2cta_instrs else 'False'}")
+    print(f"Tensor map update mode: {tensormap_update_mode}")
+    print(f"Tolerance: {tolerance}")
+    print(f"Warmup iterations: {warmup_iterations}")
+    print(f"Iterations: {iterations}")
+    print(f"Skip reference checking: {skip_ref_check}")
+
+    # Skip unsupported types
+    if ab_dtype not in {
+        cutlass.Float16,
+        cutlass.BFloat16,
+    }:
+        raise ValueError(f"Skip unsupported ab_dtype {ab_dtype}")
+    if c_dtype not in {cutlass.Float16, cutlass.BFloat16, cutlass.Float32}:
+        raise ValueError(f"Skip unsupported c_dtype {c_dtype}")
+    # Skip unsupported acc dtype
+    if acc_dtype not in {cutlass.Float32, cutlass.Float16}:
+        raise ValueError(f"Skip unsupported acc_dtype {acc_dtype}")
+    # Skip invalid ab_dtype and acc_dtype combination
+    if ab_dtype == cutlass.BFloat16 and acc_dtype == cutlass.Float16:
+        raise ValueError("Skip invalid ab_dtype and acc_dtype combination")
+    # Skip invalid mma tile shape
+    if not (
+        (not use_2cta_instrs and mma_tiler_mn[0] in [64, 128])
+        or (use_2cta_instrs and mma_tiler_mn[0] in [128, 256])
+    ):
+        raise ValueError(f"Skip invalid mma tiler M {mma_tiler_mn[0]}")
+    if mma_tiler_mn[1] not in range(32, 257, 32):
+        raise ValueError(f"Skip invalid mma tiler N {mma_tiler_mn[1]}")
+    # Skip illegal cluster shape
+    if cluster_shape_mn[0] % (2 if use_2cta_instrs else 1) != 0:
+        raise ValueError(
+            f"cluster_shape_m need align with use_2cta_instrs config {cluster_shape_mn}"
+        )
+    # Skip invalid cluster shape
+    is_power_of_2 = lambda x: x > 0 and (x & (x - 1)) == 0
+    if (
+        cluster_shape_mn[0] * cluster_shape_mn[1] > 16
+        or cluster_shape_mn[0] <= 0
+        or cluster_shape_mn[1] <= 0
+        or not is_power_of_2(cluster_shape_mn[0])
+        or not is_power_of_2(cluster_shape_mn[1])
+    ):
+        raise ValueError(f"Skip invalid cluster shape {cluster_shape_mn}")
+
+    # Skip illegal problem shape for load/store alignment
+    def check_contigous_16B_alignment(dtype, is_mode0_major, tensor_shape):
+        major_mode_idx = 0 if is_mode0_major else 1
+        num_major_elements = tensor_shape[major_mode_idx]
+        num_contiguous_elements = 16 * 8 // dtype.width
+        return num_major_elements % num_contiguous_elements == 0
+
+    if (
+        not check_contigous_16B_alignment(ab_dtype, a_major == "m", (m, k, l))
+        or not check_contigous_16B_alignment(ab_dtype, b_major == "n", (n, k, l))
+        or not check_contigous_16B_alignment(c_dtype, c_major == "m", (m, n, l))
+    ):
+        raise ValueError("Skip invalid problem alignment")
+    if not torch.cuda.is_available():
+        raise RuntimeError("GPU is required to run this example!")
+
+    torch.manual_seed(2025)
+
+    # Create tensor and return the pointer, tensor, and stride
+    def create_tensor_and_stride(
+        l: int,
+        mode0: int,
+        mode1: int,
+        is_mode0_major: bool,
+        dtype: type[cutlass.Numeric],
+        is_dynamic_layout: bool = True,
+    ) -> tuple[int, torch.Tensor, cute.Tensor, torch.Tensor, tuple[int, int]]:
+        # is_mode0_major: (l, mode1, mode0) -> (mode0, mode1, l)
+        # else: (l, mode0, mode1) -> (mode0, mode1, l)
+        shape = (l, mode1, mode0) if is_mode0_major else (l, mode0, mode1)
+        permute_order = (2, 1, 0) if is_mode0_major else (1, 2, 0)
+        # omit stride for L mode as it is always 1 for grouped GEMM
+        strides = (1, mode0) if is_mode0_major else (mode1, 1)
+        assert dtype in {cutlass.Float16, cutlass.BFloat16, cutlass.Float32}
+        is_unsigned = False
+
+        torch_dtype = cutlass_torch.dtype(dtype)
+        torch_tensor_cpu = cutlass_torch.create_and_permute_torch_tensor(
+            shape,
+            torch_dtype,
+            permute_order=permute_order,
+            init_type=cutlass_torch.TensorInitType.RANDOM,
+            init_config=cutlass_torch.RandomInitConfig(
+                min_val=0 if is_unsigned else -2, max_val=4 if is_unsigned else 2
+            ),
+        )
+        torch_tensor = torch_tensor_cpu.cuda()
+        f32_torch_tensor = torch_tensor_cpu.to(dtype=torch.float32)
+
+        cute_tensor = from_dlpack(torch_tensor, assumed_align=16)
+        if is_dynamic_layout:
+            cute_tensor = cute_tensor.mark_layout_dynamic(
+                leading_dim=(0 if is_mode0_major else 1)
+            )
+        cute_tensor = cutlass_torch.convert_cute_tensor(
+            f32_torch_tensor,
+            cute_tensor,
+            dtype,
+            is_dynamic_layout=is_dynamic_layout,
+        )
+        # Get pointer of the tensor
+        ptr = torch_tensor.data_ptr()
+        return ptr, torch_tensor, cute_tensor, f32_torch_tensor, strides
+
+    # iterate all groups and create tensors for each group
+    torch_fp32_tensors_abc = []
+    torch_tensors_abc = []
+    cute_tensors_abc = []
+    strides_abc = []
+    ptrs_abc = []
+    for _, (m, n, k, l) in enumerate(problem_sizes_mnkl):
+        ptr_a, torch_tensor_a, cute_tensor_a, tensor_fp32_a, stride_mk_a = (
+            create_tensor_and_stride(l, m, k, a_major == "m", ab_dtype)
+        )
+        ptr_b, torch_tensor_b, cute_tensor_b, tensor_fp32_b, stride_nk_b = (
+            create_tensor_and_stride(l, n, k, b_major == "n", ab_dtype)
+        )
+        ptr_c, torch_tensor_c, cute_tensor_c, tensor_fp32_c, stride_mn_c = (
+            create_tensor_and_stride(l, m, n, c_major == "m", c_dtype)
+        )
+        ptrs_abc.append([ptr_a, ptr_b, ptr_c])
+        torch_tensors_abc.append([torch_tensor_a, torch_tensor_b, torch_tensor_c])
+        torch_fp32_tensors_abc.append([tensor_fp32_a, tensor_fp32_b, tensor_fp32_c])
+        strides_abc.append([stride_mk_a, stride_nk_b, stride_mn_c])
+        cute_tensors_abc.append(
+            (
+                cute_tensor_a,
+                cute_tensor_b,
+                cute_tensor_c,
+            )
+        )
+    # Choose A, B, C with the smallest size to create initial tensormaps
+    key_size_a = lambda item: item[1][0] * item[1][2]
+    key_size_b = lambda item: item[1][1] * item[1][2]
+    key_size_c = lambda item: item[1][0] * item[1][1]
+    # Find the indices of the groups with the smallest tensor sizes
+    min_a_idx, _ = min(enumerate(problem_sizes_mnkl), key=key_size_a)
+    min_b_idx, _ = min(enumerate(problem_sizes_mnkl), key=key_size_b)
+    min_c_idx, _ = min(enumerate(problem_sizes_mnkl), key=key_size_c)
+    initial_cute_tensors_abc = [
+        cute_tensors_abc[min_a_idx][0],  # A with smallest (m, k)
+        cute_tensors_abc[min_b_idx][1],  # B with smallest (n, k)
+        cute_tensors_abc[min_c_idx][2],  # C with smallest (m, n)
+    ]
+
+    hardware_info = utils.HardwareInfo()
+    sm_count = hardware_info.get_max_active_clusters(1)
+    max_active_clusters = hardware_info.get_max_active_clusters(
+        cluster_shape_mn[0] * cluster_shape_mn[1]
+    )
+    # Prepare tensormap buffer for each SM
+    num_tensormap_buffers = sm_count
+    tensormap_pytorch_tensor = (
+        torch.empty(
+            (
+                num_tensormap_buffers,
+                GroupedGemmKernel.num_tensormaps,
+                GroupedGemmKernel.bytes_per_tensormap // 8,
+            ),
+            dtype=torch.int64,
+        )
+        .fill_(0)
+        .cuda()
+    )
+    tensormap_cute_tensor = from_dlpack(tensormap_pytorch_tensor, assumed_align=16)
+
+    grouped_gemm = GroupedGemmKernel(
+        acc_dtype,
+        use_2cta_instrs,
+        mma_tiler_mn,
+        cluster_shape_mn,
+        tensormap_update_mode,
+    )
+
+    # Convert integer list to torch tensor and cute tensor
+    def convert_list_to_tensor(l, dtype) -> tuple[torch.Tensor, cute.Tensor]:
+        torch_tensor = torch.tensor(l, dtype=dtype).cuda()
+        cute_tensor = from_dlpack(torch_tensor, assumed_align=16)
+        return torch_tensor, cute_tensor
+
+    # layout (num_groups, 4):(4, 1)
+    problem_sizes_mnkl_torch_tensor, problem_sizes_mnkl_cute_tensor = (
+        convert_list_to_tensor(problem_sizes_mnkl, torch.int32)
+    )
+    # layout (num_groups, 3, 2):(6, 2, 1)
+    strides_abc_torch_tensor, strides_abc_cute_tensor = convert_list_to_tensor(
+        strides_abc, torch.int32
+    )
+    # layout (num_groups,3):(3, 1)
+    ptrs_abc_torch_tensor, ptrs_abc_cute_tensor = convert_list_to_tensor(
+        ptrs_abc, torch.int64
+    )
+
+    # Compute total number of cluster tiles we need to compute for given grouped GEMM problem
+    def compute_total_num_clusters(
+        problem_sizes_mnkl: List[tuple[int, int, int, int]],
+        cluster_tile_shape_mn: tuple[int, int],
+    ) -> int:
+        total_num_clusters = 0
+        for m, n, _, _ in problem_sizes_mnkl:
+            num_clusters_mn = tuple(
+                (x + y - 1) // y for x, y in zip((m, n), cluster_tile_shape_mn)
+            )
+            total_num_clusters += functools.reduce(lambda x, y: x * y, num_clusters_mn)
+        return total_num_clusters
+
+    # Compute cluster tile shape
+    def compute_cluster_tile_shape(
+        mma_tiler_mn: tuple[int, int],
+        cluster_shape_mn: tuple[int, int],
+        use_2cta_instrs: bool,
+    ) -> tuple[int, int]:
+        cta_tile_shape_mn = list(mma_tiler_mn)
+        if use_2cta_instrs:
+            cta_tile_shape_mn[0] = cta_tile_shape_mn[0] // 2
+        return tuple(x * y for x, y in zip(cta_tile_shape_mn, cluster_shape_mn))
+
+    cluster_tile_shape_mn = compute_cluster_tile_shape(
+        mma_tiler_mn, cluster_shape_mn, use_2cta_instrs
+    )
+    total_num_clusters = compute_total_num_clusters(
+        problem_sizes_mnkl, cluster_tile_shape_mn
+    )
+
+    # Get current CUDA stream from PyTorch
+    torch_stream = torch.cuda.current_stream()
+    # Get the raw stream pointer as a CUstream
+    current_stream = cuda.CUstream(torch_stream.cuda_stream)
+    # Compile grouped GEMM kernel
+    compiled_grouped_gemm = cute.compile(
+        grouped_gemm,
+        initial_cute_tensors_abc[0],
+        initial_cute_tensors_abc[1],
+        initial_cute_tensors_abc[2],
+        num_groups,
+        problem_sizes_mnkl_cute_tensor,
+        strides_abc_cute_tensor,
+        ptrs_abc_cute_tensor,
+        total_num_clusters,
+        tensormap_cute_tensor,
+        max_active_clusters,
+        current_stream,
+    )
+
+    # Launch GPU kernel
+    # Warm up
+    for _ in range(warmup_iterations):
+        compiled_grouped_gemm(
+            initial_cute_tensors_abc[0],
+            initial_cute_tensors_abc[1],
+            initial_cute_tensors_abc[2],
+            problem_sizes_mnkl_cute_tensor,
+            strides_abc_cute_tensor,
+            ptrs_abc_cute_tensor,
+            tensormap_cute_tensor,
+            current_stream,
+        )
+    # Execution
+    for i in range(iterations):
+        compiled_grouped_gemm(
+            initial_cute_tensors_abc[0],
+            initial_cute_tensors_abc[1],
+            initial_cute_tensors_abc[2],
+            problem_sizes_mnkl_cute_tensor,
+            strides_abc_cute_tensor,
+            ptrs_abc_cute_tensor,
+            tensormap_cute_tensor,
+            current_stream,
+        )
+
+    # Compute reference result
+    if not skip_ref_check:
+        refs = []
+        for a, b, _ in torch_fp32_tensors_abc:
+            ref = (torch.einsum("mkl,nkl->mnl", a, b)).cpu()
+            refs.append(ref)
+        for i, ((_, _, c), ref) in enumerate(zip(torch_tensors_abc, refs)):
+            print(f"checking group {i}")
+            if c_dtype == cutlass.Float32:
+                ref_c = ref
+            else:
+                ref_c = ref.to(cutlass_torch.dtype(c_dtype))
+            torch.testing.assert_close(
+                c.cpu(),
+                ref_c,
+                atol=tolerance,
+                rtol=1e-05,
+            )
+
+
+if __name__ == "__main__":
+
+    def parse_comma_separated_ints(s: str) -> tuple[int, ...]:
+        try:
+            return tuple(int(x.strip()) for x in s.split(","))
+        except ValueError:
+            raise argparse.ArgumentTypeError(
+                "Invalid format. Expected comma-separated integers."
+            )
+
+    def parse_comma_separated_tuples(s: str) -> List[tuple[int, ...]]:
+        if s.strip().startswith("("):
+            # Split on ),( to separate tuples
+            tuples = s.strip("()").split("),(")
+            result = []
+            tuple_len = None
+
+            for t in tuples:
+                # Parse individual tuple
+                nums = [int(x.strip()) for x in t.split(",")]
+
+                # Validate tuple length consistency
+                if tuple_len is None:
+                    tuple_len = len(nums)
+                elif len(nums) != tuple_len:
+                    raise argparse.ArgumentTypeError(
+                        "All tuples must have the same length"
+                    )
+
+                result.append(tuple(nums))
+            return result
+
+        raise argparse.ArgumentTypeError(
+            "Invalid format. Expected comma-separated integers or list of tuples"
+        )
+
+    parser = argparse.ArgumentParser(
+        description="Example of Grouped GEMM on Blackwell."
+    )
+    parser.add_argument(
+        "--num_groups",
+        type=int,
+        default=2,
+        help="Number of groups",
+    )
+    parser.add_argument(
+        "--problem_sizes_mnkl",
+        type=parse_comma_separated_tuples,
+        default=((128, 128, 128, 1), (128, 128, 128, 1)),
+        help="a tuple of problem sizes for each group (comma-separated tuples)",
+    )
+    parser.add_argument(
+        "--mma_tiler_mn",
+        type=parse_comma_separated_ints,
+        default=(128, 128),
+        help="Mma tile shape (comma-separated)",
+    )
+    parser.add_argument(
+        "--cluster_shape_mn",
+        type=parse_comma_separated_ints,
+        default=(1, 1),
+        help="Cluster shape (comma-separated)",
+    )
+    parser.add_argument(
+        "--tensormap_update_mode",
+        type=str,
+        default="SMEM",
+        help="Tensor map update mode",
+    )
+    parser.add_argument("--ab_dtype", type=cutlass.dtype, default=cutlass.Float16)
+    parser.add_argument("--c_dtype", type=cutlass.dtype, default=cutlass.Float16)
+    parser.add_argument("--acc_dtype", type=cutlass.dtype, default=cutlass.Float32)
+    parser.add_argument(
+        "--use_2cta_instrs",
+        action="store_true",
+        help="Enable 2CTA MMA instructions feature",
+    )
+    parser.add_argument("--a_major", choices=["k", "m"], type=str, default="k")
+    parser.add_argument("--b_major", choices=["k", "n"], type=str, default="k")
+    parser.add_argument("--c_major", choices=["n", "m"], type=str, default="n")
+    parser.add_argument(
+        "--tolerance", type=float, default=1e-01, help="Tolerance for validation"
+    )
+    parser.add_argument(
+        "--warmup_iterations", type=int, default=0, help="Warmup iterations"
+    )
+    parser.add_argument(
+        "--iterations",
+        type=int,
+        default=1,
+        help="Number of iterations to run the kernel",
+    )
+    parser.add_argument(
+        "--skip_ref_check", action="store_true", help="Skip reference checking"
+    )
+
+    args = parser.parse_args()
+
+    if (
+        len(args.problem_sizes_mnkl) != 0
+        and len(args.problem_sizes_mnkl) != args.num_groups
+    ):
+        parser.error("--problem_sizes_mnkl must contain exactly num_groups tuples")
+
+    # l mode must be 1 for all groups
+    for _, _, _, l in args.problem_sizes_mnkl:
+        if l != 1:
+            parser.error("l must be 1 for all groups")
+
+    if len(args.mma_tiler_mn) != 2:
+        parser.error("--mma_tiler_mn must contain exactly 2 values")
+
+    if len(args.cluster_shape_mn) != 2:
+        parser.error("--cluster_shape_mn must contain exactly 2 values")
+
+    if args.tensormap_update_mode not in ["GMEM", "SMEM"]:
+        parser.error("--tensormap_update_mode must be GMEM or SMEM")
+
+    if args.tensormap_update_mode == "GMEM":
+        tensormap_update_mode = utils.TensorMapUpdateMode.GMEM
+    else:
+        tensormap_update_mode = utils.TensorMapUpdateMode.SMEM
+
+    run_grouped_gemm(
+        args.num_groups,
+        args.problem_sizes_mnkl,
+        args.ab_dtype,
+        args.c_dtype,
+        args.acc_dtype,
+        args.a_major,
+        args.b_major,
+        args.c_major,
+        args.mma_tiler_mn,
+        args.cluster_shape_mn,
+        args.use_2cta_instrs,
+        tensormap_update_mode,
+        args.tolerance,
+        args.warmup_iterations,
+        args.iterations,
+        args.skip_ref_check,
+    )
+    print("PASS")
diff --git a/examples/python/CuTeDSL/notebooks/README.md b/examples/python/CuTeDSL/notebooks/README.md
new file mode 100644
index 00000000..402c1cfc
--- /dev/null
+++ b/examples/python/CuTeDSL/notebooks/README.md
@@ -0,0 +1,31 @@
+# Copyright
+
+Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/examples/python/CuTeDSL/notebooks/cuda_graphs.ipynb b/examples/python/CuTeDSL/notebooks/cuda_graphs.ipynb
new file mode 100644
index 00000000..dc7c17cf
--- /dev/null
+++ b/examples/python/CuTeDSL/notebooks/cuda_graphs.ipynb
@@ -0,0 +1,648 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0e95f0df-4d1a-4e2e-92ff-90539bb4c517",
+   "metadata": {},
+   "source": [
+    "# Example 06: CUDA Graphs\n",
+    "\n",
+    "In this example we demonstrate how to use CUDA graphs through PyTorch with CuTe DSL.\n",
+    "The process of interacting with PyTorch's CUDA graph implementation requires exposing PyTorch's CUDA streams to CUTLASS.\n",
+    "\n",
+    "To use CUDA graphs with Blackwell requires a version of PyTorch that supports Blackwell.\n",
+    "This can be obtained through:\n",
+    "- The [PyTorch NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch)\n",
+    "- [PyTorch 2.7 with CUDA 12.8 or later](https://pytorch.org/) (e.g., `pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128`)\n",
+    "- Building PyTorch directly with your version of CUDA."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "46b8fb6f-9ac5-4a3d-b765-b6476f182bf7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import torch for CUDA graphs\n",
+    "import torch\n",
+    "import cutlass\n",
+    "import cutlass.cute as cute\n",
+    "# import CUstream type from the cuda driver bindings\n",
+    "from cuda.bindings.driver import CUstream\n",
+    "# import the current_stream function from torch\n",
+    "from torch.cuda import current_stream"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bcf5e06e-1f5b-4d72-ad73-9b36efb78ca0",
+   "metadata": {},
+   "source": [
+    "## Kernel Creation\n",
+    "\n",
+    "We create a kernel which prints \"Hello world\" as well as a host function to launch the kernel.\n",
+    "We then compile the kernel for use in our graph, by passing in a default stream.\n",
+    "\n",
+    "Kernel compilation before graph capture is required since CUDA graphs cannot JIT compile kernels during graph execution."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "0c2a6ca8-98d7-4837-b91f-af769ca8fcd8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@cute.kernel\n",
+    "def hello_world_kernel():\n",
+    "    \"\"\"\n",
+    "    A kernel that prints hello world\n",
+    "    \"\"\"\n",
+    "    cute.printf(\"Hello world\")\n",
+    "\n",
+    "@cute.jit\n",
+    "def hello_world(stream : CUstream):\n",
+    "    \"\"\"\n",
+    "    Host function that launches our (1,1,1), (1,1,1) grid in stream\n",
+    "    \"\"\"\n",
+    "    hello_world_kernel().launch(grid=[1, 1, 1], block=[1, 1, 1], stream=stream)\n",
+    "\n",
+    "# Grab a stream from PyTorch, this will also initialize our context\n",
+    "# so we can omit cutlass.cuda.initialize_cuda_context()\n",
+    "stream = current_stream()\n",
+    "hello_world_compiled = cute.compile(hello_world, CUstream(stream.cuda_stream))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ecc850af-09f8-4a29-9c93-ff31fbb9326f",
+   "metadata": {},
+   "source": [
+    "## Creating and replaying a CUDA Graph\n",
+    "\n",
+    "We create a stream through torch as well as a graph.\n",
+    "When we create the graph we can pass the stream we want to capture to torch. We similarly run the compiled kernel with the stream passed as a CUstream.\n",
+    "\n",
+    "Finally we can replay our graph and synchronize."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "f673e5ae-42bb-44d0-b652-3280606181c4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Hello world\n",
+      "Hello world\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create a CUDA Graph\n",
+    "g = torch.cuda.CUDAGraph()\n",
+    "# Capture our graph\n",
+    "with torch.cuda.graph(g):\n",
+    "    # Turn our torch Stream into a cuStream stream.\n",
+    "    # This is done by getting the underlying CUstream with .cuda_stream\n",
+    "    graph_stream = CUstream(current_stream().cuda_stream)\n",
+    "    # Run 2 iterations of our compiled kernel\n",
+    "    for _ in range(2):\n",
+    "        # Run our kernel in the stream\n",
+    "        hello_world_compiled(graph_stream)\n",
+    "\n",
+    "# Replay our graph\n",
+    "g.replay()\n",
+    "# Synchronize all streams (equivalent to cudaDeviceSynchronize() in C++)\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "db76d9c3-7617-4bf2-b326-11982e6803bf",
+   "metadata": {},
+   "source": [
+    "Our run results in the following execution when viewed in NSight Systems:\n",
+    "\n",
+    "![Image of two hello world kernels run back to back in a CUDA graph](images/cuda_graphs_image.png)\n",
+    "\n",
+    "We can observe the launch of the two kernels followed by a `cudaDeviceSynchronize()`.\n",
+    "\n",
+    "Now we can confirm that this minimizes some launch overhead:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "3ebe15bf-dc97-42e9-913c-224ecfb472e8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n",
+      "Hello world\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Get our CUDA stream from PyTorch\n",
+    "stream = CUstream(current_stream().cuda_stream)\n",
+    "\n",
+    "# Create a larger CUDA Graph of 100 iterations\n",
+    "g = torch.cuda.CUDAGraph()\n",
+    "# Capture our graph\n",
+    "with torch.cuda.graph(g):\n",
+    "    # Turn our torch Stream into a cuStream stream.\n",
+    "    # This is done by getting the underlying CUstream with .cuda_stream\n",
+    "    graph_stream = CUstream(current_stream().cuda_stream)\n",
+    "    # Run 2 iterations of our compiled kernel\n",
+    "    for _ in range(100):\n",
+    "        # Run our kernel in the stream\n",
+    "        hello_world_compiled(graph_stream)\n",
+    "\n",
+    "# Create CUDA events for measuring performance\n",
+    "start = torch.cuda.Event(enable_timing=True)\n",
+    "end = torch.cuda.Event(enable_timing=True)\n",
+    "\n",
+    "# Run our kernel to warm up the GPU\n",
+    "for _ in range(100):\n",
+    "    hello_world_compiled(stream)\n",
+    "\n",
+    "# Record our start time\n",
+    "start.record()\n",
+    "# Run 100 kernels\n",
+    "for _ in range(100):\n",
+    "    hello_world_compiled(stream)\n",
+    "# Record our end time\n",
+    "end.record()\n",
+    "# Synchronize (cudaDeviceSynchronize())\n",
+    "torch.cuda.synchronize()\n",
+    "\n",
+    "# Calculate the time spent when launching kernels in a stream\n",
+    "# Results are in ms\n",
+    "stream_time = start.elapsed_time(end) \n",
+    "\n",
+    "# Warmup our GPU again\n",
+    "g.replay()\n",
+    "# Record our start time\n",
+    "start.record()\n",
+    "# Run our graph\n",
+    "g.replay()\n",
+    "# Record our end time\n",
+    "end.record()\n",
+    "# Synchronize (cudaDeviceSynchronize())\n",
+    "torch.cuda.synchronize()\n",
+    "\n",
+    "# Calculate the time spent when launching kernels in a graph\n",
+    "# units are ms\n",
+    "graph_time = start.elapsed_time(end)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "12b8151a-46b3-4c99-9945-301f6b628131",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "8.94% speedup when using CUDA graphs for this kernel!\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Print out speedup when using CUDA graphs\n",
+    "percent_speedup = (stream_time - graph_time) / graph_time\n",
+    "print(f\"{percent_speedup * 100.0:.2f}% speedup when using CUDA graphs for this kernel!\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/python/CuTeDSL/notebooks/cute_layout_algebra.ipynb b/examples/python/CuTeDSL/notebooks/cute_layout_algebra.ipynb
new file mode 100644
index 00000000..035776aa
--- /dev/null
+++ b/examples/python/CuTeDSL/notebooks/cute_layout_algebra.ipynb
@@ -0,0 +1,1001 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Using CuTe Layout Algebra With Python DSL\n",
+    "\n",
+    "Referencing the [01_layout.md](https://github.com/NVIDIA/cutlass/blob/main/media/docs/cpp/cute/01_layout.md) and [02_layout_algebra.md](https://github.com/NVIDIA/cutlass/blob/main/media/docs/cpp/cute/02_layout_algebra.md) documentation from CuTe C++, we summarize:\n",
+    "\n",
+    "In CuTe, a `Layout`:\n",
+    "- is defined by a pair of `Shape` and `Stride`,\n",
+    "- maps coordinates space(s) to an index space,\n",
+    "- supports both static (compile-time) and dynamic (runtime) values.\n",
+    "\n",
+    "CuTe also provides a powerful set of operations—the *Layout Algebra*—for combining and manipulating layouts, including:\n",
+    "- Layout composition: Functional composition of layouts,\n",
+    "- Layout \"divide\": Splitting a layout into two component layouts,\n",
+    "- Layout \"product\": Reproducing a layout according to another layout.\n",
+    "\n",
+    "In this notebook, we will demonstrate:\n",
+    "1. How to use CuTe’s key layout algebra operations with the Python DSL.\n",
+    "2. How static and dynamic layouts behave when printed or manipulated within the Python DSL.\n",
+    "\n",
+    "We use examples from [02_layout_algebra.md](https://github.com/NVIDIA/cutlass/blob/main/media/docs/cpp/cute/02_layout_algebra.md) which we recommend to the reader for additional details."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cutlass\n",
+    "import cutlass.cute as cute"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Layout Algebra Operations\n",
+    "\n",
+    "These operations form the foundation of CuTe's layout manipulation capabilities, enabling:\n",
+    "- Efficient data tiling and partitioning,\n",
+    "- Separation of thread and data layouts with a canonical type to represent both,\n",
+    "- Native description and manipulation of hierarchical tensors of threads and data crucial for tensor core programs,\n",
+    "- Mixed static/dynamic layout transformations,\n",
+    "- Seamless integration of layout algebra with tensor operations,\n",
+    "- Expression of complex MMA and copies as canonical loops."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1. Coalesce\n",
+    "\n",
+    "The `coalesce` operation simplifies a layout by flattening and combining modes when possible, without changing its size or behavior as a function on the integers.\n",
+    "\n",
+    "It ensures the post-conditions:\n",
+    "- Preserve size: cute.size(layout) == cute.size(result),\n",
+    "- Flattened: depth(result) <= 1,\n",
+    "- Preserve functional: For all i, 0 <= i < cute.size(layout), layout(i) == result(i).\n",
+    "\n",
+    "#### Examples\n",
+    "\n",
+    "- Basic Coalesce Example :"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">>> Original: (2,(1,6)):(1,(?,2))\n",
+      ">>> Coalesced: 12:1\n",
+      ">?? Original: (2,(1,6)):(1,(6,2))\n",
+      ">?? Coalesced: 12:1\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def coalesce_example():\n",
+    "    \"\"\"\n",
+    "    Demonstrates coalesce operation flattening and combining modes\n",
+    "    \"\"\"\n",
+    "    layout = cute.make_layout((2, (1, 6)), stride=(1, (cutlass.Int32(6), 2))) # Dynamic stride\n",
+    "    result = cute.coalesce(layout)\n",
+    "\n",
+    "    print(\">>> Original:\", layout)\n",
+    "    cute.printf(\">?? Original: {}\", layout)\n",
+    "    print(\">>> Coalesced:\", result)\n",
+    "    cute.printf(\">?? Coalesced: {}\", result)\n",
+    "\n",
+    "coalesce_example()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">>> Original: ((2,(3,4)),(3,2),1):((4,(8,24)),(2,6),12)\n",
+      ">>> Coalesced: (24,6):(4,2)\n",
+      ">>> Checking post-conditions:\n",
+      ">>> 1. Checking size remains the same after the coalesce operation:\n",
+      "Original size: 144, Coalesced size: 144\n",
+      ">>> 2. Checking depth of coalesced layout <= 1:\n",
+      "Depth of coalesced layout: 1\n",
+      ">>> 3. Checking layout functionality remains the same after the coalesce operation:\n",
+      "Index 0: original 0, coalesced 0\n",
+      "Index 1: original 4, coalesced 4\n",
+      "Index 2: original 8, coalesced 8\n",
+      "Index 3: original 12, coalesced 12\n",
+      "Index 4: original 16, coalesced 16\n",
+      "Index 5: original 20, coalesced 20\n",
+      "Index 6: original 24, coalesced 24\n",
+      "Index 7: original 28, coalesced 28\n",
+      "Index 8: original 32, coalesced 32\n",
+      "Index 9: original 36, coalesced 36\n",
+      "Index 10: original 40, coalesced 40\n",
+      "Index 11: original 44, coalesced 44\n",
+      "Index 12: original 48, coalesced 48\n",
+      "Index 13: original 52, coalesced 52\n",
+      "Index 14: original 56, coalesced 56\n",
+      "Index 15: original 60, coalesced 60\n",
+      "Index 16: original 64, coalesced 64\n",
+      "Index 17: original 68, coalesced 68\n",
+      "Index 18: original 72, coalesced 72\n",
+      "Index 19: original 76, coalesced 76\n",
+      "Index 20: original 80, coalesced 80\n",
+      "Index 21: original 84, coalesced 84\n",
+      "Index 22: original 88, coalesced 88\n",
+      "Index 23: original 92, coalesced 92\n",
+      "Index 24: original 2, coalesced 2\n",
+      "Index 25: original 6, coalesced 6\n",
+      "Index 26: original 10, coalesced 10\n",
+      "Index 27: original 14, coalesced 14\n",
+      "Index 28: original 18, coalesced 18\n",
+      "Index 29: original 22, coalesced 22\n",
+      "Index 30: original 26, coalesced 26\n",
+      "Index 31: original 30, coalesced 30\n",
+      "Index 32: original 34, coalesced 34\n",
+      "Index 33: original 38, coalesced 38\n",
+      "Index 34: original 42, coalesced 42\n",
+      "Index 35: original 46, coalesced 46\n",
+      "Index 36: original 50, coalesced 50\n",
+      "Index 37: original 54, coalesced 54\n",
+      "Index 38: original 58, coalesced 58\n",
+      "Index 39: original 62, coalesced 62\n",
+      "Index 40: original 66, coalesced 66\n",
+      "Index 41: original 70, coalesced 70\n",
+      "Index 42: original 74, coalesced 74\n",
+      "Index 43: original 78, coalesced 78\n",
+      "Index 44: original 82, coalesced 82\n",
+      "Index 45: original 86, coalesced 86\n",
+      "Index 46: original 90, coalesced 90\n",
+      "Index 47: original 94, coalesced 94\n",
+      "Index 48: original 4, coalesced 4\n",
+      "Index 49: original 8, coalesced 8\n",
+      "Index 50: original 12, coalesced 12\n",
+      "Index 51: original 16, coalesced 16\n",
+      "Index 52: original 20, coalesced 20\n",
+      "Index 53: original 24, coalesced 24\n",
+      "Index 54: original 28, coalesced 28\n",
+      "Index 55: original 32, coalesced 32\n",
+      "Index 56: original 36, coalesced 36\n",
+      "Index 57: original 40, coalesced 40\n",
+      "Index 58: original 44, coalesced 44\n",
+      "Index 59: original 48, coalesced 48\n",
+      "Index 60: original 52, coalesced 52\n",
+      "Index 61: original 56, coalesced 56\n",
+      "Index 62: original 60, coalesced 60\n",
+      "Index 63: original 64, coalesced 64\n",
+      "Index 64: original 68, coalesced 68\n",
+      "Index 65: original 72, coalesced 72\n",
+      "Index 66: original 76, coalesced 76\n",
+      "Index 67: original 80, coalesced 80\n",
+      "Index 68: original 84, coalesced 84\n",
+      "Index 69: original 88, coalesced 88\n",
+      "Index 70: original 92, coalesced 92\n",
+      "Index 71: original 96, coalesced 96\n",
+      "Index 72: original 6, coalesced 6\n",
+      "Index 73: original 10, coalesced 10\n",
+      "Index 74: original 14, coalesced 14\n",
+      "Index 75: original 18, coalesced 18\n",
+      "Index 76: original 22, coalesced 22\n",
+      "Index 77: original 26, coalesced 26\n",
+      "Index 78: original 30, coalesced 30\n",
+      "Index 79: original 34, coalesced 34\n",
+      "Index 80: original 38, coalesced 38\n",
+      "Index 81: original 42, coalesced 42\n",
+      "Index 82: original 46, coalesced 46\n",
+      "Index 83: original 50, coalesced 50\n",
+      "Index 84: original 54, coalesced 54\n",
+      "Index 85: original 58, coalesced 58\n",
+      "Index 86: original 62, coalesced 62\n",
+      "Index 87: original 66, coalesced 66\n",
+      "Index 88: original 70, coalesced 70\n",
+      "Index 89: original 74, coalesced 74\n",
+      "Index 90: original 78, coalesced 78\n",
+      "Index 91: original 82, coalesced 82\n",
+      "Index 92: original 86, coalesced 86\n",
+      "Index 93: original 90, coalesced 90\n",
+      "Index 94: original 94, coalesced 94\n",
+      "Index 95: original 98, coalesced 98\n",
+      "Index 96: original 8, coalesced 8\n",
+      "Index 97: original 12, coalesced 12\n",
+      "Index 98: original 16, coalesced 16\n",
+      "Index 99: original 20, coalesced 20\n",
+      "Index 100: original 24, coalesced 24\n",
+      "Index 101: original 28, coalesced 28\n",
+      "Index 102: original 32, coalesced 32\n",
+      "Index 103: original 36, coalesced 36\n",
+      "Index 104: original 40, coalesced 40\n",
+      "Index 105: original 44, coalesced 44\n",
+      "Index 106: original 48, coalesced 48\n",
+      "Index 107: original 52, coalesced 52\n",
+      "Index 108: original 56, coalesced 56\n",
+      "Index 109: original 60, coalesced 60\n",
+      "Index 110: original 64, coalesced 64\n",
+      "Index 111: original 68, coalesced 68\n",
+      "Index 112: original 72, coalesced 72\n",
+      "Index 113: original 76, coalesced 76\n",
+      "Index 114: original 80, coalesced 80\n",
+      "Index 115: original 84, coalesced 84\n",
+      "Index 116: original 88, coalesced 88\n",
+      "Index 117: original 92, coalesced 92\n",
+      "Index 118: original 96, coalesced 96\n",
+      "Index 119: original 100, coalesced 100\n",
+      "Index 120: original 10, coalesced 10\n",
+      "Index 121: original 14, coalesced 14\n",
+      "Index 122: original 18, coalesced 18\n",
+      "Index 123: original 22, coalesced 22\n",
+      "Index 124: original 26, coalesced 26\n",
+      "Index 125: original 30, coalesced 30\n",
+      "Index 126: original 34, coalesced 34\n",
+      "Index 127: original 38, coalesced 38\n",
+      "Index 128: original 42, coalesced 42\n",
+      "Index 129: original 46, coalesced 46\n",
+      "Index 130: original 50, coalesced 50\n",
+      "Index 131: original 54, coalesced 54\n",
+      "Index 132: original 58, coalesced 58\n",
+      "Index 133: original 62, coalesced 62\n",
+      "Index 134: original 66, coalesced 66\n",
+      "Index 135: original 70, coalesced 70\n",
+      "Index 136: original 74, coalesced 74\n",
+      "Index 137: original 78, coalesced 78\n",
+      "Index 138: original 82, coalesced 82\n",
+      "Index 139: original 86, coalesced 86\n",
+      "Index 140: original 90, coalesced 90\n",
+      "Index 141: original 94, coalesced 94\n",
+      "Index 142: original 98, coalesced 98\n",
+      "Index 143: original 102, coalesced 102\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def coalesce_post_conditions():\n",
+    "    \"\"\"\n",
+    "    Demonstrates coalesce operation's 3 post-conditions:\n",
+    "    1. size(@a result) == size(@a layout)\n",
+    "    2. depth(@a result) <= 1\n",
+    "    3. for all i, 0 <= i < size(@a layout), @a result(i) == @a layout(i)\n",
+    "    \"\"\"\n",
+    "    layout = cute.make_layout(\n",
+    "        ((2, (3, 4)), (3, 2), 1),\n",
+    "        stride=((4, (8, 24)), (2, 6), 12)\n",
+    "    )\n",
+    "    result = cute.coalesce(layout)\n",
+    "\n",
+    "    print(\">>> Original:\", layout)\n",
+    "    print(\">>> Coalesced:\", result)\n",
+    "\n",
+    "    print(\">>> Checking post-conditions:\")\n",
+    "    print(\">>> 1. Checking size remains the same after the coalesce operation:\")\n",
+    "    original_size = cute.size(layout)\n",
+    "    coalesced_size = cute.size(result)\n",
+    "    print(f\"Original size: {original_size}, Coalesced size: {coalesced_size}\")\n",
+    "    assert coalesced_size == original_size, \\\n",
+    "            f\"Size mismatch: original {original_size}, coalesced {coalesced_size}\"\n",
+    "    \n",
+    "    print(\">>> 2. Checking depth of coalesced layout <= 1:\")\n",
+    "    depth = cute.depth(result)\n",
+    "    print(f\"Depth of coalesced layout: {depth}\")\n",
+    "    assert depth <= 1, f\"Depth of coalesced layout should be <= 1, got {depth}\"\n",
+    "\n",
+    "    print(\">>> 3. Checking layout functionality remains the same after the coalesce operation:\")\n",
+    "    for i in range(original_size):\n",
+    "        original_value = layout(i)\n",
+    "        coalesced_value = result(i)\n",
+    "        print(f\"Index {i}: original {original_value}, coalesced {coalesced_value}\")\n",
+    "        assert coalesced_value == original_value, \\\n",
+    "            f\"Value mismatch at index {i}: original {original_value}, coalesced {coalesced_value}\"\n",
+    "\n",
+    "coalesce_post_conditions()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- By-mode Coalesce Example :"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">>> Original:  (2,(1,6)):(1,(6,2))\n",
+      ">>> Coalesced Result:  (2,6):(1,2)\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def bymode_coalesce_example():\n",
+    "    \"\"\"\n",
+    "    Demonstrates by-mode coalescing\n",
+    "    \"\"\"\n",
+    "    layout = cute.make_layout((2, (1, 6)), stride=(1, (6, 2)))\n",
+    "\n",
+    "    # Coalesce with mode-wise profile (1,1) = coalesce both modes\n",
+    "    result = cute.coalesce(layout, target_profile=(1, 1))\n",
+    "    \n",
+    "    # Print results\n",
+    "    print(\">>> Original: \", layout)\n",
+    "    print(\">>> Coalesced Result: \", result)\n",
+    "\n",
+    "bymode_coalesce_example()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2. Composition\n",
+    "\n",
+    "`Composition` of Layout `A` with Layout `B` creates a new layout `R = A ◦ B` where:\n",
+    "- The shape of `B` is compatible with the shape of `R` so that all coordinates of `B` can also be used as coordinates of `R`,\n",
+    "- `R(c) = A(B(c))` for all coordinates `c` in `B`'s domain.\n",
+    "\n",
+    "Layout composition is very useful for reshaping and reordering layouts.\n",
+    "\n",
+    "#### Examples\n",
+    "\n",
+    "- Basic Composition Example :"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">>> Layout A: (6,2):(?,2)\n",
+      ">>> Layout B: (4,3):(3,1)\n",
+      ">>> Composition R = A ◦ B: ((2,2),3):((?{div=3},2),?)\n",
+      ">?? Layout A: (6,2):(8,2)\n",
+      ">?? Layout B: (4,3):(3,1)\n",
+      ">?? Composition R: ((2,2),3):((24,2),8)\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def composition_example():\n",
+    "    \"\"\"\n",
+    "    Demonstrates basic layout composition R = A ◦ B\n",
+    "    \"\"\"\n",
+    "    A = cute.make_layout((6, 2), stride=(cutlass.Int32(8), 2)) # Dynamic stride\n",
+    "    B = cute.make_layout((4, 3), stride=(3, 1))\n",
+    "    R = cute.composition(A, B)\n",
+    "\n",
+    "    # Print static and dynamic information\n",
+    "    print(\">>> Layout A:\", A)\n",
+    "    cute.printf(\">?? Layout A: {}\", A)\n",
+    "    print(\">>> Layout B:\", B) \n",
+    "    cute.printf(\">?? Layout B: {}\", B)\n",
+    "    print(\">>> Composition R = A ◦ B:\", R)\n",
+    "    cute.printf(\">?? Composition R: {}\", R)\n",
+    "\n",
+    "composition_example()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- Comparing Composition with static and dynamic layouts :\n",
+    "\n",
+    "In this case, the results may look different but are mathematically the same. The 1s in the shape don't affect the layout as a mathematical function on the integers. In the dynamic case, CuTe can not coalesce the dynamic size-1 modes to \"simplify\" the layout because it is not valid to do so for all possible dynamic values that parameter could realize at runtime."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">>> Static composition:\n",
+      ">>> A_static:  (10,2):(16,4)\n",
+      ">>> B_static:  (5,4):(1,5)\n",
+      ">>> R_static:  (5,(2,2)):(16,(80,4))\n",
+      ">?? Dynamic composition:\n",
+      ">?? A_dynamic: (10,2):(16,4)\n",
+      ">?? B_dynamic: (5,4):(1,5)\n",
+      ">?? R_dynamic: ((5,1),(2,2)):((16,4),(80,4))\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def composition_static_vs_dynamic_layout():\n",
+    "    \"\"\"\n",
+    "    Shows difference between static and dynamic composition results\n",
+    "    \"\"\"\n",
+    "    # Static version - using compile-time values\n",
+    "    A_static = cute.make_layout(\n",
+    "        (10, 2), \n",
+    "        stride=(16, 4)\n",
+    "    )\n",
+    "    B_static = cute.make_layout(\n",
+    "        (5, 4), \n",
+    "        stride=(1, 5)\n",
+    "    )\n",
+    "    R_static = cute.composition(A_static, B_static)\n",
+    "\n",
+    "    # Static print shows compile-time info\n",
+    "    print(\">>> Static composition:\")\n",
+    "    print(\">>> A_static: \", A_static)\n",
+    "    print(\">>> B_static: \", B_static)\n",
+    "    print(\">>> R_static: \", R_static)\n",
+    "\n",
+    "    # Dynamic version - using runtime Int32 values\n",
+    "    A_dynamic = cute.make_layout(\n",
+    "        (cutlass.Int32(10), cutlass.Int32(2)),\n",
+    "        stride=(cutlass.Int32(16), cutlass.Int32(4))\n",
+    "    )\n",
+    "    B_dynamic = cute.make_layout(\n",
+    "        (cutlass.Int32(5), cutlass.Int32(4)),\n",
+    "        stride=(cutlass.Int32(1), cutlass.Int32(5))\n",
+    "    )\n",
+    "    R_dynamic = cute.composition(A_dynamic, B_dynamic)\n",
+    "    \n",
+    "    # Dynamic printf shows runtime values\n",
+    "    cute.printf(\">?? Dynamic composition:\")\n",
+    "    cute.printf(\">?? A_dynamic: {}\", A_dynamic)\n",
+    "    cute.printf(\">?? B_dynamic: {}\", B_dynamic)\n",
+    "    cute.printf(\">?? R_dynamic: {}\", R_dynamic)\n",
+    "\n",
+    "composition_static_vs_dynamic_layout()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "-  By-mode Composition Example :\n",
+    "\n",
+    "By-mode composition allows us to apply composition operations to individual modes of a layout. This is particularly useful when you want to manipulate specific modes layout independently (e.g. rows and columns).\n",
+    "\n",
+    "In the context of CuTe, by-mode composition is achieved by using a `Tiler`, which can be a layout or a tuple of layouts. The leaves of the `Tiler` tuple specify how the corresponding mode of the target layout should be composed, allowing for sublayouts to be treated independently."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">>> Layout A: (?,(?,?)):(?,(?,?))\n",
+      ">>> Tiler: (3, 8)\n",
+      ">>> By-mode Composition Result: (3,(?,?)):(?,(?,?))\n",
+      ">?? Layout A: (12,(4,8)):(59,(13,1))\n",
+      ">?? Tiler: (3,8)\n",
+      ">?? By-mode Composition Result: (3,(4,2)):(59,(13,1))\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def bymode_composition_example():\n",
+    "    \"\"\"\n",
+    "    Demonstrates by-mode composition using a tiler\n",
+    "    \"\"\"\n",
+    "    # Define the original layout A\n",
+    "    A = cute.make_layout(\n",
+    "        (cutlass.Int32(12), (cutlass.Int32(4), cutlass.Int32(8))), \n",
+    "        stride=(cutlass.Int32(59), (cutlass.Int32(13), cutlass.Int32(1)))\n",
+    "    )\n",
+    "\n",
+    "    # Define the tiler for by-mode composition\n",
+    "    tiler = (3, 8) # Apply 3:1 to mode-0 and 8:1 to mode-1\n",
+    "\n",
+    "    # Apply by-mode composition\n",
+    "    result = cute.composition(A, tiler)\n",
+    "\n",
+    "    # Print static and dynamic information\n",
+    "    print(\">>> Layout A:\", A)\n",
+    "    cute.printf(\">?? Layout A: {}\", A)\n",
+    "    print(\">>> Tiler:\", tiler)\n",
+    "    cute.printf(\">?? Tiler: {}\", tiler)\n",
+    "    print(\">>> By-mode Composition Result:\", result)\n",
+    "    cute.printf(\">?? By-mode Composition Result: {}\", result)\n",
+    "\n",
+    "bymode_composition_example()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3. Division (Splitting into Tiles)\n",
+    "\n",
+    "The Division operation in CuTe is used to split a layout into tiles, which is particularly useful for partitioning data across threads or memory hierarchies.\n",
+    "\n",
+    "#### Examples :\n",
+    "\n",
+    "- Logical divide :\n",
+    "\n",
+    "When applied to two Layouts, `logical_divide` splits a layout into two modes -- the first mode contains the elements pointed to by the tiler, and the second mode contains the remaining elements."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">>> Layout: (4,2,3):(2,1,8)\n",
+      ">>> Tiler : 4:2\n",
+      ">>> Logical Divide Result: ((2,2),(2,3)):((4,1),(2,8))\n",
+      ">?? Logical Divide Result: ((2,2),(2,3)):((4,1),(2,8))\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def logical_divide_1d_example():\n",
+    "    \"\"\"\n",
+    "    Demonstrates 1D logical divide\n",
+    "    \"\"\"\n",
+    "    # Define the original layout\n",
+    "    layout = cute.make_layout((4, 2, 3), stride=(2, 1, 8))  # (4,2,3):(2,1,8)\n",
+    "    \n",
+    "    # Define the tiler\n",
+    "    tiler = cute.make_layout(4, stride=2)  # Apply to layout 4:2\n",
+    "    \n",
+    "    # Apply logical divide\n",
+    "    result = cute.logical_divide(layout, tiler=tiler)\n",
+    "    \n",
+    "    # Print results\n",
+    "    print(\">>> Layout:\", layout)\n",
+    "    print(\">>> Tiler :\", tiler)\n",
+    "    print(\">>> Logical Divide Result:\", result)\n",
+    "    cute.printf(\">?? Logical Divide Result: {}\", result)\n",
+    "\n",
+    "logical_divide_1d_example()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "When applied to a Layout and a `Tiler` tuple, `logical_divide` applies itself to the leaves of the `Tiler`and the corresponding mode of the target Layout. This means that the sublayouts are split independently according to the layouts within the `Tiler`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">>> Layout: (9,(4,8)):(59,(13,1))\n",
+      ">>> Tiler : (<cutlass.cute.core._Layout object at 0x7fc95a4ca7b0>, <cutlass.cute.core._Layout object at 0x7fc958160f50>)\n",
+      ">>> Logical Divide Result: ((3,3),((2,4),(2,2))):((177,59),((13,2),(26,1)))\n",
+      ">?? Logical Divide Result: ((3,3),((2,4),(2,2))):((177,59),((13,2),(26,1)))\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def logical_divide_2d_example():\n",
+    "    \"\"\"\n",
+    "    Demonstrates 2D logical divide :\n",
+    "    Layout Shape : (M, N, L, ...)\n",
+    "    Tiler Shape  : <TileM, TileN>\n",
+    "    Result Shape : ((TileM,RestM), (TileN,RestN), L, ...)\n",
+    "    \"\"\"\n",
+    "    # Define the original layout\n",
+    "    layout = cute.make_layout((9, (4, 8)), stride=(59, (13, 1)))  # (9,(4,8)):(59,(13,1))\n",
+    "    \n",
+    "    # Define the tiler\n",
+    "    tiler = (cute.make_layout(3, stride=3),            # Apply to mode-0 layout 3:3\n",
+    "             cute.make_layout((2, 4), stride=(1, 8)))  # Apply to mode-1 layout (2,4):(1,8)\n",
+    "    \n",
+    "    # Apply logical divide\n",
+    "    result = cute.logical_divide(layout, tiler=tiler)\n",
+    "    \n",
+    "    # Print results\n",
+    "    print(\">>> Layout:\", layout)\n",
+    "    print(\">>> Tiler :\", tiler)\n",
+    "    print(\">>> Logical Divide Result:\", result)\n",
+    "    cute.printf(\">?? Logical Divide Result: {}\", result)\n",
+    "\n",
+    "logical_divide_2d_example()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Zipped, tiled, and flat divide are flavors of `logical_divide` that potentially rearrange modes into more convenient forms.\n",
+    "\n",
+    "- Zipped Divide :"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">>> Layout: (9,(4,8)):(59,(13,1))\n",
+      ">>> Tiler : (<cutlass.cute.core._Layout object at 0x7fc95a4ca7b0>, <cutlass.cute.core._Layout object at 0x7fc9581611f0>)\n",
+      ">>> Zipped Divide Result: ((3,(2,4)),(3,(2,2))):((177,(13,2)),(59,(26,1)))\n",
+      ">?? Zipped Divide Result: ((3,(2,4)),(3,(2,2))):((177,(13,2)),(59,(26,1)))\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def zipped_divide_example():\n",
+    "    \"\"\"\n",
+    "    Demonstrates zipped divide :\n",
+    "    Layout Shape : (M, N, L, ...)\n",
+    "    Tiler Shape  : <TileM, TileN>\n",
+    "    Result Shape : ((TileM,TileN), (RestM,RestN,L,...))\n",
+    "    \"\"\"\n",
+    "    # Define the original layout\n",
+    "    layout = cute.make_layout((9, (4, 8)), stride=(59, (13, 1)))  # (9,(4,8)):(59,(13,1))\n",
+    "    \n",
+    "    # Define the tiler\n",
+    "    tiler = (cute.make_layout(3, stride=3),            # Apply to mode-0 layout 3:3\n",
+    "             cute.make_layout((2, 4), stride=(1, 8)))  # Apply to mode-1 layout (2,4):(1,8)\n",
+    "    \n",
+    "    # Apply zipped divide\n",
+    "    result = cute.zipped_divide(layout, tiler=tiler)\n",
+    "    \n",
+    "    # Print results\n",
+    "    print(\">>> Layout:\", layout)\n",
+    "    print(\">>> Tiler :\", tiler)\n",
+    "    print(\">>> Zipped Divide Result:\", result)\n",
+    "    cute.printf(\">?? Zipped Divide Result: {}\", result)\n",
+    "\n",
+    "zipped_divide_example()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- Tiled Divide :"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">>> Layout: (9,(4,8)):(59,(13,1))\n",
+      ">>> Tiler : (<cutlass.cute.core._Layout object at 0x7fc9581610d0>, <cutlass.cute.core._Layout object at 0x7fc958161070>)\n",
+      ">>> Tiled Divide Result: ((3,(2,4)),3,(2,2)):((177,(13,2)),59,(26,1))\n",
+      ">?? Tiled Divide Result: ((3,(2,4)),3,(2,2)):((177,(13,2)),59,(26,1))\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def tiled_divide_example():\n",
+    "    \"\"\"\n",
+    "    Demonstrates tiled divide :\n",
+    "    Layout Shape : (M, N, L, ...)\n",
+    "    Tiler Shape  : <TileM, TileN>\n",
+    "    Result Shape : ((TileM,TileN), RestM, RestN, L, ...)\n",
+    "    \"\"\"\n",
+    "    # Define the original layout\n",
+    "    layout = cute.make_layout((9, (4, 8)), stride=(59, (13, 1)))  # (9,(4,8)):(59,(13,1))\n",
+    "    \n",
+    "    # Define the tiler\n",
+    "    tiler = (cute.make_layout(3, stride=3),            # Apply to mode-0 layout 3:3\n",
+    "             cute.make_layout((2, 4), stride=(1, 8)))  # Apply to mode-1 layout (2,4):(1,8)\n",
+    "    \n",
+    "    # Apply tiled divide\n",
+    "    result = cute.tiled_divide(layout, tiler=tiler)\n",
+    "    \n",
+    "    # Print results\n",
+    "    print(\">>> Layout:\", layout)\n",
+    "    print(\">>> Tiler :\", tiler)\n",
+    "    print(\">>> Tiled Divide Result:\", result)\n",
+    "    cute.printf(\">?? Tiled Divide Result: {}\", result)\n",
+    "\n",
+    "tiled_divide_example()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- Flat Divide :"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">>> Layout: (9,(4,8)):(59,(13,1))\n",
+      ">>> Tiler : (<cutlass.cute.core._Layout object at 0x7fc958161430>, <cutlass.cute.core._Layout object at 0x7fc9581610d0>)\n",
+      ">>> Flat Divide Result: (3,(2,4),3,(2,2)):(177,(13,2),59,(26,1))\n",
+      ">?? Flat Divide Result: (3,(2,4),3,(2,2)):(177,(13,2),59,(26,1))\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def flat_divide_example():\n",
+    "    \"\"\"\n",
+    "    Demonstrates flat divide :\n",
+    "    Layout Shape : (M, N, L, ...)\n",
+    "    Tiler Shape  : <TileM, TileN>\n",
+    "    Result Shape : (TileM, TileN, RestM, RestN, L, ...)\n",
+    "    \"\"\"\n",
+    "    # Define the original layout\n",
+    "    layout = cute.make_layout((9, (4, 8)), stride=(59, (13, 1)))  # (9,(4,8)):(59,(13,1))\n",
+    "    \n",
+    "    # Define the tiler\n",
+    "    tiler = (cute.make_layout(3, stride=3),            # Apply to mode-0 layout 3:3\n",
+    "             cute.make_layout((2, 4), stride=(1, 8)))  # Apply to mode-1 layout (2,4):(1,8)\n",
+    "    \n",
+    "    # Apply flat divide\n",
+    "    result = cute.flat_divide(layout, tiler=tiler)\n",
+    "    \n",
+    "    # Print results\n",
+    "    print(\">>> Layout:\", layout)\n",
+    "    print(\">>> Tiler :\", tiler)\n",
+    "    print(\">>> Flat Divide Result:\", result)\n",
+    "    cute.printf(\">?? Flat Divide Result: {}\", result)\n",
+    "\n",
+    "flat_divide_example()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 4. Product (Reproducing a Tile)\n",
+    "\n",
+    "The Product operation in CuTe is used to reproduce one layout according to another layout. It creates a new layout where:\n",
+    "- The first mode is the original layout A.\n",
+    "- The second mode is a restrided layout B that points to the origin of a \"unique replication\" of A.\n",
+    "\n",
+    "This is particularly useful for repeating layouts of threads across a tile of data for creating \"repeat\" patterns.\n",
+    "\n",
+    "#### Examples\n",
+    "\n",
+    "- Logical Product :"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">>> Layout: (2,2):(4,1)\n",
+      ">>> Tiler : 6:1\n",
+      ">>> Logical Product Result: ((2,2),(2,3)):((4,1),(2,8))\n",
+      ">?? Logical Product Result: ((2,2),(2,3)):((4,1),(2,8))\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def logical_product_1d_example():\n",
+    "    \"\"\"\n",
+    "    Demonstrates 1D logical product\n",
+    "    \"\"\"\n",
+    "    # Define the original layout\n",
+    "    layout = cute.make_layout((2, 2), stride=(4, 1))  # (2,2):(4,1)\n",
+    "    \n",
+    "    # Define the tiler\n",
+    "    tiler = cute.make_layout(6, stride=1)  # Apply to layout 6:1\n",
+    "    \n",
+    "    # Apply logical product\n",
+    "    result = cute.logical_product(layout, tiler=tiler)\n",
+    "    \n",
+    "    # Print results\n",
+    "    print(\">>> Layout:\", layout)\n",
+    "    print(\">>> Tiler :\", tiler)\n",
+    "    print(\">>> Logical Product Result:\", result)\n",
+    "    cute.printf(\">?? Logical Product Result: {}\", result)\n",
+    "\n",
+    "logical_product_1d_example()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- Blocked and Raked Product :\n",
+    "  \n",
+    "  - Blocked Product: Combines the modes of A and B in a block-like fashion, preserving the semantic meaning of the modes by reassociating them after the product.\n",
+    "  - Raked Product: Combines the modes of A and B in an interleaved or \"raked\" fashion, creating a cyclic distribution of the tiles."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">>> Layout: (2,5):(5,1)\n",
+      ">>> Tiler : (3,4):(1,3)\n",
+      ">>> Blocked Product Result: ((2,3),(5,4)):((5,10),(1,30))\n",
+      ">>> Raked Product Result: ((3,2),(4,5)):((10,5),(30,1))\n",
+      ">?? Blocked Product Result: ((2,3),(5,4)):((5,10),(1,30))\n",
+      ">?? Raked Product Result: ((3,2),(4,5)):((10,5),(30,1))\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def blocked_raked_product_example():\n",
+    "    \"\"\"\n",
+    "    Demonstrates blocked and raked products\n",
+    "    \"\"\"\n",
+    "    # Define the original layout\n",
+    "    layout = cute.make_layout((2, 5), stride=(5, 1))\n",
+    "    \n",
+    "    # Define the tiler\n",
+    "    tiler = cute.make_layout((3, 4), stride=(1, 3))\n",
+    "    \n",
+    "    # Apply blocked product\n",
+    "    blocked_result = cute.blocked_product(layout, tiler=tiler)\n",
+    "\n",
+    "    # Apply raked product\n",
+    "    raked_result = cute.raked_product(layout, tiler=tiler)\n",
+    "    \n",
+    "    # Print results\n",
+    "    print(\">>> Layout:\", layout)\n",
+    "    print(\">>> Tiler :\", tiler)\n",
+    "    print(\">>> Blocked Product Result:\", blocked_result)\n",
+    "    print(\">>> Raked Product Result:\", raked_result)\n",
+    "    cute.printf(\">?? Blocked Product Result: {}\", blocked_result)\n",
+    "    cute.printf(\">?? Raked Product Result: {}\", raked_result)\n",
+    "\n",
+    "blocked_raked_product_example()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- Zipped, tiled, and flat product :\n",
+    "  \n",
+    "  - Similar to divide operations, zipped, tiled, and flat product are flavors of `logical_product` that potentially rearrange modes into more convenient forms."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">>> Layout: (2,5):(5,1)\n",
+      ">>> Tiler : (3,4):(1,3)\n",
+      ">>> Zipped Product Result: ((2,5),(3,4)):((5,1),(10,30))\n",
+      ">>> Tiled Product Result: ((2,5),3,4):((5,1),10,30)\n",
+      ">>> Flat Product Result: (2,5,3,4):(5,1,10,30)\n",
+      ">?? Zipped Product Result: ((2,5),(3,4)):((5,1),(10,30))\n",
+      ">?? Tiled Product Result: ((2,5),3,4):((5,1),10,30)\n",
+      ">?? Flat Product Result: (2,5,3,4):(5,1,10,30)\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def zipped_tiled_flat_product_example():\n",
+    "    \"\"\"\n",
+    "    Demonstrates zipped, tiled, and flat products\n",
+    "    Layout Shape : (M, N, L, ...)\n",
+    "    Tiler Shape  : <TileM, TileN>\n",
+    "\n",
+    "    zipped_product  : ((M,N), (TileM,TileN,L,...))\n",
+    "    tiled_product   : ((M,N), TileM, TileN, L, ...)\n",
+    "    flat_product    : (M, N, TileM, TileN, L, ...)\n",
+    "    \"\"\"\n",
+    "    # Define the original layout\n",
+    "    layout = cute.make_layout((2, 5), stride=(5, 1))\n",
+    "    \n",
+    "    # Define the tiler\n",
+    "    tiler = cute.make_layout((3, 4), stride=(1, 3))\n",
+    "\n",
+    "    # Apply zipped product\n",
+    "    zipped_result = cute.zipped_product(layout, tiler=tiler)\n",
+    "    \n",
+    "    # Apply tiled product\n",
+    "    tiled_result = cute.tiled_product(layout, tiler=tiler)\n",
+    "    \n",
+    "    # Apply flat product\n",
+    "    flat_result = cute.flat_product(layout, tiler=tiler)\n",
+    "\n",
+    "    # Print results\n",
+    "    print(\">>> Layout:\", layout)\n",
+    "    print(\">>> Tiler :\", tiler)\n",
+    "    print(\">>> Zipped Product Result:\", zipped_result)\n",
+    "    print(\">>> Tiled Product Result:\", tiled_result)\n",
+    "    print(\">>> Flat Product Result:\", flat_result)\n",
+    "    cute.printf(\">?? Zipped Product Result: {}\", zipped_result)\n",
+    "    cute.printf(\">?? Tiled Product Result: {}\", tiled_result)\n",
+    "    cute.printf(\">?? Flat Product Result: {}\", flat_result)\n",
+    "\n",
+    "zipped_tiled_flat_product_example()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pythondsl_venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/python/CuTeDSL/notebooks/data_types.ipynb b/examples/python/CuTeDSL/notebooks/data_types.ipynb
new file mode 100644
index 00000000..e618885d
--- /dev/null
+++ b/examples/python/CuTeDSL/notebooks/data_types.ipynb
@@ -0,0 +1,310 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import List\n",
+    "\n",
+    "import cutlass\n",
+    "import cutlass.cute as cute"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Understanding data structure in CuTe DSL\n",
+    "\n",
+    "In most cases, data structures in CuTe DSL work the same as Python data structures with the notable difference that Python data structures in most cases are considered as static data which are interpreted by the DSL compiler embedded inside Python interpreter.\n",
+    "\n",
+    "To differentiate between compile-time and runtime values, CuTe DSL introduces primitive types that \n",
+    "represent dynamic values in JIT-compiled code.\n",
+    "\n",
+    "CuTe DSL provides a comprehensive set of primitive numeric types for representing dynamic values at \n",
+    "runtime. These types are formally defined within the CuTe DSL typing system:\n",
+    "\n",
+    "### Integer Types\n",
+    "- `Int8` - 8-bit signed integer\n",
+    "- `Int16` - 16-bit signed integer  \n",
+    "- `Int32` - 32-bit signed integer\n",
+    "- `Int64` - 64-bit signed integer\n",
+    "- `Int128` - 128-bit signed integer\n",
+    "- `Uint8` - 8-bit unsigned integer\n",
+    "- `Uint16` - 16-bit unsigned integer\n",
+    "- `Uint32` - 32-bit unsigned integer\n",
+    "- `Uint64` - 64-bit unsigned integer\n",
+    "- `Uint128` - 128-bit unsigned integer\n",
+    "\n",
+    "### Floating Point Types\n",
+    "- `Float16` - 16-bit floating point\n",
+    "- `Float32` - 32-bit floating point \n",
+    "- `Float64` - 64-bit floating point\n",
+    "- `BFloat16` - Brain Floating Point format (16-bit)\n",
+    "- `TFloat32` - Tensor Float32 format (reduced precision format used in tensor operations)\n",
+    "- `Float8E4M3` - 8-bit floating point with 4-bit exponent and 3-bit mantissa\n",
+    "- `Float8E5M2` - 8-bit floating point with 5-bit exponent and 2-bit mantissa\n",
+    "\n",
+    "These specialized types are designed to represent dynamic values in CuTe DSL code that will be \n",
+    "evaluated at runtime, in contrast to Python's built-in numeric types which are evaluated during \n",
+    "compilation.\n",
+    "\n",
+    "### Example usage:\n",
+    "\n",
+    "```python\n",
+    "x = cutlass.Int32(5)        # Creates a 32-bit integer\n",
+    "y = cutlass.Float32(3.14)   # Creates a 32-bit float\n",
+    "\n",
+    "@cute.jit\n",
+    "def foo(a: cutlass.Int32):  # annotate `a` as 32-bit integer passed to jit function via ABI\n",
+    "    ...\n",
+    "```\n",
+    "To differentiate between compile-time and runtime values, CuTe DSL introduces primitive types that \n",
+    "represent dynamic values in JIT-compiled code.\n",
+    "\n",
+    "CuTe DSL provides a comprehensive set of primitive numeric types for representing dynamic values at \n",
+    "runtime. These types are formally defined within the CuTe DSL typing system:\n",
+    "\n",
+    "### Integer Types\n",
+    "- `Int8` - 8-bit signed integer\n",
+    "- `Int16` - 16-bit signed integer  \n",
+    "- `Int32` - 32-bit signed integer\n",
+    "- `Int64` - 64-bit signed integer\n",
+    "- `Int128` - 128-bit signed integer\n",
+    "- `Uint8` - 8-bit unsigned integer\n",
+    "- `Uint16` - 16-bit unsigned integer\n",
+    "- `Uint32` - 32-bit unsigned integer\n",
+    "- `Uint64` - 64-bit unsigned integer\n",
+    "- `Uint128` - 128-bit unsigned integer\n",
+    "\n",
+    "### Floating Point Types\n",
+    "- `Float16` - 16-bit floating point\n",
+    "- `Float32` - 32-bit floating point \n",
+    "- `Float64` - 64-bit floating point\n",
+    "- `BFloat16` - Brain Floating Point format (16-bit)\n",
+    "- `TFloat32` - Tensor Float32 format (reduced precision format used in tensor operations)\n",
+    "- `Float8E4M3` - 8-bit floating point with 4-bit exponent and 3-bit mantissa\n",
+    "- `Float8E5M2` - 8-bit floating point with 5-bit exponent and 2-bit mantissa\n",
+    "\n",
+    "These specialized types are designed to represent dynamic values in CuTe DSL code that will be \n",
+    "evaluated at runtime, in contrast to Python's built-in numeric types which are evaluated during \n",
+    "compilation.\n",
+    "\n",
+    "### Example usage:\n",
+    "\n",
+    "```python\n",
+    "x = cutlass.Int32(5)        # Creates a 32-bit integer\n",
+    "y = cutlass.Float32(3.14)   # Creates a 32-bit float\n",
+    "\n",
+    "@cute.jit\n",
+    "def foo(a: cutlass.Int32):  # annotate `a` as 32-bit integer passed to jit function via ABI\n",
+    "    ...\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "a(static) = ?\n",
+      "b(static) = ?\n",
+      "a(dynamic) = 3.140000\n",
+      "b(dynamic) = 5\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def bar():\n",
+    "    a = cutlass.Float32(3.14)\n",
+    "    print(\"a(static) =\", a)             # prints `a(static) = ?`\n",
+    "    cute.printf(\"a(dynamic) = {}\", a)   # prints `a(dynamic) = 3.140000`\n",
+    "\n",
+    "    b = cutlass.Int32(5)\n",
+    "    print(\"b(static) =\", b)             # prints `b(static) = 5`\n",
+    "    cute.printf(\"b(dynamic) = {}\", b)   # prints `b(dynamic) = 5`\n",
+    "\n",
+    "bar()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Type Conversion API\n",
+    "\n",
+    "CUTLASS numeric types provide type conversion through the `to()` method available on all Numeric types. This allows you to convert between different numeric data types at runtime.\n",
+    "\n",
+    "Syntax:\n",
+    "\n",
+    "```python\n",
+    "new_value = value.to(target_type)\n",
+    "```\n",
+    "\n",
+    "The `to()` method supports conversion between:\n",
+    "- Integer types (Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64)\n",
+    "- Floating point types (Float16, Float32, Float64, BFloat16)\n",
+    "- Mixed integer/floating point conversions\n",
+    "\n",
+    "Note that when converting from floating point to integer types, the decimal portion is truncated. When converting between types with different ranges, values may be clamped or lose precision if they exceed the target type's representable range."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Int32(42) => Float32(42.000000)\n",
+      "Float32(3.140000) => Int32(3)\n",
+      "Int32(127) => Int8(127)\n",
+      "Int32(300) => Int8(44) (truncated due to range limitation)\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def type_conversion():\n",
+    "    # Convert from Int32 to Float32\n",
+    "    x = cutlass.Int32(42)\n",
+    "    y = x.to(cutlass.Float32)\n",
+    "    cute.printf(\"Int32({}) => Float32({})\", x, y)\n",
+    "\n",
+    "    # Convert from Float32 to Int32\n",
+    "    a = cutlass.Float32(3.14)\n",
+    "    b = a.to(cutlass.Int32)\n",
+    "    cute.printf(\"Float32({}) => Int32({})\", a, b)\n",
+    "\n",
+    "    # Convert from Int32 to Int8\n",
+    "    c = cutlass.Int32(127)\n",
+    "    d = c.to(cutlass.Int8)\n",
+    "    cute.printf(\"Int32({}) => Int8({})\", c, d)\n",
+    "\n",
+    "    # Convert from Int32 to Int8 with value exceeding Int8 range\n",
+    "    e = cutlass.Int32(300)\n",
+    "    f = e.to(cutlass.Int8)\n",
+    "    cute.printf(\"Int32({}) => Int8({}) (truncated due to range limitation)\", e, f)\n",
+    "\n",
+    "type_conversion()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Operator Overloading\n",
+    "\n",
+    "CUTLASS numeric types support Python's built-in operators, allowing you to write natural mathematical expressions. The operators work with both CUTLASS numeric types and Python native numeric types.\n",
+    "\n",
+    "Supported operators include:\n",
+    "- Arithmetic: `+`, `-`, `*`, `/`, `//`, `%`, `**`\n",
+    "- Comparison: `<`, `<=`, `==`, `!=`, `>=`, `>`\n",
+    "- Bitwise: `&`, `|`, `^`, `<<`, `>>`\n",
+    "- Unary: `-` (negation), `~` (bitwise NOT)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "a: Int32(10), b: Int32(3)\n",
+      "x: Float32(5.500000)\n",
+      "\n",
+      "a + b = 13\n",
+      "x * 2 = 11.000000\n",
+      "a + x = 15.500000 (Int32 + Float32 promotes to Float32)\n",
+      "a / b = 3.333333\n",
+      "x / 2.0 = 2.750000\n",
+      "a > b = 1\n",
+      "a & b = 2\n",
+      "-a = -10\n",
+      "~a = -11\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def operator_demo():\n",
+    "    # Arithmetic operators\n",
+    "    a = cutlass.Int32(10)\n",
+    "    b = cutlass.Int32(3)\n",
+    "    cute.printf(\"a: Int32({}), b: Int32({})\", a, b)\n",
+    "\n",
+    "    x = cutlass.Float32(5.5)\n",
+    "    cute.printf(\"x: Float32({})\", x)\n",
+    "\n",
+    "    cute.printf(\"\")\n",
+    "\n",
+    "    sum_result = a + b\n",
+    "    cute.printf(\"a + b = {}\", sum_result)\n",
+    "\n",
+    "    y = x * 2  # Multiplying with Python native type\n",
+    "    cute.printf(\"x * 2 = {}\", y)\n",
+    "\n",
+    "    # Mixed type arithmetic (Int32 + Float32) that integer is converted into float32\n",
+    "    mixed_result = a + x\n",
+    "    cute.printf(\"a + x = {} (Int32 + Float32 promotes to Float32)\", mixed_result)\n",
+    "\n",
+    "    # Division with Int32 (note: integer division)\n",
+    "    div_result = a / b\n",
+    "    cute.printf(\"a / b = {}\", div_result)\n",
+    "\n",
+    "    # Float division\n",
+    "    float_div = x / cutlass.Float32(2.0)\n",
+    "    cute.printf(\"x / 2.0 = {}\", float_div)\n",
+    "\n",
+    "    # Comparison operators\n",
+    "    is_greater = a > b\n",
+    "    cute.printf(\"a > b = {}\", is_greater)\n",
+    "\n",
+    "    # Bitwise operators\n",
+    "    bit_and = a & b\n",
+    "    cute.printf(\"a & b = {}\", bit_and)\n",
+    "\n",
+    "    neg_a = -a\n",
+    "    cute.printf(\"-a = {}\", neg_a)\n",
+    "\n",
+    "    not_a = ~a\n",
+    "    cute.printf(\"~a = {}\", not_a)\n",
+    "\n",
+    "operator_demo()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/python/CuTeDSL/notebooks/elementwise_add.ipynb b/examples/python/CuTeDSL/notebooks/elementwise_add.ipynb
new file mode 100644
index 00000000..9cebc273
--- /dev/null
+++ b/examples/python/CuTeDSL/notebooks/elementwise_add.ipynb
@@ -0,0 +1,838 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from functools import partial\n",
+    "\n",
+    "import cutlass\n",
+    "import cutlass.cute as cute\n",
+    "from cutlass.cute.runtime import from_dlpack"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Tutorial: Elementwise Add Kernel in CuTe DSL\n",
+    "\n",
+    "This tutorial demonstrates how to implement a simple elementwise\n",
+    "addition kernel using the CuTe DSL (Domain Specific Language).\n",
+    "\n",
+    "\n",
+    "\n",
+    "Elementwise Addition\n",
+    "---------------------\n",
+    "\n",
+    "Elementwise addition is a fundamental operation in linear algebra.\n",
+    "Given two tensors of the same shape, the operation performs element-wise\n",
+    "addition to produce a result tensor of the same shape.\n",
+    "\n",
+    "For two 2D tensors :math:`A` and :math:`B` of shape :math:`(M, N)`,\n",
+    "the elementwise addition operation :math:`C = A + B` is defined as:\n",
+    "\n",
+    "$\n",
+    "   C_{i,j} = A_{i,j} + B_{i,j}\n",
+    "$\n",
+    "\n",
+    "where:\n",
+    "\n",
+    "- $i \\in [0, M-1]$ represents the row index\n",
+    "- $j \\in [0, N-1]$ represents the column index\n",
+    "- $A_{i,j}$, $B_{i,j}$, and $C_{i,j}$ are the elements at position $(i,j)$ \n",
+    "  in tensors $A$, $B$, and $C$ respectively\n",
+    "\n",
+    "This operation is performed independently for each element position,\n",
+    "making it highly parallelizable and well-suited for GPU implementation.\n",
+    "\n",
+    "Naive Elementwise Add Kernel\n",
+    "-----------------------------\n",
+    "\n",
+    "Let's start with a naive implementation that loads each element from\n",
+    "$A$ and $B$, adds them, and stores the result back to $C$."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@cute.kernel\n",
+    "def naive_elementwise_add_kernel(\n",
+    "    gA: cute.Tensor,\n",
+    "    gB: cute.Tensor,\n",
+    "    gC: cute.Tensor,\n",
+    "):\n",
+    "    tidx, _, _ = cute.arch.thread_idx()\n",
+    "    bidx, _, _ = cute.arch.block_idx()\n",
+    "    bdim, _, _ = cute.arch.block_dim()\n",
+    "\n",
+    "    thread_idx = bidx * bdim + tidx\n",
+    "\n",
+    "    # Map thread index to logical index of input tensor\n",
+    "    m, n = gA.shape\n",
+    "    ni = thread_idx % n\n",
+    "    mi = thread_idx // n\n",
+    "\n",
+    "    # Map logical index to physical address via tensor layout\n",
+    "    a_val = gA[mi, ni]\n",
+    "    b_val = gB[mi, ni]\n",
+    "\n",
+    "    # Perform element-wise addition\n",
+    "    gC[mi, ni] = a_val + b_val"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Structure of the Kernel\n",
+    "\n",
+    "The naive kernel simply maps each thread to one element with a 1-to-1 mapping.\n",
+    "In this kernel, we don't use CuTe layout algebra but only use basic\n",
+    "addressing to index the tensor.\n",
+    "\n",
+    "We can launch the kernel with the following JIT function:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@cute.jit\n",
+    "def naive_elementwise_add(\n",
+    "    mA: cute.Tensor,\n",
+    "    mB: cute.Tensor,\n",
+    "    mC: cute.Tensor\n",
+    "):\n",
+    "    num_threads_per_block = 256\n",
+    "\n",
+    "    m, n = mA.shape\n",
+    "    kernel = naive_elementwise_add_kernel(mA, mB, mC)\n",
+    "    kernel.launch(grid=((m * n) // num_threads_per_block, 1, 1),\n",
+    "                  block=(num_threads_per_block, 1, 1))\n",
+    "\n",
+    "M, N = 2048, 2048\n",
+    "\n",
+    "a = torch.randn(M, N, device=\"cuda\", dtype=torch.float16)\n",
+    "b = torch.randn(M, N, device=\"cuda\", dtype=torch.float16)\n",
+    "c = torch.zeros(M, N, device=\"cuda\", dtype=torch.float16)\n",
+    "\n",
+    "a_ = from_dlpack(a, assumed_align=16)\n",
+    "b_ = from_dlpack(b, assumed_align=16)\n",
+    "c_ = from_dlpack(c, assumed_align=16)\n",
+    "\n",
+    "# Compile kernel\n",
+    "naive_elementwise_add_ = cute.compile(naive_elementwise_add, a_, b_, c_)\n",
+    "naive_elementwise_add_(a_, b_, c_)\n",
+    "\n",
+    "# verify correctness\n",
+    "torch.testing.assert_close(c, a + b)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Benchmark performance\n",
+    "\n",
+    "Here's a utility function to benchmark our kernel implementations:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def benchmark(callable, *, num_warmups, num_iterations):\n",
+    "    start_event = torch.cuda.Event(enable_timing=True)\n",
+    "    end_event = torch.cuda.Event(enable_timing=True)\n",
+    "\n",
+    "    torch.cuda.synchronize()\n",
+    "\n",
+    "    for _ in range(num_warmups):\n",
+    "        callable()\n",
+    "\n",
+    "    start_event.record(stream=torch.cuda.current_stream())\n",
+    "    for _ in range(num_iterations):\n",
+    "        callable()\n",
+    "    end_event.record(stream=torch.cuda.current_stream())\n",
+    "    torch.cuda.synchronize()\n",
+    "\n",
+    "    elapsed_time = start_event.elapsed_time(end_event)\n",
+    "    avg_time = elapsed_time / num_iterations\n",
+    "\n",
+    "    print(f\"Average execution time: {avg_time:.4f} ms\")\n",
+    "    print(f\"Throughput: {(3 * a.numel() * 2) / (avg_time / 1000) / 1e9:.2f} GB/s\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average execution time: 0.0385 ms\n",
+      "Throughput: 653.44 GB/s\n"
+     ]
+    }
+   ],
+   "source": [
+    "benchmark(partial(naive_elementwise_add_, a_, b_, c_), num_warmups=5, num_iterations=100)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Performance Analysis\n",
+    "\n",
+    "While our naive implementation maps thread indices to contiguous tensor\n",
+    "dimensions for coalesced memory access, it doesn't have enough\n",
+    "in-flight load & store operations to hide memory latency.\n",
+    "\n",
+    "According to Little's Law:\n",
+    "\n",
+    "$ L = \\lambda \\times W $\n",
+    "\n",
+    "Where:\n",
+    "- $L$ is the average number of items in a system\n",
+    "- $\\lambda$ is the average arrival rate of items (bandwidth)\n",
+    "- $W$ is the average time an item spends in the system (latency)\n",
+    "\n",
+    "For our elementwise addition kernel:\n",
+    "\n",
+    "1. $L$: The number of load & store operations in-flight\n",
+    "2. $\\lambda$ (Bandwidth): Data transfer rate between memory and compute units\n",
+    "3. $W$ (Latency): Round-trip delay of memory requests\n",
+    "\n",
+    "For memory-bound operations like elementwise addition, performance is\n",
+    "limited by the number of in-flight load & store operations.\n",
+    "\n",
+    "## Vectorized Load and Store\n",
+    "\n",
+    "To improve performance according to Little's Law, we need to increase the number\n",
+    "of in-flight requests. We can do this by increasing the number of bytes handled\n",
+    "in each load & store operation per thread through vectorized memory access.\n",
+    "\n",
+    "Since Ampere GPUs support up to 128-bit per load/store and each element is 32-bit,\n",
+    "we can load 4 elements per vectorized operation on contiguous rows.\n",
+    "CuTe tiling operations make this vectorization straightforward.\n",
+    "\n",
+    "Using ``tiled_tensor = cute.zipped_divide(tensor, tiler)``, we can partition the input\n",
+    "``tensor`` into groups of ``tiler`` blocks. For vectorization, we specify ``tiler``\n",
+    "as the block of data each thread accesses (4 contiguous elements in the same row, or ``(1,4)``).\n",
+    "Different threads can then access different blocks by indexing into the 2nd mode of ``tiled_tensor``.\n",
+    "\n",
+    "```python\n",
+    "mA : cute.Tensor                           # (2048,2048):(2048,1)\n",
+    "gA = cute.zipped_divide(a, tiler=(1, 4))   # tiled/vectorized => ((1,4),(2048,512)):((0,1),(2048,4))\n",
+    "```\n",
+    "\n",
+    "$\n",
+    "    \\begin{array}{ccccc}\n",
+    "    & ((1,4) & , & (2048,512)) & : ((0,1),(2048,4)) \\\\\n",
+    "    & \\underbrace{\\phantom{(1,4)}}_{tiler} & & \\underbrace{\\phantom{(2048,512)}}_{threads} & \\\\\n",
+    "    & \\text{\\scriptsize per-thread} & & \\text{\\scriptsize num of tiles}\n",
+    "    \\end{array}\n",
+    "$"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@cute.kernel\n",
+    "def vectorized_elementwise_add_kernel(\n",
+    "    gA: cute.Tensor,\n",
+    "    gB: cute.Tensor,\n",
+    "    gC: cute.Tensor,\n",
+    "):\n",
+    "    tidx, _, _ = cute.arch.thread_idx()\n",
+    "    bidx, _, _ = cute.arch.block_idx()\n",
+    "    bdim, _, _ = cute.arch.block_dim()\n",
+    "\n",
+    "    thread_idx = bidx * bdim + tidx\n",
+    "\n",
+    "    # Map thread index to logical index of input tensor\n",
+    "    m, n = gA.shape[1]       # thread-domain\n",
+    "    ni = thread_idx % n\n",
+    "    mi = thread_idx // n\n",
+    "\n",
+    "    # Map logical index to physical address via tensor layout\n",
+    "    a_val = gA[(None, (mi, ni))].load()\n",
+    "    b_val = gB[(None, (mi, ni))].load()\n",
+    "    print(f\"[DSL INFO] sliced gA = {gA[(None, (mi, ni))]}\")\n",
+    "    print(f\"[DSL INFO] sliced gB = {gB[(None, (mi, ni))]}\")\n",
+    "\n",
+    "    # Perform element-wise addition\n",
+    "    gC[(None, (mi, ni))] = a_val + b_val"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This vectorized kernel follows a similar structure to its naive non-vectorized counterpart,\n",
+    "with one key difference: the tensor slicing pattern. By using `(None, (mi, ni))` as the slice indices,\n",
+    "we can extract a `(1,4)` sub-tensor from `gA`, `gB` and `gC` like \n",
+    "\n",
+    "```python\n",
+    "gA[(None, (mi, ni))]\n",
+    "\n",
+    "```\n",
+    "\n",
+    "Then tensor data can be loaded into vector via the `.load()` method.\n",
+    "\n",
+    "\n",
+    "```\n",
+    "                                         slice\n",
+    "    ((1,4),(2048,512)):((0,1),(2048,4))   ==>  ((1,4)):((0,1))\n",
+    "       ^     ^    ^\n",
+    "       |     |    |\n",
+    "     (None, (mi,  ni))\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[DSL INFO] Tiled Tensors:\n",
+      "[DSL INFO]   gA = tensor<ptr<f16, gmem, align<16>> o ((1,4),(2048,512)):((0,1),(2048,4))>\n",
+      "[DSL INFO]   gB = tensor<ptr<f16, gmem, align<16>> o ((1,4),(2048,512)):((0,1),(2048,4))>\n",
+      "[DSL INFO]   gC = tensor<ptr<f16, gmem, align<16>> o ((1,4),(2048,512)):((0,1),(2048,4))>\n",
+      "[DSL INFO] sliced gA = tensor<ptr<f16, gmem, align<8>> o ((1,4)):((0,1))>\n",
+      "[DSL INFO] sliced gB = tensor<ptr<f16, gmem, align<8>> o ((1,4)):((0,1))>\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def vectorized_elementwise_add(\n",
+    "    mA: cute.Tensor,\n",
+    "    mB: cute.Tensor,\n",
+    "    mC: cute.Tensor\n",
+    "):\n",
+    "    threads_per_block = 256\n",
+    "\n",
+    "    gA = cute.zipped_divide(mA, (1, 4))\n",
+    "    gB = cute.zipped_divide(mB, (1, 4))\n",
+    "    gC = cute.zipped_divide(mC, (1, 4))\n",
+    "\n",
+    "    print(f\"[DSL INFO] Tiled Tensors:\")\n",
+    "    print(f\"[DSL INFO]   gA = {gA}\")\n",
+    "    print(f\"[DSL INFO]   gB = {gB}\")\n",
+    "    print(f\"[DSL INFO]   gC = {gC}\")\n",
+    "\n",
+    "    vectorized_elementwise_add_kernel(gA, gB, gC).launch(\n",
+    "        grid=(cute.size(gC, mode=[1]) // threads_per_block, 1, 1),\n",
+    "        block=(threads_per_block, 1, 1),\n",
+    "    )\n",
+    "\n",
+    "a = torch.randn(M, N, device=\"cuda\", dtype=torch.float16)\n",
+    "b = torch.randn(M, N, device=\"cuda\", dtype=torch.float16)\n",
+    "c = torch.zeros(M, N, device=\"cuda\", dtype=torch.float16)\n",
+    "\n",
+    "a_ = from_dlpack(a, assumed_align=16)\n",
+    "b_ = from_dlpack(b, assumed_align=16)\n",
+    "c_ = from_dlpack(c, assumed_align=16)\n",
+    "\n",
+    "compiled_func = cute.compile(vectorized_elementwise_add, a_, b_, c_)\n",
+    "compiled_func(a_, b_, c_)\n",
+    "\n",
+    "# verify correctness\n",
+    "torch.testing.assert_close(c, a + b)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average execution time: 0.0202 ms\n",
+      "Throughput: 1244.98 GB/s\n"
+     ]
+    }
+   ],
+   "source": [
+    "benchmark(partial(compiled_func, a_, b_, c_), num_warmups=5, num_iterations=100)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## TV Layout\n",
+    "\n",
+    "Both the naive and vectorized kernels follow a common pattern to map thread indices\n",
+    "to physical addresses:\n",
+    "\n",
+    "Step 1: Map thread index to logical M/N coordinates\n",
+    "\n",
+    "```python\n",
+    "    mi = thread_idx // n\n",
+    "    ni = thread_idx % n\n",
+    "```\n",
+    "\n",
+    "Step 2: Map logical M/N coordinates to physical addresses using the tensor layout\n",
+    "\n",
+    "```python\n",
+    "    a[(None, (mi, ni))].load()\n",
+    "```\n",
+    "\n",
+    "CuTe uses TV layout to represent this mapping from thread index and value index\n",
+    "(i.e., the 4 elements loaded per thread) to the logical coordinate space of a tensor.\n",
+    "By configuring different TV layouts, we can experiment with different memory access\n",
+    "patterns with minimal code changes.\n",
+    "\n",
+    "The following example demonstrates two levels of tiling: at the thread-block level\n",
+    "and at the thread level.\n",
+    "\n",
+    "For thread-block level tiling, each input & output tensor is first divided\n",
+    "into a group of ``(TileM, TileN)`` sub-tensors at the host side.\n",
+    "\n",
+    "Inside the GPU kernel, we provide the thread-block index to the 2nd mode of the tiled tensor\n",
+    "(``gA[((None, None), bidx)]``), which returns a thread-block local view of\n",
+    "a single ``(TileM, TileN)`` sub-tensor.\n",
+    "\n",
+    "For thread level tiling, we compose the sub-tensor (which maps from logical coordinates\n",
+    "to physical addresses) with the TV layout (which maps from thread & value indices to\n",
+    "logical coordinates). This gives us a tiled sub-tensor that maps from thread & value\n",
+    "indices directly to physical addresses.\n",
+    "\n",
+    "We then provide the thread index to the tiled sub-tensor (``tidfrgA[(tidx, None)]``)\n",
+    "to get a thread-local view of the data each thread accesses. Note that the thread index\n",
+    "is now in the 1st mode, as the tiled sub-tensor puts the thread mode before the value mode."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@cute.kernel\n",
+    "def elementwise_add_kernel(\n",
+    "    gA: cute.Tensor,\n",
+    "    gB: cute.Tensor,\n",
+    "    gC: cute.Tensor,\n",
+    "    tv_layout: cute.Layout\n",
+    "):\n",
+    "    tidx, _, _ = cute.arch.thread_idx()\n",
+    "    bidx, _, _ = cute.arch.block_idx()\n",
+    "\n",
+    "    #--------------------------------\n",
+    "    # slice for thread-block level view\n",
+    "    #--------------------------------\n",
+    "    blk_coord = ((None, None), bidx)\n",
+    "\n",
+    "    # logical coord -> address\n",
+    "    blkA = gA[blk_coord]  # (TileM, TileN) -> physical address\n",
+    "    blkB = gB[blk_coord]  # (TileM, TileN) -> physical address\n",
+    "    blkC = gC[blk_coord]  # (TileM, TileN) -> physical address\n",
+    "\n",
+    "    #--------------------------------\n",
+    "    # compose for thread-index & value-index to physical mapping\n",
+    "    #--------------------------------\n",
+    "    # blockA:    (TileM, TileN) -> physical address\n",
+    "    # tv_layout: (tid, vid)     -> (TileM, TileN)\n",
+    "    # tidfrgA = blkA o tv_layout\n",
+    "    # tidfrgA:   (tid, vid) -> physical address\n",
+    "    tidfrgA = cute.composition(blkA, tv_layout)\n",
+    "    tidfrgB = cute.composition(blkB, tv_layout)\n",
+    "    tidfrgC = cute.composition(blkC, tv_layout)\n",
+    "\n",
+    "    print(f\"Composed with TV layout:\")\n",
+    "    print(f\"  tidfrgA: {tidfrgA.type}\")\n",
+    "\n",
+    "    #--------------------------------\n",
+    "    # slice for thread-level view\n",
+    "    #--------------------------------\n",
+    "    # `None` represent slice of the entire per-thread data\n",
+    "    thr_coord = (tidx, None)\n",
+    "\n",
+    "    # slice for threads: vid -> address\n",
+    "    thrA = tidfrgA[thr_coord]  # (V) -> physical address\n",
+    "    thrB = tidfrgB[thr_coord]  # (V) -> physical address\n",
+    "    thrC = tidfrgC[thr_coord]  # (V) -> physical address\n",
+    "\n",
+    "    thrC[None] = thrA.load() + thrB.load()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If we take a closer look at the layout of zipped divided input tensor `gA`:\n",
+    "\n",
+    "```\n",
+    "Tiled to Thread Block:\n",
+    "\n",
+    "    ((16,256),(128,8))  : ((2048,1),(32768,256))\n",
+    "     ~~~~~~~~  ~~~~~~      ~~~~~~~~\n",
+    "        |        |            |\n",
+    "        |        |            |\n",
+    "        |        `------------------------> Number of Thread Blocks\n",
+    "        |                     |\n",
+    "        |                     |\n",
+    "        `--------------------'\n",
+    "                  |\n",
+    "                  V\n",
+    "             Thread Block\n",
+    "                 Tile\n",
+    "\n",
+    "Sliced to Thread-Block local sub-tensor (a (16, 128) tile):  gA[((None, None), bidx)]\n",
+    "\n",
+    "    (16,256)   :  (2048,1)\n",
+    "     ~~~~~~        ~~~~~~\n",
+    "        |             |        Tiled/Composed with TV Layout\n",
+    "        |             |    \n",
+    "        |             |    o   ((32,4),(8,4)):((128,4),(16,1))\n",
+    "        V             V         \n",
+    "~~~~~~~~~~~~~~~     ~~~~~~~~~~~~~~~~~~~ \n",
+    "((32,4), (8,4))  :  ((4,8192),(1,2048))\n",
+    "    |      |\n",
+    "    |      `--------> per thread fragment\n",
+    "    |\n",
+    "Thread Block\n",
+    "  Shape\n",
+    "\n",
+    "Sliced to Thread local sub-tensor (a (4,8) tile):  tidfrgA[(tidx, None)]\n",
+    "\n",
+    "```\n",
+    "\n",
+    "The host code below shows the construction of the TV layout. By composing\n",
+    "a thread layout of ``(4,32):(32,1)`` (32 threads read contiguous elements on the row dimension,\n",
+    "then 4 warps read different rows) with a value layout of ``(4,8):(8,1)`` (each thread reads\n",
+    "8 contiguous elements on the row dimension across 4 contiguous rows),\n",
+    "we obtain the TV layout shown in the figure above."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tiler: (16, 256)\n",
+      "TV Layout: ((32,4),(8,4)):((128,4),(16,1))\n",
+      "Tiled Input Tensors:\n",
+      "  gA: !cute.memref<f16, gmem, align<16>, \"((16,256),(128,8)):((2048,1),(32768,256))\">\n",
+      "  gB: !cute.memref<f16, gmem, align<16>, \"((16,256),(128,8)):((2048,1),(32768,256))\">\n",
+      "  gC: !cute.memref<f16, gmem, align<16>, \"((16,256),(128,8)):((2048,1),(32768,256))\">\n",
+      "Composed with TV layout:\n",
+      "  tidfrgA: !cute.memref<f16, gmem, align<16>, \"((32,4),(8,4)):((8,8192),(1,2048))\">\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def elementwise_add(\n",
+    "    mA: cute.Tensor,\n",
+    "    mB: cute.Tensor,\n",
+    "    mC: cute.Tensor,\n",
+    "):\n",
+    "    # mA layout: (M, N):(N, 1)\n",
+    "    # TV layout map thread & value index to (16, 256) logical tile\n",
+    "    #  - contiguous thread index maps to mode-1 because input layout is contiguous on\n",
+    "    #     mode-1 for coalesced load-store\n",
+    "    #  - each thread load 8 contiguous element each row and load 4 rows\n",
+    "    thr_layout = cute.make_layout((4, 32), stride=(32, 1))\n",
+    "    val_layout = cute.make_layout((4, 8), stride=(8, 1))\n",
+    "    tiler_mn, tv_layout = cute.make_layout_tv(thr_layout, val_layout)\n",
+    "    print(f\"Tiler: {tiler_mn}\")\n",
+    "    print(f\"TV Layout: {tv_layout}\")\n",
+    "\n",
+    "    gA = cute.zipped_divide(mA, tiler_mn)  # ((TileM, TileN), (RestM, RestN))\n",
+    "    gB = cute.zipped_divide(mB, tiler_mn)  # ((TileM, TileN), (RestM, RestN))\n",
+    "    gC = cute.zipped_divide(mC, tiler_mn)  # ((TileM, TileN), (RestM, RestN))\n",
+    "\n",
+    "    print(f\"Tiled Input Tensors:\")\n",
+    "    print(f\"  gA: {gA.type}\")\n",
+    "    print(f\"  gB: {gB.type}\")\n",
+    "    print(f\"  gC: {gC.type}\")\n",
+    "\n",
+    "    # Launch the kernel asynchronously\n",
+    "    # Async token(s) can also be specified as dependencies\n",
+    "    elementwise_add_kernel(\n",
+    "        gA, gB, gC, tv_layout\n",
+    "    ).launch(\n",
+    "        grid=[cute.size(gC, mode=[1]), 1, 1],\n",
+    "        block=[cute.size(tv_layout, mode=[0]), 1, 1],\n",
+    "    )\n",
+    "\n",
+    "a = torch.randn(M, N, device=\"cuda\", dtype=torch.float16)\n",
+    "b = torch.randn(M, N, device=\"cuda\", dtype=torch.float16)\n",
+    "c = torch.zeros(M, N, device=\"cuda\", dtype=torch.float16)\n",
+    "\n",
+    "a_ = from_dlpack(a, assumed_align=16)\n",
+    "b_ = from_dlpack(b, assumed_align=16)\n",
+    "c_ = from_dlpack(c, assumed_align=16)\n",
+    "\n",
+    "elementwise_add_ = cute.compile(elementwise_add, a_, b_, c_)\n",
+    "elementwise_add_(a_, b_, c_)\n",
+    "\n",
+    "# verify correctness\n",
+    "torch.testing.assert_close(c, a + b)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average execution time: 0.0222 ms\n",
+      "Throughput: 1133.58 GB/s\n"
+     ]
+    }
+   ],
+   "source": [
+    "benchmark(partial(elementwise_add_, a_, b_, c_), num_warmups=5, num_iterations=200)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Using Lambda Function\n",
+    "\n",
+    "CuTe DSL is built on top of Python. It can leverage Python to implement meta-programming to generate flexible kernels.\n",
+    "E.g. we can write kernel template that take custom binary operations to generate kernels for arbitrary binary operations.\n",
+    "\n",
+    "\n",
+    "```python\n",
+    "@cute.jit\n",
+    "def elementwise_apply(\n",
+    "    op: cutlass.Constexpr,\n",
+    "    mA: cute.Tensor,\n",
+    "    mB: cute.Tensor,\n",
+    "    mC: cute.Tensor\n",
+    "):\n",
+    "    ...\n",
+    "\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tiler: (16, 256)\n",
+      "TV Layout: ((32,4),(8,4)):((128,4),(16,1))\n",
+      "Tiled Input Tensors:\n",
+      "  gA: !cute.memref<f16, gmem, align<16>, \"((16,256),(128,8)):((2048,1),(32768,256))\">\n",
+      "  gB: !cute.memref<f16, gmem, align<16>, \"((16,256),(128,8)):((2048,1),(32768,256))\">\n",
+      "  gC: !cute.memref<f16, gmem, align<16>, \"((16,256),(128,8)):((2048,1),(32768,256))\">\n",
+      "Composed with TV layout:\n",
+      "  tidfrgA: !cute.memref<f16, gmem, align<16>, \"((32,4),(8,4)):((8,8192),(1,2048))\">\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.kernel\n",
+    "def elementwise_apply_kernel(\n",
+    "    op: cutlass.Constexpr,    # lambda function must be const expr to generate code at compile time\n",
+    "    gA: cute.Tensor,\n",
+    "    gB: cute.Tensor,\n",
+    "    gC: cute.Tensor,\n",
+    "    tv_layout: cute.Layout\n",
+    "):\n",
+    "    tidx, _, _ = cute.arch.thread_idx()\n",
+    "    bidx, _, _ = cute.arch.block_idx()\n",
+    "\n",
+    "    blk_coord = ((None, None), bidx)\n",
+    "\n",
+    "    # logical coord -> address\n",
+    "    blkA = gA[blk_coord]  # (TileM, TileN) -> physical address\n",
+    "    blkB = gB[blk_coord]  # (TileM, TileN) -> physical address\n",
+    "    blkC = gC[blk_coord]  # (TileM, TileN) -> physical address\n",
+    "\n",
+    "    tidfrgA = cute.composition(blkA, tv_layout)\n",
+    "    tidfrgB = cute.composition(blkB, tv_layout)\n",
+    "    tidfrgC = cute.composition(blkC, tv_layout)\n",
+    "\n",
+    "    print(f\"Composed with TV layout:\")\n",
+    "    print(f\"  tidfrgA: {tidfrgA.type}\")\n",
+    "\n",
+    "    thr_coord = (tidx, None)\n",
+    "\n",
+    "    # slice for threads: vid -> address\n",
+    "    thrA = tidfrgA[thr_coord]  # (V) -> physical address\n",
+    "    thrB = tidfrgB[thr_coord]  # (V) -> physical address\n",
+    "    thrC = tidfrgC[thr_coord]  # (V) -> physical address\n",
+    "\n",
+    "    #--------------------------------\n",
+    "    # apply custom operation\n",
+    "    #--------------------------------\n",
+    "    thrC[None] = op(thrA.load(), thrB.load())\n",
+    "\n",
+    "\n",
+    "@cute.jit\n",
+    "def elementwise_op(\n",
+    "    op: cutlass.Constexpr,\n",
+    "    mA: cute.Tensor,\n",
+    "    mB: cute.Tensor,\n",
+    "    mC: cute.Tensor,\n",
+    "):\n",
+    "    # mA layout: (M, N):(N, 1)\n",
+    "    # TV layout map thread & value index to (16, 256) logical tile\n",
+    "    #  - contiguous thread index maps to mode-1 because input layout is contiguous on\n",
+    "    #     mode-1 for coalesced load-store\n",
+    "    #  - each thread load 8 contiguous element each row and load 4 rows\n",
+    "    thr_layout = cute.make_layout((4, 32), stride=(32, 1))\n",
+    "    val_layout = cute.make_layout((4, 8), stride=(8, 1))\n",
+    "    tiler_mn, tv_layout = cute.make_layout_tv(thr_layout, val_layout)\n",
+    "    print(f\"Tiler: {tiler_mn}\")\n",
+    "    print(f\"TV Layout: {tv_layout}\")\n",
+    "\n",
+    "    gA = cute.zipped_divide(mA, tiler_mn)  # ((TileM, TileN), (RestM, RestN))\n",
+    "    gB = cute.zipped_divide(mB, tiler_mn)  # ((TileM, TileN), (RestM, RestN))\n",
+    "    gC = cute.zipped_divide(mC, tiler_mn)  # ((TileM, TileN), (RestM, RestN))\n",
+    "\n",
+    "    print(f\"Tiled Input Tensors:\")\n",
+    "    print(f\"  gA: {gA.type}\")\n",
+    "    print(f\"  gB: {gB.type}\")\n",
+    "    print(f\"  gC: {gC.type}\")\n",
+    "\n",
+    "    # Launch the kernel asynchronously\n",
+    "    # Async token(s) can also be specified as dependencies\n",
+    "    elementwise_apply_kernel(\n",
+    "        op, gA, gB, gC, tv_layout\n",
+    "    ).launch(\n",
+    "        grid=[cute.size(gC, mode=[1]), 1, 1],\n",
+    "        block=[cute.size(tv_layout, mode=[0]), 1, 1],\n",
+    "    )\n",
+    "\n",
+    "a = torch.randn(M, N, device=\"cuda\", dtype=torch.float16)\n",
+    "b = torch.randn(M, N, device=\"cuda\", dtype=torch.float16)\n",
+    "c = torch.zeros(M, N, device=\"cuda\", dtype=torch.float16)\n",
+    "\n",
+    "a_ = from_dlpack(a, assumed_align=16)\n",
+    "b_ = from_dlpack(b, assumed_align=16)\n",
+    "c_ = from_dlpack(c, assumed_align=16)\n",
+    "\n",
+    "from operator import mul\n",
+    "\n",
+    "elementwise_op(mul, a_, b_, c_)\n",
+    "\n",
+    "# verify correctness\n",
+    "torch.testing.assert_close(c, mul(a, b))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Custom operators can be more complex. For example, here's a function that performs\n",
+    "multiplication followed by ReLU:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tiler: (16, 256)\n",
+      "TV Layout: ((32,4),(8,4)):((128,4),(16,1))\n",
+      "Tiled Input Tensors:\n",
+      "  gA: !cute.memref<f16, gmem, align<16>, \"((16,256),(128,8)):((2048,1),(32768,256))\">\n",
+      "  gB: !cute.memref<f16, gmem, align<16>, \"((16,256),(128,8)):((2048,1),(32768,256))\">\n",
+      "  gC: !cute.memref<f16, gmem, align<16>, \"((16,256),(128,8)):((2048,1),(32768,256))\">\n",
+      "Composed with TV layout:\n",
+      "  tidfrgA: !cute.memref<f16, gmem, align<16>, \"((32,4),(8,4)):((8,8192),(1,2048))\">\n"
+     ]
+    }
+   ],
+   "source": [
+    "def mul_relu(a, b):\n",
+    "    tmp = a * b\n",
+    "    return cute.where(tmp > 0, tmp, cute.full_like(tmp, 0))\n",
+    "\n",
+    "\n",
+    "# As we uses cute.where in customized operation, we need to create another relu function\n",
+    "def mul_relu_ref(a, b):\n",
+    "    tmp = a * b\n",
+    "    return torch.relu(tmp)\n",
+    "\n",
+    "\n",
+    "elementwise_op(mul_relu, a_, b_, c_)\n",
+    "\n",
+    "# verify correctness\n",
+    "torch.testing.assert_close(c, mul_relu_ref(a, b))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {},
+    "version_major": 2,
+    "version_minor": 0
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/python/CuTeDSL/notebooks/hello_world.ipynb b/examples/python/CuTeDSL/notebooks/hello_world.ipynb
new file mode 100644
index 00000000..47719ae6
--- /dev/null
+++ b/examples/python/CuTeDSL/notebooks/hello_world.ipynb
@@ -0,0 +1,173 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Your First Program with CuTe DSL\n",
+    "\n",
+    "## Introduction\n",
+    "\n",
+    "Welcome! In this tutorial, we'll write a simple \"Hello World\" program that runs on your GPU using CuTe DSL. This will help you understand the basics of GPU programming with our framework.\n",
+    "\n",
+    "### What You'll Learn\n",
+    "\n",
+    "- How to write code that runs on both CPU (host) and GPU (device),\n",
+    "- How to launch a GPU kernel (a function that runs on the GPU),\n",
+    "- Basic CUDA concepts like threads and thread blocks,\n",
+    "\n",
+    "### Step 1: Import Required Libraries\n",
+    "\n",
+    "First, let's import the libraries we need:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cutlass               \n",
+    "import cutlass.cute as cute  "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "### Step 2: Write Our GPU Kernel\n",
+    "A GPU kernel is a function that runs on the GPU. Here's a simple kernel that prints \"Hello World\".\n",
+    "Key concepts:\n",
+    "- `@cute.kernel`: This decorator tells CUTLASS that this function should run on the GPU\n",
+    "- `cute.arch.thread_idx()`: Gets the ID of the current GPU thread (like a worker's ID number)\n",
+    "- We only want one thread to print the message (thread 0) to avoid multiple prints"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@cute.kernel\n",
+    "def kernel():\n",
+    "    # Get the x component of the thread index (y and z components are unused)\n",
+    "    tidx, _, _ = cute.arch.thread_idx()\n",
+    "    # Only the first thread (thread 0) prints the message\n",
+    "    if tidx == 0:\n",
+    "        cute.printf(\"Hello world\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 3: Write Our Host Function\n",
+    "\n",
+    "Now we need a function that sets up the GPU and launches our kernel.\n",
+    "Key concepts:\n",
+    "- `@cute.jit`: This decorator is for functions that run on the CPU but can launch GPU code\n",
+    "- We need to initialize CUDA before using the GPU\n",
+    "- `.launch()` tells CUDA how many blocks, threads, shared memory, etc. to use"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@cute.jit\n",
+    "def hello_world():\n",
+    "\n",
+    "    # Print hello world from host code\n",
+    "    cute.printf(\"hello world\")\n",
+    "    \n",
+    "    # Initialize CUDA context for launching a kernel with error checking\n",
+    "    # We make context initialization explicit to allow users to control the context creation \n",
+    "    # and avoid potential issues with multiple contexts\n",
+    "    cutlass.cuda.initialize_cuda_context()\n",
+    "\n",
+    "    # Launch kernel\n",
+    "    kernel().launch(\n",
+    "        grid=(1, 1, 1),   # Single thread block\n",
+    "        block=(32, 1, 1)  # One warp (32 threads) per thread block\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 4: Run Our Program\n",
+    "\n",
+    "There are 2 ways we can run our program:\n",
+    "\n",
+    "1. compile and run immediately\n",
+    "2. separate compilation which allows us to compile the code once and run multiple times\n",
+    "   \n",
+    "Please note the `Compiling...` for Method 2 prints before the \"Hello world\" of the first kernel. This shows the asynchronous behavior between CPU and GPU prints. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running hello_world()...\n",
+      "hello world\n",
+      "Compiling...\n",
+      "Hello world\n",
+      "Running compiled version...\n",
+      "hello world\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Method 1: Just-In-Time (JIT) compilation - compiles and runs the code immediately\n",
+    "print(\"Running hello_world()...\")\n",
+    "hello_world()\n",
+    "\n",
+    "# Method 2: Compile first (useful if you want to run the same code multiple times)\n",
+    "print(\"Compiling...\")\n",
+    "hello_world_compiled = cute.compile(hello_world)\n",
+    "# Run the pre-compiled version\n",
+    "print(\"Running compiled version...\")\n",
+    "hello_world_compiled()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {},
+    "version_major": 2,
+    "version_minor": 0
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/python/CuTeDSL/notebooks/images/cuda_graphs_image.png b/examples/python/CuTeDSL/notebooks/images/cuda_graphs_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..44fc138d319d5820377c987e3ac386913f1c0777
GIT binary patch
literal 8586
zcmeHNXH=70myRIl1r!yjBB-Er>Cy#6K#<-$(wh{ecdrGcMj(LFktQwl4vC0#CA0(x
zRmu${gb)awgbDi9HQ&svS?inmb${eN=j>DV-p@WOd%sT-qo<>OfrgC+003Ojc=XT!
z0HA=7`@QF>$<Gr>J!5in#@9go0RTM2zD6FLby3z<1^{Y+w8wT-<neDlkIa1mfQwzH
zpEJGQpBw>zo17XCm5qXIu(OeVMk^gSJZ!GO^(N(6ibq$jP+Mx;`t)P3%IMnD+Z_tg
z0OYF>>yGlQ!vbqfU0khwNSjU@lFFjO#m*zHG0nG<xAFyzNww$4+sfyEL|>u%ZT$6X
zZcUc+j(Nr#ntzaLcvAXx?p(Uz=#1O*sC4NJ+pEh(O~KZAjs}NBru&A5E}lB(Bh4i&
z4E)MW?k2iuk@j*^_9*~>&v?4~XtD?(C4xOeZa&8}FU2*Bx(?sa7>(zII?bP5pqV<z
zwZ#%LcFyy-DMDd63Lf)kqZbl|$*Zk?C!goj>3qB|w6R6;){^@aH6N-gw&bRnBIa*P
z054$GZT~n1fRz^GnmeoV$Qp}o;K>la5O_znCnRD1GvKYdT0lU+FvnT)DOpi+zs_{n
zf}Ujk>dXJ#kzSXu%Tjb*dI-?kK!%t#SF+&>K4{$-KuAI$Os;!ZOhh}$`TM}`YS=4$
z5nV?Giac^(HsXf7rH!&%9Soc;z&B|%dQ<qkG46n>Cz}C%3J1d5b|!YJ-W|(cJ!!e{
z?54+n-`!fJc$UaPM+6tU09R<N7>{mU^p8t3cvY)rOny<TmsTrxX|eMQoqL(dbfJ=f
z@H?OEVm!AFhWuk|IeyS(!*IIj#8o&PHYw0~=26&XW(YAQ^#|+{<mC7CIKg4zm#r&*
zD3zSYTpk*E7SYt$h&ELW8@D-g@f4uV&8?EINMS{h_OZsn#mz`J?2zaoGCa`zZMdba
zbd8NS<n{`|I5K>e)E&!i8VSVP9|a;?MjtcqWUUffk3ajMzH8VvMb?-pc1#RDQ_Go$
z^6uM|c=}M^CkNTV#(!r;m;*@H#nr`ut6NN(Dk1LpALSbm<89^)5Rx1X4-+vo38L;X
zb0;H0aSa`<9y9Iy_h6}MwJroiS$9Y<+`a=|zE`l<Jw%qHmPa*hn>hI^as~800_)l+
zP+WM`8GH~Ja@;L(du&W{&a|6gwM$AQt?8s5D-c@gk+{b9#f~;`NsdagCd~v<i?#g7
zF$u@_)=P*8(EzZ7w_g86L#asL6bPPZ>;80m>^e%B^Qs(>hZ?nfiw0AYUKCAQawksG
zWRdUnvtH6GbnVqKQ2sD|Z<nsyScWO%c~4sQ2+d5BKQP^?2gv%C@v^bwvU`=mn$w77
zO!xPqE%L0e*ToK-4^`TkusUe(*oZC2eqQ(z+jQKIdtk)K_XA`=v;K<5Rzu~09=15l
zQ|_rZR9{_67#@f9#JaApI;aOa#BtyZv<#zamTm#tw%N}H_$l@b6_Gckm9@7x<oeA1
zVcJFXn@%Gt)a`KQ6iDj2oy6n_&xrbiB_=R_J(aBy<M6E@@I-Tbjcm9W^4L10W;@8<
zN2hfPxZ`l07E^-y1*NFAF6UK4F)h~UoN>&wwsz~u{R`p>V!Gh5w(0Nzp~<q_R4}(>
z{OdUdN4S*ExX0v|N{iY}6OR*q!zCt*ou2GTFfs)-H80}i$EyA*%c>-BUPN+@lt`ST
z$MSU1O^DN}M9Ju+Y|NL<XsJNnSV~F_B1js*_zDUk!PCPP_3Ys=g7VR1nU*$9g6B*O
zR-qT`@HDqcr<YhOIYcYF@fOM%Vo+rqE-KN3k^rOS@(*{#bRdE%>KEaI5*TAk;}Sz-
zIX@&}m5`4WQ%l6=SAucojwiUR^H|@La5fi~k@-wBIn#C2>#4Cx;f_gjJWD16f*w;c
zEh88C`*zYk1@>6jZOOxbONjTKFnhUi-N|iFtE3WU6ZuacHP=8;1Tqa&P@{4A$Yr@N
zom*p#EyJkX`6PH37Xvl<ei?hfRACYj)iFw?shNHed_KXh#r+tIm)R;cQx)Ja($~oD
zxy!2;U_g&|mAxLwix^x2%a{q;4s~dPxQhhjc+@4&#bM#x8ZoRgqj=eV+m9KA6f1Wa
zF>h^U{jIi~3ioEnF{X9fo!}Go)d|YoeD;vxF-o)IZIO%}u{vT%+{H)uitP<SYY3ei
z&cg%F9xHK#DU7EO3J!bRESTjGp1b*`d7LO$7=4$YIsbV}?6Hi+{yr0Vy5Y_n;TvlM
zBm|St0b+@4F7EgbW$gCeRj5xouDLBqLj8VZ&8{(%hpTg9*ax?=Qmb;d$UWv2$mKzl
zW@O9?S^w*rh=u3e5T&+4b|+!8%9%p@4nzA{zDM8s^zB!Wn{UcnYMYl5VN}hl#FWug
z7O-V`#>7^AO(7!o2vZUF=7eH8xv)?IqZ=9HANVl&*rNe2<pIwE-JQ=0U7?+qiMfJ%
zGsn>+EUqrujb-k%zzW<=6Bh5a5#qL1NkP{)NozysFdMMpSKYFypyIDKO->qGnugn*
z$(#2}LRiYLPNN2rUNx{~jMFn8v*a)!bh#10H*K9>s^l-hF$4m6u1K4vk#7=FMmPxc
zc+~wI>*;PhUESU48Btc2n_I%11~b&}zI=hz8BQOCR^OPPvk_sbZ?xVdHHtm0@yeVp
z?tsC@Xa_lm=N80f?+F-tb~&TdeoWUtJCN_F#(TSa;YuovRia8totNZ2CrxHsh}W0|
z<n|3*bJ?h(1LK8e5L@Sr6v94uwdF@QhsURf%_KR5Y0>tm6J$tJ9EVSh5BH7Sj-M7a
zHI!8fsj9qEmvB55^aveqT=$)8IT3E)+}?(o`hRT?5E2anf7_5Y4T*r>{dvK&$j(X;
z&AS^gC2M=g>f@3!uX4m3ic`-{wOq$5^adfv65DwY^nz;f!7`;FKRs}*UI^kjF>gZB
zQkU&^#vs(8RW;f6#7f1UDb8>;G>(L|caUebl}4`lpMZNO-=}P2fX{Z-bCV9+PU0AX
zBenswF1N_Z>#bIpVnsW6>>j{$D1kNCFXVg3>y(E%^W2c3hh)@;Q9wazALZ6?N702#
z^0ku(e@$ImK@~W&uoKEE?k*cDrCeHcS|(-otqOH%_1=S7B(ltwxTzmXB@KuXUi@NB
zF9i2@s5mLLF%(zg$<)5v_?eM$y!z|CgD{k=syWanY6!K*<F8IYqg~ch7XYW3SxP56
ze}>AoDf$ecd7Vazq|Dg%7;7gDuyu)Q)~jhVN<9jg?gAVt!;eDA3A=)vuul_wD9gnk
ztO_9n;JWe+COEr%-sd$#Hfdhv+{c?1u&3vIRO=fX%Ss_SMQ^;z300QH)}0}*eaYUo
z(af8O^a*!m5A+QVF3p}6B1?Ey+pib8L_m^Gt+mBk6K|KtH=E=EtD%*Zm9leYRrG@a
zQIQwEg}gSLgn4i5%QBNaeQC#XxMjWPdMuQJ%>-E48RKwC+~k6x^!dCO(Vr;54ZYYM
zU3-a{d>-l+?bo;Dy2DxFOO04<39s*#1ljn`>$YF1Aid11-eExDtau#WpY{q_k~2g7
zm4f`>S?c%n`m-YVvm*GjBKWf+__HGTvm*GjBKUu<2!1je&`dIA@{6eed}e$UMrKSF
zX#Pvm<CNr}CGRu+2ZH0@yX|FCdOCCcxl`h%M6|Z9PUELVL1E*klK*q<|4isSSzcM8
z!YY#$e|C$ns1W=~0!io8IP_lp)#b?jJJ0j)h2ftK`3vYGwY9Y`cW;pGS&-=2nLpzA
zNk_2*fwHG2{<p!a5~n;3GfRx_yw1-E_AV7%B=7oH%+}vvUch`i)kQP&zx47HH@w16
zl1rZB$!XwB7yf5$|7Ta~Ut_towe?pl)l5v7enBamwa-Rbq%2;Q<WgOFPq(~-!^g(C
zT-{U1Q?_j>3rXsPmMmCJTomye+<CR!(Ze;ifVvI%`m!-k-Q~XL*b&@i2j=U!``sXP
z(Sr^Yb&NqtEN|P~F1%PjapvyihXNP))(C5M$dVz|iN+Q!swLn>{!HdMj~V}0k|7UY
zhKmF8I3*7$Wk*@`evp{LL%<#!_Kr?YR2@yW<v;48`!etKL9VYhq#?y+TRK#K|Dub{
ze9<f=Fjii=6wT*UI05+$aEUVU9#wq_`t_V#P!FVEGc5Wp+oziC6zueixPavZwQS@j
zeJIP@&_FG=kHO-jSUZ3c-BfFxi8oHclRkQ5e@;fp)=UBcxzOwyEyb??gedcKwIk1}
z?Q_TzdP;zO-ye%zA)&S)Imn#cIoIRGXS<%FxTRd?(q|^%JFt517@vBL(46~uK%j+i
zhd9qF;zVK82D7*>ohLPat+Z-tTF#;XO6)bCt&Vu97C1NB^f94?)2Ooi4*m_-Y*KdK
z8(TJSdP?i{xlNu_Clb8^$Z7t{^W*A>=EHYv)3!jC%4jC-QPAb=^6Uvba0dhD^gz?|
zxW$G~zmbkGgU<OpY6Ct_HZ^2&!sJ&B67+LoJ1n02)Z3W(eW;2`5BcWPo`5sj$=Y|G
z3+xED{N`b0{t4*v9lkuZD})_F4|B;ezL!)<VWy{ndSywfGzmfIa$V+Zz~6BRqfmMG
z#}XKK!UmU%Ul)eKUsX}*il;nrVF9Oj>o3Mf9z8Esp!2K20KLoBd#cvtq>xRoC9WHE
z)}2{7392pDaNqX<(vWz=c?gFC`(`i_>Sj?G%47uy8Yk4`gx_1-DLV84LhyFAB;16s
z?;>qhFDL1FHR|hLkM}ODYJ=kT3V-P0dy3?C-#hN;TK%mw5GElxDFgZx#9fMyftJ|~
zVKZda-NWpo2Q-H?-3pfBqQSTpYeQ6)@e;#PWivNUK|D1ScB2$*p{o1HFj7+IM`Rn~
zx?#Z;P!<A0tZ0`l?x~N-fC}ax1!xZJ^bERyD(c)T>Uuu3Jq0leCnaMN-z+Xk_)kd(
zW6F#P*t&tBv7CK1R+suEW6*ABR?`N?){T?2Ard3ZzLvOtmI!PAZC8Z7jHL7-q6VzL
zlWf4F1@DSWNe#)<+T(aMg6u}<?vMI=fAaehrXZ%wZ*5xWa`9|oQop8gfx&YWCh4~`
zsbGP+#bg6t-yKJ=se9R|1FYWlhi-{`$R}uD<9fJa%Vc>145Sg~+u&1JL7dQ_S6qbm
zoQ&nI<p-=EDYQ$Qm(G68waOAK)rUP?7tq+zPDpM?Hh>@Pv>?p<$W}02K%}$?g{;4j
z=qS}M(l$W9rc-DS3Twg!elwQZ$t)vGwlWoc(l6pFamOw?!kHP@YHm%lt4=H0&+uJ7
zGEd&=(=FjInqGQi*<Qb*)D+{rpVVF4+rCu<QESTuiJ3;K(l&lGC^iSH;68|H_lwA(
zS&JPriofj*G_;XqXCW&B7Mn9Wnx4RkYhsX~I>!0uby4l6V3nQXZ-;fV;)wE_N#TcT
z0&r9V$%{=aF$ezcWtD3uy_&jrFNkhi)=9x7LA(KC6^iU_vDOrF1)4_pc=w5L7YyQe
z$0MYAr;CVp!mT26f-zq5yU(nCr;z^8>!3HIcWivP!bvxKFdJ5BDQX&+;S|#G<y|kC
zs=ki;-A%LDt<SK=61rKkTFQ7}vB&u#V~S(%nwWQnbGNA${C%ug$A;9h=<O~MM>WES
zfNawQlDNq{?jj=4)Q><MjP1+_PN^?njF;z7;WU-#Yvxm3<LH0t&4Ya3<_UY(oAR;E
zONxM`kE>q?8uM@G>L0FNEdp{>e2@xmDE~NuCbL@aGYvpwckfF`Y;%-ne;`p~clU<9
z(+^Ygn;qHdd9gWRm2Fb0IkB@ACYLkX7=0z{KK8NMrV~|&s_edp=_vQvSh8bDRk*oj
z`1@&bNdYw8V8-C%+B{v+Mm&Yg1mA7z<SU@9eL<(?Ndf=!<JqZ!L!XAPe>)pI_Tbtm
zLt!$J{}`RSb7-olr{Y;ME7-vKrt!qQs!AGhS(lR$eIk!7y#4w}`$I}zTI(?M)~y~+
z4qXX_22SmgT^O%kAQ|rOK>)yrB$bh(sJXbBMzX2+MozgZ12$E5T&jWB|3R$5Z4H~E
z`beWEAXC}u$3Uiba2xGFz4+pltHy>(JH7lWM_1jATa4zf>Y-m>T~m66X4G3d<JNO_
z8b|n;Afp+Y43Z5jGr5~vtjbk1>i?!S)VRn$-Onq^86@RiLcy;fVixyD)M2cvqLZW2
zj<y0}x+sG8jz)oj#O$GJNdSlIaQsaC=8VvQ_Q(+B1w_xrAx16SF8>~{iU2AkRNki9
z^5hQ0x}GP!!|3?;0koRihNZ1O$+8YKlG9gwyU#LE?Tm&dbPh=ha!vK;v5-rDoZAqY
zv`r0IrSj$##T0FcRk~}A46Gc4I2~BcPIWnbg7D>yC)vQI8Xhr}pi@^4e$+k(o3f>T
zWOdhb3w{-l_-*(j2eQ`8ArU!O^CEzQ)~p{rLnmaNQB}kx(2#GHY#<SC|4NhgTD`Ah
zt@C8&#)#?=yqjNmtc;S+A|x<&d<_3(=THi-{rFwee#&q*Ox)cJ@4^UH7?ZK9Y`9;=
zuQ6AzOan5@^+<gXsK|vh_!N;x)Nw!LEn4NmT1-QL{u9mWvke8q)+YUIF8!<*ys9QF
zXB#+If(l~qc>`Vyx)hq0t|a8k)t|wJt@tZguv9N<pMgIft)Whh+E6M^WUn}+ykKH=
zM)Kj@ho_MX{Wa8Z^_C?|Tbp9z6~2u7`iShMEltn!@JeCt^9fz$YYc_isuvKmwr#u)
ziE}ou?p2`QV$5D?4$5U@ir!R}qZMew2^fGPgvaB5<3pM|&bp;hAoa%cSWy`93gvQ}
zBqkwGsKd<IaVJ)uOD%qXMADoiq1oyE%g%*Vp;rv4AVeX38g}BG2e8cP+e7Ve5q(N(
z`eDf<MT7oWP2-L9mz{NNBAq46Qw7Vjwr<^Vu(diLd1e0lpR5J=jawpLT#3pWldLn8
z4BLdWk>L8Wd(N)YNT5+TzrG<SfzQrIuhZ~*8DCm5RMv>{r7~6TLpPLB3jbd2c!{@3
z*}AFv&eZFejPSi1R34c|=5l&VF7yT>EJk_F7D5hTnjA9~8H(_7XRLbJQ{e_mNz#V?
zY?I=h0X%g<z2eP*cGYXJ_g;TMfP?sO9)Ivhcm(9kGu(E;IoHJ<E=Ic8MS_@!1}oP!
z!aP)H3`gT(q-bBpK|=ZjpR?ie_AiRq8i=T@O1nzeCl}COJo~-^geID_AB#7<Ua~0-
zgPL^I+G5Jj@j4C51bY+sxPl%;@>P!H(h4*{stp)>+YfCY7rP@r??Q;k&1o?rm@Mq<
z%#ibJZddM?+hsF;dZ;x<5;E^s=3}9Ab<vMAfPgBNKRG0h&VA;dy?--L5o++!IFr92
z!bOR%#h0*Ie+I!G{|+5DJ~B~5?J0fFjMj_5&jLaZmXqynFHT4p^{zjp)Lj)`866)M
zrBdhiKz<+|lDzjk!tD|)idWF*#;70SydU<8HZi8TX(!mqscbhxs+Bw43>!XAe8(f_
z9bz2@Nl>Y^0^L>;&rvf;woc$q6y%Pe&8EMz4?kpx&TDk@2ZzFuU2WR7zvEkR6j8P=
zDC|WyBZCTk)LUlx&lEb{l!6OPZBoY?(EIl-=oR`@Ze5BTIs(vgn8y#ss55ny204lX
zzCt)S&E@?@loGOG%yLUxGFxf|I-dK{3UpsP<nCM;X=hEx3YY@`;ZHC_79AaX^$P5*
zOQA;%Z1+8?+t-*6w5gO{|L5|RZ;2sQ_IY(U1K<YrxD=I<N$>+X{(>32<T*CBt*VKE
zql2@-hUca*0A@iSLWRYB*@CgE6GP=K8|h&b_4~t7ZJPr%dXug5UQ7TbK{rCK{`X!X
z{xGO&E)kj+UYEMBnT(fF`Az&>AG>|o_pS-O{Jn1`K%`F7+PBc$B*K>w@bxFWp))sb
zTVayiHn*<;?3FEt3qC7aHdGxxZPKW^#c=xL|L6Yr_ICNdy!QqEdQUD^AWs0KFaP>m
bz>gEkEBfuj)~)@(Q?Z7s&O`8nColg4Ecelt

literal 0
HcmV?d00001

diff --git a/examples/python/CuTeDSL/notebooks/print.ipynb b/examples/python/CuTeDSL/notebooks/print.ipynb
new file mode 100644
index 00000000..64787bb4
--- /dev/null
+++ b/examples/python/CuTeDSL/notebooks/print.ipynb
@@ -0,0 +1,425 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Printing with CuTe DSL\n",
+    "\n",
+    "This notebook demonstrates the different ways to print values in CuTe and explains the important distinction between static (compile-time) and dynamic (runtime) values.\n",
+    "\n",
+    "## Key Concepts\n",
+    "- Static values: Known at compile time\n",
+    "- Dynamic values: Only known at runtime\n",
+    "- Different printing methods for different scenarios\n",
+    "- Layout representation in CuTe\n",
+    "- Tensor visualization and formatting"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cutlass\n",
+    "import cutlass.cute as cute\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Print Example Function\n",
+    "\n",
+    "The `print_example` function demonstrates several important concepts:\n",
+    "\n",
+    "### 1. Python's `print` vs CuTe's `cute.printf`\n",
+    "- `print`: Can only show static values at compile time\n",
+    "- `cute.printf`: Can display both static and dynamic values at runtime\n",
+    "\n",
+    "### 2. Value Types\n",
+    "- `a`: Dynamic `Int32` value (runtime)\n",
+    "- `b`: Static `Constexpr[int]` value (compile-time)\n",
+    "\n",
+    "### 3. Layout Printing\n",
+    "Shows how layouts are represented differently in static vs dynamic contexts:\n",
+    "- Static context: Unknown values shown as `?`\n",
+    "- Dynamic context: Actual values displayed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@cute.jit\n",
+    "def print_example(a: cutlass.Int32, b: cutlass.Constexpr[int]):\n",
+    "    \"\"\"\n",
+    "    Demonstrates different printing methods in CuTe and how they handle static vs dynamic values.\n",
+    "\n",
+    "    This example shows:\n",
+    "    1. How Python's `print` function works with static values at compile time but can't show dynamic values\n",
+    "    2. How `cute.printf` can display both static and dynamic values at runtime\n",
+    "    3. The difference between types in static vs dynamic contexts\n",
+    "    4. How layouts are represented in both printing methods\n",
+    "\n",
+    "    Args:\n",
+    "        a: A dynamic Int32 value that will be determined at runtime\n",
+    "        b: A static (compile-time constant) integer value\n",
+    "    \"\"\"\n",
+    "    # Use Python `print` to print static information\n",
+    "    print(\">>>\", b)  # => 2\n",
+    "    # `a` is dynamic value\n",
+    "    print(\">>>\", a)  # => ?\n",
+    "\n",
+    "    # Use `cute.printf` to print dynamic information\n",
+    "    cute.printf(\">?? {}\", a)  # => 8\n",
+    "    cute.printf(\">?? {}\", b)  # => 2\n",
+    "\n",
+    "    print(\">>>\", type(a))  # => <class 'cutlass.Int32'>\n",
+    "    print(\">>>\", type(b))  # => <class 'int'>\n",
+    "\n",
+    "    layout = cute.make_layout((a, b))\n",
+    "    print(\">>>\", layout)            # => (?,2):(1,?)\n",
+    "    cute.printf(\">?? {}\", layout)   # => (8,2):(1,8)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compile and Run\n",
+    "\n",
+    "**Direct Compilation and Run**\n",
+    "  - `print_example(cutlass.Int32(8), 2)`\n",
+    "  - Compiles and runs in one step will execute both static and dynamic print\n",
+    "    * `>>>` stands for static print\n",
+    "    * `>??` stands for dynamic print"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">>> 2\n",
+      ">>> ?\n",
+      ">>> Int32\n",
+      ">>> <class 'int'>\n",
+      ">>> (?,2):(1,?)\n",
+      ">?? 8\n",
+      ">?? 2\n",
+      ">?? (8,2):(1,8)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print_example(cutlass.Int32(8), 2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compile Function\n",
+    "\n",
+    "When compiles the function with `cute.compile(print_example, cutlass.Int32(8), 2)`, Python interpreter \n",
+    "traces code and only evaluate static expression and print static information."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">>> 2\n",
+      ">>> ?\n",
+      ">>> Int32\n",
+      ">>> <class 'int'>\n",
+      ">>> (?,2):(1,?)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print_example_compiled = cute.compile(print_example, cutlass.Int32(8), 2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Call compiled function\n",
+    "\n",
+    "Only print out runtime information"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">?? 8\n",
+      ">?? 2\n",
+      ">?? (8,2):(1,8)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print_example_compiled(cutlass.Int32(8))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Format String Example\n",
+    "\n",
+    "The `format_string_example` function shows an important limitation:\n",
+    "- F-strings in CuTe are evaluated at compile time\n",
+    "- This means dynamic values won't show their runtime values in f-strings\n",
+    "- Use `cute.printf` when you need to see runtime values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Direct run output:\n",
+      "a: ?, b: 2\n",
+      "layout: (?,2):(1,?)\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def format_string_example(a: cutlass.Int32, b: cutlass.Constexpr[int]):\n",
+    "    \"\"\"\n",
+    "    Format string is evaluated at compile time.\n",
+    "    \"\"\"\n",
+    "    print(f\"a: {a}, b: {b}\")\n",
+    "\n",
+    "    layout = cute.make_layout((a, b))\n",
+    "    print(f\"layout: {layout}\")\n",
+    "\n",
+    "print(\"Direct run output:\")\n",
+    "format_string_example(cutlass.Int32(8), 2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Printing Tensor Examples\n",
+    "\n",
+    "CuTe provides specialized functionality for printing tensors through the `print_tensor` operation. The `cute.print_tensor` takes the following parameter:\n",
+    "- `Tensor` (required): A CuTe tensor object that you want to print. The tensor must support load and store operations\n",
+    "- `verbose` (optional, default=False): A boolean flag that controls the level of detail in the output. When set to True, it will print indices details for each element in the tensor.\n",
+    "\n",
+    "Below example code shows the difference between verbose ON and OFF, and how to print a sub range of the given tensor."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from cutlass.cute.runtime import from_dlpack\n",
+    "\n",
+    "@cute.jit\n",
+    "def print_tensor_basic(x : cute.Tensor):\n",
+    "    # Print the tensor\n",
+    "    print(\"Basic output:\")\n",
+    "    cute.print_tensor(x)\n",
+    "    \n",
+    "@cute.jit\n",
+    "def print_tensor_verbose(x : cute.Tensor):\n",
+    "    # Print the tensor with verbose mode\n",
+    "    print(\"Verbose output:\")\n",
+    "    cute.print_tensor(x, verbose=True)\n",
+    "\n",
+    "@cute.jit\n",
+    "def print_tensor_slice(x : cute.Tensor, coord : tuple):\n",
+    "    # slice a 2D tensor from the 3D tensor\n",
+    "    sliced_data = cute.slice_(x, coord)\n",
+    "    y = cute.make_fragment(sliced_data.layout, sliced_data.element_type)\n",
+    "    # Convert to TensorSSA format by loading the sliced data into the fragment\n",
+    "    y.store(sliced_data.load())\n",
+    "    print(\"Slice output:\")\n",
+    "    cute.print_tensor(y)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The default `cute.print_tensor` will output CuTe tensor with datatype, storage space, CuTe layout information, and print data in torch-style format."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Basic output:\n",
+      "tensor(raw_ptr(0x000000000a5f1d50: f32, generic, align<4>) o (4,3,2):(6,2,1), data=\n",
+      "       [[[ 0.000000,  2.000000,  4.000000, ],\n",
+      "         [ 6.000000,  8.000000,  10.000000, ],\n",
+      "         [ 12.000000,  14.000000,  16.000000, ],\n",
+      "         [ 18.000000,  20.000000,  22.000000, ]],\n",
+      "\n",
+      "        [[ 1.000000,  3.000000,  5.000000, ],\n",
+      "         [ 7.000000,  9.000000,  11.000000, ],\n",
+      "         [ 13.000000,  15.000000,  17.000000, ],\n",
+      "         [ 19.000000,  21.000000,  23.000000, ]]])\n"
+     ]
+    }
+   ],
+   "source": [
+    "def tensor_print_example1():\n",
+    "    shape = (4, 3, 2)\n",
+    "    \n",
+    "    # Creates [0,...,23] and reshape to (4, 3, 2)\n",
+    "    data = np.arange(24, dtype=np.float32).reshape(*shape) \n",
+    "      \n",
+    "    print_tensor_basic(from_dlpack(data))\n",
+    "\n",
+    "tensor_print_example1()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The verbosed print will show coodination details of each element in the tensor. The below example shows how we index element in a 2D 4x3 tensor space."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Verbose output:\n",
+      "tensor(raw_ptr(0x000000000a814cc0: f32, generic, align<4>) o (4,3):(3,1), data= (\n",
+      "\t(0,0)= 0.000000\n",
+      "\t(0,1)= 1.000000\n",
+      "\t(0,2)= 2.000000\n",
+      "\t(1,0)= 3.000000\n",
+      "\t(1,1)= 4.000000\n",
+      "\t(1,2)= 5.000000\n",
+      "\t(2,0)= 6.000000\n",
+      "\t(2,1)= 7.000000\n",
+      "\t(2,2)= 8.000000\n",
+      "\t(3,0)= 9.000000\n",
+      "\t(3,1)= 10.000000\n",
+      "\t(3,2)= 11.000000\n",
+      ")\n"
+     ]
+    }
+   ],
+   "source": [
+    "def tensor_print_example2():\n",
+    "    shape = (4, 3)\n",
+    "    \n",
+    "    # Creates [0,...,11] and reshape to (4, 3)\n",
+    "    data = np.arange(12, dtype=np.float32).reshape(*shape) \n",
+    "      \n",
+    "    print_tensor_verbose(from_dlpack(data))\n",
+    "\n",
+    "tensor_print_example2()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To print a subset elements in the given Tensor, we can use cute.slice_ to select a range of the given tensor, load them into register and then print the values with `cute.print_tensor`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Slice output:\n",
+      "tensor(raw_ptr(0x00007ffeeae1fc60: f32, rmem, align<32>) o (4):(3), data=\n",
+      "       [ 0.000000, ],\n",
+      "       [ 3.000000, ],\n",
+      "       [Slice output:\n",
+      " 6.000000, ],\n",
+      "       [ 9.000000, ])\n",
+      "tensor(raw_ptr(0x00007ffeeae1fc60: f32, rmem, align<32>) o (3):(1), data=\n",
+      "       [ 3.000000, ],\n",
+      "       [ 4.000000, ],\n",
+      "       [ 5.000000, ])\n"
+     ]
+    }
+   ],
+   "source": [
+    "def tensor_print_example3():\n",
+    "    shape = (4, 3)\n",
+    "    \n",
+    "    # Creates [0,...,11] and reshape to (4, 3)\n",
+    "    data = np.arange(12, dtype=np.float32).reshape(*shape) \n",
+    "      \n",
+    "    print_tensor_slice(from_dlpack(data), (None, 0))\n",
+    "    print_tensor_slice(from_dlpack(data), (1, None))\n",
+    "\n",
+    "tensor_print_example3()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/python/CuTeDSL/notebooks/tensor.ipynb b/examples/python/CuTeDSL/notebooks/tensor.ipynb
new file mode 100644
index 00000000..80b9cff1
--- /dev/null
+++ b/examples/python/CuTeDSL/notebooks/tensor.ipynb
@@ -0,0 +1,390 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cutlass\n",
+    "import cutlass.cute as cute"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Tensor\n",
+    "\n",
+    "A tensor in CuTe is created through the composition of two key components:\n",
+    "\n",
+    "1. An **Engine** (E) - A random-access, pointer-like object that supports:\n",
+    "   - Offset operation: `e + d → e` (offset engine by elements of a layout's codomain)\n",
+    "   - Dereference operation: `*e → v` (dereference engine to produce value)\n",
+    "\n",
+    "2. A **Layout** (L) - Defines the mapping from coordinates to offsets\n",
+    "\n",
+    "A tensor is formally defined as the composition of an engine E with a layout L, expressed as `T = E ∘ L`. When evaluating a tensor at coordinate c, it:\n",
+    "\n",
+    "1. Maps the coordinate c to the codomain using the layout\n",
+    "2. Offsets the engine accordingly\n",
+    "3. Dereferences the result to obtain the tensor's value\n",
+    "\n",
+    "This can be expressed mathematically as:\n",
+    "\n",
+    "```\n",
+    "T(c) = (E ∘ L)(c) = *(E + L(c))\n",
+    "```\n",
+    "\n",
+    "## Example Usage\n",
+    "\n",
+    "Here's a simple example of creating a tensor using pointer and layout `(8,5):(5,1)` and fill with ones:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@cute.jit\n",
+    "def create_tensor_from_ptr(ptr: cute.Pointer):\n",
+    "    layout = cute.make_layout((8, 5), stride=(5, 1))\n",
+    "    tensor = cute.make_tensor(ptr, layout)\n",
+    "    tensor.fill(1)\n",
+    "    cute.print_tensor(tensor)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This creates a tensor where:\n",
+    "- The engine is a pointer\n",
+    "- The layout with shape `(8, 5)` and stride `(5, 1)`\n",
+    "- The resulting tensor can be evaluated using coordinates defined by the layout\n",
+    "\n",
+    "We can test this by allocating buffer with torch and run test with pointer to torch tensor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor(raw_ptr(0x000000000736b0c0: f32, generic, align<4>) o (8,5):(5,1), data=\n",
+      "       [[ 1.000000,  1.000000,  1.000000,  1.000000,  1.000000, ],\n",
+      "        [ 1.000000,  1.000000,  1.000000,  1.000000,  1.000000, ],\n",
+      "        [ 1.000000,  1.000000,  1.000000,  1.000000,  1.000000, ],\n",
+      "        ...\n",
+      "        [ 1.000000,  1.000000,  1.000000,  1.000000,  1.000000, ],\n",
+      "        [ 1.000000,  1.000000,  1.000000,  1.000000,  1.000000, ],\n",
+      "        [ 1.000000,  1.000000,  1.000000,  1.000000,  1.000000, ]])\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "\n",
+    "from cutlass.torch import dtype as torch_dtype\n",
+    "import cutlass.cute.runtime as cute_rt\n",
+    "\n",
+    "a = torch.randn(8, 5, dtype=torch_dtype(cutlass.Float32))\n",
+    "ptr_a = cute_rt.make_ptr(cutlass.Float32, a.data_ptr())\n",
+    "\n",
+    "create_tensor_from_ptr(ptr_a)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## DLPACK support \n",
+    "\n",
+    "CuTe DSL is designed to support dlpack protocol natively. This offers easy integration with frameworks \n",
+    "supporting DLPack, e.g. torch, numpy, jax, tensorflow, etc.\n",
+    "\n",
+    "For more information, please refer to DLPACK project: https://github.com/dmlc/dlpack\n",
+    "\n",
+    "Calling `from_dlpack` can convert any tensor or ndarray object supporting `__dlpack__` and `__dlpack_device__`.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from cutlass.cute.runtime import from_dlpack\n",
+    "\n",
+    "@cute.jit\n",
+    "def print_tensor_dlpack(src: cute.Tensor):\n",
+    "    print(src)\n",
+    "    cute.print_tensor(src)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor<ptr<f32, generic> o (8,5):(5,1)>\n",
+      "tensor(raw_ptr(0x0000000007559340: f32, generic, align<4>) o (8,5):(5,1), data=\n",
+      "       [[-1.151769,  1.019397, -0.371175, -0.717776,  0.502176, ],\n",
+      "        [ 0.114282,  0.900084,  0.320770,  1.564574, -0.632329, ],\n",
+      "        [-0.570140,  0.178112, -0.423079,  1.936198,  0.003355, ],\n",
+      "        ...\n",
+      "        [-2.425393, -0.275528,  1.267157, -0.811101, -0.985456, ],\n",
+      "        [ 0.777889, -2.114074,  0.357184, -0.321312, -0.938138, ],\n",
+      "        [ 1.959564,  1.797602,  0.116901,  0.306198, -1.837295, ]])\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = torch.randn(8, 5, dtype=torch_dtype(cutlass.Float32))\n",
+    "\n",
+    "print_tensor_dlpack(from_dlpack(a))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor<ptr<f32, generic> o (8,8):(8,1)>\n",
+      "tensor(raw_ptr(0x0000000007979da0: f32, generic, align<4>) o (8,8):(8,1), data=\n",
+      "       [[ 0.122739, -0.605744, -1.442022, ..., -0.356501, -0.993329, -0.091110, ],\n",
+      "        [ 0.278448,  0.318482, -0.276867, ...,  1.542181, -1.701539, -0.309454, ],\n",
+      "        [ 0.563565, -0.753936,  0.131214, ...,  0.437912, -0.482277, -0.051540, ],\n",
+      "        ...\n",
+      "        [-1.974096, -0.177881,  0.426807, ..., -1.579115, -0.304974,  0.451164, ],\n",
+      "        [ 0.149851, -0.704689, -0.295063, ..., -0.653001,  0.008871,  0.903916, ],\n",
+      "        [ 1.188619,  1.519662,  1.270734, ...,  0.404082,  0.173200,  0.093476, ]])\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "a = np.random.randn(8, 8).astype(np.float32)\n",
+    "\n",
+    "print_tensor_dlpack(from_dlpack(a))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Tensor Evaluation Methods\n",
+    "\n",
+    "Tensors support two primary methods of evaluation:\n",
+    "\n",
+    "### 1. Full Evaluation\n",
+    "When applying the tensor evaluation with a complete coordinate c, it computes the offset, applies it to the engine, \n",
+    "and dereferences it to return the stored value. This is the straightforward case where you want to access \n",
+    "a specific element of the tensor.\n",
+    "\n",
+    "### 2. Partial Evaluation (Slicing)\n",
+    "When evaluating with an incomplete coordinate c = c' ⊕ c* (where c* represents the unspecified portion), \n",
+    "the result is a new tensor which is a slice of the original tensor with its engine offset to account for \n",
+    "the coordinates that were provided. This operation can be expressed as:\n",
+    "\n",
+    "```\n",
+    "T(c) = (E ∘ L)(c) = (E + L(c')) ∘ L(c*) = T'(c*)\n",
+    "```\n",
+    "\n",
+    "Slicing effectively reduces the dimensionality of the tensor, creating a sub-tensor that can be \n",
+    "further evaluated or manipulated."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "a[2] = 10.000000 (equivalent to a[(2,0)])\n",
+      "a[9] = 6.000000 (equivalent to a[(1,1)])\n",
+      "a[2,0] = 10.000000\n",
+      "a[2,4] = 14.000000\n",
+      "a[(2,4)] = 14.000000\n",
+      "a[2,3] = 100.000000\n",
+      "a[(2,4)] = 101.000000\n",
+      "tensor([[  0.,   1.,   2.,   3.,   4.],\n",
+      "        [  5.,   6.,   7.,   8.,   9.],\n",
+      "        [ 10.,  11.,  12., 100., 101.],\n",
+      "        [ 15.,  16.,  17.,  18.,  19.],\n",
+      "        [ 20.,  21.,  22.,  23.,  24.],\n",
+      "        [ 25.,  26.,  27.,  28.,  29.],\n",
+      "        [ 30.,  31.,  32.,  33.,  34.],\n",
+      "        [ 35.,  36.,  37.,  38.,  39.]])\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def tensor_access_item(a: cute.Tensor):\n",
+    "    # access data using linear index\n",
+    "    cute.printf(\"a[2] = {} (equivalent to a[{}])\", a[2],\n",
+    "                cute.make_identity_tensor(a.layout.shape)[2])\n",
+    "    cute.printf(\"a[9] = {} (equivalent to a[{}])\", a[9],\n",
+    "                cute.make_identity_tensor(a.layout.shape)[9])\n",
+    "\n",
+    "    # access data using n-d coordinates, following two are equivalent\n",
+    "    cute.printf(\"a[2,0] = {}\", a[2, 0])\n",
+    "    cute.printf(\"a[2,4] = {}\", a[2, 4])\n",
+    "    cute.printf(\"a[(2,4)] = {}\", a[2, 4])\n",
+    "\n",
+    "    # assign value to tensor@(2,4)\n",
+    "    a[2,3] = 100.0\n",
+    "    a[2,4] = 101.0\n",
+    "    cute.printf(\"a[2,3] = {}\", a[2,3])\n",
+    "    cute.printf(\"a[(2,4)] = {}\", a[(2,4)])\n",
+    "\n",
+    "@cute.kernel\n",
+    "def print_tensor_gpu(ptr: cute.Pointer):\n",
+    "    layout = cute.make_layout((8, 5), stride=(5, 1))\n",
+    "    tensor = cute.make_tensor(ptr, layout)\n",
+    "\n",
+    "    tidx, _, _ = cute.arch.thread_idx()\n",
+    "\n",
+    "    if tidx == 0:\n",
+    "        cute.print_tensor(tensor)\n",
+    "\n",
+    "\n",
+    "# Create a tensor with sequential data using torch\n",
+    "data = torch.arange(0, 8*5, dtype=torch.float32).reshape(8, 5)\n",
+    "tensor_access_item(from_dlpack(data))\n",
+    "\n",
+    "print(data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Tensor as memory view\n",
+    "\n",
+    "In CUDA programming, different memory spaces have different characteristics in terms of access speed, scope, and lifetime:\n",
+    "\n",
+    "- **generic**: Default memory space that can refer to any other memory space.\n",
+    "- **global memory (gmem)**: Accessible by all threads across all blocks, but has higher latency.\n",
+    "- **shared memory (smem)**: Accessible by all threads within a block, with much lower latency than global memory.\n",
+    "- **register memory (rmem)**: Thread-private memory with the lowest latency, but limited capacity.\n",
+    "- **tensor memory (tmem)**: Specialized memory introduced in NVIDIA Blackwell architecture for tensor operations.\n",
+    "\n",
+    "When creating tensors in CuTe, you can specify the memory space to optimize performance based on your access patterns.\n",
+    "\n",
+    "For more information on CUDA memory spaces, see the [CUDA Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#memory-hierarchy).\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Coordinate Tensor\n",
+    "\n",
+    "A coordinate tensor is a special type of tensor that maps coordinates to coordinates rather than to values. \n",
+    "The key distinction is that while regular tensors map coordinates to some value type (like numbers), \n",
+    "coordinate tensors map coordinates to other coordinates.\n",
+    "\n",
+    "For example, given a shape (4,4), a coordinate tensor using row-major layout would appear as:\n",
+    "\n",
+    "\\begin{bmatrix} \n",
+    "(0,0) & (0,1) & (0,2) & (0,3) \\\\\n",
+    "(1,0) & (1,1) & (1,2) & (1,3) \\\\\n",
+    "(2,0) & (2,1) & (2,2) & (2,3) \\\\\n",
+    "(3,0) & (3,1) & (3,2) & (3,3)\n",
+    "\\end{bmatrix}\n",
+    "\n",
+    "The same shape with a column-major layout would appear as:\n",
+    "\n",
+    "\\begin{bmatrix}\n",
+    "(0,0) & (1,0) & (2,0) & (3,0) \\\\\n",
+    "(0,1) & (1,1) & (2,1) & (3,1) \\\\\n",
+    "(0,2) & (1,2) & (2,2) & (3,2) \\\\\n",
+    "(0,3) & (1,3) & (2,3) & (3,3)\n",
+    "\\end{bmatrix}\n",
+    "\n",
+    "The key points about coordinate tensors are:\n",
+    "- Each element in the tensor is itself a coordinate tuple (i,j) rather than a scalar value\n",
+    "- The coordinates map to themselves - so position (1,2) contains the coordinate (1,2)\n",
+    "- The layout (row-major vs column-major) determines how these coordinate tuples are arranged in memory\n",
+    "\n",
+    "For example, coordinate tensors can be created using the `make_identity_tensor` utility:\n",
+    "\n",
+    "```python\n",
+    "coord_tensor = make_identity_tensor(layout.shape())\n",
+    "```\n",
+    "\n",
+    "This creates a tensor that maps each coordinate to itself, providing a reference point for understanding how other layouts transform these coordinates."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor<(0,0) o (8,4):(1@0,1@1)>\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def print_tensor_coord(a: cute.Tensor):\n",
+    "    coord_tensor = cute.make_identity_tensor(a.layout.shape)\n",
+    "    print(coord_tensor)\n",
+    "\n",
+    "a = torch.randn(8,4, dtype=torch_dtype(cutlass.Float32))\n",
+    "print_tensor_coord(from_dlpack(a))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {},
+    "version_major": 2,
+    "version_minor": 0
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/python/CuTeDSL/notebooks/tensorssa.ipynb b/examples/python/CuTeDSL/notebooks/tensorssa.ipynb
new file mode 100644
index 00000000..8d83e02e
--- /dev/null
+++ b/examples/python/CuTeDSL/notebooks/tensorssa.ipynb
@@ -0,0 +1,558 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cutlass\n",
+    "import cutlass.cute as cute\n",
+    "from cutlass.cute.runtime import from_dlpack\n",
+    "\n",
+    "import numpy as np\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Introduction to the TensorSSA in CuTe DSL\n",
+    "\n",
+    "This tutorial introduces what is the `TensorSSA` and why we need it. We also give some examples to show how to use `TensorSSA`.\n",
+    "\n",
+    "## What is TensorSSA\n",
+    "\n",
+    "`TensorSSA` is a Python class that represents a tensor value in Static Single Assignment (SSA) form within the CuTe DSL. You can think of it as a tensor residing in a (simulated) register.\n",
+    "\n",
+    "## Why TensorSSA\n",
+    "\n",
+    "`TensorSSA` encapsulates the underlying MLIR tensor value into an object that's easier to manipulate in Python. By overloading numerous Python operators (like `+`, `-`, `*`, `/`, `[]`, etc.), it allows users to express tensor computations (primarily element-wise operations and reductions) in a more Pythonic way. These element-wise operations are then translated into optimized vectorization instructions.\n",
+    "\n",
+    "It's part of the CuTe DSL, serving as a bridge between the user-described computational logic and the lower-level MLIR IR, particularly for representing and manipulating register-level data.\n",
+    "\n",
+    "## When to use TensorSSA\n",
+    "\n",
+    "`TensorSSA` is primarily used in the following scenarios:\n",
+    "\n",
+    "### Load from memory and store to memory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "a_vec: tensor_value<vector<12xf32> o (3, 4)>\n",
+      "b_vec: tensor_value<vector<12xf32> o (3, 4)>\n",
+      "tensor(raw_ptr(0x0000000006cff170: f32, generic, align<4>) o (3,4):(4,1), data=\n",
+      "       [[ 2.000000,  2.000000,  2.000000,  2.000000, ],\n",
+      "        [ 2.000000,  2.000000,  2.000000,  2.000000, ],\n",
+      "        [ 2.000000,  2.000000,  2.000000,  2.000000, ]])\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def load_and_store(res: cute.Tensor, a: cute.Tensor, b: cute.Tensor):\n",
+    "    \"\"\"\n",
+    "    Load data from memory and store the result to memory.\n",
+    "\n",
+    "    :param res: The destination tensor to store the result.\n",
+    "    :param a: The source tensor to be loaded.\n",
+    "    :param b: The source tensor to be loaded.\n",
+    "    \"\"\"\n",
+    "    a_vec = a.load()\n",
+    "    print(f\"a_vec: {a_vec}\")      # prints `a_vec: vector<12xf32> o (3, 4)`\n",
+    "    b_vec = b.load()\n",
+    "    print(f\"b_vec: {b_vec}\")      # prints `b_vec: vector<12xf32> o (3, 4)`\n",
+    "    res.store(a_vec + b_vec)\n",
+    "    cute.print_tensor(res)\n",
+    "\n",
+    "a = np.ones(12).reshape((3, 4)).astype(np.float32)\n",
+    "b = np.ones(12).reshape((3, 4)).astype(np.float32)\n",
+    "c = np.zeros(12).reshape((3, 4)).astype(np.float32)\n",
+    "load_and_store(from_dlpack(c), from_dlpack(a), from_dlpack(b))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Register-Level Tensor Operations\n",
+    "\n",
+    "When writing kernel logic, various computations, transformations, slicing, etc., are performed on data loaded into registers."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor_value<vector<24xf32> o (4, 2, 3)> -> tensor_value<vector<12xf32> o (4, 3)>\n",
+      "tensor(raw_ptr(0x00000000071acaf0: f32, generic, align<4>) o (4,3):(3,1), data=\n",
+      "       [[ 3.000000,  4.000000,  5.000000, ],\n",
+      "        [ 9.000000,  10.000000,  11.000000, ],\n",
+      "        [ 15.000000,  16.000000,  17.000000, ],\n",
+      "        [ 21.000000,  22.000000,  23.000000, ]])\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def apply_slice(src: cute.Tensor, dst: cute.Tensor, indices: cutlass.Constexpr):\n",
+    "    \"\"\"\n",
+    "    Apply slice operation on the src tensor and store the result to the dst tensor.\n",
+    "\n",
+    "    :param src: The source tensor to be sliced.\n",
+    "    :param dst: The destination tensor to store the result.\n",
+    "    :param indices: The indices to slice the source tensor.\n",
+    "    \"\"\"\n",
+    "    src_vec = src.load()\n",
+    "    dst_vec = src_vec[indices]\n",
+    "    print(f\"{src_vec} -> {dst_vec}\")\n",
+    "    if isinstance(dst_vec, cute.TensorSSA):\n",
+    "        dst.store(dst_vec)\n",
+    "        cute.print_tensor(dst)\n",
+    "    else:\n",
+    "        dst[0] = dst_vec\n",
+    "        cute.print_tensor(dst)\n",
+    "\n",
+    "def slice_1():\n",
+    "    src_shape = (4, 2, 3)\n",
+    "    dst_shape = (4, 3)\n",
+    "    indices = (None, 1, None)\n",
+    "\n",
+    "    \"\"\"\n",
+    "    a:\n",
+    "    [[[ 0.  1.  2.]\n",
+    "      [ 3.  4.  5.]]\n",
+    "\n",
+    "     [[ 6.  7.  8.]\n",
+    "      [ 9. 10. 11.]]\n",
+    "\n",
+    "     [[12. 13. 14.]\n",
+    "      [15. 16. 17.]]\n",
+    "\n",
+    "     [[18. 19. 20.]\n",
+    "      [21. 22. 23.]]]\n",
+    "    \"\"\"\n",
+    "    a = np.arange(np.prod(src_shape)).reshape(*src_shape).astype(np.float32)\n",
+    "    dst = np.random.randn(*dst_shape).astype(np.float32)\n",
+    "    apply_slice(from_dlpack(a), from_dlpack(dst), indices)\n",
+    "\n",
+    "slice_1()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor_value<vector<24xf32> o (4, 2, 3)> -> ?\n",
+      "tensor(raw_ptr(0x00000000013cbbe0: f32, generic, align<4>) o (1):(1), data=\n",
+      "       [ 10.000000, ])\n"
+     ]
+    }
+   ],
+   "source": [
+    "def slice_2():\n",
+    "    src_shape = (4, 2, 3)\n",
+    "    dst_shape = (1,)\n",
+    "    indices = 10\n",
+    "    a = np.arange(np.prod(src_shape)).reshape(*src_shape).astype(np.float32)\n",
+    "    dst = np.random.randn(*dst_shape).astype(np.float32)\n",
+    "    apply_slice(from_dlpack(a), from_dlpack(dst), indices)\n",
+    "\n",
+    "slice_2()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Arithmetic Operations\n",
+    "\n",
+    "As we mentioned earlier, there're many tensor operations whose operands are `TensorSSA`. And they are all element-wise operations. We give some examples below.\n",
+    "\n",
+    "### Binary Operations\n",
+    "\n",
+    "For binary operations, the LHS operand is `TensorSSA` and the RHS operand can be either `TensorSSA` or `Numeric`. When the RHS is `Numeric`, it will be broadcast to a `TensorSSA`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor(raw_ptr(0x00000000074f0e70: f32, generic, align<4>) o (3):(1), data=\n",
+      "       [ 3.000000, ],\n",
+      "       [ 3.000000, ],\n",
+      "       [ 3.000000, ])\n",
+      "tensor(raw_ptr(0x00000000074f0e70: f32, generic, align<4>) o (3):(1), data=\n",
+      "       [-1.000000, ],\n",
+      "       [-1.000000, ],\n",
+      "       [-1.000000, ])\n",
+      "tensor(raw_ptr(0x00000000074f0e70: f32, generic, align<4>) o (3):(1), data=\n",
+      "       [ 2.000000, ],\n",
+      "       [ 2.000000, ],\n",
+      "       [ 2.000000, ])\n",
+      "tensor(raw_ptr(0x00000000074f0e70: f32, generic, align<4>) o (3):(1), data=\n",
+      "       [ 0.500000, ],\n",
+      "       [ 0.500000, ],\n",
+      "       [ 0.500000, ])\n",
+      "tensor(raw_ptr(0x00000000074f0e70: f32, generic, align<4>) o (3):(1), data=\n",
+      "       [ 0.000000, ],\n",
+      "       [ 0.000000, ],\n",
+      "       [ 0.000000, ])\n",
+      "tensor(raw_ptr(0x00000000074f0e70: f32, generic, align<4>) o (3):(1), data=\n",
+      "       [ 1.000000, ],\n",
+      "       [ 1.000000, ],\n",
+      "       [ 1.000000, ])\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def binary_op_1(res: cute.Tensor, a: cute.Tensor, b: cute.Tensor):\n",
+    "    a_vec = a.load()\n",
+    "    b_vec = b.load()\n",
+    "\n",
+    "    add_res = a_vec + b_vec\n",
+    "    res.store(add_res)\n",
+    "    cute.print_tensor(res)        # prints [3.000000, 3.000000, 3.000000]\n",
+    "\n",
+    "    sub_res = a_vec - b_vec\n",
+    "    res.store(sub_res)\n",
+    "    cute.print_tensor(res)        # prints [-1.000000, -1.000000, -1.000000]\n",
+    "\n",
+    "    mul_res = a_vec * b_vec\n",
+    "    res.store(mul_res)\n",
+    "    cute.print_tensor(res)        # prints [2.000000, 2.000000, 2.000000]\n",
+    "\n",
+    "    div_res = a_vec / b_vec\n",
+    "    res.store(div_res)\n",
+    "    cute.print_tensor(res)        # prints [0.500000, 0.500000, 0.500000]\n",
+    "\n",
+    "    floor_div_res = a_vec // b_vec\n",
+    "    res.store(floor_div_res)\n",
+    "    cute.print_tensor(res)        # prints [0.000000, 0.000000, 0.000000]\n",
+    "\n",
+    "    mod_res = a_vec % b_vec\n",
+    "    res.store(mod_res)\n",
+    "    cute.print_tensor(res)        # prints [1.000000, 1.000000, 1.000000]\n",
+    "\n",
+    "\n",
+    "a = np.empty((3,), dtype=np.float32)\n",
+    "a.fill(1.0)\n",
+    "b = np.empty((3,), dtype=np.float32)\n",
+    "b.fill(2.0)\n",
+    "res = np.empty((3,), dtype=np.float32)\n",
+    "binary_op_1(from_dlpack(res), from_dlpack(a), from_dlpack(b))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor(raw_ptr(0x0000000007828ed0: f32, generic, align<4>) o (3):(1), data=\n",
+      "       [ 3.000000, ],\n",
+      "       [ 3.000000, ],\n",
+      "       [ 3.000000, ])\n",
+      "tensor(raw_ptr(0x0000000007828ed0: f32, generic, align<4>) o (3):(1), data=\n",
+      "       [-1.000000, ],\n",
+      "       [-1.000000, ],\n",
+      "       [-1.000000, ])\n",
+      "tensor(raw_ptr(0x0000000007828ed0: f32, generic, align<4>) o (3):(1), data=\n",
+      "       [ 2.000000, ],\n",
+      "       [ 2.000000, ],\n",
+      "       [ 2.000000, ])\n",
+      "tensor(raw_ptr(0x0000000007828ed0: f32, generic, align<4>) o (3):(1), data=\n",
+      "       [ 0.500000, ],\n",
+      "       [ 0.500000, ],\n",
+      "       [ 0.500000, ])\n",
+      "tensor(raw_ptr(0x0000000007828ed0: f32, generic, align<4>) o (3):(1), data=\n",
+      "       [ 0.000000, ],\n",
+      "       [ 0.000000, ],\n",
+      "       [ 0.000000, ])\n",
+      "tensor(raw_ptr(0x0000000007828ed0: f32, generic, align<4>) o (3):(1), data=\n",
+      "       [ 1.000000, ],\n",
+      "       [ 1.000000, ],\n",
+      "       [ 1.000000, ])\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def binary_op_2(res: cute.Tensor, a: cute.Tensor, c: cutlass.Constexpr):\n",
+    "    a_vec = a.load()\n",
+    "\n",
+    "    add_res = a_vec + c\n",
+    "    res.store(add_res)\n",
+    "    cute.print_tensor(res)        # prints [3.000000, 3.000000, 3.000000]\n",
+    "\n",
+    "    sub_res = a_vec - c\n",
+    "    res.store(sub_res)\n",
+    "    cute.print_tensor(res)        # prints [-1.000000, -1.000000, -1.000000]\n",
+    "\n",
+    "    mul_res = a_vec * c\n",
+    "    res.store(mul_res)\n",
+    "    cute.print_tensor(res)        # prints [2.000000, 2.000000, 2.000000]\n",
+    "\n",
+    "    div_res = a_vec / c\n",
+    "    res.store(div_res)\n",
+    "    cute.print_tensor(res)        # prints [0.500000, 0.500000, 0.500000]\n",
+    "\n",
+    "    floor_div_res = a_vec // c\n",
+    "    res.store(floor_div_res)\n",
+    "    cute.print_tensor(res)        # prints [0.000000, 0.000000, 0.000000]\n",
+    "\n",
+    "    mod_res = a_vec % c\n",
+    "    res.store(mod_res)\n",
+    "    cute.print_tensor(res)        # prints [1.000000, 1.000000, 1.000000]\n",
+    "\n",
+    "a = np.empty((3,), dtype=np.float32)\n",
+    "a.fill(1.0)\n",
+    "c = 2.0\n",
+    "res = np.empty((3,), dtype=np.float32)\n",
+    "binary_op_2(from_dlpack(res), from_dlpack(a), c)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[False  True False]\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def binary_op_3(res: cute.Tensor, a: cute.Tensor, b: cute.Tensor):\n",
+    "    a_vec = a.load()\n",
+    "    b_vec = b.load()\n",
+    "\n",
+    "    gt_res = a_vec > b_vec\n",
+    "    res.store(gt_res)\n",
+    "\n",
+    "    \"\"\"\n",
+    "    ge_res = a_ >= b_   # [False, True, False]\n",
+    "    lt_res = a_ < b_    # [True, False, True]\n",
+    "    le_res = a_ <= b_   # [True, False, True]\n",
+    "    eq_res = a_ == b_   # [False, False, False]\n",
+    "    \"\"\"\n",
+    "\n",
+    "a = np.array([1, 2, 3], dtype=np.float32)\n",
+    "b = np.array([2, 1, 4], dtype=np.float32)\n",
+    "res = np.empty((3,), dtype=np.bool_)\n",
+    "binary_op_3(from_dlpack(res), from_dlpack(a), from_dlpack(b))\n",
+    "print(res)     # prints [False, True, False]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[3 0 7]\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def binary_op_4(res: cute.Tensor, a: cute.Tensor, b: cute.Tensor):\n",
+    "    a_vec = a.load()\n",
+    "    b_vec = b.load()\n",
+    "\n",
+    "    xor_res = a_vec ^ b_vec\n",
+    "    res.store(xor_res)\n",
+    "\n",
+    "    # or_res = a_vec | b_vec\n",
+    "    # res.store(or_res)     # prints [3, 2, 7]\n",
+    "\n",
+    "    # and_res = a_vec & b_vec\n",
+    "    # res.store(and_res)      # prints [0, 2, 0]\n",
+    "\n",
+    "a = np.array([1, 2, 3], dtype=np.int32)\n",
+    "b = np.array([2, 2, 4], dtype=np.int32)\n",
+    "res = np.empty((3,), dtype=np.int32)\n",
+    "binary_op_4(from_dlpack(res), from_dlpack(a), from_dlpack(b))\n",
+    "print(res)     # prints [3, 0, 7]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Unary Operations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor(raw_ptr(0x0000000007fbd180: f32, generic, align<4>) o (3):(1), data=\n",
+      "       [ 2.000000, ],\n",
+      "       [ 2.000000, ],\n",
+      "       [ 2.000000, ])\n",
+      "tensor(raw_ptr(0x0000000007fbd180: f32, generic, align<4>) o (3):(1), data=\n",
+      "       [-0.756802, ],\n",
+      "       [-0.756802, ],\n",
+      "       [-0.756802, ])\n",
+      "tensor(raw_ptr(0x0000000007fbd180: f32, generic, align<4>) o (3):(1), data=\n",
+      "       [ 16.000000, ],\n",
+      "       [ 16.000000, ],\n",
+      "       [ 16.000000, ])\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def unary_op_1(res: cute.Tensor, a: cute.Tensor):\n",
+    "    a_vec = a.load()\n",
+    "\n",
+    "    sqrt_res = cute.math.sqrt(a_vec)\n",
+    "    res.store(sqrt_res)\n",
+    "    cute.print_tensor(res)        # prints [2.000000, 2.000000, 2.000000]\n",
+    "\n",
+    "    sin_res = cute.math.sin(a_vec)\n",
+    "    res.store(sin_res)\n",
+    "    cute.print_tensor(res)        # prints [-0.756802, -0.756802, -0.756802]\n",
+    "\n",
+    "    exp2_res = cute.math.exp2(a_vec)\n",
+    "    res.store(exp2_res)\n",
+    "    cute.print_tensor(res)        # prints [16.000000, 16.000000, 16.000000]\n",
+    "\n",
+    "a = np.array([4.0, 4.0, 4.0], dtype=np.float32)\n",
+    "res = np.empty((3,), dtype=np.float32)\n",
+    "unary_op_1(from_dlpack(res), from_dlpack(a))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Reduction Operation\n",
+    "\n",
+    "The `TensorSSA`'s `reduce` method applies a specified reduction operation (`ReductionOp.ADD`, `ReductionOp.MUL`, `ReductionOp.MAX`, `ReductionOp.MIN`) starting with an initial value, and performs this reduction along the dimensions specified by the `reduction_profile.`. The result is typically a new `TensorSSA` with reduced dimensions or a scalar value if reduces across all axes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "21.000000\n",
+      "tensor(raw_ptr(0x00007ffd1ea2bca0: f32, rmem, align<32>) o (2):(1), data=\n",
+      "       [ 6.000000, ],\n",
+      "       [ 15.000000, ])\n",
+      "tensor(raw_ptr(0x00007ffd1ea2bcc0: f32, rmem, align<32>) o (3):(1), data=\n",
+      "       [ 6.000000, ],\n",
+      "       [ 8.000000, ],\n",
+      "       [ 10.000000, ])\n"
+     ]
+    }
+   ],
+   "source": [
+    "@cute.jit\n",
+    "def reduction_op(a: cute.Tensor):\n",
+    "    \"\"\"\n",
+    "    Apply reduction operation on the src tensor.\n",
+    "\n",
+    "    :param src: The source tensor to be reduced.\n",
+    "    \"\"\"\n",
+    "    a_vec = a.load()\n",
+    "    red_res = a_vec.reduce(\n",
+    "        cute.ReductionOp.ADD,\n",
+    "        0.0,\n",
+    "        reduction_profile=0\n",
+    "    )\n",
+    "    cute.printf(red_res)        # prints 21.000000\n",
+    "\n",
+    "    red_res = a_vec.reduce(\n",
+    "        cute.ReductionOp.ADD,\n",
+    "        0.0,\n",
+    "        reduction_profile=(None, 1)\n",
+    "    )\n",
+    "    # We can't print the TensorSSA directly at this point, so we store it to a new Tensor and print it.\n",
+    "    res = cute.make_fragment(red_res.shape, cutlass.Float32)\n",
+    "    res.store(red_res)\n",
+    "    cute.print_tensor(res)        # prints [6.000000, 15.000000]\n",
+    "\n",
+    "    red_res = a_vec.reduce(\n",
+    "        cute.ReductionOp.ADD,\n",
+    "        1.0,\n",
+    "        reduction_profile=(1, None)\n",
+    "    )\n",
+    "    res = cute.make_fragment(red_res.shape, cutlass.Float32)\n",
+    "    res.store(red_res)\n",
+    "    cute.print_tensor(res)        # prints [6.000000, 8.000000, 10.000000]\n",
+    "\n",
+    "\n",
+    "a = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)\n",
+    "reduction_op(from_dlpack(a))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/python/00_basic_gemm.ipynb b/examples/python/deprecated/00_basic_gemm.ipynb
similarity index 100%
rename from examples/python/00_basic_gemm.ipynb
rename to examples/python/deprecated/00_basic_gemm.ipynb
diff --git a/examples/python/01_epilogue.ipynb b/examples/python/deprecated/01_epilogue.ipynb
similarity index 100%
rename from examples/python/01_epilogue.ipynb
rename to examples/python/deprecated/01_epilogue.ipynb
diff --git a/examples/python/02_pytorch_extension_grouped_gemm.ipynb b/examples/python/deprecated/02_pytorch_extension_grouped_gemm.ipynb
similarity index 100%
rename from examples/python/02_pytorch_extension_grouped_gemm.ipynb
rename to examples/python/deprecated/02_pytorch_extension_grouped_gemm.ipynb
diff --git a/examples/python/03_basic_conv2d.ipynb b/examples/python/deprecated/03_basic_conv2d.ipynb
similarity index 100%
rename from examples/python/03_basic_conv2d.ipynb
rename to examples/python/deprecated/03_basic_conv2d.ipynb
diff --git a/examples/python/04_epilogue_visitor.ipynb b/examples/python/deprecated/04_epilogue_visitor.ipynb
similarity index 100%
rename from examples/python/04_epilogue_visitor.ipynb
rename to examples/python/deprecated/04_epilogue_visitor.ipynb
diff --git a/examples/python/README.md b/examples/python/deprecated/README.md
similarity index 100%
rename from examples/python/README.md
rename to examples/python/deprecated/README.md
diff --git a/include/cute/arch/config.hpp b/include/cute/arch/config.hpp
index 2383b4e6..8ec8ffb2 100644
--- a/include/cute/arch/config.hpp
+++ b/include/cute/arch/config.hpp
@@ -93,6 +93,7 @@
 #  define CUTE_ARCH_STSM_SM90_ENABLED
 #endif
 
+
 #if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED))
 #  define CUTE_ARCH_TCGEN05_S8_MMA_ENABLED
 #endif
diff --git a/include/cute/arch/mma_sm100_umma.hpp b/include/cute/arch/mma_sm100_umma.hpp
index 4b6d7f86..f754e266 100644
--- a/include/cute/arch/mma_sm100_umma.hpp
+++ b/include/cute/arch/mma_sm100_umma.hpp
@@ -1394,7 +1394,11 @@ struct SM100_MMA_MXF4_SS
           "{\n\t"
           ".reg .pred p;\n\t"
           "setp.ne.b32 p, %4, 0;\n\t"
+#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9)
+          "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block16 [%0], %1, %2, %3, [%5], [%6], p; \n\t"
+#else
           "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%5], [%6], p; \n\t"
+#endif
           "}\n"
           :
           : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC),
@@ -1411,7 +1415,11 @@ struct SM100_MMA_MXF4_SS
           "{\n\t"
           ".reg .pred p;\n\t"
           "setp.ne.b32 p, %4, 0;\n\t"
+#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9)
+          "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.block32 [%0], %1, %2, %3, [%5], [%6], p; \n\t"
+#else
           "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%5], [%6], p; \n\t"
+#endif
           "}\n"
           :
           : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC),
@@ -1457,7 +1465,11 @@ struct SM100_MMA_MXF4NVF4_SS_SPARSE
           "{\n\t"
           ".reg .pred p;\n\t"
           "setp.ne.b32 p, %4, 0;\n\t"
+#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9)
+          "tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block16 [%0], %1, %2, [%7], %3, [%5], [%6], p; \n\t"
+#else
           "tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, [%7], %3, [%5], [%6], p; \n\t"
+#endif
           "}\n"
           :
           : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC),
@@ -1475,7 +1487,11 @@ struct SM100_MMA_MXF4NVF4_SS_SPARSE
           "{\n\t"
           ".reg .pred p;\n\t"
           "setp.ne.b32 p, %4, 0;\n\t"
+#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9)
+          "tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.block32 [%0], %1, %2, [%7], %3, [%5], [%6], p; \n\t"
+#else
           "tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, [%7], %3, [%5], [%6], p; \n\t"
+#endif
           "}\n"
           :
           : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC),
@@ -1520,7 +1536,11 @@ struct SM100_MMA_MXF4_2x1SM_SS
           "{\n\t"
           ".reg .pred p;\n\t"
           "setp.ne.b32 p, %4, 0;\n\t"
+#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9)
+          "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block16 [%0], %1, %2, %3, [%5], [%6], p; \n\t"
+#else
           "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%5], [%6], p; \n\t"
+#endif
           "}\n"
           :
           : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC),
@@ -1537,7 +1557,11 @@ struct SM100_MMA_MXF4_2x1SM_SS
           "{\n\t"
           ".reg .pred p;\n\t"
           "setp.ne.b32 p, %4, 0;\n\t"
+#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9)
+          "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.block32 [%0], %1, %2, %3, [%5], [%6], p; \n\t"
+#else
           "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%5], [%6], p; \n\t"
+#endif
           "}\n"
           :
           : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC),
@@ -1582,7 +1606,11 @@ struct SM100_MMA_MXF4NVF4_2x1SM_SS_SPARSE
           "{\n\t"
           ".reg .pred p;\n\t"
           "setp.ne.b32 p, %4, 0;\n\t"
+#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9)
+          "tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block16 [%0], %1, %2, [%7], %3, [%5], [%6], p; \n\t"
+#else
           "tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, [%7], %3, [%5], [%6], p; \n\t"
+#endif
           "}\n"
           :
           : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC),
@@ -1600,7 +1628,11 @@ struct SM100_MMA_MXF4NVF4_2x1SM_SS_SPARSE
           "{\n\t"
           ".reg .pred p;\n\t"
           "setp.ne.b32 p, %4, 0;\n\t"
+#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9)
+          "tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.block32 [%0], %1, %2, [%7], %3, [%5], [%6], p; \n\t"
+#else
           "tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, [%7], %3, [%5], [%6], p; \n\t"
+#endif
           "}\n"
           :
           : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC),
diff --git a/include/cute/atom/copy_traits_sm100_tma.hpp b/include/cute/atom/copy_traits_sm100_tma.hpp
index 851db289..0212db11 100644
--- a/include/cute/atom/copy_traits_sm100_tma.hpp
+++ b/include/cute/atom/copy_traits_sm100_tma.hpp
@@ -104,7 +104,7 @@ struct Copy_Traits<SM100_TMA_2SM_LOAD, NumBitsPerTMA, AuxParams_>
   auto
   get_tma_tensor(GShape const& g_shape) const {
     static_assert(is_congruent<decltype(g_shape), decltype(aux_params_.g_stride_)>::value);
-    return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_));
+    return make_coord_tensor(make_layout(g_shape, aux_params_.g_stride_));
   }
 
   // Don't try to execute a copy with SM100_TMA_2SM_LOAD before calling .with()
@@ -192,7 +192,7 @@ struct Copy_Traits<SM100_TMA_2SM_LOAD_MULTICAST, NumBitsPerTMA, AuxParams_>
   auto
   get_tma_tensor(GShape const& g_shape) const {
     static_assert(is_congruent<decltype(g_shape), decltype(aux_params_.g_stride_)>::value);
-    return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_));
+    return make_coord_tensor(make_layout(g_shape, aux_params_.g_stride_));
   }
 
   // Don't try to execute a copy with SM100_TMA_2SM_LOAD_MULTICAST_OP before calling .with()
diff --git a/include/cute/atom/copy_traits_sm90_tma.hpp b/include/cute/atom/copy_traits_sm90_tma.hpp
index ad668cee..209a8448 100644
--- a/include/cute/atom/copy_traits_sm90_tma.hpp
+++ b/include/cute/atom/copy_traits_sm90_tma.hpp
@@ -146,7 +146,7 @@ struct Copy_Traits<SM90_TMA_LOAD, NumBitsPerTMA, AuxParams_>
   auto
   get_tma_tensor(GShape const& g_shape) const {
     static_assert(is_congruent<decltype(g_shape), decltype(aux_params_.g_stride_)>::value);
-    return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_));
+    return make_coord_tensor(make_layout(g_shape, aux_params_.g_stride_));
   }
 
   // Don't try to execute a copy with SM90_TMA_LOAD before calling .with()
@@ -276,7 +276,7 @@ struct Copy_Traits<SM90_TMA_LOAD_MULTICAST, NumBitsPerTMA, AuxParams_>
   auto
   get_tma_tensor(GShape const& g_shape) const {
     static_assert(is_congruent<decltype(g_shape), decltype(aux_params_.g_stride_)>::value);
-    return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_));
+    return make_coord_tensor(make_layout(g_shape, aux_params_.g_stride_));
   }
 
   // Don't try to execute a copy with SM90_TMA_LOAD_MULTICAST before calling .with()
@@ -350,7 +350,7 @@ struct Copy_Traits<SM90_TMA_STORE, NumBitsPerTMA, AuxParams_>
   auto
   get_tma_tensor(GShape const& g_shape) const {
     static_assert(is_congruent<decltype(g_shape), decltype(aux_params_.g_stride_)>::value);
-    return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_));
+    return make_coord_tensor(make_layout(g_shape, aux_params_.g_stride_));
   }
 
   // Construct new TMA_STORE with (unsafe) swapped out TMA descriptor ptr (for grouped gemm/ptr array gemm)
@@ -463,7 +463,7 @@ struct Copy_Traits<SM90_TMA_REDUCE_ADD, NumBitsPerTMA, AuxParams_>
   auto
   get_tma_tensor(GShape const& g_shape) const {
     static_assert(is_congruent<decltype(g_shape), decltype(aux_params_.g_stride_)>::value);
-    return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_));
+    return make_coord_tensor(make_layout(g_shape, aux_params_.g_stride_));
   }
 
   template <class Coord, int... Is>
diff --git a/include/cute/tensor_impl.hpp b/include/cute/tensor_impl.hpp
index 0d914488..e65ad419 100644
--- a/include/cute/tensor_impl.hpp
+++ b/include/cute/tensor_impl.hpp
@@ -474,14 +474,14 @@ make_fragment_like(Tensor<Engine,Layout> const& tensor)
 }
 
 //
-// make_counting_tensor
+// make_coord_tensor
 //   Make a tensor from a layout by binding it to a counting iter with 0-offset of the same profile as the codomain.
 //
 
 template <class Layout, __CUTE_REQUIRES(is_layout<Layout>::value)>
 CUTE_HOST_DEVICE constexpr
 auto
-make_counting_tensor(Layout const& layout)
+make_coord_tensor(Layout const& layout)
 {
   return make_tensor(make_inttuple_iter(coprofile(layout)), layout);
 }
@@ -496,7 +496,7 @@ CUTE_HOST_DEVICE constexpr
 auto
 make_identity_tensor(Shape const& shape)
 {
-  return make_counting_tensor(make_identity_layout(shape));
+  return make_coord_tensor(make_identity_layout(shape));
 }
 
 //
diff --git a/include/cutlass/arch/config.h b/include/cutlass/arch/config.h
index e5daf829..60be8d72 100644
--- a/include/cutlass/arch/config.h
+++ b/include/cutlass/arch/config.h
@@ -105,10 +105,8 @@
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-
-
 // SM101 and SM101a
-#if !CUTLASS_CLANG_CUDA && (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8))
+#if !CUTLASS_CLANG_CUDA && (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8)
   #define CUTLASS_ARCH_MMA_SM101_SUPPORTED 1
   #if (!defined(CUTLASS_ARCH_MMA_SM101_ENABLED) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 1010)
     #define CUTLASS_ARCH_MMA_SM101_ENABLED 1
@@ -118,7 +116,7 @@
     #endif
 
     // SM101f
-    #if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9))
+    #if !CUTLASS_CLANG_CUDA && (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9)
     #define CUTLASS_ARCH_MMA_SM101F_SUPPORTED 1
     #endif
 
@@ -130,6 +128,8 @@
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 // SM120 and SM120a
 #if !CUTLASS_CLANG_CUDA && (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8))
   #define CUTLASS_ARCH_MMA_SM120_SUPPORTED 1
diff --git a/include/cutlass/epilogue/collective/builders/sm120_builder.inl b/include/cutlass/epilogue/collective/builders/sm120_builder.inl
index e1c1bff8..80e84e9a 100644
--- a/include/cutlass/epilogue/collective/builders/sm120_builder.inl
+++ b/include/cutlass/epilogue/collective/builders/sm120_builder.inl
@@ -284,6 +284,46 @@ struct CallbacksBuilder<
   >;
 };
 
+// Overload CallbacksBuilder to pick the correct copy atoms for PtrArray epilogue fusions
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  int NumEpilogueWarpgroups,
+  class FusionOp,
+  class TileShape_MNK,
+  class EpilogueTile_MN,
+  class AccLoadOp,
+  class ElementAccumulator
+>
+struct CallbacksBuilder<
+  Sm120PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore, NumEpilogueWarpgroups>,
+  FusionOp,
+  TileShape_MNK,
+  EpilogueTile_MN,
+  ElementAccumulator,
+  AccLoadOp,
+  cute::enable_if_t<(FusionOp::IsAuxOutSupported ^ FusionOp::IsAuxInSupported) // only one aux tensor
+              && not cute::is_subbyte_v<typename FusionOp::ElementAux>>
+> {
+  using GmemStrideTypeAux = gemm::TagToStrideC_t<typename FusionOp::GmemLayoutTagAux>;
+  using SmemLayoutAtomAux = decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<
+    GmemStrideTypeAux, typename FusionOp::ElementAux, EpilogueTile_MN>());
+
+  using CopyOpR2S = decltype(detail::sm120_get_smem_store_op_for_accumulator<GmemStrideTypeAux, typename FusionOp::ElementAux>());
+
+  using CopyOpS2R = decltype(detail::sm120_get_smem_load_op_for_source<GmemStrideTypeAux, typename FusionOp::ElementAux>());
+  
+  using SmemCopyOpAux = cute::conditional_t<FusionOp::IsAuxOutSupported, CopyOpR2S, CopyOpS2R>;
+
+  using Callbacks = fusion::FusionCallbacks<
+    Sm120PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore, NumEpilogueWarpgroups>,
+    FusionOp, TileShape_MNK, EpilogueTile_MN,
+    SmemLayoutAtomAux, SmemCopyOpAux
+  >;
+};
 
 // Helper for building TMA warp-specialized collective epilogues, specialized by
 // the fusion operation performed and the dispatch policy to use.
diff --git a/include/cutlass/epilogue/collective/builders/sm90_builder.inl b/include/cutlass/epilogue/collective/builders/sm90_builder.inl
index 9cb03fdc..ef0d7c4b 100644
--- a/include/cutlass/epilogue/collective/builders/sm90_builder.inl
+++ b/include/cutlass/epilogue/collective/builders/sm90_builder.inl
@@ -115,12 +115,13 @@ sm90_compute_tile_shape_or_override() {
   if constexpr (cute::is_same_v<EpilogueTileType, EpilogueTileAuto>) {
     auto epi_tile = [&] () {
       if constexpr (detail::sm90_is_cooperative_v<Schedule>) {
+
         auto tile_m = cute::min(_128{}, size<0>(TileShape_MNK{}));
         auto tile_n = cute::gcd(cute::min(_32{}, size<1>(TileShape_MNK{})), size<1>(TileShape_MNK{}));
         return make_shape(tile_m, tile_n);
       }
       else if constexpr (detail::sm90_is_warp_specialized_v<Schedule>) {
-        constexpr int N_perf = sizeof_bits_v<ElementD> == 8 ? 64 : 32;
+        constexpr int N_perf = (sizeof_bits_v<ElementD> == 8) && (size<1>(TileShape_MNK{}) % 64 == 0) ? 64 : 32;
         auto tile_m = cute::min(_64{}, size<0>(TileShape_MNK{}));
         auto tile_n = cute::gcd(cute::min(Int<N_perf>{}, size<1>(TileShape_MNK{})), size<1>(TileShape_MNK{}));
         return make_shape(tile_m, tile_n);
@@ -194,9 +195,9 @@ struct CallbacksBuilder<
   using SmemLayoutAtomAux = decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<
     GmemStrideTypeAux, typename FusionOp::ElementAux, EpilogueTile_MN>());
   using CopyOpR2S = decltype(detail::sm90_get_smem_store_op_for_accumulator<
-    GmemStrideTypeAux, typename FusionOp::ElementAux>());
+    GmemStrideTypeAux, typename FusionOp::ElementAux, EpilogueTile_MN>());
   using CopyOpS2R = decltype(detail::sm90_get_smem_load_op_for_source<
-    GmemStrideTypeAux, typename FusionOp::ElementAux>());
+    GmemStrideTypeAux, typename FusionOp::ElementAux, EpilogueTile_MN>());
   using SmemCopyOpAux = cute::conditional_t<FusionOp::IsAuxOutSupported, CopyOpR2S, CopyOpS2R>;
 
   using Callbacks = fusion::FusionCallbacks<
@@ -234,9 +235,9 @@ struct CallbacksBuilder<
   using SmemLayoutAtomAux = decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<
     GmemStrideTypeAux, typename FusionOp::ElementAux, EpilogueTile_MN>());
   using CopyOpR2S = decltype(detail::sm90_get_smem_store_op_for_accumulator<
-    GmemStrideTypeAux, typename FusionOp::ElementAux>());
+    GmemStrideTypeAux, typename FusionOp::ElementAux, EpilogueTile_MN>());
   using CopyOpS2R = decltype(detail::sm90_get_smem_load_op_for_source<
-    GmemStrideTypeAux, typename FusionOp::ElementAux>());
+    GmemStrideTypeAux, typename FusionOp::ElementAux, EpilogueTile_MN>());
   using SmemCopyOpAux = cute::conditional_t<FusionOp::IsAuxOutSupported, CopyOpR2S, CopyOpS2R>;
 
   using Callbacks = fusion::FusionCallbacks<
@@ -316,7 +317,17 @@ struct Sm90TmaBuilderImpl {
     >;
 
   // Get the smallest tiled copy we can use to retile the accumulators
-  using CopyAtomC = Copy_Atom<SM90_U32x4_STSM_N, cutlass::half_t>;
+  // using CopyAtomC = Copy_Atom<SM90_U32x4_STSM_N, cutlass::half_t>;
+  using CopyAtomC = cute::conditional_t<
+    size<1>(EpilogueTile_MN{}) % 16 == 0,
+    Copy_Atom<SM90_U32x4_STSM_N, cutlass::half_t>,
+    cute::conditional_t<
+      size<1>(EpilogueTile_MN{}) % 8 == 0,
+      Copy_Atom<SM90_U32x2_STSM_N, cutlass::half_t>,
+      void
+    >
+  >;
+  static_assert(!cute::is_same_v<CopyAtomC, void>, "CopyAtomC can't be void, divisiblity check for EpilogueTile_MN failed");
   // Get register to register tiled copy that happen before shared memory store.
   // Apply void as no register transform op needed currently.
   using CopyOpR2R = void;
@@ -343,10 +354,10 @@ struct Sm90TmaBuilderImpl {
       FusionCallbacks,
       CopyOpG2S,
       decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<UnderlyingGmemStrideTypeC, ElementC, EpilogueTile_MN>()),
-      decltype(detail::sm90_get_smem_load_op_for_source<UnderlyingGmemStrideTypeC, ElementC>()),
+      decltype(detail::sm90_get_smem_load_op_for_source<UnderlyingGmemStrideTypeC, ElementC, EpilogueTile_MN>()),
       CopyOpS2G,
       decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<UnderlyingGmemStrideTypeD, ElementD, EpilogueTile_MN>()),
-      decltype(detail::sm90_get_smem_store_op_for_accumulator<UnderlyingGmemStrideTypeD, ElementD>()),
+      decltype(detail::sm90_get_smem_store_op_for_accumulator<UnderlyingGmemStrideTypeD, ElementD, EpilogueTile_MN>()),
       CopyAtomC,
       CopyOpR2R
     >;
@@ -404,7 +415,7 @@ struct AuxLoadDescriptor {
       >()
     );
   using CopyOpS2R =
-    decltype(detail::sm90_get_smem_load_op_for_source<Stride, ElementAux>());
+    decltype(detail::sm90_get_smem_load_op_for_source<Stride, ElementAux, EpilogueTile>());
 };
 
 // Get Stride, SmemLayout, and CopyOpS2R for AuxStore node
@@ -425,7 +436,7 @@ struct AuxStoreDescriptor {
       >()
     );
   using CopyOpR2S =
-    decltype(detail::sm90_get_smem_store_op_for_accumulator<Stride, ElementAux>());
+    decltype(detail::sm90_get_smem_store_op_for_accumulator<Stride, ElementAux, EpilogueTile>());
 };
 
 } // namespace detail
@@ -745,7 +756,7 @@ private:
   using SmemLayoutAtomAux = decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<
     GmemStrideTypeAux, typename Schedule::ElementT, EpilogueTile_MN>());
   using SmemCopyOpAux = decltype(detail::sm90_get_smem_store_op_for_accumulator<
-    GmemStrideTypeAux, typename Schedule::ElementT>());
+    GmemStrideTypeAux, typename Schedule::ElementT, EpilogueTile_MN>());
   using FusionOperationAux = fusion::LinCombPerRowBiasEltActAux<
     GmemLayoutTagD, Schedule::template ActivationFunctor, ElementD, ElementCompute,
     typename Schedule::ElementT, typename Schedule::ElementBias, ElementC_, ElementCompute
@@ -769,7 +780,17 @@ private:
   using GmemStrideTypeD = gemm::TagToStrideC_t<GmemLayoutTagD>;
 
   // Get the smallest tiled copy we can use to retile the accumulators
-  using CopyAtomC = Copy_Atom<SM90_U32x4_STSM_N, cutlass::half_t>;
+  using CopyAtomC = cute::conditional_t<
+    size<1>(EpilogueTile_MN{}) % 16 == 0,
+    Copy_Atom<SM90_U32x4_STSM_N, cutlass::half_t>,
+    cute::conditional_t<
+      size<1>(EpilogueTile_MN{}) % 8 == 0,
+      Copy_Atom<SM90_U32x2_STSM_N, cutlass::half_t>,
+      void
+    >
+  >;
+  static_assert(!cute::is_same_v<CopyAtomC, void>, "CopyAtomC can't be void, divisiblity check for EpilogueTile_MN failed");
+
   // Get register to register tiled copy that happen before shared memory store.
   // Apply void as no register transform op needed.
   using CopyOpR2R = void;
@@ -788,10 +809,10 @@ public:
       cute::conditional_t<Schedule::StoreT, FusionCallbacksAux, FusionCallbacksNoAux>,
       SM90_TMA_LOAD,
       decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<GmemStrideTypeC, ElementC, EpilogueTile_MN>()),
-      decltype(detail::sm90_get_smem_load_op_for_source<GmemStrideTypeC, ElementC>()),
+      decltype(detail::sm90_get_smem_load_op_for_source<GmemStrideTypeC, ElementC, EpilogueTile_MN>()),
       SM90_TMA_STORE,
       decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<GmemStrideTypeD, ElementD, EpilogueTile_MN>()),
-      decltype(detail::sm90_get_smem_store_op_for_accumulator<GmemStrideTypeD, ElementD>()),
+      decltype(detail::sm90_get_smem_store_op_for_accumulator<GmemStrideTypeD, ElementD, EpilogueTile_MN>()),
       CopyAtomC,
       CopyOpR2R
     >;
diff --git a/include/cutlass/epilogue/collective/builders/sm90_common.inl b/include/cutlass/epilogue/collective/builders/sm90_common.inl
index a6affcfc..c0a90396 100644
--- a/include/cutlass/epilogue/collective/builders/sm90_common.inl
+++ b/include/cutlass/epilogue/collective/builders/sm90_common.inl
@@ -37,16 +37,26 @@ namespace cutlass::epilogue::collective::detail {
 ///////////////////////////////////////////////////////////////////////////////
 
 // Selects the largest vectorized smem store atom available
-template <class GmemStrideTypeD, class ElementD>
+template <class GmemStrideTypeD, class ElementD, class EpilogueTile_MN>
 constexpr auto
 sm90_get_smem_store_op_for_accumulator() {
   using namespace cute;
 
   if constexpr (sizeof(ElementD) == 2 && size<0>(GmemStrideTypeD{}) == 1) {
-    return SM90_U16x8_STSM_T{};
+    if constexpr (size<1>(EpilogueTile_MN{}) % 16 == 0) {
+      return SM90_U16x8_STSM_T{};
+    }
+    else if constexpr (size<1>(EpilogueTile_MN{}) % 8 == 0) {
+      return SM90_U16x4_STSM_T{};
+    }
   }
   else if constexpr (sizeof(ElementD) == 2 && size<1>(GmemStrideTypeD{}) == 1) {
-    return SM90_U32x4_STSM_N{};
+    if constexpr (size<1>(EpilogueTile_MN{}) % 16 == 0) {
+      return SM90_U32x4_STSM_N{};
+    }
+    else if constexpr (size<1>(EpilogueTile_MN{}) % 8 == 0) {
+      return SM90_U32x2_STSM_N{};
+    }
   }
   else {
     // auto-vectorizing store
@@ -55,20 +65,26 @@ sm90_get_smem_store_op_for_accumulator() {
 }
 
 // Selects the largest vectorized smem load atom available
-template <class GmemStrideTypeC, class ElementC>
+template <class GmemStrideTypeC, class ElementC, class EpilogueTile_MN>
 constexpr auto
 sm90_get_smem_load_op_for_source() {
   using namespace cute;
 
   // Reuse the logic from smem store selector
-  using SmemStoreOp = decltype(sm90_get_smem_store_op_for_accumulator<GmemStrideTypeC, ElementC>());
+  using SmemStoreOp = decltype(sm90_get_smem_store_op_for_accumulator<GmemStrideTypeC, ElementC, EpilogueTile_MN>());
 
   if constexpr (cute::is_same_v<SmemStoreOp, SM90_U16x8_STSM_T>) {
     return SM75_U16x8_LDSM_T{};
   }
+  else if constexpr (cute::is_same_v<SmemStoreOp, SM90_U16x4_STSM_T>) {
+    return SM75_U16x4_LDSM_T{};
+  }
   else if constexpr (cute::is_same_v<SmemStoreOp, SM90_U32x4_STSM_N>) {
     return SM75_U32x4_LDSM_N{};
   }
+  else if constexpr (cute::is_same_v<SmemStoreOp, SM90_U32x2_STSM_N>) {
+    return SM75_U32x2_LDSM_N{};
+  }
   else {
     // auto-vectorizing load
     return AutoVectorizingCopyWithAssumedAlignment<128>{};
diff --git a/include/cutlass/epilogue/collective/default_epilogue.hpp b/include/cutlass/epilogue/collective/default_epilogue.hpp
index 0d019b1c..ed34bc10 100644
--- a/include/cutlass/epilogue/collective/default_epilogue.hpp
+++ b/include/cutlass/epilogue/collective/default_epilogue.hpp
@@ -215,8 +215,8 @@ public:
     Tensor cD_mn = local_tile(mD_crd, take<0,2>(blk_shape_MNK), make_coord(m_coord, n_coord));         // (BLK_M,BLK_N)
     Tensor tCcD_mn = thr_mma.partition_C(cD_mn);                                                   // (VEC,THR_M,THR_N)
     // Relative coordinate tensors (static)
-    Tensor cD = make_counting_tensor(cD_mn.layout());                                                  // (BLK_M,BLK_N)
-    Tensor tCcD = make_counting_tensor(tCcD_mn.layout());                                          // (VEC,THR_M,THR_N)
+    Tensor cD = make_coord_tensor(cD_mn.layout());                                                  // (BLK_M,BLK_N)
+    Tensor tCcD = make_coord_tensor(tCcD_mn.layout());                                          // (VEC,THR_M,THR_N)
     // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate
     auto residue_cD = shape_MN - cD_mn(_0{});                                                                  // (m,n)
     auto residue_tCcD = shape_MN - tCcD_mn(_0{});                                                              // (m,n)
diff --git a/include/cutlass/epilogue/collective/sm100_epilogue_array_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm100_epilogue_array_tma_warpspecialized.hpp
index 0ed7d6b9..1f0a915d 100644
--- a/include/cutlass/epilogue/collective/sm100_epilogue_array_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/collective/sm100_epilogue_array_tma_warpspecialized.hpp
@@ -286,12 +286,8 @@ public:
       void* workspace) {
     // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
     // These will be replaced with correct values before the initial tma load.
-    auto init_shape = repeat_like(append<4>(typename ProblemShape::UnderlyingProblemShape{}, 1), int32_t(1));
-    // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
-    // These will be replaced with correct values before the initial tma load.
-    constexpr int tma_alignment_bits = 128;
-    auto init_M = tma_alignment_bits;
-    auto init_N = tma_alignment_bits;
+    auto init_M = int32_t(size<0>(CtaTileShape{}));
+    auto init_N = int32_t(size<1>(CtaTileShape{}));
     auto init_L = 1;
 
     InternalStrideC stride_c;
@@ -745,8 +741,8 @@ public:
     Tensor cD_mn = local_tile(mD_crd, take<0,2>(cta_tile_mnk), make_coord(m_coord, n_coord));        // (CTA_M,CTA_N)
     Tensor tTR_cD_mn = thread_t2r.partition_D(flat_divide(cD_mn, EpilogueTile{}));     // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
     // Relative coordinate tensors (static)
-    Tensor cD = make_counting_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
-    Tensor tTR_cD = make_counting_tensor(tTR_cD_mn.layout());                          // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
+    Tensor cD = make_coord_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
+    Tensor tTR_cD = make_coord_tensor(tTR_cD_mn.layout());                          // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
     // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate
     auto residue_cD = make_coord(M,N) - cD_mn(_0{});                                                           // (m,n)
     auto residue_tTR_cD = make_coord(M,N) - tTR_cD_mn(_0{});                                                   // (m,n)
@@ -786,8 +782,8 @@ public:
     [[maybe_unused]] bool reverse_epi_n = ReuseTmem && acc_pipe_consumer_state.phase() == 0;
     static_assert(not (ReuseTmem && AccumulatorPipeline::Stages != 1), "Tmem reuse requires 1 accumulator stage");
 
-    // Predication for TMA store (one warp issues TMA store)
-    bool issue_tma_store = warp_idx == 0;
+    // Predication for TMA store (a single thread from one warp issues TMA store)
+    bool issue_tma_store = (warp_idx == 0) && cute::elect_one_sync();
 
     // In the reuse smem configuration we have StagesC smem buffers and at most StagesD committed TMA stores in flight.
     // The TMA store pipeline producer acquire returns when at most StagesD-1 committed stores are in-flight, so we can
@@ -1118,8 +1114,8 @@ public:
     Tensor cD_mn = local_tile(mD_crd, take<0,2>(cta_tile_mnk), make_coord(m_coord, n_coord));          // (CTA_M,CTA_N)
     Tensor tTR_cD_mn = thread_t2r.partition_D(flat_divide(cD_mn, EpilogueTile{}));     // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
     // Relative coordinate tensors (static)
-    Tensor cD = make_counting_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
-    Tensor tTR_cD = make_counting_tensor(tTR_cD_mn.layout());                          // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
+    Tensor cD = make_coord_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
+    Tensor tTR_cD = make_coord_tensor(tTR_cD_mn.layout());                          // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
     // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate
     auto residue_cD = make_coord(M,N) - cD_mn(_0{});                                                           // (m,n)
     auto residue_tTR_cD = make_coord(M,N) - tTR_cD_mn(_0{});                                                   // (m,n)
diff --git a/include/cutlass/epilogue/collective/sm100_epilogue_nosmem.hpp b/include/cutlass/epilogue/collective/sm100_epilogue_nosmem.hpp
index c2172798..e32cdfa4 100644
--- a/include/cutlass/epilogue/collective/sm100_epilogue_nosmem.hpp
+++ b/include/cutlass/epilogue/collective/sm100_epilogue_nosmem.hpp
@@ -606,8 +606,8 @@ public:
     // Construct the EVT consumer callbacks
     auto residue_cD = make_coord(M,N) - cD(_0{});
     auto residue_tTR_cD = make_coord(M,N) - tTR_cD(_0{});
-    Tensor cD_ = make_counting_tensor(cD.layout());
-    Tensor tTR_cD_ = make_counting_tensor(tTR_cD.layout());
+    Tensor cD_ = make_coord_tensor(cD.layout());
+    Tensor tTR_cD_ = make_coord_tensor(tTR_cD.layout());
     constexpr bool RefSrc = false;
 
     Tensor mC = make_tensor(make_gmem_ptr<GmemElementC>(params.ptr_C), make_shape(M,N,L), params.dC);
diff --git a/include/cutlass/epilogue/collective/sm100_epilogue_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm100_epilogue_tma_warpspecialized.hpp
index 3f445bf5..412a4b7b 100644
--- a/include/cutlass/epilogue/collective/sm100_epilogue_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/collective/sm100_epilogue_tma_warpspecialized.hpp
@@ -695,8 +695,8 @@ public:
     Tensor cD_mn = local_tile(mD_crd, take<0,2>(cta_tile_mnk), make_coord(m_coord, n_coord));          // (CTA_M,CTA_N)
     Tensor tTR_cD_mn = thread_t2r.partition_D(flat_divide(cD_mn, EpilogueTile{}));     // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
     // Relative coordinate tensors (static)
-    Tensor cD = make_counting_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
-    Tensor tTR_cD = make_counting_tensor(tTR_cD_mn.layout());                          // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
+    Tensor cD = make_coord_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
+    Tensor tTR_cD = make_coord_tensor(tTR_cD_mn.layout());                          // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
     // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate
     auto residue_cD = make_coord(M,N) - cD_mn(_0{});                                                           // (m,n)
     auto residue_tTR_cD = make_coord(M,N) - tTR_cD_mn(_0{});                                                   // (m,n)
@@ -1065,8 +1065,8 @@ public:
     Tensor cD_mn = local_tile(mD_crd, take<0,2>(cta_tile_mnk), make_coord(m_coord, n_coord));          // (CTA_M,CTA_N)
     Tensor tTR_cD_mn = thread_t2r.partition_D(flat_divide(cD_mn, EpilogueTile{}));     // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
     // Relative coordinate tensors (static)
-    Tensor cD = make_counting_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
-    Tensor tTR_cD = make_counting_tensor(tTR_cD_mn.layout());                          // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
+    Tensor cD = make_coord_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
+    Tensor tTR_cD = make_coord_tensor(tTR_cD_mn.layout());                          // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
     // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate
     auto residue_cD = make_coord(M,N) - cD_mn(_0{});                                                           // (m,n)
     auto residue_tTR_cD = make_coord(M,N) - tTR_cD_mn(_0{});                                                   // (m,n)
diff --git a/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
index 41c95f16..af53a1c6 100644
--- a/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
@@ -304,11 +304,9 @@ public:
       [[maybe_unused]] void* workspace) {
     // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
     // These will be replaced with correct values before the initial tma load.
-    auto init_shape = repeat_like(append<4>(typename ProblemShape::UnderlyingProblemShape{}, 1), int32_t(1));
-    constexpr int tma_alignment_bits = 128;
-    auto init_M = tma_alignment_bits;
-    auto init_N = tma_alignment_bits;
-    auto init_L = get<3>(init_shape);
+    auto init_M = int32_t(size<0>(CtaTileMNK{}));
+    auto init_N = int32_t(size<1>(CtaTileMNK{}));
+    auto init_L = 1;
 
     static_assert(!is_im2col_C and !is_im2col_D, "Im2Col not supported on C or D");
 
@@ -324,8 +322,6 @@ public:
       auto problem_shape_MNKL = append<4>(problem_shape.get_host_problem_shape(0), 1);
       init_M = get<0>(problem_shape_MNKL);
       init_N = get<1>(problem_shape_MNKL);
-      init_L = get<3>(problem_shape_MNKL);
-
       stride_c = args.dC;
       stride_d = args.dD;
     }
@@ -755,8 +751,8 @@ public:
     Tensor cD_mn = local_tile(mD_crd, take<0,2>(CtaTileMNK{}), make_coord(m_coord, n_coord));          // (CTA_M,CTA_N)
     Tensor tRS_cD_mn = thread_r2s.partition_S(flat_divide(cD_mn, EpilogueTile{}));     // (R2S,R2S_M,R2S_N,EPI_M,EPI_N)
     // Relative coordinate tensors (static)
-    Tensor cD = make_counting_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
-    Tensor tRS_cD = make_counting_tensor(tRS_cD_mn.layout());                          // (R2S,R2S_M,R2S_N,EPI_M,EPI_N)
+    Tensor cD = make_coord_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
+    Tensor tRS_cD = make_coord_tensor(tRS_cD_mn.layout());                          // (R2S,R2S_M,R2S_N,EPI_M,EPI_N)
     // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate
     auto residue_cD = make_coord(M,N) - cD_mn(_0{});                                                           // (m,n)
     auto residue_tRS_cD = make_coord(M,N) - tRS_cD_mn(_0{});                                                   // (m,n)
@@ -803,8 +799,8 @@ public:
     // to ensure visibility of smem reads/writes to threads or TMA unit
     auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
 
-    // Predication for TMA store (one warp issues TMA store)
-    bool issue_tma_store = (thread_idx / NumThreadsPerWarp) == 0;
+    // Predication for TMA store (a single thread from one warp issues TMA store)
+    bool issue_tma_store = ((thread_idx / NumThreadsPerWarp) == 0) && cute::elect_one_sync();
 
     // In the reuse smem configuration we have StagesC smem buffers and at most StagesD committed TMA stores in flight.
     // The TMA store pipeline producer acquire returns when at most StagesD-1 committed stores are in-flight, so we can
diff --git a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
index f244fafa..062b9a8b 100644
--- a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
@@ -662,8 +662,8 @@ public:
       }
     }();
     // Relative coordinate tensors (static)
-    Tensor cD = make_counting_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
-    Tensor tRS_cD = make_counting_tensor(tRS_cD_mn.layout());                          // (R2S,R2S_M,R2S_N,EPI_M,EPI_N)
+    Tensor cD = make_coord_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
+    Tensor tRS_cD = make_coord_tensor(tRS_cD_mn.layout());                          // (R2S,R2S_M,R2S_N,EPI_M,EPI_N)
     // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate
     auto residue_cD = make_coord(M,N) - cD_mn(_0{});                                                           // (m,n)
     auto residue_tRS_cD = make_coord(M,N) - tRS_cD_mn(_0{});                                                   // (m,n)
diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h
index c3abfdff..265a75ee 100644
--- a/include/cutlass/epilogue/thread/activation.h
+++ b/include/cutlass/epilogue/thread/activation.h
@@ -496,13 +496,29 @@ template <>
 struct HardSwish<float> {
   using T = float;
   static const bool kIsHeavy = false;
+  static constexpr float kOneSixth = 0.16666667f;
 
   CUTLASS_HOST_DEVICE
   T operator()(T const &x) const {
     minimum<T> mn;
     maximum<T> mx;
     T relu6 = mn(mx(x + T(3), T(0)), T(6));
-    return x * relu6 * 0.16666667f;
+    return x * relu6 * kOneSixth;
+  }
+};
+
+template <>
+struct HardSwish<cutlass::half_t> {
+  using T = cutlass::half_t;
+  static const bool kIsHeavy = false;
+  static constexpr float kOneSixth = 0.16666667f;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &x) const {
+    minimum<T> mn;
+    maximum<T> mx;
+    T relu6 = mn(mx(x + T(3), T(0)), T(6));
+    return x * relu6 * T(kOneSixth);
   }
 };
 
@@ -524,22 +540,6 @@ struct HardSwish<Array<T, N> > {
   }
 };
 
-template <int N>
-struct HardSwish<Array<half_t, N> > {
-  using T = half_t;
-  static const bool kIsHeavy = false;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &value) const {
-    minimum<Array<T, N> > mn;
-    maximum<Array<T, N> > mx;
-    multiplies<Array<T, N> > mul;
-    plus<Array<T, N> > add;
-
-    return mul(mul(mn(mx(add(value, T(3)), T(0)), T(6)), value), T(0.16666667f));
-  }
-};
-
 template <typename T>
 using ScaledHardSwish = Scale<HardSwish<T>>;
 
diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h
index 5a9a9888..628a8077 100644
--- a/include/cutlass/functional.h
+++ b/include/cutlass/functional.h
@@ -722,7 +722,7 @@ struct has_unqualified_conj : cutlass::platform::false_type
 template<typename T>
 struct has_unqualified_conj<
     T,
-    decltype(conj(cutlass::platform::declval<T>()), void())
+    decltype(static_cast<void>(conj(cutlass::platform::declval<T>())), void())
   > : cutlass::platform::true_type
 {};
 
diff --git a/include/cutlass/gemm/collective/builders/sm100_blockwise_umma_builder.inl b/include/cutlass/gemm/collective/builders/sm100_blockwise_umma_builder.inl
index dc99eb3d..8617e883 100644
--- a/include/cutlass/gemm/collective/builders/sm100_blockwise_umma_builder.inl
+++ b/include/cutlass/gemm/collective/builders/sm100_blockwise_umma_builder.inl
@@ -129,7 +129,10 @@ auto sm100_make_simt_gmem_tiled_copy_SFA() {
       using ScaleCopyTypeA = cute::uint_byte_t<Alignment>; 
       using SmemScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ScaleCopyTypeA>, Element>;
       constexpr int ElementsPerSFACopy = static_cast<int>(sizeof(ScaleCopyTypeA) / sizeof(Element));
-      return make_tiled_copy(SmemScalingCopyAtomA{}, Layout<Shape<_32>>{}, Layout<Shape<Int<ElementsPerSFACopy>>>{});
+      return make_tiled_copy(
+          SmemScalingCopyAtomA{}, 
+          Layout<Shape<_32>>{},  // 32 threads
+          Layout<Shape<Shape<Int<ScaleGranularityM>, Int<ElementsPerSFACopy>>>, Stride<Stride<_0, _1>>>{});
     } 
     else {
       using SmemScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<Element>, Element>;
@@ -138,9 +141,8 @@ auto sm100_make_simt_gmem_tiled_copy_SFA() {
   } 
   else {
     // we expect scale Ks per tile to be small
-    constexpr int LeadingScalesPerTileSFA = ScaleKsPerTile;
     using SmemScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<Element>, Element>;
-    return make_tiled_copy(SmemScalingCopyAtomA{}, Layout<Shape<_1, Int<LeadingScalesPerTileSFA>>>{}, Layout<Shape<_1,_1>>{});
+    return make_tiled_copy(SmemScalingCopyAtomA{}, Layout<Shape<_32>>{}, Layout<Shape<_1>>{});
   }
 }
 
@@ -161,7 +163,10 @@ auto sm100_make_simt_gmem_tiled_copy_SFB() {
       using ScaleCopyTypeB = cute::uint_byte_t<Alignment>; 
       using SmemScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ScaleCopyTypeB>, Element>;
       constexpr int ElementsPerSFBCopy = static_cast<int>(sizeof(ScaleCopyTypeB) / sizeof(Element));
-      return make_tiled_copy(SmemScalingCopyAtomB{}, Layout<Shape<_32>>{}, Layout<Shape<Int<ElementsPerSFBCopy>>>{});
+      return make_tiled_copy(
+          SmemScalingCopyAtomB{}, 
+          Layout<Shape<_32>>{},  // 32 threads
+          Layout<Shape<Shape<Int<ScaleGranularityN>, Int<ElementsPerSFBCopy>>>, Stride<Stride<_0, _1>>>{});
     } 
     else {
       using SmemScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<Element>, Element>;
@@ -170,9 +175,8 @@ auto sm100_make_simt_gmem_tiled_copy_SFB() {
   } 
   else {
     // we expect scale Ks per tile to be small
-    constexpr int LeadingScalesPerTileSFB = ScaleKsPerTile;
     using SmemScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<Element>, Element>;
-    return make_tiled_copy(SmemScalingCopyAtomB{}, Layout<Shape<_1, Int<LeadingScalesPerTileSFB>>>{}, Layout<Shape<_1,_1>>{});
+    return make_tiled_copy(SmemScalingCopyAtomB{}, Layout<Shape<_32>>{}, Layout<Shape<_1>>{});
   }
 }
 
diff --git a/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl b/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
index b6c489da..c75af3ac 100644
--- a/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
+++ b/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
@@ -260,7 +260,9 @@ struct CollectiveBuilder<
       GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
  
   static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0;
-  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
+  static constexpr size_t SchedulerPipelineStorage = cute::is_pointer_v<TagToStrideA_t<GmemLayoutATag>> ? 
+      sizeof(cutlass::PipelineDetail::PipelineAsyncSharedStorage<8>) : 0;
+  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage + SchedulerPipelineStorage);
   static constexpr int Sm90ReducedSmemCapacityBytes = detail::sm90_smem_capacity_bytes - KernelSmemCarveout;
 
   static constexpr int PipelineStages = detail::compute_stage_count_or_override<Sm90ReducedSmemCapacityBytes,
@@ -446,7 +448,9 @@ public:
 
   // Handle mixed dtype array GEMM's size of tensor map storage.
   static constexpr size_t TensorMapStorage = sizeof(cute::TmaDescriptor) * size_t(IsMixedInput) * 4;
-  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
+  static constexpr size_t SchedulerPipelineStorage = cute::is_pointer_v<TagToStrideA_t<GmemLayoutATag_>> ? 
+      sizeof(cutlass::PipelineDetail::PipelineAsyncSharedStorage<8>) : 0;
+  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage + SchedulerPipelineStorage);
   static constexpr int Sm90ReducedSmemCapacityBytes = detail::sm90_smem_capacity_bytes - KernelSmemCarveout;
 
   static constexpr int PipelineStages = IsMixedInput ?
@@ -570,7 +574,9 @@ struct CollectiveBuilder<
       GmmaMajorB, ElementB, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
 
   static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0;
-  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
+  static constexpr size_t SchedulerPipelineStorage = cute::is_pointer_v<TagToStrideA_t<GmemLayoutATag>> ? 
+      sizeof(cutlass::PipelineDetail::PipelineAsyncSharedStorage<8>) : 0;
+  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage + SchedulerPipelineStorage);
   static constexpr int Sm90ReducedSmemCapacityBytes = detail::sm90_smem_capacity_bytes - KernelSmemCarveout;
 
   static constexpr int PipelineStages = detail::compute_stage_count_or_override<Sm90ReducedSmemCapacityBytes,
@@ -1128,8 +1134,9 @@ struct CollectiveBuilder<
 
   static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0;
   // Reserve 128B for 8 stages of tile scheduling
-  static constexpr size_t TileSchedulerCarveout = IsArrayOfPointersGemm ? 128 : 0;
-  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage) + static_cast<int>(TileSchedulerCarveout);
+  static constexpr size_t SchedulerPipelineStorage = cute::is_pointer_v<TagToStrideA_t<GmemLayoutATag>> ? 
+      sizeof(cutlass::PipelineDetail::PipelineAsyncSharedStorage<8>) : 0;
+  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage + SchedulerPipelineStorage);
 
   static constexpr int ScaleMsPerTile = size<0>(TileShape_MNK{}) / ScaleGranularityM;
   static constexpr int ScaleNsPerTile = size<1>(TileShape_MNK{}) / ScaleGranularityN;
diff --git a/include/cutlass/gemm/collective/sm100_blockscaled_mma_array_warpspecialized.hpp b/include/cutlass/gemm/collective/sm100_blockscaled_mma_array_warpspecialized.hpp
index bdc877bd..b51d1256 100644
--- a/include/cutlass/gemm/collective/sm100_blockscaled_mma_array_warpspecialized.hpp
+++ b/include/cutlass/gemm/collective/sm100_blockscaled_mma_array_warpspecialized.hpp
@@ -427,10 +427,9 @@ struct CollectiveMma<
     cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) {
     // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
     // These will be replaced with correct values before the initial tma load.
-    constexpr int tma_alignment_bits = 128;
-    auto init_M = tma_alignment_bits;
-    auto init_N = tma_alignment_bits;
-    auto init_K = tma_alignment_bits;
+    auto init_M = int32_t(size<0>(TileShape{}));
+    auto init_N = int32_t(size<1>(TileShape{}));
+    auto init_K = int32_t(size<2>(TileShape{}));
     auto init_L = 1;
 
     // Tensor pointers will be fixed before the first access
diff --git a/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized_blockwise_scaling.hpp b/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized_blockwise_scaling.hpp
index bb05b52a..8fc171e8 100644
--- a/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized_blockwise_scaling.hpp
+++ b/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized_blockwise_scaling.hpp
@@ -190,8 +190,14 @@ struct CollectiveMma<
   using TransformB = TransformB_;
   using ArchTag = typename DispatchPolicy::ArchTag;
 
-  static constexpr int AlignmentSFA = GmemTiledCopySFA::AtomNumVal::value * sizeof(typename GmemTiledCopySFA::ValType) / sizeof(ElementAccumulator);
-  static constexpr int AlignmentSFB = GmemTiledCopySFB::AtomNumVal::value * sizeof(typename GmemTiledCopySFB::ValType) / sizeof(ElementAccumulator);
+  static constexpr int CopyAlignmentSFA = GmemTiledCopySFA::AtomNumVal::value * sizeof(typename GmemTiledCopySFA::ValType) / sizeof(ElementAccumulator);
+  static constexpr int CopyAlignmentSFB = GmemTiledCopySFB::AtomNumVal::value * sizeof(typename GmemTiledCopySFB::ValType) / sizeof(ElementAccumulator);
+
+  static constexpr int AlignmentSFA = CopyAlignmentSFA * (GmemTiledCopySFA::AtomNumVal::value > 1 ? 
+      (size<0,1>(InternalLayoutSFA{}.stride()) == 1 ? ScaleGranularityM : ScaleGranularityK) : 1);
+  static constexpr int AlignmentSFB = CopyAlignmentSFB * (GmemTiledCopySFB::AtomNumVal::value > 1 ? 
+      (size<0,1>(InternalLayoutSFB{}.stride()) == 1 ? ScaleGranularityN : ScaleGranularityK) : 1);
+
 
   using MainloopABPipeline = cutlass::PipelineTmaUmmaAsync<
                                 DispatchPolicy::Stages,
@@ -522,8 +528,8 @@ struct CollectiveMma<
         auto [M,N,K,L] = problem_shape_MNKL;
         implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), InternalStrideA{});
         implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), InternalStrideB{});
-        implementable_sf = implementable_sf && cutlass::detail::check_alignment<AlignmentSFA>(ScaleConfig::tile_atom_to_shape_SFA(problem_shape_MNKL));
-        implementable_sf = implementable_sf && cutlass::detail::check_alignment<AlignmentSFB>(ScaleConfig::tile_atom_to_shape_SFB(problem_shape_MNKL));
+        implementable_sf = implementable_sf && cutlass::detail::check_alignment<CopyAlignmentSFA>(ScaleConfig::tile_atom_to_shape_SFA(problem_shape_MNKL));
+        implementable_sf = implementable_sf && cutlass::detail::check_alignment<CopyAlignmentSFB>(ScaleConfig::tile_atom_to_shape_SFB(problem_shape_MNKL));
         if (!implementable_sf) {
           CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for Scale Factors.\n");
         }
diff --git a/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp b/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp
index c8a5367a..d86c58be 100644
--- a/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp
+++ b/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp
@@ -201,8 +201,14 @@ struct CollectiveMma<
                                   AtomThrShapeMNK>;
   using AccumulatorPipelineState = typename AccumulatorPipeline::PipelineState;
 
-  static constexpr int AlignmentSFA = GmemTiledCopySFA::AtomNumVal::value * sizeof(typename GmemTiledCopySFA::ValType) / sizeof(ElementAccumulator);
-  static constexpr int AlignmentSFB = GmemTiledCopySFB::AtomNumVal::value * sizeof(typename GmemTiledCopySFB::ValType) / sizeof(ElementAccumulator);
+  static constexpr int CopyAlignmentSFA = GmemTiledCopySFA::AtomNumVal::value * sizeof(typename GmemTiledCopySFA::ValType) / sizeof(ElementAccumulator);
+  static constexpr int CopyAlignmentSFB = GmemTiledCopySFB::AtomNumVal::value * sizeof(typename GmemTiledCopySFB::ValType) / sizeof(ElementAccumulator);
+
+  static constexpr int AlignmentSFA = CopyAlignmentSFA * (GmemTiledCopySFA::AtomNumVal::value > 1 ? 
+      (size<0,1>(LayoutSFA{}.stride()) == 1 ? ScaleGranularityM : ScaleGranularityK) : 1);
+  static constexpr int AlignmentSFB = CopyAlignmentSFB * (GmemTiledCopySFB::AtomNumVal::value > 1 ? 
+      (size<0,1>(LayoutSFB{}.stride()) == 1 ? ScaleGranularityN : ScaleGranularityK) : 1);
+
 
   // Two arrivals per thread in the warp (1 arrival and 1 arrival through cp.async.mbarrier)
   static constexpr int NumMainloopSFProducerThreadEvents = 64;
@@ -566,8 +572,8 @@ struct CollectiveMma<
       CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
     }
 
-    bool implementable_sf = cutlass::detail::check_alignment<AlignmentSFA>(args.layout_SFA);
-    implementable_sf = implementable_sf && cutlass::detail::check_alignment<AlignmentSFB>(args.layout_SFB);
+    bool implementable_sf = cutlass::detail::check_alignment<CopyAlignmentSFA>(args.layout_SFA);
+    implementable_sf = implementable_sf && cutlass::detail::check_alignment<CopyAlignmentSFB>(args.layout_SFB);
 
     if (!implementable_sf) {
       CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for Scale Factors.\n");
diff --git a/include/cutlass/gemm/collective/sm100_mma_warpspecialized_mixed_input.hpp b/include/cutlass/gemm/collective/sm100_mma_warpspecialized_mixed_input.hpp
deleted file mode 100644
index f8d1a00a..00000000
--- a/include/cutlass/gemm/collective/sm100_mma_warpspecialized_mixed_input.hpp
+++ /dev/null
@@ -1,824 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-
-
-
-#pragma once
-#include <cuda_bf16.h>
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/pipeline/pipeline.hpp"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/detail/sm100_tmem_helper.hpp"
-#include "cutlass/detail/cluster.hpp"
-
-#include "cute/algorithm/functional.hpp"
-#include "cute/arch/cluster_sm90.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cute/atom/copy_atom.hpp"
-#include "cute/algorithm/gemm.hpp"
-#include "cute/tensor_predicate.hpp"
-#include "cute/arch/mma_sm100.hpp"
-#include "cutlass/trace.h"
-#include "cutlass/kernel_hardware_info.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-using namespace cute;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// WarpSpecialized Mainloop for Mixed Input Kernels
-template <
-  int Load2TransformPipelineStageCount_,
-  int Transform2MmaPipelineStageCount_,
-  int SchedulerPipelineStageCount_,
-  int AccumulatorPipelineStageCount_,
-  class ClusterShape,
-  class TileShape_,
-  class ElementA_,
-  class StrideA_,
-  class ElementB_,
-  class StrideB_,
-  class TiledMma_,
-  class GmemTiledCopyA_,
-  class SmemLayoutAtomsA_,
-  class CopyAtomsA_,
-  class TransformA_,
-  class GmemTiledCopyB_,
-  class SmemLayoutAtomsB_,
-  class CopyAtomsB_,
-  class TransformB_>
-struct CollectiveMma<
-    MainloopSm100TmaUmmaWarpSpecializedMixedInput<
-      Load2TransformPipelineStageCount_,
-      Transform2MmaPipelineStageCount_,
-      SchedulerPipelineStageCount_,
-      AccumulatorPipelineStageCount_,
-      ClusterShape>,
-    TileShape_,
-    ElementA_,
-    StrideA_,
-    ElementB_,
-    StrideB_,
-    TiledMma_,
-    GmemTiledCopyA_,
-    SmemLayoutAtomsA_,
-    CopyAtomsA_,
-    TransformA_,
-    GmemTiledCopyB_,
-    SmemLayoutAtomsB_,
-    CopyAtomsB_,
-    TransformB_>
-{
-  //
-  // Type Aliases
-  //
-
-  // Determine MMA type: MMA_1SM vs MMA_2SM
-  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma_::ThrLayoutVMNK{})), _1, _1>;
-  using DispatchPolicy = MainloopSm100TmaUmmaWarpSpecializedMixedInput<
-                            Load2TransformPipelineStageCount_,
-                            Transform2MmaPipelineStageCount_,
-                            SchedulerPipelineStageCount_,
-                            AccumulatorPipelineStageCount_,
-                            ClusterShape>;
-  using TileShape = TileShape_;
-  using TiledMma = TiledMma_;
-  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
-  using CtaShape_MNK = decltype(shape_div(TileShape{}, AtomThrShapeMNK{}));
-
-  // Define A and B block shapes for reduced size TMA_LOADs
-  using CtaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShape{}), size<2>(TileShape{}))));
-  using CtaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(size<1>(TileShape{}), size<2>(TileShape{}))));
-
-  using ElementA = ElementA_;
-  using StrideA = StrideA_;
-  using ElementAMma = typename TiledMma::ValTypeA;
-
-  static constexpr int IsSubbyteA = cute::sizeof_bits_v<ElementA> < 8;
-  using TmaElementA = cute::conditional_t<IsSubbyteA, uint8_t, ElementA>;
-
-  using ElementB = ElementB_;
-  using StrideB = StrideB_;
-  using ElementBMma = typename TiledMma::ValTypeB;
-
-  using StrideScale = cute::Stride<cute::Int<1>, int64_t, int64_t>;
-  using NonVoidStrideScale = cute::conditional_t<
-      cute::is_void_v<StrideScale>, cute::Stride<_1, int64_t, int64_t>, StrideScale>;
-
-  using ElementAccumulator = typename TiledMma::ValTypeC;
-  using GmemTiledCopyA = GmemTiledCopyA_;
-  using GmemTiledCopyB = GmemTiledCopyB_;
-  using SmemLayoutAtomsA = SmemLayoutAtomsA_;
-  using SmemLayoutAtomsB = SmemLayoutAtomsB_;
-  using CopyAtomsA = CopyAtomsA_;
-  using CopyAtomsB = CopyAtomsB_;
-  using TransformA = TransformA_;
-  using TransformB = TransformB_;
-  using ArchTag = typename DispatchPolicy::ArchTag;
-
-  static_assert(sizeof(ElementA) < 2, "Matrix to be scaled should be provided in A otherwise input is not supported");
-  static_assert(cute::is_same_v<ElementAMma, cutlass::bfloat16_t> || cute::is_same_v<ElementAMma, cutlass::half_t> || cute::is_same_v<ElementAMma, cutlass::float_e4m3_t>, "Compute type A should be cutlass::bfloat16_t or cutlass::half_t or cutlass::float_e4m3_t");
-
-  using Load2TransformPipeline = cutlass::PipelineTmaTransformAsync<
-                             DispatchPolicy::Load2TransformPipelineStageCount,
-                             AtomThrShapeMNK>;
-  using Load2TransformPipelineState = typename Load2TransformPipeline::PipelineState;
-
-  using Transform2MmaPipeline = cutlass::PipelineUmmaConsumerAsync<
-                              DispatchPolicy::Transform2MmaPipelineStageCount,
-                              AtomThrShapeMNK>;
-  using Transform2MmaPipelineState = typename Transform2MmaPipeline::PipelineState;
-
-  using Mma2AccumPipeline =  cutlass::PipelineUmmaAsync<
-                              DispatchPolicy::Schedule::AccumulatorPipelineStageCount,
-                              AtomThrShapeMNK>;
-  using Mma2AccumPipelineState = typename Mma2AccumPipeline::PipelineState;
-
-  // Thread Counts
-  static constexpr uint32_t NumAccumThreads = 128; //Maintains compatibility with input_transform kernel
-  static constexpr uint32_t NumTransformationThreads = 128;
-
-  // Get the Algorithm parameters
-  constexpr static int AccumulatorPipelineStageCount = DispatchPolicy::Schedule::AccumulatorPipelineStageCount;
-  constexpr static int StagesPerTile = size<2>(CtaShapeA_MK{});
-
-  using SmemLayoutAtomA = typename SmemLayoutAtomsA::InputLayoutAtom;
-  using SmemLayoutAtomACompute = typename SmemLayoutAtomsA::ComputeLayoutAtom;
-  using SmemLayoutAtomB = typename SmemLayoutAtomsB::InputLayoutAtom;
-  using SmemLayoutAtomBCompute = typename SmemLayoutAtomsB::ComputeLayoutAtom;
-
-  using InputCopyAtomA = typename CopyAtomsA::InputCopyAtom;
-  using ComputeCopyAtomA = typename CopyAtomsA::ComputeCopyAtom;
-  using InputCopyAtomB = typename CopyAtomsB::InputCopyAtom;
-  using ComputeCopyAtomB = typename CopyAtomsB::ComputeCopyAtom;
-
-  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert(((size<0,0>(CtaShapeA_MK{}) * size<1>(CtaShapeA_MK{})) % size<0>(SmemLayoutAtomACompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape.");
-  static_assert(((size<0,1>(CtaShapeA_MK{}) * size<2>(CtaShapeA_MK{})) % size<1>(SmemLayoutAtomACompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape.");
-
-  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert(((size<0,0>(CtaShapeB_NK{}) * size<1>(CtaShapeB_NK{})) % size<0>(SmemLayoutAtomBCompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape.");
-  static_assert(((size<0,1>(CtaShapeB_NK{}) * size<2>(CtaShapeB_NK{})) % size<1>(SmemLayoutAtomBCompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape.");
-
-  // Tile along K mode first before tiling over MN. PIPE mode last as usual.
-  // This maximizes TMA boxes due to better smem-K vectorization, reducing total issued TMAs.
-  using SmemLayoutA = decltype(UMMA::tile_to_mma_shape(
-      SmemLayoutAtomA{},
-      append(CtaShapeA_MK{}, Int<DispatchPolicy::Load2TransformPipelineStageCount>{}),
-             (cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})));
-
-  using SmemLayoutACompute = decltype(UMMA::tile_to_mma_shape(
-      SmemLayoutAtomACompute{},
-      append(CtaShapeA_MK{}, Int<DispatchPolicy::Load2TransformPipelineStageCount>{}),
-             (cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})));
-
-  using SmemLayoutB = decltype(UMMA::tile_to_mma_shape(
-      SmemLayoutAtomB{},
-      append(CtaShapeB_NK{}, Int<DispatchPolicy::Load2TransformPipelineStageCount>{}),
-             (cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})));
-
-  static_assert(DispatchPolicy::Load2TransformPipelineStageCount >= 2 && DispatchPolicy::Load2TransformPipelineStageCount >= 2,
-                "Specialization requires Stages set to value 2 or more.");
-  static_assert((cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value ||
-                 cute::is_base_of<cute::UMMA::tmem_frg_base,      typename TiledMma::FrgTypeA>::value  ) &&
-                 cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
-                 "MMA atom must A operand from SMEM or TMEM and B operand from SMEM for this mainloop.");
-  static_assert((cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>),
-                 "GmemTiledCopyA - invalid TMA copy atom specified.");
-  static_assert((cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>),
-                 "GmemTiledCopyB -  invalid TMA copy atom specified.");
-
-  struct PipelineStorage {
-    using Load2TransformPipelineStorage = typename Load2TransformPipeline::SharedStorage;
-    alignas(16) Load2TransformPipelineStorage load2transform_pipeline;
-    using Transform2MmaPipelineStorage = typename Transform2MmaPipeline::SharedStorage;
-    alignas(16) Transform2MmaPipelineStorage transform2mma_pipeline;
-    using Mma2AccumPipelineStorage = typename Mma2AccumPipeline::SharedStorage;
-    alignas(16) Mma2AccumPipelineStorage mma2accum_pipeline;
-  };
-
-  struct SharedStorage {
-    struct TensorStorage : cute::aligned_struct<128, _0> {
-
-      struct TensorStorageUntransformed {
-        cute::ArrayEngine<ElementA, cute::cosize_v<SmemLayoutA>> smem_A;
-        cute::ArrayEngine<ElementB, cute::cosize_v<SmemLayoutB>> smem_B;
-      };
-
-      struct TensorStorageTransformedAinSmem {
-        alignas(1024) cute::ArrayEngine<ElementAMma, cute::cosize_v<SmemLayoutACompute>> smem_ACompute;
-        alignas(1024) cute::ArrayEngine<ElementBMma, cute::cosize_v<SmemLayoutB>> smem_BCompute;
-      };
-
-      union TensorStorageTransformedAinTmem {
-        alignas(1024) cute::ArrayEngine<ElementAMma, 1> smem_ACompute;  // No smem_ACompute
-        alignas(1024) cute::ArrayEngine<ElementBMma, cute::cosize_v<SmemLayoutB>> smem_BCompute;
-      };
-
-      using TensorStorageTransformed = cute::conditional_t<
-                                      cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value,
-                                      TensorStorageTransformedAinSmem,
-                                      TensorStorageTransformedAinTmem>;
-
-      TensorStorageUntransformed input;
-      TensorStorageTransformed compute;
-    } tensors;
-
-    PipelineStorage pipeline;
-  };
-  using TensorStorage = typename SharedStorage::TensorStorage;
-
-  // Different from other GEMM kernels, both CTAs should be aware of loads. Both CTAs will work on
-  // loaded input A and B matrices to convert the data type
-  static constexpr uint32_t TmaTransactionBytes =
-    cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * size<2>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value))+
-    cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * size<2>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
-
-  // Host side kernel arguments
-  struct Arguments {
-    ElementA const* ptr_A{nullptr};
-    StrideA dA{};
-    ElementB const* ptr_B{nullptr};
-    StrideB dB{};
-  };
-
-  // Device side kernel params
-  struct Params {
-    using ClusterLayout_VMNK = decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}), ClusterShape{})), 
-                                                     make_tile(typename TiledMma::AtomThrID{})));
-
-    using TMA_A = decltype(make_tma_atom_A_sm100<TmaElementA>(
-        GmemTiledCopyA{},
-        make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
-        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
-        TileShape{},
-        TiledMma{},
-        ClusterLayout_VMNK{})
-      );
-    using TMA_B = decltype(make_tma_atom_B_sm100<ElementB>(
-        GmemTiledCopyB{},
-        make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
-        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
-        TileShape{},
-        TiledMma{},
-        ClusterLayout_VMNK{})
-      );
-    TMA_A tma_load_a;
-    TMA_B tma_load_b;
-    TMA_A tma_load_a_fallback;
-    TMA_B tma_load_b_fallback;
-    dim3 cluster_shape_fallback;
-  };
-
-  CUTLASS_DEVICE
-  CollectiveMma(Params const& params, ClusterShape cluster_shape, uint32_t block_rank_in_cluster)
-    : cluster_shape_(cluster_shape)
-    , block_rank_in_cluster_(block_rank_in_cluster) {
-    if constexpr (IsDynamicCluster) {
-      const bool is_fallback_cluster = (cute::size<0>(cluster_shape_) == params.cluster_shape_fallback.x && 
-                                        cute::size<1>(cluster_shape_) == params.cluster_shape_fallback.y);
-      observed_tma_load_a_ = is_fallback_cluster ? &params.tma_load_a_fallback : &params.tma_load_a;
-      observed_tma_load_b_ = is_fallback_cluster ? &params.tma_load_b_fallback : &params.tma_load_b;
-    } 
-    else {
-      observed_tma_load_a_ = &params.tma_load_a;
-      observed_tma_load_b_ = &params.tma_load_b;
-    }
-  }
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) {
-    (void) workspace;
-
-    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    Tensor tensor_a = make_tensor(args.ptr_A, make_layout(make_shape(M,K,L), args.dA));
-    Tensor tensor_b = make_tensor(args.ptr_B, make_layout(make_shape(N,K,L), args.dB));
-  
-    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape);
-    // Cluster layout for TMA construction
-    auto cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMma::AtomThrID{}));
-
-    auto cluster_shape_fallback = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape_fallback);
-    // Cluster layout for TMA construction
-    auto cluster_layout_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMma::AtomThrID{}));
-
-    typename Params::TMA_A tma_load_a = make_tma_atom_A_sm100<TmaElementA>(
-        GmemTiledCopyA{},
-        tensor_a,
-        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
-        TileShape{},
-        TiledMma{},
-        cluster_layout_vmnk);
-
-    typename Params::TMA_B tma_load_b = make_tma_atom_B_sm100<ElementB>(
-        GmemTiledCopyB{},
-        tensor_b,
-        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
-        TileShape{},
-        TiledMma{},
-        cluster_layout_vmnk);
-
-    typename Params::TMA_A tma_load_a_fallback = make_tma_atom_A_sm100<TmaElementA>(
-        GmemTiledCopyA{},
-        tensor_a,
-        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
-        TileShape{},
-        TiledMma{},
-        cluster_layout_vmnk_fallback);
-
-    typename Params::TMA_B tma_load_b_fallback = make_tma_atom_B_sm100<ElementB>(
-        GmemTiledCopyB{},
-        tensor_b,
-        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
-        TileShape{},
-        TiledMma{},
-        cluster_layout_vmnk_fallback);
-
-    return {
-      tma_load_a,
-      tma_load_b,
-      tma_load_a_fallback,
-      tma_load_b_fallback,
-      hw_info.cluster_shape_fallback
-    };
-  }
-
-  template<class ProblemShape>
-  static bool
-  can_implement(
-      ProblemShape const& problem_shape,
-      [[maybe_unused]] Arguments const& args) {
-    constexpr int tma_alignment_bits = 128;
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    bool implementable = true;
-    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
-    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
-    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
-    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
-
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
-    }
-    return implementable;
-  }
-
-  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
-  CUTLASS_DEVICE static void
-  prefetch_tma_descriptors(Params const& params) {
-    if constexpr (IsDynamicCluster) {
-      dim3 cs = cute::cluster_shape();
-      const bool is_fallback_cluster = (cs.x == params.cluster_shape_fallback.x && cs.y == params.cluster_shape_fallback.y);
-      if (is_fallback_cluster) {
-        cute::prefetch_tma_descriptor(params.tma_load_a_fallback.get_tma_descriptor());
-        cute::prefetch_tma_descriptor(params.tma_load_b_fallback.get_tma_descriptor());
-      }
-      else {
-        cute::prefetch_tma_descriptor(params.tma_load_a.get_tma_descriptor());
-        cute::prefetch_tma_descriptor(params.tma_load_b.get_tma_descriptor());
-      }
-    }
-    else {
-      cute::prefetch_tma_descriptor(params.tma_load_a.get_tma_descriptor());
-      cute::prefetch_tma_descriptor(params.tma_load_b.get_tma_descriptor());
-    }
-  }
-
-  /// Construct A Single Stage's Accumulator Shape
-  CUTLASS_DEVICE auto 
-  partition_accumulator_shape() {
-    auto acc_shape = partition_shape_C(TiledMma{}, take<0,2>(TileShape{}));  // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
-
-    return acc_shape;
-  }
-
-  /// Produce the inputs to the transform threads by loading inputs from gmem -> smem
-  template <
-    class GTensorA, class GTensorB,
-    class GTensorPartitionedA, class GTensorPartitionedB,
-    class STensorA, class STensorB,
-    class TileCoordMNKL,
-    class KTileIterator
-  >
-  CUTLASS_DEVICE auto
-  load(
-      Params const& params,
-      Load2TransformPipeline pipeline,
-      Load2TransformPipelineState load2xform_pipeline_state,
-      cute::tuple<GTensorA, GTensorB,
-                  GTensorPartitionedA, GTensorPartitionedB,
-                  STensorA, STensorB,
-                  uint16_t, uint16_t> const& load_inputs,
-      TileCoordMNKL const& cta_coord_mnkl,
-      KTileIterator k_tile_iter, int k_tile_count) {
-
-    auto [unused_gA, unused_gB,
-          tAgA_mkl, tBgB_nkl, tAsA, tBsB,
-          mcast_mask_a, mcast_mask_b] = load_inputs;
-
-    // slice out the work coord from tiled tensors
-    Tensor tAgA = tAgA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
-    Tensor tBgB = tBgB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
-
-    uint32_t skip_wait = (k_tile_count <= 0);
-    auto pipeline_flag = pipeline.producer_try_acquire(load2xform_pipeline_state, skip_wait);
-
-    // Issue the Mainloop loads
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 0; --k_tile_count) {
-      // LOCK mainloop_load2xform_pipeline_state for _writing_
-      pipeline.producer_acquire(load2xform_pipeline_state, pipeline_flag);
-      int write_stage = load2xform_pipeline_state.index();
-
-      using BarrierType = typename Load2TransformPipeline::ProducerBarrierType;
-      BarrierType* tma_barrier = pipeline.producer_get_barrier(load2xform_pipeline_state);
-
-      // Advance mainloop_pipe
-      ++load2xform_pipeline_state;
-
-      skip_wait = (k_tile_count <= 1);
-      pipeline_flag = pipeline.producer_try_acquire(load2xform_pipeline_state, skip_wait);
-
-      copy(observed_tma_load_a_->with(*tma_barrier, mcast_mask_a), tAgA(_,*k_tile_iter), tAsA(_,write_stage));
-      copy(observed_tma_load_b_->with(*tma_barrier, mcast_mask_b), tBgB(_,*k_tile_iter), tBsB(_,write_stage));
-
-      ++k_tile_iter;
-    }
-
-    return cute::make_tuple(load2xform_pipeline_state, k_tile_iter);
-
-  }
-
-  /// Set up the data needed by this collective for load.
-  /// Returned tuple must contain at least two elements, with the first two elements being:
-  /// gA_mkl - The tiled tensor for input A
-  /// gB_nkl - The tiled tensor for input B
-  // Other inputs needed for load(): partitioned AB tensors for gmem and smem, and mcast masks
-  template <class ProblemShape_MNKL>
-  CUTLASS_DEVICE auto
-  load_init(
-      ProblemShape_MNKL const& problem_shape_MNKL,
-      Params const& params,
-      TensorStorage& shared_storage) const {
-    auto [gA_mkl, gB_nkl] = tile_input_tensors(params, problem_shape_MNKL);
-
-    ThrMMA cta_mma = TiledMma{}.get_slice(blockIdx.x % size(typename TiledMma::AtomThrID{}));
-
-    Tensor tCgA_mkl = cta_mma.partition_A(gA_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
-    Tensor tCgB_nkl = cta_mma.partition_B(gB_nkl);          // (MMA, MMA_N, MMA_K, n, k, l)
-
-    Tensor sA = make_tensor(make_smem_ptr(shared_storage.input.smem_A.begin()), SmemLayoutA{});  // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor sB = make_tensor(make_smem_ptr(shared_storage.input.smem_B.begin()), SmemLayoutB{});  // (MMA,MMA_N,MMA_K,PIPE)
-
-    // Define the CTA-in-cluster Layout and Coord
-    Layout cta_layout_mnk  = make_layout(cluster_shape_);
-    Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
-    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster_);
-
-    // Project the cta_layout for tma_a along the n-modes
-    auto [tAgA_mkl, tAsA] = tma_partition(*observed_tma_load_a_,
-                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
-                                      group_modes<0,3>(sA), group_modes<0,3>(tCgA_mkl));
-
-    // Project the cta_layout for tma_b along the m-modes
-    auto [tBgB_nkl, tBsB] = tma_partition(*observed_tma_load_b_,
-                                      get<1>(cta_coord_vmnk), make_layout(size<1>(cta_layout_vmnk)),
-                                      group_modes<0,3>(sB), group_modes<0,3>(tCgB_nkl));
-
-    // TMA Multicast Masks
-    uint16_t mcast_mask_a = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
-    uint16_t mcast_mask_b = create_tma_multicast_mask<1>(cta_layout_vmnk, cta_coord_vmnk);
-
-
-    return cute::make_tuple(
-        gA_mkl, gB_nkl,                        // for scheduler
-        tAgA_mkl, tBgB_nkl, tAsA, tBsB,        // for input tensor values
-        mcast_mask_a, mcast_mask_b);           // multicast masks
-  }
-
-  template<
-    class KTileIterator, class Accumulator,
-    class GTensorA, class DstCopyA, class SrcTensorA, class DstTensorA,
-    class GTensorB
-  >
-  CUTLASS_DEVICE auto
-  transform(
-      Load2TransformPipeline load2transform_pipeline,
-      Load2TransformPipelineState load2transform_pipeline_consumer_state,
-      Transform2MmaPipeline transform2mma_pipeline,
-      Transform2MmaPipelineState transform2mma_pipeline_producer_state,
-      Accumulator accumulators,
-      cute::tuple<GTensorA, DstCopyA, SrcTensorA, DstTensorA,
-                  GTensorB> input_operands,
-      KTileIterator k_tile_iter, int k_tile_count) {
-
-    cutlass::arch::NamedBarrier transform_bar(NumTransformationThreads, cutlass::arch::ReservedNamedBarriers::TransformBarrier);
-
-    // tAsA : (Copy,#Copy),MMA_Rest,MMA_M_Rest,MMA_K_Rest, SmemStages (In SMEM)
-    // tAdA : (Copy,#Copy),MMA_Rest,MMA_M_Rest,MMA_K_Rest, NumComputeMtxs, SmemStages (In SMEM or TMEM)
-    // tBsB : (Copy,#Copy),MMA_Rest,MMA_N_Rest,MMA_K_Rest, SmemStages (In SMEM)
-    // tBsB : (Copy,#Copy),MMA_Rest,MMA_N_Rest,MMA_K_Rest, NumComputeMtxs, SmemStages (In SMEM)
-    auto [unused_tAgA, dst_copy_A, tAsA, tAsACompute,
-          unused_tBgB] = input_operands;
-
-    // Create the tensors in registers
-    auto tArA = make_tensor<ElementA>(tAsA(_,_,_,_,0).shape());
-    auto tArACompute = make_tensor<ElementAMma>(tAsA(_,_,_,_,0).shape());
-
-    auto tArA_x2 = recast<Array<ElementA,2>>(tArA);
-    auto tArACompute_x2 = recast<Array<ElementAMma,2>>(tArACompute);
-
-
-    uint32_t skip_wait = (k_tile_count <= 0);
-    auto load2transform_flag = load2transform_pipeline.consumer_try_wait(load2transform_pipeline_consumer_state, skip_wait);
-    auto transform2mma_flag = transform2mma_pipeline.producer_try_acquire(transform2mma_pipeline_producer_state, skip_wait);
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 0; --k_tile_count) {
-
-      load2transform_pipeline.consumer_wait(load2transform_pipeline_consumer_state, load2transform_flag);
-      transform2mma_pipeline.producer_acquire(transform2mma_pipeline_producer_state, transform2mma_flag);
-
-      int load2transform_consumer_index = load2transform_pipeline_consumer_state.index(); // read stage
-      int transform2mma_producer_index = transform2mma_pipeline_producer_state.index(); //write stage
-
-      auto curr_load2transform_pipeline_consumer_state = load2transform_pipeline_consumer_state;
-      auto curr_transform2mma_pipeline_producer_state = transform2mma_pipeline_producer_state;
-
-      // Copy the input A matrix from SMEM
-      copy(AutoVectorizingCopy{}, tAsA(_,_,_,_,load2transform_consumer_index), tArA);
-      //Transform Input A stored in registers
-      cute::transform(tArA_x2, tArACompute_x2, cutlass::NumericArrayConverter<ElementAMma, ElementA, 2, cutlass::FloatRoundStyle::round_to_nearest_satfinite>::convert);
-      //Transformed A stored in TMEM
-      copy(dst_copy_A, tArACompute, tAsACompute(_,_,_,_,transform2mma_producer_index));
-
-      // Loads from SMEM are done. Signal the mainloop load as early as possible
-      transform_bar.sync();
-      load2transform_pipeline.consumer_release(curr_load2transform_pipeline_consumer_state);
-
-      // fence for SMEM writes
-      cutlass::arch::fence_view_async_shared();
-      if constexpr (is_tmem<decltype(tAsACompute)>::value) {
-        // fence for TMEM writes if A operand is coming from TMEM
-        cutlass::arch::fence_view_async_tmem_store();
-      }
-
-      // Let the MMA know we are done transforming
-      transform2mma_pipeline.producer_commit(curr_transform2mma_pipeline_producer_state);
-
-      // Next pipeline stage
-      ++load2transform_pipeline_consumer_state;
-      ++transform2mma_pipeline_producer_state;
-
-      skip_wait = (k_tile_count <= 1);
-      // Peek the next pipeline stage's barriers
-      load2transform_flag = load2transform_pipeline.consumer_try_wait(load2transform_pipeline_consumer_state, skip_wait);
-      transform2mma_flag = transform2mma_pipeline.producer_try_acquire(transform2mma_pipeline_producer_state, skip_wait);
-    }
-    return cute::make_tuple(load2transform_pipeline_consumer_state, transform2mma_pipeline_producer_state);
-  }
-
-  template<class ProblemShape_MNKL, class Accumulator>
-  CUTLASS_DEVICE auto
-  transform_init(
-      Params const& params,
-      ProblemShape_MNKL const& problem_shape_MNKL,
-      Accumulator accumulators,
-      TensorStorage& shared_storage) {
-
-    auto [gA_mkl, gB_nkl] = tile_input_tensors(params, problem_shape_MNKL);
-
-    Tensor sA_orig = make_tensor(make_smem_ptr(shared_storage.input.smem_A.begin()), SmemLayoutA{}); 
-    Tensor sA = as_position_independent_swizzle_tensor(sA_orig); //tCsA
-    Tensor sACompute = make_tensor(make_smem_ptr(shared_storage.compute.smem_ACompute.begin()), SmemLayoutACompute{}); //tCsACompute
-
-    // Map input, compute, and fragment tensors to
-    //   Copy strategies and partitioned tensors. These will become the input
-    //   operands of the transform function. Depending on MMA atom type, the
-    //   operands can reside in SMEM or TMEM
-    auto setup_copy_ops = [&] (
-        auto tensor_input,
-        auto input_copy_atom,
-        auto tensor_compute,
-        auto make_fragment,
-        auto compute_copy_atom) constexpr {
-
-      auto fragment_compute = make_fragment(tensor_compute); //tCrA(Compute)
-      if constexpr (cute::is_tmem<cute::remove_cvref_t<decltype(fragment_compute)>>::value) {
-        // For M=128 with 2CTA MMA atoms, the TMEM tensor for A has a duplicated allocation.
-        // Instead of allocation a 64x16 TMEM tensor, we have a 128x16 allocation
-        // See: TmemAllocMode::Duplicated.
-        Tensor tensor_input2x = [&] () constexpr {
-        if constexpr (decltype(size<0,0>(fragment_compute) == Int<128>{} && size<0,0>(tensor_input) == Int<64>{})::value) {
-          return make_tensor(tensor_input.data(),
-                             logical_product(tensor_input.layout(),
-                                             make_tile(make_tile(Layout<_2,_0>{},_),_,_,_)));   // ((128,16),m,k,PIPE)
-          }
-          else {
-            return tensor_input;
-          }
-        }();  //tCsA_2x 
-
-        fragment_compute.data() = accumulators.data().get() + cutlass::detail::find_tmem_tensor_col_offset(accumulators); //tCrA.data()
-        auto reg2tmem_tiled_copy = make_tmem_copy(compute_copy_atom, fragment_compute(_,_,0,0));
-        auto thr_reg2tmem_tiled_copy = reg2tmem_tiled_copy.get_slice(threadIdx.x % NumTransformationThreads);
-        auto partitioned_tensor_input = thr_reg2tmem_tiled_copy.partition_S(tensor_input2x);
-        auto partitioned_tensor_compute = thr_reg2tmem_tiled_copy.partition_D(fragment_compute);
-        return cute::make_tuple(reg2tmem_tiled_copy, partitioned_tensor_input, partitioned_tensor_compute);
-      }
-      else {
-        auto tensor_compute_ind_sw = as_position_independent_swizzle_tensor(tensor_compute);
-        auto reg2smem_tiled_copy = make_cotiled_copy(compute_copy_atom, Layout<Shape <_128,_8>, Stride<  _8,_1>>{},
-                                                     tensor_compute(_,_,0,0).layout());
-
-        auto thr_reg2smem_tiled_copy = reg2smem_tiled_copy.get_slice(threadIdx.x % NumTransformationThreads);
-        auto partitioned_tensor_input = thr_reg2smem_tiled_copy.partition_S(tensor_input);
-        auto partitioned_tensor_compute = thr_reg2smem_tiled_copy.partition_D(tensor_compute_ind_sw);
-
-        return cute::make_tuple(AutoVectorizingCopy{}, partitioned_tensor_input, partitioned_tensor_compute);
-      }
-    };
-
-    auto [dst_copy_A, tAsA, tAsACompute] =
-        setup_copy_ops(sA, InputCopyAtomA{}, sACompute, [&](auto &arg) {return TiledMma::make_fragment_A(arg);}, ComputeCopyAtomA{});
-
-    return cute::make_tuple(gA_mkl, dst_copy_A, tAsA, tAsACompute,
-                            gB_nkl);
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Consumer Perspective
-  template <
-    class FrgEngine, class FrgLayout,
-    class TensorA, class TensorB
-  >
-  CUTLASS_DEVICE auto
-  mma(
-      Transform2MmaPipeline transform2mma_pipeline,
-      Transform2MmaPipelineState transform2mma_pipeline_consumer_state,
-      Mma2AccumPipeline mma2accum_pipeline,
-      Mma2AccumPipelineState mma2accum_pipeline_producer_state,
-      cute::Tensor<FrgEngine, FrgLayout> const& accumulators,
-      cute::tuple<TensorA, TensorB> const& input_operands,
-      int k_tile_count
-  ) {
-    TiledMma tiled_mma;
-
-    auto curr_transform2mma_pipeline_consumer_state = transform2mma_pipeline_consumer_state;
-    auto next_transform2mma_pipeline_consumer_state = transform2mma_pipeline_consumer_state;
-
-    uint32_t skip_wait = (k_tile_count <= 0);
-    auto transform2mma_flag = transform2mma_pipeline.consumer_try_wait(next_transform2mma_pipeline_consumer_state, skip_wait);
-    ++next_transform2mma_pipeline_consumer_state;
-
-
-    // tCrA : (MMA), MMA_M, MMA_K, SmemStage  (In SMEM or TMEM)
-    //      We use SMEM stages to match #buffers in Load <-> Convert
-    // tCrB : (MMA), MMA_N, MMA_K, SmemStages (In SMEM)
-    auto const [tCrA, tCrB] = input_operands;
-
-    int remaining_accum_promotions = k_tile_count;
-    uint32_t mma2accum_skip_wait = (remaining_accum_promotions <= 0);
-    auto mma2accum_flag = mma2accum_pipeline.producer_try_acquire(mma2accum_pipeline_producer_state, mma2accum_skip_wait);
-    mma2accum_pipeline.producer_acquire(mma2accum_pipeline_producer_state, mma2accum_flag);
-    auto curr_mma2accum_pipeline_producer_state = mma2accum_pipeline_producer_state;
-    ++mma2accum_pipeline_producer_state;
-    
-    // No accumulator addition to the k_tile initially
-    tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 0; --k_tile_count) {
-
-      transform2mma_pipeline.consumer_wait(curr_transform2mma_pipeline_consumer_state, transform2mma_flag);
-
-      int transform2mma_pipeline_consumer_state_index = curr_transform2mma_pipeline_consumer_state.index(); //read_stage
-      int mma2accum_pipeline_producer_state_index = curr_mma2accum_pipeline_producer_state.index();  //write_stage
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-
-        auto tCtC = accumulators(_,_,_,mma2accum_pipeline_producer_state_index);
-
-        auto tCrA0 = tCrA(_,_,_,transform2mma_pipeline_consumer_state_index);
-        auto tCrB0 = tCrB(_,_,_,transform2mma_pipeline_consumer_state_index);
-
-        cute::gemm(tiled_mma, tCrA0(_,_,k_block), tCrB0(_,_,k_block), tCtC);               // A[0]*B[0]
-        tiled_mma.accumulate_ = UMMA::ScaleOut::One;
-        
-      }
-
-      transform2mma_pipeline.consumer_release(curr_transform2mma_pipeline_consumer_state);
-
-      skip_wait = (k_tile_count <= 1);
-      transform2mma_flag = transform2mma_pipeline.consumer_try_wait(next_transform2mma_pipeline_consumer_state, skip_wait);
-
-      curr_transform2mma_pipeline_consumer_state = next_transform2mma_pipeline_consumer_state;
-      ++next_transform2mma_pipeline_consumer_state;
-    }
-
-    mma2accum_pipeline.producer_commit(curr_mma2accum_pipeline_producer_state);
-
-    return cute::make_tuple(curr_transform2mma_pipeline_consumer_state, mma2accum_pipeline_producer_state);
-  }
-
-  template<class FrgEngine, class FrgLayout>
-  CUTLASS_DEVICE auto
-  mma_init(cute::Tensor<FrgEngine, FrgLayout> const& accumulators, TensorStorage& shared_storage) const {
-    TiledMma tiled_mma;
-
-    auto get_tCrA = [&] () constexpr {
-      if constexpr (cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value) {
-        Tensor sACompute = make_tensor(make_smem_ptr(shared_storage.compute.smem_ACompute.begin()), SmemLayoutACompute{});
-        return tiled_mma.make_fragment_A(sACompute);
-      }
-      else {
-        auto tCrA = tiled_mma.make_fragment_A(shape(SmemLayoutACompute{}));
-        tCrA.data() = accumulators.data().get() + cutlass::detail::find_tmem_tensor_col_offset(accumulators);
-        return tCrA;
-      }
-    };
-
-    Tensor tCrA = get_tCrA();
-    Tensor sB = make_tensor(make_smem_ptr(shared_storage.input.smem_B.begin()), SmemLayoutB{});
-    Tensor tCrB = tiled_mma.make_fragment_B(sB);
-    return cute::make_tuple(tCrA, tCrB);
-  }
-
-  template<class FrgEngine, class FrgLayout, class TmemCopyAtom, class EpilogueTile>
-  CUTLASS_DEVICE auto
-  accum_init(cute::Tensor<FrgEngine, FrgLayout> const& accumulators, TmemCopyAtom tmem_cp_atom, EpilogueTile epilogue_tile) {
-    return accumulators;
-  }
-
-private:
-  template <class ProblemShape_MNKL>
-  CUTLASS_DEVICE
-  constexpr auto
-  tile_input_tensors(Params const& params, ProblemShape_MNKL const& problem_shape_MNKL) const {
-    using X = cute::Underscore;
-    // Separate out problem shape for convenience
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    // Represent the full tensors -- get these from TMA
-    Tensor mA_mkl = observed_tma_load_a_->get_tma_tensor(make_shape(M,K,L));
-    Tensor mB_nkl = observed_tma_load_b_->get_tma_tensor(make_shape(N,K,L));
-
-    // Tile the tensors and defer the slice
-    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});
-    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});
-
-    return cute::make_tuple(gA_mkl, gB_nkl);
-  }
-
-  typename Params::TMA_A const* observed_tma_load_a_ = nullptr;
-  typename Params::TMA_B const* observed_tma_load_b_ = nullptr;
-
-  ClusterShape cluster_shape_;
-  uint32_t block_rank_in_cluster_;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/sm120_blockscaled_mma_array_tma.hpp b/include/cutlass/gemm/collective/sm120_blockscaled_mma_array_tma.hpp
index e7f43fd5..fc7bc988 100755
--- a/include/cutlass/gemm/collective/sm120_blockscaled_mma_array_tma.hpp
+++ b/include/cutlass/gemm/collective/sm120_blockscaled_mma_array_tma.hpp
@@ -353,13 +353,12 @@ struct CollectiveMma<
     (void) workspace;
     // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
     // These will be replaced with correct values before the initial tma load.
-    auto init_shape = repeat_like(typename ProblemShape::UnderlyingProblemShape{}, int32_t(1));
-    constexpr int tma_alignment_bits = 128;
-    auto init_M = tma_alignment_bits;
-    auto init_N = tma_alignment_bits;
-    auto init_K = tma_alignment_bits;
+    auto init_M = int32_t(size<0>(TileShape{}));
+    auto init_N = int32_t(size<1>(TileShape{}));
+    auto init_K = int32_t(size<2>(TileShape{}));
+    auto init_L = 1;
+
     // Batches/Groups are managed by using appropriate pointers to input matrices
-    const uint32_t init_L = 1;
     TmaInternalElementA const* ptr_A_first_batch = nullptr;
     TmaInternalElementB const* ptr_B_first_batch = nullptr;
     ElementSF const* ptr_SFA_first_batch = nullptr;
@@ -1058,11 +1057,11 @@ struct CollectiveMma<
 
     Tensor tensor_sfb = make_tensor(ptr_SF, mainloop_params.layout_SFB[next_group]);
 
-    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_a, tensor_a, 
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_a, tensor_a,
                                              prob_shape_A, prob_stride_A);
     cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_sfa, tensor_sfa,
                                              prob_shape_SFA, prob_stride_SFA);
-    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_b, tensor_b, 
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_b, tensor_b,
                                              prob_shape_B, prob_stride_B);
     cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_sfb, tensor_sfb,
                                              prob_shape_SFB, prob_stride_SFB);
@@ -1091,7 +1090,7 @@ struct CollectiveMma<
                                                             prob_stride_B);
     cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_SFB,
                                                             prob_shape_SFB,
-                                                            prob_stride_SFB);                                                   
+                                                            prob_stride_SFB);
   }
 
   // The entire warp must call this function collectively (that is, the instructions are aligned)
@@ -1122,6 +1121,10 @@ struct CollectiveMma<
   tensormaps_cp_fence_release (
       TensorMapStorage& shared_tensormaps,
       cute::tuple<TensorMapA, TensorMapB, TensorMapSFA, TensorMapSFB> const& input_tensormaps) {
+    if (cute::elect_one_sync()) {
+      cute::tma_desc_commit_group();
+      cute::tma_desc_wait_group();
+    }
     // Entire warp must do this (i.e. it's aligned)
     tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
     tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
diff --git a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input.hpp b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input.hpp
index 25a68671..dc30ae56 100644
--- a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input.hpp
@@ -1330,6 +1330,10 @@ public:
   tensormaps_cp_fence_release (
       TensorMapStorage& shared_tensormaps,
       cute::tuple<TMs...> const& input_tensormaps) {
+    if (cute::elect_one_sync()) {
+      cute::tma_desc_commit_group();
+      cute::tma_desc_wait_group();
+    }
     // Entire warp must do this (i.e. it's aligned)
     tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
     tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
diff --git a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp
index 8b06d58b..da16d118 100644
--- a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp
@@ -529,10 +529,9 @@ struct CollectiveMma<
 
     // Prologue GMMAs
     int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-    assert(k_tile_count >= 1);
     tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
     warpgroup_fence_operand(accum);
-    {
+    if (k_tile_count > 0) {
       // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
       auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
       pipeline.consumer_wait(smem_pipe_read, barrier_token);
@@ -739,6 +738,10 @@ struct CollectiveMma<
   tensormaps_cp_fence_release (
       TensorMapStorage& shared_tensormaps,
       cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) {
+    if (cute::elect_one_sync()) {
+      cute::tma_desc_commit_group();
+      cute::tma_desc_wait_group();
+    }
     // Entire warp must do this (i.e. it's aligned)
     tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
     tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
diff --git a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8.hpp b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8.hpp
index 676382ad..53348dff 100644
--- a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8.hpp
@@ -747,6 +747,10 @@ struct CollectiveMma<
   tensormaps_cp_fence_release (
       TensorMapStorage& shared_tensormaps,
       cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) {
+    if (cute::elect_one_sync()) {
+      cute::tma_desc_commit_group();
+      cute::tma_desc_wait_group();
+    }
     // Entire warp must do this (i.e. it's aligned)
     tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
     tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
diff --git a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
index 27c03af4..6cec1862 100644
--- a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -135,6 +135,10 @@ struct CollectiveMma<
 
   static constexpr int ScalePromotionInterval = ScaleGranularityK / size<2>(typename TiledMma::AtomShape_MNK{});
   static_assert(ScalePromotionInterval % 4 == 0, "ScalePromotionInterval must be a multiple of 4.");
+  static_assert(ScalePromotionInterval >= size<2>(TileShape{}) / tile_size<2>(TiledMma{}),
+    "ScalePromotionInterval must be greater than or equal to the number of stages of the MMA atom.");
+  static_assert(ScalePromotionInterval % (size<2>(TileShape{}) / tile_size<2>(TiledMma{})) == 0,
+    "ScalePromotionInterval must be a multiple of the number of stages of the MMA atom.");
 
   static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
   static constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN;
@@ -811,31 +815,37 @@ struct CollectiveMma<
     Tensor tCrSFB = make_tensor_like<ElementBlockScale>(tCsSFB(_, _, _, _0{}));                     // (MMA,MMA_M,MMA_N)
 
     // Prologue GMMAs
-    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-    assert(k_tile_count >= 1);
     tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+    auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+    pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
     // fence_operand();
     GmmaFP8Accumulation accumulation(accum, ScalePromotionInterval, size<2>(tCrA));
-    {
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
 
-      if constexpr (ScalePromotionInterval != 4) {
-        if (accumulation.prepare_if_needed()) {
-          tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-        }
-      }
-      else {
-        // Always zero out the accumulator for finest granularity
-        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-      }
+    warpgroup_fence_operand(accumulation());
+
+    {
 
       int read_stage = smem_pipe_read.index();
       // Load per block scale values from shared memory to registers
       copy(tCsSFA(_,_,_,make_coord(_0{},read_stage)), tCrSFA);
       copy(tCsSFB(_,_,_,make_coord(_0{},read_stage)), tCrSFB);
 
+      warpgroup_fence_operand(accumulation());
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+      warpgroup_fence_operand(accumulation());
+
+
       if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
         tCrSFA(_0{}) = tCrSFA(_0{}) * tCrSFB(_0{});
       }
@@ -854,16 +864,9 @@ struct CollectiveMma<
         }
       }
 
-      warpgroup_arrive();
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        // (V,M) x (V,N) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-      }
-
-      warpgroup_commit_batch();
+      warpgroup_wait<0>();
+      ++smem_pipe_read;
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
 
       // Block scale the accumulators with reg tensor `tCrSFA` and `tCrSFB`
       if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
@@ -879,90 +882,16 @@ struct CollectiveMma<
       if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
         scale_if_needed(accumulation, tCrSFA, tCrSFB);
       }
-
-      ++smem_pipe_read;
     }
 
     warpgroup_fence_operand(accumulation());
 
-    CUTLASS_PRAGMA_UNROLL
-    for (int k_tile_prologue = prologue_mma_count - 1; k_tile_prologue > 0; --k_tile_prologue)
-    {
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-
-      if constexpr (ScalePromotionInterval != 4) {
-        if (accumulation.prepare_if_needed()) {
-          tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-        }
-      }
-      else {
-        // Always zero out the accumulator for finest granularity
-        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-      }
-
-      int read_stage = smem_pipe_read.index();
-      // Load per block scale values from shared memory to registers
-      copy(tCsSFA(_,_,_,make_coord(_0{},read_stage)), tCrSFA);
-      copy(tCsSFB(_,_,_,make_coord(_0{},read_stage)), tCrSFB);
-
-      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
-        tCrSFA(_0{}) = tCrSFA(_0{}) * tCrSFB(_0{});
-      }
-      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
-        ElementBlockScale scale_b = tCrSFB(_0{});
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(filter_zeros(tCrSFA)); i++) {
-          filter_zeros(tCrSFA)(i) = filter_zeros(tCrSFA)(i) * scale_b;
-        }
-      }
-      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
-        ElementBlockScale scale_a = tCrSFA(_0{});
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(filter_zeros(tCrSFB)); i++) {
-          filter_zeros(tCrSFB)(i) = filter_zeros(tCrSFB)(i) * scale_a;
-        }
-      }
-
-      warpgroup_arrive();
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        // (V,M) x (V,N) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-      }
-
-      warpgroup_commit_batch();
-
-      // Block scale the accumulators with reg tensor `tCrSFA` and `tCrSFB`
-      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
-        ElementBlockScale scale_ab = tCrSFA(_0{});
-        scale_if_needed(accumulation, scale_ab);
-      }
-      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
-        scale_if_needed(accumulation, tCrSFA);
-      }
-      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
-        scale_if_needed(accumulation, tCrSFB);
-      }
-      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
-        scale_if_needed(accumulation, tCrSFA, tCrSFB);
-      }
-
-      ++smem_pipe_read;
-    }
-
-    warpgroup_fence_operand(accumulation());
     // Mainloop GMMAs
-    k_tile_count -= prologue_mma_count;
+    k_tile_count--;
 
     CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 0; --k_tile_count)
+    for ( ; k_tile_count > 1; --k_tile_count)
     {
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
       pipeline.consumer_wait(smem_pipe_read, barrier_token);
 
       //
@@ -970,29 +899,10 @@ struct CollectiveMma<
       //
 
       int read_stage = smem_pipe_read.index();
-      // fence_operand();
+
       // Load per block scale values from shared memory to registers (at most twice per block along M and/or N)
-      copy(tCsSFA(_,_,_,make_coord(_0{},read_stage)), tCrSFA);
-      copy(tCsSFB(_,_,_,make_coord(_0{},read_stage)), tCrSFB);
-
-      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
-        tCrSFA(_0{}) = tCrSFA(_0{}) * tCrSFB(_0{});
-      }
-      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
-        ElementBlockScale scale_b = tCrSFB(_0{});
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(filter_zeros(tCrSFA)); i++) {
-          filter_zeros(tCrSFA)(i) = filter_zeros(tCrSFA)(i) * scale_b;
-        }
-      }
-      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
-        ElementBlockScale scale_a = tCrSFA(_0{});
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(filter_zeros(tCrSFB)); i++) {
-          filter_zeros(tCrSFB)(i) = filter_zeros(tCrSFB)(i) * scale_a;
-        }
-      }
-
+      copy(tCsSFA(_,_,_,make_coord(_0{}, read_stage)), tCrSFA);
+      copy(tCsSFB(_,_,_,make_coord(_0{}, read_stage)), tCrSFB);
 
       if constexpr (ScalePromotionInterval != 4) {
         if (accumulation.prepare_if_needed()) {
@@ -1005,7 +915,6 @@ struct CollectiveMma<
       }
 
       warpgroup_fence_operand(accumulation());
-
       warpgroup_arrive();
       // Unroll the K mode manually to set scale D to 1
       CUTLASS_PRAGMA_UNROLL
@@ -1017,9 +926,31 @@ struct CollectiveMma<
       warpgroup_commit_batch();
 
       /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
-      warpgroup_wait<K_PIPE_MMAS>();
       warpgroup_fence_operand(accumulation());
 
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        tCrSFA(_0{}) = tCrSFA(_0{}) * tCrSFB(_0{});
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_b = tCrSFB(_0{});
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(filter_zeros(tCrSFA)); i++) {
+          filter_zeros(tCrSFA)(i) = filter_zeros(tCrSFA)(i) * scale_b;
+        }
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        ElementBlockScale scale_a = tCrSFA(_0{});
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(filter_zeros(tCrSFB)); i++) {
+          filter_zeros(tCrSFB)(i) = filter_zeros(tCrSFB)(i) * scale_a;
+        }
+      }
+
+      warpgroup_wait<0>();
+      pipeline.consumer_release(smem_pipe_release); // Unlock previous tile
+      ++smem_pipe_read;
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+
       // Block scale the accumulators with reg tensor `tCrSFA` and `tCrSFB`
       if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
         ElementBlockScale scale_ab = tCrSFA(_0{});
@@ -1035,13 +966,81 @@ struct CollectiveMma<
         scale_if_needed(accumulation, tCrSFA, tCrSFB);
       }
 
-      // UNLOCK smem_pipe_release, done _computing_ on it
-      pipeline.consumer_release(smem_pipe_release);
-
       // Advance smem_pipe_read and smem_pipe_release
-      ++smem_pipe_read;
       ++smem_pipe_release;
     }
+
+    if (k_tile_count) {
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      // Load per block scale values from shared memory to registers (at most twice per block along M and/or N)
+      copy(tCsSFA(_,_,_,make_coord(_0{}, read_stage)), tCrSFA);
+      copy(tCsSFB(_,_,_,make_coord(_0{}, read_stage)), tCrSFB);
+
+      if constexpr (ScalePromotionInterval != 4) {
+        if (accumulation.prepare_if_needed()) {
+          tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+        }
+      }
+      else {
+        // Always zero out the accumulator for finest granularity
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      warpgroup_fence_operand(accumulation());
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_fence_operand(accumulation());
+
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        tCrSFA(_0{}) = tCrSFA(_0{}) * tCrSFB(_0{});
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_b = tCrSFB(_0{});
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(filter_zeros(tCrSFA)); i++) {
+          filter_zeros(tCrSFA)(i) = filter_zeros(tCrSFA)(i) * scale_b;
+        }
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        ElementBlockScale scale_a = tCrSFA(_0{});
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(filter_zeros(tCrSFB)); i++) {
+          filter_zeros(tCrSFB)(i) = filter_zeros(tCrSFB)(i) * scale_a;
+        }
+      }
+      warpgroup_wait<0>();
+      pipeline.consumer_release(smem_pipe_release); // Unlock previous tile
+
+      // Block scale the accumulators with reg tensor `tCrSFA` and `tCrSFB`
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_ab = tCrSFA(_0{});
+        scale_if_needed(accumulation, scale_ab);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        scale_if_needed(accumulation, tCrSFA);
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        scale_if_needed(accumulation, tCrSFB);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
+        scale_if_needed(accumulation, tCrSFA, tCrSFB);
+      }
+    }
     if constexpr (ScalePromotionInterval != 4) {
       // residues only exists when granularity is not the finnest
       if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
@@ -1066,19 +1065,9 @@ struct CollectiveMma<
   /// Perform a Consumer Epilogue to release all buffers
   CUTLASS_DEVICE void
   mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
-    // Prologue GMMAs
-    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-    k_tile_count -= prologue_mma_count;
-
-    smem_pipe_release.advance(k_tile_count);
-    
-    // Wait on all GMMAs to complete
-    warpgroup_wait<0>();
-
-    for (int count = 0; count < prologue_mma_count; ++count) {
-      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
-      ++smem_pipe_release;
-    }
+    // The pipeline is not released in the first iteration
+    smem_pipe_release.advance(k_tile_count - 1);
+    pipeline.consumer_release(smem_pipe_release);
   }
 
   //
@@ -1198,6 +1187,10 @@ struct CollectiveMma<
   tensormaps_cp_fence_release (
       TensorMapStorage& shared_tensormaps,
       cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) {
+    if (cute::elect_one_sync()) {
+      cute::tma_desc_commit_group();
+      cute::tma_desc_wait_group();
+    }
     // Entire warp must do this (i.e. it's aligned)
     tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
     tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
index 5b8f1059..19009d5d 100644
--- a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -128,6 +128,10 @@ struct CollectiveMma<
 
   static constexpr int ScalePromotionInterval = ScaleGranularityK / size<2>(typename TiledMma::AtomShape_MNK{});
   static_assert(ScalePromotionInterval % 4 == 0, "ScalePromotionInterval must be a multiple of 4.");
+  static_assert(ScalePromotionInterval >= size<2>(TileShape{}) / tile_size<2>(TiledMma{}),
+    "ScalePromotionInterval must be greater than or equal to the number of stages of the MMA atom.");
+  static_assert(ScalePromotionInterval % (size<2>(TileShape{}) / tile_size<2>(TiledMma{})) == 0,
+    "ScalePromotionInterval must be a multiple of the number of stages of the MMA atom.");
   static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
   static constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN;
 
@@ -213,7 +217,6 @@ struct CollectiveMma<
     StrideA dA;
     ElementB const* ptr_B;
     StrideB dB;
-    uint32_t mma_promotion_interval = 4;
     ElementBlockScale const* ptr_SFA; 
     LayoutSFA layout_SFA;
     ElementBlockScale const* ptr_SFB;
@@ -382,16 +385,6 @@ struct CollectiveMma<
       CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem size doesn't meet the minimum alignment requirements for using TMA to load scale B.\n");
     }
 
-    /* MMA promotion interval should be a multiple of 4, since each mainloop iteration would issue 4 MMA instructions. */
-    constexpr int pipe_k = size<2>(TileShape{}) / tile_size<2>(TiledMma{});
-    if (args.mma_promotion_interval % 4 != 0 ||
-        args.mma_promotion_interval != ScalePromotionInterval ||
-        args.mma_promotion_interval % pipe_k != 0 ||
-        pipe_k > args.mma_promotion_interval) {
-      implementable = false;
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Argument mma_promotion_interval is invalid.\n");
-    }
-
     // We expect full tiles in K
     if (K % size<2>(TileShape{}) != 0) {
       implementable = false;
@@ -1001,7 +994,7 @@ struct CollectiveMma<
       // Advance smem_pipe_read and smem_pipe_release
       ++smem_pipe_release;
     }
-    if (k_tile_count == 1) {
+    if (k_tile_count) {
       pipeline.consumer_wait(smem_pipe_read, barrier_token);
 
       //
diff --git a/include/cutlass/gemm/dispatch_policy.hpp b/include/cutlass/gemm/dispatch_policy.hpp
index 7f8e6f30..712fc1ba 100644
--- a/include/cutlass/gemm/dispatch_policy.hpp
+++ b/include/cutlass/gemm/dispatch_policy.hpp
@@ -1016,6 +1016,8 @@ struct MainloopSm100ArrayTmaUmmaWarpSpecializedFastF32 {
 
 
 
+// Mainloop schedule for array-based TMA
+
 template<
   int Stages_,
   int SchedulerPipelineStageCount_,
diff --git a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized.hpp
index 055b56e3..78401097 100644
--- a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized.hpp
+++ b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized.hpp
@@ -824,9 +824,6 @@ public:
     }
 
     else if (is_participant.sched) {
-      if constexpr (IsSchedDynamicPersistent) {
-        cutlass::arch::wait_on_dependent_grids();
-      }
       // Signal the epilogue warps to proceed once the prologue is complete
       epilogue_throttle_barrier.arrive();
 
@@ -837,6 +834,8 @@ public:
         // why this variable is needed.
         bool requires_clc_query = true;
 
+        cutlass::arch::wait_on_dependent_grids();
+
         do {
           if (requires_clc_query) {
             // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
@@ -872,6 +871,9 @@ public:
         clc_pipeline.producer_tail(clc_pipe_producer_state);
       }
       else {
+
+        cutlass::arch::wait_on_dependent_grids();
+
         do {
           auto [next_work_tile_info, increment_pipe] = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
           work_tile_info = next_work_tile_info;
diff --git a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp
index 57c00407..76432e1e 100644
--- a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp
+++ b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp
@@ -832,10 +832,6 @@ public:
       // Register reconfiguration
       arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
 
-      if constexpr (IsSchedDynamicPersistent) {
-        cutlass::arch::wait_on_dependent_grids();
-      }
-
       // Signal the epilogue warps to proceed once the prologue is complete
       epilogue_throttle_barrier.arrive();
 
@@ -845,6 +841,8 @@ public:
         // See comment below where this variable is updated for a description of
         // why this variable is needed.
         bool requires_clc_query = true;
+
+        cutlass::arch::wait_on_dependent_grids();
         do {
           if (requires_clc_query) {
             // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
@@ -883,6 +881,7 @@ public:
         clc_pipeline.producer_tail(clc_pipeline_producer_state);
       }
       else {
+        cutlass::arch::wait_on_dependent_grids();
         do {
           auto [next_work_tile_info, increment_pipe] = scheduler.advance_to_next_work(clc_pipeline, clc_pipeline_producer_state);
           work_tile_info = next_work_tile_info;
diff --git a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_mma_transform.hpp b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_mma_transform.hpp
index c036c0af..83eebaf5 100644
--- a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_mma_transform.hpp
+++ b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_mma_transform.hpp
@@ -941,6 +941,8 @@ public:
         // why this variable is needed.
         bool requires_clc_query = true;
 
+        cutlass::arch::wait_on_dependent_grids();
+
         do {
           if (requires_clc_query) {
             // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
@@ -976,6 +978,7 @@ public:
         clc_pipeline.producer_tail(clc_pipe_producer_state);
       }
       else {
+        cutlass::arch::wait_on_dependent_grids();
         do {
           auto [next_work_tile_info, increment_pipe] = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
           work_tile_info = next_work_tile_info;
diff --git a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp
index 222a7ad1..3989ffe3 100644
--- a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp
+++ b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp
@@ -177,7 +177,6 @@ public:
 
   // Kernel level shared memory storage
   struct SharedStorage {
-    // Barriers should be allocated in lower 8KB of SMEM for SM100
     struct PipelineStorage : cute::aligned_struct<16, _1> {
       using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
       using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
@@ -649,15 +648,14 @@ public:
     }
 
     else if (is_participant.sched) {
-      if constexpr (IsSchedDynamicPersistent) {
-        cutlass::arch::wait_on_dependent_grids();
-      }
       if constexpr (IsSchedDynamicPersistent) {
         // Whether a new CLC query must be performed.
         // See comment below where this variable is updated for a description of
         // why this variable is needed.
         bool requires_clc_query = true;
 
+        cutlass::arch::wait_on_dependent_grids();
+
         do {
           if (requires_clc_query) {
             // Query next clcID and update producer state
diff --git a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_input_transform.hpp b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_input_transform.hpp
index ae712512..fcaae852 100644
--- a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_input_transform.hpp
+++ b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_input_transform.hpp
@@ -717,10 +717,6 @@ public:
       // Register reconfiguration
       arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
 
-      if constexpr (IsSchedDynamicPersistent) {
-        cutlass::arch::wait_on_dependent_grids();
-      }
-
       // Signal the epilogue warps to proceed once the prologue is complete
       epilogue_throttle_barrier.arrive();
 
@@ -729,6 +725,9 @@ public:
         // See comment below where this variable is updated for a description of
         // why this variable is needed.
         bool requires_clc_query = true;
+
+        cutlass::arch::wait_on_dependent_grids();
+
         do {
           if (requires_clc_query) {
             // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
diff --git a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_mma_transform.hpp b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_mma_transform.hpp
index 1826cce9..180bda31 100644
--- a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_mma_transform.hpp
+++ b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_mma_transform.hpp
@@ -769,10 +769,6 @@ public:
       // Register reconfiguration
       arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
 
-      if constexpr (IsSchedDynamicPersistent) {
-        cutlass::arch::wait_on_dependent_grids();
-      }
-
       // Signal the epilogue warps to proceed once the prologue is complete
       epilogue_throttle_barrier.arrive();
 
@@ -783,6 +779,8 @@ public:
         // why this variable is needed.
         bool requires_clc_query = true;
 
+        cutlass::arch::wait_on_dependent_grids();
+
         do {
           if (requires_clc_query) {
             // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
diff --git a/include/cutlass/gemm/kernel/sm100_sparse_gemm_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm100_sparse_gemm_tma_warpspecialized.hpp
index a3494d33..0932f5c6 100644
--- a/include/cutlass/gemm/kernel/sm100_sparse_gemm_tma_warpspecialized.hpp
+++ b/include/cutlass/gemm/kernel/sm100_sparse_gemm_tma_warpspecialized.hpp
@@ -687,15 +687,14 @@ public:
     }
 
     else if (is_participant.sched) {
-      if constexpr (IsSchedDynamicPersistent) {
-        cutlass::arch::wait_on_dependent_grids();
-      }
       if constexpr (IsSchedDynamicPersistent) {
         // Whether a new CLC query must be performed.
         // See comment below where this variable is updated for a description of
         // why this variable is needed.
         bool requires_clc_query = true;
 
+        cutlass::arch::wait_on_dependent_grids();
+
         do {
           if (requires_clc_query) {
             // Query next clcID and update producer state
diff --git a/include/cutlass/gemm/kernel/sm120_gemm_tma_warpspecialized_cooperative_asymmetric_dma.hpp b/include/cutlass/gemm/kernel/sm120_gemm_tma_warpspecialized_cooperative_asymmetric_dma.hpp
index ac57aa6d..610dfc6e 100644
--- a/include/cutlass/gemm/kernel/sm120_gemm_tma_warpspecialized_cooperative_asymmetric_dma.hpp
+++ b/include/cutlass/gemm/kernel/sm120_gemm_tma_warpspecialized_cooperative_asymmetric_dma.hpp
@@ -551,13 +551,12 @@ public:
       if (producer_warp_role == ProducerWarpRole::Warp1) {
         work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
 
-        if constexpr (IsSchedDynamicPersistent) {
-          cutlass::arch::wait_on_dependent_grids();
-        }
-
         if constexpr (IsSchedDynamicPersistent) {
           bool requires_clc_query = true;
           TileSchedulerPipelineState scheduler_pipe_producer_state = cutlass::make_producer_start_state<TileSchedulerPipeline>();
+
+          cutlass::arch::wait_on_dependent_grids();
+
           while (work_tile_info.is_valid()) {
             if (requires_clc_query) {
               // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
index 93e9b797..4f5723da 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
@@ -153,12 +153,12 @@ public:
     cute::conditional_t<cute::is_same_v<SchedulerTag, void>, void, ProblemShape> // Use void for default scheduler.
   >::Scheduler;
 
-  static constexpr auto TileSchedulerStages = 8;
-
   using TileSchedulerArguments = typename TileScheduler::Arguments;
   using TileSchedulerParams = typename TileScheduler::Params;
   using TileSchedulerResponse = typename TileSchedulerResponseGetter<TileScheduler>::Type;
 
+  static constexpr auto TileSchedulerStages = 8;
+
   static constexpr uint32_t NumLoadWarpGroups = 1;
   static constexpr uint32_t NumMmaThreads = size(TiledMma{});
   static constexpr uint32_t NumMmaWarpGroups = NumMmaThreads / NumThreadsPerWarpGroup;
@@ -378,7 +378,6 @@ public:
     if (status != Status::kSuccess) {
       return status;
     }
-
     return status;
   }
 
@@ -461,6 +460,7 @@ public:
         return TileScheduler{params.scheduler};
       }
     } ();
+
     // In a warp specialized kernel, collectives expose data movement and compute operations separately
     CollectiveMainloop collective_mainloop;
     CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
@@ -484,7 +484,7 @@ public:
     typename TileSchedulerPipeline::Params tile_scheduler_pipeline_params;
     if constexpr (cute::is_same_v<SchedulerTag, GroupScheduler>) {
       if (warp_group_role == WarpGroupRole::Producer
-       && producer_warp_role == ProducerWarpRole::Scheduler) {
+        && producer_warp_role == ProducerWarpRole::Scheduler) {
         tile_scheduler_pipeline_params.role = TileSchedulerPipeline::ThreadCategory::Producer;
       }
       else {
@@ -499,7 +499,6 @@ public:
       tile_scheduler_pipeline_params.producer_arv_count = 1;
     }
     TileSchedulerPipeline tile_scheduler_pipeline(shared_storage.pipelines.scheduler, tile_scheduler_pipeline_params);
-
     // Mainloop Load pipeline
     using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
     typename MainloopPipeline::Params mainloop_pipeline_params;
@@ -683,9 +682,8 @@ public:
             block_rank_in_cluster,
             shared_storage.tensors.mainloop
           );
-          // Update starting pipeline state for the next tile
-          // Wait for the last TMA stage to complete loading, before issuing tensormap updates
-          mainloop_pipe_producer_state.advance(work_k_tile_count - 1);
+          // Pipeline state is only advanced if there are K tiles to compute
+          mainloop_pipe_producer_state.advance(work_k_tile_count);
 
           // Signal for the epilogue load warp to begin
           if (do_load_order_arrive) {
@@ -706,11 +704,6 @@ public:
             if constexpr (IsGroupedGemmKernel) {
               problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(curr_batch), 1);
             }
-            // Purpose of this pipeline state is to make sure TMA loads have finished before doing descriptor updates
-            // Since this state is waiting for loads to finish, it must start in the inverted phase.
-            typename CollectiveMainloop::PipelineState mainloop_pipe_tma_consumer_state =
-              {mainloop_pipe_producer_state.index(), !mainloop_pipe_producer_state.phase(), mainloop_pipe_producer_state.count()};
-            mainloop_pipeline.consumer_wait(mainloop_pipe_tma_consumer_state);
             collective_mainloop.tensormaps_perform_update(
               shared_storage.tensormaps.mainloop,
               params.mainloop,
@@ -723,8 +716,6 @@ public:
             // Entire warp must do this (i.e. it's aligned)
             collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps);
           }
-          // Advance the producer state for the last remaining stage that was being waited for above
-          mainloop_pipe_producer_state.advance(1);
         } while (work_tile_info.is_valid()); // Scheduler work fetch loop
 
         // Make sure all Consumer Warp Groups have been waited upon
@@ -771,8 +762,8 @@ public:
               block_rank_in_cluster,
               shared_storage.tensors.mainloop
             );
+
             // Update starting pipeline state for the next tile
-            // Wait for the last TMA stage to complete loading, before issuing tensormap updates
             mainloop_pipe_producer_state.advance(work_k_tile_count);
 
             // Get next work tile
@@ -790,8 +781,8 @@ public:
               }
             }
           } while (work_tile_info.is_valid()); // Scheduler work fetch loop
-        }
-      }
+        } // End of auxiliary load needed check
+      } // Mainloop Auxiliary Load Producer Warp End
       // Epilogue Producer Warp
       else if (producer_warp_role == ProducerWarpRole::Epilogue && collective_epilogue.is_producer_load_needed()) {
         int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
@@ -854,6 +845,7 @@ public:
               wait
             );
           }
+
           work_tile_info = next_work_tile_info;
           if (increment_pipe) {
             ++tile_scheduler_pipe_consumer_state;
@@ -917,8 +909,8 @@ public:
 
         // Converge before issuing tensormap fence release since fence is aligned
         __syncwarp();
-        collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, 
-                                                                    epi_store_tensormap, 
+        collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue,
+                                                                    epi_store_tensormap,
                                                                     consumer_warp_group_idx);
       }
 
@@ -1021,7 +1013,7 @@ public:
 
             // Converge before issuing tensormap fence release since fence is aligned
             __syncwarp();
-            collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, 
+            collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue,
                                                                        epi_store_tensormap,
                                                                        consumer_warp_group_idx);
           }
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp
index c720c215..f33f4685 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp
@@ -69,7 +69,6 @@ class GemmUniversal<
   cute::enable_if_t<cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, typename CollectiveMainloop_::DispatchPolicy::Schedule>>
 >
 {
-
   // Get the type of the scheduler response.
   template<typename TileScheduler, typename = void>
   struct TileSchedulerResponseGetter {
@@ -145,7 +144,6 @@ public:
     TileScheduler_
   >;
 
-
   using TileScheduler = typename detail::TileSchedulerSelector<
     SchedulerTag,
     ArchTag,
@@ -646,6 +644,8 @@ public:
       cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
 
       if (producer_warp_role == ProducerWarpRole::Scheduler) {
+        // GroupScheduler requires a producer warp to iterate over the group infos and push
+        // the work tile infos to the downstream pipelines.
         if constexpr (cute::is_same_v<SchedulerTag, GroupScheduler>) {
           do {
             auto [next_work_tile_info, increment_pipe] = scheduler.advance_to_next_work(tile_scheduler_pipeline, tile_scheduler_pipe_producer_state);
@@ -684,7 +684,8 @@ public:
         bool did_batch_change = true;
         do {
           if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
-            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+                work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
             work_tile_info = next_work_tile_info;
             if (increment_pipe) {
               ++tile_scheduler_pipe_consumer_state;
@@ -719,9 +720,8 @@ public:
             block_rank_in_cluster,
             shared_storage.tensors.mainloop
           );
-          // Update starting pipeline state for the next tile
-          // Wait for the last TMA stage to complete loading, before issuing tensormap updates
-          mainloop_pipe_producer_state.advance(work_k_tile_count - 1);
+          // Pipeline state is only advanced if there are K tiles to compute
+          mainloop_pipe_producer_state.advance(work_k_tile_count);
 
           // Signal for the epilogue load warp to begin
           if (do_load_order_arrive) {
@@ -742,11 +742,6 @@ public:
             if constexpr (IsGroupedGemmKernel) {
               problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(curr_batch), 1);
             }
-            // Purpose of this pipeline state is to make sure TMA loads have finished before doing descriptor updates
-            // Since this state is waiting for loads to finish, it must start in the inverted phase.
-            typename CollectiveMainloop::PipelineState mainloop_pipe_tma_consumer_state =
-              {mainloop_pipe_producer_state.index(), !mainloop_pipe_producer_state.phase(), mainloop_pipe_producer_state.count()};
-            mainloop_pipeline.consumer_wait(mainloop_pipe_tma_consumer_state);
             collective_mainloop.tensormaps_perform_update(
               shared_storage.tensormaps.mainloop,
               params.mainloop,
@@ -759,8 +754,6 @@ public:
             // Entire warp must do this (i.e. it's aligned)
             collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps);
           }
-          // Advance the producer state for the last remaining stage that was being waited for above
-          mainloop_pipe_producer_state.advance(1);
         } while (work_tile_info.is_valid()); // Scheduler work fetch loop
 
         // Make sure all Consumer Warp Groups have been waited upon
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
index 1264b230..dc5610fc 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
@@ -546,6 +546,8 @@ public:
         if constexpr (IsSchedDynamicPersistent) { 
           bool requires_clc_query = true;
           TileSchedulerPipelineState scheduler_pipe_producer_state = cutlass::make_producer_start_state<TileSchedulerPipeline>();
+
+          cutlass::arch::wait_on_dependent_grids();
           while (work_tile_info.is_valid()) {
 
             if (requires_clc_query) {
diff --git a/include/cutlass/pipeline/sm100_pipeline.hpp b/include/cutlass/pipeline/sm100_pipeline.hpp
index 3dba8dda..53bc9199 100644
--- a/include/cutlass/pipeline/sm100_pipeline.hpp
+++ b/include/cutlass/pipeline/sm100_pipeline.hpp
@@ -334,9 +334,9 @@ public:
       static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
       static_assert(IsDynamicCluster or ((cute::size<0>(cluster_shape) % cute::size<0>(atom_thr_shape) == 0) &&
                     (cute::size<1>(cluster_shape) % cute::size<1>(atom_thr_shape) == 0)));
-      uint32_t const multicast_consumer_arrival_count = (cute::size<0>(cluster_shape) / cute::size<0>(atom_thr_shape)) +
-                                     (cute::size<1>(cluster_shape) / cute::size<1>(atom_thr_shape)) - 1;
-
+      uint32_t const num_consumer_per_cluster = params.num_consumers / NumThreadsPerWarpGroup;
+      uint32_t const multicast_consumer_arrival_count = ((cute::size<0>(cluster_shape) / cute::size<0>(atom_thr_shape)) +
+                                     (cute::size<1>(cluster_shape) / cute::size<1>(atom_thr_shape)) - 1) * num_consumer_per_cluster;
       cutlass::arch::detail::initialize_barrier_array_pair_aligned<decltype(storage.full_barrier_), decltype(storage.empty_barrier_), Stages>(
           storage.full_barrier_, storage.empty_barrier_, producer_arv_cnt, multicast_consumer_arrival_count);
     }
diff --git a/include/cutlass/version.h b/include/cutlass/version.h
index a2880049..41d78322 100644
--- a/include/cutlass/version.h
+++ b/include/cutlass/version.h
@@ -34,9 +34,9 @@
 #include <cstdint>
 #include <string>
 
-#define CUTLASS_MAJOR 3
-#define CUTLASS_MINOR 9
-#define CUTLASS_PATCH 2
+#define CUTLASS_MAJOR 4
+#define CUTLASS_MINOR 0
+#define CUTLASS_PATCH 0 
 
 #ifdef CUTLASS_VERSIONS_GENERATED
 #include "cutlass/version_extended.h"
diff --git a/media/docs/cpp/blackwell.rst b/media/docs/cpp/blackwell.rst
new file mode 100644
index 00000000..ccb45239
--- /dev/null
+++ b/media/docs/cpp/blackwell.rst
@@ -0,0 +1,10 @@
+.. _blackwell:
+
+Blackwell Specific
+==================
+
+.. toctree::
+  :maxdepth: 2
+
+  Blackwell SM100/SM120 GEMMs<blackwell_functionality.md>
+  Blackwell Cluster Launch Control<blackwell_cluster_launch_control.md>
diff --git a/media/docs/cpp/blackwell_cluster_launch_control.md b/media/docs/cpp/blackwell_cluster_launch_control.md
index a4006f20..1504c144 100644
--- a/media/docs/cpp/blackwell_cluster_launch_control.md
+++ b/media/docs/cpp/blackwell_cluster_launch_control.md
@@ -6,7 +6,7 @@ A GEMM workload usually consists of three phases: prologue, mainloop and epilogu
 
 Consider a GEMM that has `20x20x1` output tiles, running on a GPU with `100` SMs. There is another kernel occupying all the resources of `20` SMs so only `80` SMs can be used. Assume cluster shape is `1x1x1`. The following diagram shows how the schedule would look like for such a kernel. 
 
-<p align="center"><img src=../../images/non_persistent.png alt="GEMM tiles are evenly divided among available SMs" title="GEMM Scheduling with Limited SM Resources"></p>
+<p align="center"><img src=../images/non_persistent.png alt="A beautiful sunset" title="Sunset over the mountains"></p>
 
 
 ### Static Scheduler
@@ -14,7 +14,7 @@ CUTLASS has adopted a software technique named **persistent kernels**. Persisten
 
 However, static scheduler is susceptible to workload imbalance if the resources of some SMs are unavailable. The following diagram illustrates this issue. 
 
-<p align="center"><img src=../../images/persistent_static.png alt="GEMM tiles are unevenly divided among available SMs, leading to workload imbalance" title="Imbalanced Workload Scheduling due to Static Scheduler"></p>
+<p align="center"><img src=../images/persistent_static.png alt="A beautiful sunset" title="Sunset over the mountains"></p>
 
 ### Dynamic Scheduler with Cluster Launch Control
 A fundamental limitation of persistent scheduling is that the number of SMs this kernel can utilize is unknown in real time. Some SMs might be occupied by another kernel and thus their resources are unavailable. This makes it challenging to load-balance work across SMs.
@@ -32,7 +32,7 @@ Cluster launch control follows the below rules:
 
 The following diagram shows how the schedule would look like with cluster launch control.
 
-<p align="center"><img src=../../images/persistent_clc.png alt="GEMM tiles are dynamically allocated among available SMs, leading to a balanced workload" title="Dynamic Scheduler with Cluster Launch Control"></p>
+<p align="center"><img src=../images/persistent_clc.png alt="A beautiful sunset" title="Sunset over the mountains"></p>
 
 ## Programming Model
 ### Pseudo Code
@@ -120,7 +120,7 @@ The CLC pipeline has a depth of 3 to overlap the CLC operations of multiple wave
 
 
 
-# Copyright
+### Copyright
 
 Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
diff --git a/media/docs/cpp/blackwell_functionality.md b/media/docs/cpp/blackwell_functionality.md
index 582899d3..df3d7f13 100644
--- a/media/docs/cpp/blackwell_functionality.md
+++ b/media/docs/cpp/blackwell_functionality.md
@@ -723,7 +723,7 @@ Specialized policies must be used to generate mixed-input-datatype `mx_float4_t`
 |----------------|----|----|----|----|------------------------------------|
  128x128x128    | Y  | N  | N  | N  | `KernelTmaWarpSpecializedPingpong` or `KernelTmaWarpSpecializedCooperative` |
 
-# Copyright
+### Copyright
 
 Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
diff --git a/media/docs/cpp/build/building_in_windows_with_visual_studio.md b/media/docs/cpp/build/building_in_windows_with_visual_studio.md
index ebadf321..08935db1 100644
--- a/media/docs/cpp/build/building_in_windows_with_visual_studio.md
+++ b/media/docs/cpp/build/building_in_windows_with_visual_studio.md
@@ -5,7 +5,7 @@ Users and developers may build either
 in Visual Studio's graphical integrated development environment,
 or on the command line with `cmake --build`.
 
-# Software prerequisites
+## Software prerequisites
 
 1. Windows 10 or 11
 
@@ -22,7 +22,7 @@ or on the command line with `cmake --build`.
 Visual Studio must be installed *before* the CUDA Toolkit.
 Otherwise, Visual Studio's build system won't know about CUDA.
 
-# Operating system settings
+## Operating system settings
 
 By default, Windows restricts the maximum file path length (`MAX_PATH`) to 260 characters.
 CUTLASS has many files and directory paths that challenge this requirement.
@@ -48,7 +48,7 @@ before attempting to clone or build CUTLASS.
 [This Microsoft help article](https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry)
 explains different ways to change the registry setting.
 
-# Set up build environment
+## Set up build environment
 
 1. Run "git bash" to get a familiar command-line interface
 
@@ -62,7 +62,7 @@ explains different ways to change the registry setting.
 
 Alternate approaches may rely on the CMake GUI and/or Windows' native command line.
 
-# Building
+## Building
 
 A successful CMake run will create a `CUTLASS.sln` Visual Studio "solution" file in the build directory.
 One can open this in Visual Studio and build the entire solution or any subset of projects as desired.
@@ -77,7 +77,7 @@ Unlike with CMake's Makefile or Ninja generators,
 `CMAKE_BUILD_TYPE` has no effect on the Visual Studio generator,
 because the Visual Studio generator creates all build configurations.
 
-# Tips
+## Tips
 
 With Windows builds, one may find that CMake reruns unnecessarily.
 For example, cancelling a build and starting it again may rerun CMake.
@@ -86,7 +86,7 @@ One work-around is to set the CMake option `CMAKE_SUPPRESS_REGENERATION=ON`.
 However, this turns off CMake's ability to detect on its own when it needs to rerun.
 As a result, one will need to know when to rerun CMake by hand.
 
-## Copyright
+### Copyright
 
 Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
diff --git a/media/docs/cpp/build/building_with_clang_as_host_compiler.md b/media/docs/cpp/build/building_with_clang_as_host_compiler.md
index 47b3971d..332d2006 100644
--- a/media/docs/cpp/build/building_with_clang_as_host_compiler.md
+++ b/media/docs/cpp/build/building_with_clang_as_host_compiler.md
@@ -5,7 +5,7 @@ Clang as host compiler, and NVCC as device compiler.
 This is NOT the same as building with
 Clang as both host and device compiler ("CUDA Clang").
 
-# Software prerequisites
+## Software prerequisites
 
 1. Clang (regularly tested with Clang 17;
    occasionally tested with Clang 10 and greater)
@@ -29,9 +29,9 @@ A symptom of not installing all needed dependencies
 is the following error when attempting to use clang:
 `"/usr/bin/ld: cannot find -lstdc++: No such file or directory"`.
 
-# Running CMake
+## Running CMake
 
-## Required CMake options
+### Required CMake options
 
 The Clang build requires specifying the following CMake options.
 Replace `<path-to-clang++>` with the path to your `clang++` executable.
@@ -55,7 +55,7 @@ then one can set `CMAKE_CUDA_COMPILER` as follows.
 
 * `CMAKE_CUDA_COMPILER=${PATH_TO_CUDA_TOOLKIT}/bin/nvcc`
 
-## Copyright
+### Copyright
 
 Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
diff --git a/media/docs/cpp/build/index.rst b/media/docs/cpp/build/index.rst
new file mode 100644
index 00000000..ddd34d16
--- /dev/null
+++ b/media/docs/cpp/build/index.rst
@@ -0,0 +1,10 @@
+.. _cpp_build:
+
+Build
+=====
+
+.. toctree::
+  :maxdepth: 1
+
+  Building on Windows with Visual Studio<building_in_windows_with_visual_studio.md>
+  Building with Clang as host compiler<building_with_clang_as_host_compiler.md>
diff --git a/media/docs/cpp/code_organization.md b/media/docs/cpp/code_organization.md
index 84d9ab0f..fd0292fe 100644
--- a/media/docs/cpp/code_organization.md
+++ b/media/docs/cpp/code_organization.md
@@ -1,6 +1,6 @@
 ![ALT](../../images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Code Organization")
 
-# CUTLASS Code Organization
+# Code Organization
 
 This document describes the layout of the CUTLASS repository. The main components are:
 
diff --git a/media/docs/cpp/cute/02_layout_algebra.md b/media/docs/cpp/cute/02_layout_algebra.md
index c2accec9..e48ede48 100644
--- a/media/docs/cpp/cute/02_layout_algebra.md
+++ b/media/docs/cpp/cute/02_layout_algebra.md
@@ -249,9 +249,7 @@ auto same_r = make_layout(composition(layout<0>(a), get<0>(tiler)),
 We often use the `<LayoutA, LayoutB, ...>` notation to distinguish `Tiler`s from the concatenation-of-sublayouts notation `(LayoutA, LayoutB, ...)` that we used previously.
 
 The `result` in the above code can be depicted as the 3x8 sublayout of the original layout highlighted in the figure below.
-<p align="center">
-  <img src="../../../images/cute/composition1.png" alt="composition1.png" height="250"/>
-</p>
+![composition1.png](../../../images/cute/composition1.png)
 
 For convenience, CuTe also interprets `Shape`s as a tiler as well. A `Shape` is interpreted as tuple-of-layouts-with-stride-1:
 ```cpp
@@ -268,9 +266,7 @@ auto tiler = make_shape(Int<3>{}, Int<8>{});
 auto result = composition(a, tiler);
 ```
 where `result` can be depicted as the 3x8 sublayout of the original layout highlighted in the figure below.
-<p align="center">
-  <img src="../../../images/cute/composition2.png" alt="composition2.png" height="250"/>
-</p>
+![composition2.png](../../../images/cute/composition2.png)
 
 ## Composition Tilers
 
@@ -323,9 +319,7 @@ The `cotarget` parameter above is most commonly an integer -- you can see we onl
 
 * `complement((2,2):(1,6), 24)` is `(3,2):(2,12)`. Note that `((2,2),(3,2)):((1,6),(2,12))` has cosize `24` and produces unique indices.
 
-<p align="center">
-  <img src="../../../images/cute/complement1.png" alt="complement1.png" height="75"/>
-</p>
+![complement1.png](../../../images/cute/complement1.png)
 As a visualization, the above figure depicts the codomain of the last example. The image of the original layout `(2,2):(1,6)` is colored in gray. The complement effectively "repeats" the original layout (displayed in the other colors) such that the codomain size of the result is `24`. The complement `(3,2):(2,12)` can be viewed as the "layout of the repetition."
 
 ## Division (Tiling)
@@ -371,9 +365,7 @@ This is computed in the three steps described in the implementation above.
 * Concantenation of `(B,B*) = (4,(2,3)):(2,(1,8))`.
 * Composition of `A = (4,2,3):(2,1,8)` with `(B,B*)` is then `((2,2),(2,3)):((4,1),(2,8))`.
 
-<p align="center">
-  <img src="../../../images/cute/divide1.png" alt="divide1.png" height="150"/>
-</p>
+![divide1.png](../../../images/cute/divide1.png)
 
 The above figure depicts `A` as a 1-D layout with the elements pointed to by `B` highlighted in gray. The layout `B` describes our "tile" of data, and there are six of those tiles in `A` shown by each of the colors. After the divide, the first mode of the result is the tile of data and the second mode of the result iterates over each tile.
 
@@ -383,9 +375,7 @@ Using the `Tiler` concept defined above, this immediately generalizes to multidi
 
 Similar to the 2-D composition example above, consider a 2-D layout `A = (9,(4,8)):(59,(13,1))` and want to apply `3:3` down the columns (mode-0) and `(2,4):(1,8)` across the rows (mode-1). This means the tiler can be written as `B = <3:3, (2,4):(1,8)>`.
 
-<p align="center">
-  <img src="../../../images/cute/divide2.png" alt="divide2.png" height="450"/>
-</p>
+![divide2.png](../../../images/cute/divide2.png)
 
 The above figure depicts `A` as a 2-D layout with the elements pointed to by `B` highlighted in gray. The layout `B` describes our "tile" of data, and there are twelve of those tiles in `A` shown by each of the colors. After the divide, the first mode of each mode of the result is the tile of data and the second mode of each mode iterates over each tile. In that sense, this operation can be viewed as a kind of `gather` operation or as simply a permutation on the rows and cols.
 
@@ -429,9 +419,7 @@ We note that `logical_divide` preserves the *semantics* of the modes while permu
 
 This is not the case with `zipped_divide`. The mode-0 in the `zipped_divide` result is the `Tile` itself (of whatever rank the `Tiler` was) and mode-1 is the layout of those tiles. It doesn't always make sense to plot these as 2-D layouts, because the `M`-mode is now more aptly the "tile-mode" and the `N`-mode is more aptly the "rest-mode". Regardless, we still can plot the resulting layout as 2-D as shown below.
 
-<p align="center">
-  <img src="../../../images/cute/divide3.png" alt="divide3.png" height="450"/>
-</p>
+![divide3.png](../../../images/cute/divide3.png)
 
 We've kept each tile as its color in the previous images for clarity. Clearly, iterating across tiles is now equivalent to iterating across a row of this layout and iterating over elements within a tile is equivalent to iterating down a column of this layout. As we'll see in the `Tensor` section, this can be used to great effect in partitioning within or across tiles of data.
 
@@ -476,9 +464,7 @@ This is computed in the three steps described in the implementation above.
 * Composition of `A* = (2,3):(2,8)` with `B = 6:1` is then `(2,3):(2,8)`.
 * Concatenation of `(A,A* o B) = ((2,2),(2,3)):((4,1),(2,8))`.
 
-<p align="center">
-  <img src="../../../images/cute/product1.png" alt="product1.png" height="175"/>
-</p>
+![product1.png](../../../images/cute/product1.png)
 
 The above figure depicts `A` and `B` as a 1-D layouts. The layout `B` describes the number and order of repetitions of `A` and they are colored for clarity. After the product, the first mode of the result is the tile of data and the second mode of the result iterates over each tile.
 
@@ -486,9 +472,7 @@ Note that the result is identical to the result of the 1-D Logical Divide exampl
 
 Of course, we can change the number and order of the tiles in the product by changing `B`.
 
-<p align="center">
-  <img src="../../../images/cute/product2.png" alt="product2.png" height="175"/>
-</p>
+![product2.png](../../../images/cute/product2.png)
 
 For example, in the above image with `B = (4,2):(2,1)`, there are 8 repeated tiles instead of 6 and the tiles are in a different order.
 
@@ -496,9 +480,7 @@ For example, in the above image with `B = (4,2):(2,1)`, there are 8 repeated til
 
 We can use the by-mode `tiler` strategies previously developed to write multidimensional products as well.
 
-<p align="center">
-  <img src="../../../images/cute/product2d.png" alt="product2d.png" height="250"/>
-</p>
+![product2d.png](../../../images/cute/product2d.png)
 
 The above image demonstates the use of a `tiler` to apply `logical_product` by-mode. Despite this **not being the recommended approach**, the result is a rank-2 layout consisting of 2x5 row-major block that is tiled across a 3x4 column-major arrangement.
 
@@ -519,17 +501,13 @@ Because `A` is always compatible with mode-0 of the result and `B` is always com
 
 This is exactly what `blocked_product` and `raked_product` do and it is why they are called rank-sensitive. Unlike other CuTe functions that take `Layout` arguments, these care about the top-level rank of the arguments so that each mode can be reassociated after the `logical_product`.
 
-<p align="center">
-  <img src="../../../images/cute/productblocked2d.png" alt="productblocked2d.png" height="250"/>
-</p>
+![productblocked2d.png](../../../images/cute/productblocked2d.png)
 
 The above image shows the same result as the `tiler` approach, but with much more intuitive arguments. A 2x5 row-major layout is arranged as a tile in a 3x4 column-major arrangement. Also note that `blocked_product` went ahead and `coalesced` mode-0 for us.
 
 Similarly, `raked_product` combines the modes slightly differently. Instead of the resulting "column" mode being constructed from the `A` "column" mode then the `B` "column" mode, the resulting "column" mode is constructed from the `B` "column" mode then the `A` "column" mode.
 
-<p align="center">
-  <img src="../../../images/cute/productraked2d.png" alt="productraked2d.png" height="250"/>
-</p>
+![productraked2d.png](../../../images/cute/productraked2d.png)
 
 This results in the "tile" `A` now being interleaved or "raked" with the "layout-of-tiles" `B` instead of appearing as blocks. Other references call this a "cyclic distribution."
 
diff --git a/media/docs/cpp/cute/03_tensor.md b/media/docs/cpp/cute/03_tensor.md
index 45abb88e..aead2907 100644
--- a/media/docs/cpp/cute/03_tensor.md
+++ b/media/docs/cpp/cute/03_tensor.md
@@ -269,9 +269,7 @@ Tensor E = A(make_coord(_,1),make_coord(0,_,1));
 Tensor F = A(make_coord(2,_),make_coord(_,3,_));
 ```
 
-<p align="center">
-  <img src="../../../images/cute/slice.png" alt="slice.png" height="300"/>
-</p>
+![slice.png](../../../images/cute/slice.png)
 
 In the image above, a `Tensor` is sliced in various ways and the subtensors generated by those slices are highlighted within the original tensor. Note that tensor `C` and `D` contain the same elements, but have different ranks and shapes due to the use of `_` versus the use of `make_coord(_,_)`. In each case, the rank of the result is equal to the number of `Underscore`s in the slicing coordinate.
 
@@ -327,9 +325,7 @@ Tensor tv = composition(A, tv_layout);                           // (8,4)
 Tensor  v = tv(threadIdx.x, _);                                  // (4)
 ```
 
-<p align="center">
-  <img src="../../../images/cute/tv_layout.png" alt="tv_layout.png" height="300"/>
-</p>
+![tv_layout.png](../../../images/cute/tv_layout.png)
 
 The above image is a visual representation of the above code. An arbitrary 4x8 layout of data is composed with a specific 8x4 TV-layout that represents a partitioning pattern. The result of the composition is on the right where each threads' values are arranged across each row. The bottom layout depicts the inverse TV layout which shows the mapping of 4x8 logical coordinates to the thread id and value id they will be mapped to.
 
diff --git a/media/docs/cpp/cute/0t_mma_atom.md b/media/docs/cpp/cute/0t_mma_atom.md
index aa6da8c2..ab57c92e 100644
--- a/media/docs/cpp/cute/0t_mma_atom.md
+++ b/media/docs/cpp/cute/0t_mma_atom.md
@@ -208,9 +208,7 @@ Volta architecture implements an HMMA instruction where a group of 8 threads cal
 
 We first take a look at how we would take the ISA semantics of thread and data partitioning for the HMMA instruction, and encode it in a Traits struct. The HMMA NT instruction has the thread-data layout:
 
-<p align="center">
-  <img src="../../../images/cute/HMMA.8x8x4.NT.png" alt="HMMA.8x8x4.NT.png" height="400"/>
-</p>
+![HMMA.8x8x4.NT.png](../../../images/cute/HMMA.8x8x4.NT.png)
 
 ### Types
 
@@ -250,9 +248,7 @@ Again, this layout function maps the logical thread id [0,8) of the MMA operatio
 
 Let us look at exactly how the 8 threads within a QP are mapped to the A, B and C matrices. For the C and D matrices, the above image is broken down a bit more below. On the left is shown the whole QP level view, and on the right is shown the values owned by just thread 0.
 
-<p align="center">
-  <img src="../../../images/cute/HMMA.8x8x4.quadpair.C.png" alt="HMMA.8x8x4.quadpair.C.png" height="400"/>
-</p>
+![HMMA.8x8x4.quadpair.C.png](../../../images/cute/HMMA.8x8x4.quadpair.C.png)
 
 The metainformation of this single instruction level view is what we want to encode in CuTe. Specifically, the QP level view in this diagram corresponds to the four MMA traits for [SM70_F32F16F16F32](https://github.com/NVIDIA/cutlass/tree/main/include/cute/arch/mma_sm70.hpp). These structs contain the `Element` types, the `Shape_MNK`, and the `ThrID` mapping we constructed above. Now, let us take a look at the definition of `CLayout`, the thread-data layout of accumulators. The job of `CLayout` is to construct a mapping between the `(logical_thr_id, logical_val_id)` and `(m, n)` coordinate in the C matrix which can then be used to build up more complicated layouts and operations like the 16x16x4 WMMA.
 
@@ -320,9 +316,7 @@ In the case of F16 accumulators, the layout is way less complex. Each row of acc
 
 A and B matrix layouts depend on whether the sources are transposed or not. The diagram below shows the thread ID to data ownership map for A and B matrices in the case of NT and TN transposes.
 
-<p align="center">
-  <img src="../../../images/cute/HMMA.8x8x4.quadpair.AB.png" alt="HMMA.8x8x4.quadpair.AB.png" height="400"/>
-</p>
+![HMMA.8x8x4.quadpair.AB.png](../../../images/cute/HMMA.8x8x4.quadpair.AB.png)
 
 Let's look at the TN layout for A matrix first (right side in the diagram). Again, there are the same 8 logical threads, but each threads owns only 4 elements this time. The shape of `ALayout` will then be `Shape<_8, _4>`. As for the strides, we again need a similar mapping between `(m, k) == m + k * M`. Looking down the `M` mode, we go from `(T0, V0)` to `(T1, V0)` which is a stride of 1 for all 8 threads. For the `K` mode, as we go across, we go from `(T0, V0)` to `(T0, V1)`, which makes a stride of 8 for all 4 values. Therefore, the A layout is:
 
@@ -375,17 +369,13 @@ using ThrID = Layout<_128, _1>;
 Accumulators are mapped hierarchically in GMMA, starting from the concept of a core matrix and building up to a layout for the whole C matrix tile. Let's look at this core matrix first. We only consider fp16 accumulators here, but extensions of fp32 accumulators as trivial as we will see later.
 
 Each core matrix has the layout as shown in the diagram below.
-<p align="center">
-  <img src="../../../images/cute/gmma_coremat_cd_fp16.png" alt="gmma_coremat_cd_fp16.png" height="600"/>
-</p>
+![gmma_coremat_cd_fp16.png](../../../images/cute/gmma_coremat_cd_fp16.png)
 
 As in the Volta examples, the thread IDs are logical only, and which of the four warps they belong to in the warpgroup is not important.
 
 Then GMMA tiles this core matrix first vertically along the M mode, and then repeats that column of core matrices along the N mode to construct the full MxN tile. This tiling is shown in the image below.
 
-<p align="center">
-  <img src="../../../images/cute/gmma_wg_n_slice.png" alt="gmma_wg_n_slice.png" height="600"/>
-</p>
+![gmma_wg_n_slice.png](../../../images/cute/gmma_wg_n_slice.png)
 
 With this image, we are again ready to start building the `CLayout` for `SM90_64x128x16_F16F16F16F16_TN` atom. Same as before, we are constructing a mapping between the `(logical_thr_id, logical_val_id) -> (m, n)` coordinate spaces.
 
@@ -452,9 +442,7 @@ Let's start with `SM70_8x8x4_F32F16F16F32_NT`.
 MMA_Atom mma = MMA_Atom<SM70_8x8x4_F32F16F16F32_NT>{};
 print_latex(mma);
 ```
-<p align="center">
-  <img src="../../../images/cute/HMMA.8x8x4.NT_Atom.png" alt="HMMA.8x8x4.NT_Atom.png" height="400"/>
-</p>
+![HMMA.8x8x4.NT_Atom.png](../../../images/cute/HMMA.8x8x4.NT_Atom.png)
 
 The above is equivalent to 
 ```cpp
@@ -472,9 +460,7 @@ We can create an object akin to a WMMA by using four of these quadpair MMAs:
                                          Stride<_2,_1>>{});   // 2x2 n-major layout of Atoms
     print_latex(mma);
 ```
-<p align="center">
-  <img src="../../../images/cute/HMMA.8x8x4.NT_2x2.png" alt="HMMA.8x8x4.NT_2x2.png" height="400"/>
-</p>
+![HMMA.8x8x4.NT_2x2.png](../../../images/cute/HMMA.8x8x4.NT_2x2.png)
 This `TiledMMA` replicates the `MMA_Atom` across threads as we can see the `T4` and `T8` and `T12` threads in the `C`-matrix that were not used before. Each quadrant of the `C`-matrix is a replica of the atom's partitioning pattern for a new quadpair and this replication follows a `(2,2):(2,1)` layout.
 
 The above represents a 16x16x4 MMA now, but we can immediately expand this "tile size" up to 32x32x4 instead:
@@ -485,9 +471,7 @@ The above represents a 16x16x4 MMA now, but we can immediately expand this "tile
                                   Tile<_32,_32,_4>{});      // 32x32x4 tiler
     print_latex(mma);
 ```
-<p align="center">
-  <img src="../../../images/cute/HMMA.8x8x4.NT_2x2_32x32x4.png" alt="HMMA.8x8x4.NT_2x2_32x32x4.png" height="400"/>
-</p>
+![HMMA.8x8x4.NT_2x2_32x32x4.png](../../../images/cute/HMMA.8x8x4.NT_2x2_32x32x4.png)
 This `TiledMMA` replicates the previous `TiledMMA` across values instead of threads. We can see the `T0V8` and `T16V8` and `T8V8` values in the `C`-matrix that were not used before. Each quadrant of the `C`-matrix is a replica of the previous `TiledMMA`'s partitioning pattern for a new set of values.
 
 Continuing, we see that there are eight values that `T0` receives from the `A`-matrix. Those reads occur at coordinates
@@ -513,9 +497,7 @@ which are separate, but we might prefer them to be next to each other. That is w
                                        _4>{});                   // Permutation on K, size 4 identity
     print_latex(mma);
 ```
-<p align="center">
-  <img src="../../../images/cute/HMMA.8x8x4.NT_2x2_32Mx32x4.png" alt="HMMA.8x8x4.NT_2x2_32Mx32x4.png" height="400"/>
-</p>
+![HMMA.8x8x4.NT_2x2_32Mx32x4.png](../../../images/cute/HMMA.8x8x4.NT_2x2_32Mx32x4.png)
 
 That layout `(4,4,2):(1,8,4)` is read like a scatter permutation, telling the m-coords of the original image where to go in the new image.
 ```
diff --git a/media/docs/cpp/cute/0x_gemm_tutorial.md b/media/docs/cpp/cute/0x_gemm_tutorial.md
index 44ea84dc..38e57e4e 100644
--- a/media/docs/cpp/cute/0x_gemm_tutorial.md
+++ b/media/docs/cpp/cute/0x_gemm_tutorial.md
@@ -334,9 +334,7 @@ These thread layouts are then used to partition the tiles of data in global memo
 ```
 where we've used the same projection-style interface to avoid applying the `N`-mode of `tC` to the `(BLK_M,BLK_K)` shape of `sA` and avoid applying the `M`-mode of `tC` to the `(BLK_N,BLK_K)` shape of `sB`.
 
-<p align="center">
-  <img src="../../../images/cute/tC_partitioning.png" alt="tC_partitioning.png" height="300"/>
-</p>
+![tC_partitioning.png](../../../images/cute/tC_partitioning.png)
 This diagram shows a `tC` layout, highlights two threads in green and blue, shows the projections of the `tC` layout, and finally highlights the subtensors within `sA`, `sB`, and `gC` that `tCsA`, `tCsB`, and `tCgC` represent.
 
 With the data partitioned across the threads, *every thread* can now participate in the compute step by writing
@@ -390,9 +388,7 @@ As a first example, lets look at the `TiledCopy` that `gemm_nt` generates.
   print_latex(copyA);
 ```
 The easiest way to see what this `TiledCopy` does is to look at the partition pattern in LaTeX.
-<p align="center">
-  <img src="../../../images/cute/TiledCopyA.png" alt="TiledCopyA.png" height="300"/>
-</p>
+![TiledCopyA.png](../../../images/cute/TiledCopyA.png)
 On the left is the source-tensor partitioning and on the right is the destination-tensor partitioning. The partition patterns are the same for this case, but there exist PTX instructions which require different patterns in the source and destination. The diagram shows that each thread reads 4x1 `TA` elements and there are 32x8 threads. The `UniversalCopy<uint128_t>` forces the instruction to use a 128-bit copy instruction. If the partition (of `sA` or `gA` in this case) does not result in 4 `TA` elements that cannot be vectorized to a 128-bit load/store, then CuTe will statically fail with an error message to that effect.
 
 To use the `TiledCopy`, the kernel writes
@@ -421,9 +417,7 @@ As a first example, lets look at the `TiledMMA` that `gemm_nt` generates.
   print_latex(mmaC);
 ```
 The easiest way to see what this `TiledMMA` does is to look at the partition pattern in LaTeX.
-<p align="center">
-  <img src="../../../images/cute/TiledMmaC.png" alt="TiledMmaC.png" height="300"/>
-</p>
+![TiledMmaC.png](../../../images/cute/TiledMmaC.png)
 On the left is the A-tensor partitioning, on the top is the B-tensor partitioning, and in the middle is the C-tensor partitioning.Because the `UniversalFMA` is a 1x1x1 MMA instruction, a 16x16x1 tiling of them results in a 16x16x1 `TiledMMA`. Other MMA instructions will have different threads involved and have different instruction sizes. In this case, all threads will read a single element from `A`, `B`, and `C` each.
 
 To use the `TiledMMA`, the kernel writes
diff --git a/media/docs/cpp/cute/0z_tma_tensors.md b/media/docs/cpp/cute/0z_tma_tensors.md
index a7e2a012..4b9c0070 100644
--- a/media/docs/cpp/cute/0z_tma_tensors.md
+++ b/media/docs/cpp/cute/0z_tma_tensors.md
@@ -8,7 +8,7 @@ What is an `ArithTuple`? Are those tensor strides? What do those mean? What is t
 
 This documentation intends to answer those questions and introduce some of the more advanced features of CuTe.
 
-# Introduction to TMA instructions
+## Introduction to TMA instructions
 
 The Tensor Memory Accelerator (TMA) is a set of instructions for copying possibly multidimensional arrays between global and shared memory.  TMA was introduced in the Hopper architecture.  A single TMA instruction can copy an entire tile of data all at once.  As a result, the hardware no longer needs to compute individual memory addresses and issue a separate copy instruction for each element of the tile.
 
@@ -53,9 +53,9 @@ That means that an ordinary CuTe Tensor that stores a GMEM pointer and computes
 
 What do we do?
 
-# Building a TMA Tensor
+## Building a TMA Tensor
 
-## Implicit CuTe Tensors
+### Implicit CuTe Tensors
 
 All CuTe Tensors are compositions of Layouts and Iterators. An ordinary global memory tensor's iterator is its global memory pointer. However, a CuTe Tensor's iterator doesn't have to be a pointer; it can be any random-access iterator.
 
@@ -83,7 +83,7 @@ This tensor maps logical coordinates to on-the-fly computed integers. Because it
 But the TMA doesn't consume pointers or integers, it consumes coordinates. Can we make a tensor of implicit TMA
 coordinates for the TMA instruction to consume? If so, then we could presumably also tile and partition and slice that tensor of coordinates so that we would always have the right TMA coordinate to give to the instruction.
 
-## ArithTupleIterators and ArithTuples
+### ArithTupleIterators and ArithTuples
 
 First, we build a `counting_iterator` equivalent for TMA coordinates. It should support
 
@@ -110,7 +110,7 @@ In summary, one creates a TMA descriptor for the *whole global memory tensor*. T
 
 We can now track and offset TMA coordinates with this iterator, but how do we get CuTe Layouts to generate non-integer offsets?
 
-## Strides aren't just integers
+### Strides aren't just integers
 
 Ordinary tensors have a layout that maps
 a logical coordinate `(i,j)` into a 1-D linear index `k`.
@@ -122,7 +122,7 @@ to a TMA coordinate, rather than to a 1-D linear index.
 
 To do this, we can abstract what a stride is. Strides need not be integers, but rather any algebraic object that supports inner-product with the integers (the logical coordinate). The obvious choice is the `ArithmeticTuple` we used earlier since they can be added to each other, but this time additionally equipped with an `operator*` so it can also be scaled by an integer.
 
-### Aside: Integer-module strides
+#### Aside: Integer-module strides
 
 A group of objects that support addition between elements and product between elements and integers is called an integer-module.
 
@@ -133,7 +133,7 @@ Rank-R tuples of integers are an integer-module.
 
 In principle, layout strides may be any integer-module.
 
-### Basis elements
+#### Basis elements
 
 CuTe's basis elements live in the header file `cute/numeric/arithmetic_tuple.hpp`.
 To make it easy to create `ArithmeticTuple`s that can be used as strides, CuTe defines normalized basis elements using the `E` type alias. "Normalized" means that the scaling factor of the basis element is the compile-time integer 1.
@@ -172,7 +172,7 @@ Intuitively, "compatible" means that
 the nested structure of the two basis elements
 matches well enough to add the two elements together.
 
-### Linear combinations of strides
+#### Linear combinations of strides
 
 Layouts work by taking the inner product
 of the natural coordinate with their strides.
@@ -200,7 +200,7 @@ and can be interpreted as the coordinate `((7,4),23)`.
 Thus, linear combinations of these strides can be used to generate TMA coordinates.
 These coordinates, in turn, can be used to offset TMA coordinate iterators.
 
-## Application to TMA Tensors
+### Application to TMA Tensors
 
 Now we can build CuTe Tensors like the one seen in the introduction.
 
@@ -230,7 +230,7 @@ ArithTuple(0,0) o (4,5):(_1@1,_1@0):
   (0,3)  (1,3)  (2,3)  (3,3)  (4,3)
 ```
 
-## Copyright
+### Copyright
 
 Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
diff --git a/media/docs/cpp/cute/index.rst b/media/docs/cpp/cute/index.rst
index a6611dd7..4673d571 100644
--- a/media/docs/cpp/cute/index.rst
+++ b/media/docs/cpp/cute/index.rst
@@ -4,7 +4,7 @@ CuTe
 ====================
 
 .. toctree::
-  :maxdepth: 2
+  :maxdepth: 1
 
   00_quickstart<00_quickstart.md>
   01_layout<01_layout.md>
diff --git a/media/docs/cpp/cutlass_2x.rst b/media/docs/cpp/cutlass_2x.rst
new file mode 100644
index 00000000..e3107c09
--- /dev/null
+++ b/media/docs/cpp/cutlass_2x.rst
@@ -0,0 +1,12 @@
+.. _cutlass_2_x:
+
+CUTLASS 2.x
+==================
+
+.. toctree::
+  :maxdepth: 2
+
+  Layouts and Tensors<layout.md>
+  GEMM API<gemm_api.md>
+  Tile Iterator Concepts<tile_iterator_concept.md>
+  Utilities<utilities.md>
diff --git a/media/docs/cpp/cutlass_3x.rst b/media/docs/cpp/cutlass_3x.rst
new file mode 100644
index 00000000..2f4e50c7
--- /dev/null
+++ b/media/docs/cpp/cutlass_3x.rst
@@ -0,0 +1,11 @@
+.. _cutlass_3_x:
+
+CUTLASS 3.x
+==================
+
+.. toctree::
+  :maxdepth: 2
+
+  Design <cutlass_3x_design.md>
+  GEMM Backwards Compatibility <cutlass_3x_backwards_compatibility.md>
+  GEMM API <gemm_api_3x.md>
diff --git a/media/docs/cpp/cutlass_3x_backwards_compatibility.md b/media/docs/cpp/cutlass_3x_backwards_compatibility.md
index 1dc42ef7..be9c50a1 100644
--- a/media/docs/cpp/cutlass_3x_backwards_compatibility.md
+++ b/media/docs/cpp/cutlass_3x_backwards_compatibility.md
@@ -438,7 +438,7 @@ obtain the kernel's configuration parameters. Users can use these to approximate
 for 3.0 API kernels.  However, the reflective interfaces cannot always match the types exactly,
 as the mappings are not always bijective.
 
-# Copyright
+### Copyright
 
 Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
diff --git a/media/docs/cpp/cutlass_3x_design.md b/media/docs/cpp/cutlass_3x_design.md
index b1eed530..05e2c18d 100644
--- a/media/docs/cpp/cutlass_3x_design.md
+++ b/media/docs/cpp/cutlass_3x_design.md
@@ -114,7 +114,7 @@ In this way, CuTe reifies the thread-to-data-layout mapping,
 makes it easier to write code that is "correct by construction".
 If the code compiles, it's probably correct.
 
-## Copyright
+### Copyright
 
 Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
diff --git a/media/docs/cpp/functionality.md b/media/docs/cpp/functionality.md
index 396db1fe..73454967 100644
--- a/media/docs/cpp/functionality.md
+++ b/media/docs/cpp/functionality.md
@@ -277,7 +277,7 @@ CUDA exposes warp-level matrix operations in the CUDA C++ WMMA API. The CUDA C++
 |  **B**     | `RowMajor`, `ColumnMajor`  | `RowMajor`, `ColumnMajor`    |
 |  **C**     | `RowMajor`, `ColumnMajor`  | `RowMajor`, `ColumnMajor`    |
 
-# Copyright
+### Copyright
 
 Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
diff --git a/media/docs/cpp/fundamental_types.md b/media/docs/cpp/fundamental_types.md
index b29fb5bf..ece3de16 100644
--- a/media/docs/cpp/fundamental_types.md
+++ b/media/docs/cpp/fundamental_types.md
@@ -355,7 +355,7 @@ support on current and future NVIDIA GPUs.
 
 ```
 
-# Copyright
+### Copyright
 
 Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
diff --git a/media/docs/cpp/gemm_api.md b/media/docs/cpp/gemm_api.md
index fd8ecf5e..fdfc49b3 100644
--- a/media/docs/cpp/gemm_api.md
+++ b/media/docs/cpp/gemm_api.md
@@ -5,7 +5,7 @@
 CUTLASS presents a uniform programming model for matrix multiply-accumulate operations at each level of the hierarchy. This document
 focuses on device-level, threadblock-level GEMMs, warp-level GEMMs, thread-level GEMMs, and instruction-level GEMMs.
 
-# CUTLASS GEMM Model
+## CUTLASS GEMM Model
 
 CUTLASS implements the basic GEMM triple loop nest with a tiled structure mirroring the execution model hierarchy.
 
@@ -62,7 +62,7 @@ warp-synchronous matrix multiply instructions targeting Tensor Cores.
 Alternatively, GEMMs targeting single-thread instructions may have an additional series of nested loops corresponding to 
 thread-level concurrency.
 
-# CUTLASS GEMM Components
+## CUTLASS GEMM Components
 
 This loop nest is expressed in CUTLASS via the following components which are specialized for data type, layout, and
 math instruction.
@@ -71,7 +71,7 @@ math instruction.
 
 These components are described in the following sections.
 
-## Device-wide GEMM API
+### Device-wide GEMM API
 
 The device-level GEMM API is intended to streamline instantiation and execution of the standard
 GEMM computation across the GPU. This operator is intended to be used in host-side .cu code and
@@ -119,7 +119,7 @@ The device-wide GEMM API is embodied by the following operators:
 ```
 
 
-## Threadblock-level GEMM API
+### Threadblock-level GEMM API
 
 GEMMs at this scope are expected to efficiently load tiles of data from global memory into internal storage and then compute matrix
 products with warp-level GEMM operators.
@@ -196,7 +196,7 @@ struct Mma {
 };
 ```
 
-## Warp-level Matrix Multiply API
+### Warp-level Matrix Multiply API
 
 Warp-level GEMM operators load tiles from shared memory into registers and then compute matrix multiplies using either 
 Tensor Cores or CUDA Cores. The result is accumulated in a register tile. Iterators are defined for each
@@ -416,7 +416,7 @@ class MmaSimt;
 ```
 
 
-## Thread-level GEMM API
+### Thread-level GEMM API
 
 Thread-level GEMM operations perform matrix multiply-accumulate on data held in registers. These target CUDA Cores exclusively.
 
@@ -502,7 +502,7 @@ struct Mma;
 } // namespace cutlass
 ```
 
-## Efficient Epilogue 
+### Efficient Epilogue 
 
 CUTLASS GEMM operators perform mma followed by epilogue operation similar 
 to cuBLAS. CUTLASS implements an efficient row-major epilogue. Thus, to achieve 
@@ -529,7 +529,7 @@ of input layouts. Thus, CUTLASS supports the following layout combinations for i
 
 - `{N,T} x {N,T} => {N,T}` - NN, TN, TN, TT GEMM for both row-major and column-major output
 
-## Instruction-level operations
+### Instruction-level operations
 
 CUTLASS defines a template-based interface to Tensor Core operations to avoid resorting
 to inline PTX.
@@ -538,7 +538,7 @@ to inline PTX.
 - [mma_sm75.h](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/arch/mma_sm75.h) - Turing TensorCore operations
 
 
-# Copyright
+### Copyright
 
 Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
diff --git a/media/docs/cpp/gemm_api_3x.md b/media/docs/cpp/gemm_api_3x.md
index c643fafd..8f890a0d 100644
--- a/media/docs/cpp/gemm_api_3x.md
+++ b/media/docs/cpp/gemm_api_3x.md
@@ -19,7 +19,7 @@ Device, Kernel, and Collective.
 It also briefly discusses the Tiled MMA/Copy and Atom level,
 and then refers readers to CuTe's tutorial for more information.
 
-# CUTLASS GEMM Model
+## CUTLASS GEMM Model
 
 CUTLASS implements algorithms that express
 the classical "triply nested loop" GEMM algorithm
@@ -80,7 +80,7 @@ and computes MMAs.
 These tiled copy and tiled mma iterations are generally
 fully static and get fully unrolled.
 
-# CUTLASS GEMM Components
+## CUTLASS GEMM Components
 
 CUTLASS expresses the above loop nest
 with the following components which are specialized for
@@ -146,7 +146,7 @@ using GemmHandle = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
 Towards the end, we also briefly cover CuTe's tiled mma and copy as well as the atom layer APIs,
 before redirecting users to CuTe-specific documentation for further details.
 
-## Collective API
+### Collective API
 
 A Collective is "the largest collection of threads
 onto which mma atoms and copy atoms are tiled."
@@ -670,7 +670,7 @@ please refer to CuTe's tutorial, e.g., the sections on
 
 * [a GEMM example](./cute/0x_gemm_tutorial.md).
 
-# Copyright
+### Copyright
 
 Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
diff --git a/media/docs/cpp/getting_started.rst b/media/docs/cpp/getting_started.rst
new file mode 100644
index 00000000..df34f3f6
--- /dev/null
+++ b/media/docs/cpp/getting_started.rst
@@ -0,0 +1,16 @@
+.. _getting_started:
+
+Getting Started
+==================
+
+.. toctree::
+  :maxdepth: 2
+
+  Quickstart<quickstart.md>
+  IDE Setup<ide_setup.md>
+  Build<build/index>
+  Functionality<functionality.md>
+  Terminology<terminology.md>
+  Fundamental Types<fundamental_types.md>
+  Programming Guidelines<programming_guidelines.md>
+  
diff --git a/media/docs/cpp/grouped_scheduler.md b/media/docs/cpp/grouped_scheduler.md
index 333496f7..fab12062 100644
--- a/media/docs/cpp/grouped_scheduler.md
+++ b/media/docs/cpp/grouped_scheduler.md
@@ -1,6 +1,6 @@
 ![ALT](../../images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Grouped Kernel Schedulers")
 
-# CUTLASS Grouped Kernel Schedulers
+# Grouped Kernel Schedulers
 
 CUTLASS's grouped kernel is a persistent kernel which launches multiple problems (e.g., GEMMs, SYR2Ks) within a
 single CUDA kernel launch.
diff --git a/media/docs/cpp/ide_setup.md b/media/docs/cpp/ide_setup.md
index 6a332b31..bad80bba 100644
--- a/media/docs/cpp/ide_setup.md
+++ b/media/docs/cpp/ide_setup.md
@@ -118,7 +118,7 @@ This is usually a convenient way to configure projects, but it's not as simple f
 clang doesn't understand many of the compiler flags used by nvcc. Hence, for now, we don't recommend using
 `compile_commands.json` to configure your CUDA project.
 
-## Copyright
+### Copyright
 
 Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
diff --git a/media/docs/cpp/layout.md b/media/docs/cpp/layout.md
index 5e1d4d29..1bc4d2c2 100644
--- a/media/docs/cpp/layout.md
+++ b/media/docs/cpp/layout.md
@@ -217,7 +217,7 @@ and `TensorRef` objects for each of the operands whose extents are implied as a
 redundant storage of extent quantities, CUTLASS minimizes capacity utilization of precious resources such as constant memory.
 This is consistent with BLAS conventions.
 
-# Summary:
+## Summary:
 
 The design patterns described in this document form a hierarchy:
 * `T *ptr;` is a pointer to a contiguous sequence of elements of type `T`
@@ -225,7 +225,7 @@ The design patterns described in this document form a hierarchy:
 * `TensorRef<T, Layout> ref(ptr, layout);` is an object pointing to an _unbounded_ tensor containing elements of type `T` and a layout of type `Layout`
 * `TensorView<T, Layout> view(ref, extent);` is an object pointing to a _bounded_ tensor containing elements of type `T` and a layout of type `Layout`
 
-# Appendix: Existing Layouts
+### Appendix: Existing Layouts
 
 This section enumerates several existing Layout types defined in CUTLASS.
 
@@ -268,7 +268,7 @@ Permuted Shared Memory Layouts:
 - `TensorOpCrosswise<ElementSize>`
 
 
-# Copyright
+### Copyright
 
 Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
diff --git a/media/docs/cpp/overview.md b/media/docs/cpp/overview.md
deleted file mode 100644
index 35d2aac1..00000000
--- a/media/docs/cpp/overview.md
+++ /dev/null
@@ -1,619 +0,0 @@
-![ALT](../../images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
-
-# Overview
-
-# CUTLASS 3.9.0
-
-_CUTLASS 3.9.0 - March 2025_
-
-CUTLASS is a collection of CUDA C++ template abstractions for implementing
-high-performance matrix-matrix multiplication (GEMM) and related computations at all levels 
-and scales within CUDA. It incorporates strategies for hierarchical decomposition and 
-data movement similar to those used to implement cuBLAS and cuDNN.  CUTLASS decomposes 
-these "moving parts" into reusable, modular software components abstracted by C++ template 
-classes.  Primitives for different levels of a conceptual parallelization hierarchy
-can be specialized and tuned via custom tiling sizes, data types,
-and other algorithmic policy. The resulting flexibility simplifies their use
-as building blocks within custom kernels and applications.
-
-To support a wide variety of applications, CUTLASS provides extensive support for
-mixed-precision computations, providing specialized data-movement and
-multiply-accumulate abstractions for FP64, FP32, TF32, FP16, BF16,
-[FP32 emulation via tensor core instruction](https://github.com/NVIDIA/cutlass/tree/main/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm), 
- 8b floating point types (e5m2 and e4m3),
- block scaled data types (NVIDIA NVFP4 and OCP standard MXFP4, MXFP6, MXFP8),
- narrow integer types (4 and 8b signed and unsigned integers),
- and binary 1b data types (where architectures allow for the
-native support of such data types).
-CUTLASS demonstrates optimal matrix multiply operations
-targeting the programmable, high-throughput _Tensor Cores_ implemented by
-NVIDIA's Volta, Turing, Ampere, Ada, Hopper, and Blackwell architectures.
-
-In addition to GEMMs, CUTLASS implements high-performance convolution via
-the implicit GEMM algorithm. Implicit GEMM is the formulation of a convolution
-operation as a GEMM thereby taking advantage of CUTLASS's modular GEMM pipeline.
-This allows CUTLASS to build convolutions by reusing highly-optimized GEMM components.
-
-See the [Quick Start Guide](quickstart.md) to get started quickly.
-
-See the [functionality docs](functionality.md) for a more comprehensive
-list of kernel level features, data types, instructions, and minimum supported by CUTLASS on each GPU
-architecture.
-
-# What's New in CUTLASS 3.9
-
-* Support for Blackwell SM120 kernels for GeForce GPUs in CUTLASS 3.x API:
-  - Collective mainloops that target for:
-    * [Blockscaled datatypes with support for dense GEMM](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp)
-    * [Blockscaled datatypes with support for sparse GEMM](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm120_blockscaled_sparse_mma_tma.hpp)
-  - New [GEMM](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/dispatch_policy.hpp) and [epilogue](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/dispatch_policy.hpp) dispatch policies for collectives, kernel layers, and builders.
-  - [Blackwell SM120 epilogue](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp) and [full set of EVT fusions](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp).
-* Set of examples that demonstrate the usage of the 3.x API for targeting Blackwell SM120 architecture:
-  - [Blockscaled GEMM with NVFP4 input datatype and BF16 output tensor](https://github.com/NVIDIA/cutlass/tree/main/examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu).
-  - [Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor with scale factor generation](https://github.com/NVIDIA/cutlass/tree/main/examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu).
-  - [Blockscaled GEMM with mixed input datatype (MXFP8 and MXFP6) and BF16 output tensor](https://github.com/NVIDIA/cutlass/tree/main/examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu).
-* Set of unit tests that demonstrate the usage of both [sparse](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/) and [dense](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/) Blackwell SM120 blockscaled GEMM.
-* Enhancement and new support of block-wise and group-wise GEMM for Hopper and Blackwell architectures:
-  - Enhancement of [blockwise GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) for Hopper architecture.
-  - Enhancement of [groupwise GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) for Hopper architecture.
-  - Support for [grouped GEMM with blockwise and groupwise scaling](https://github.com/NVIDIA/cutlass/tree/main/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/) for Hopper architecture.
-  - Support for [blockwise GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu) for Blackwell architecture.
-  - Support for [groupwise GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu) for Blackwell architecture.
-  - Support for [grouped GEMM with blockwise](https://github.com/NVIDIA/cutlass/tree/main/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu) and [groupwise scaling](https://github.com/NVIDIA/cutlass/tree/main/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu) for Blackwell architecture.
-* Added support for enhanced kernel performance search (auto-tuning) in CUTLASS profiler:
-  - Sorting performance results by GFLOPs/second: Users can now sort the final performance report based on GFLOPs/second, making it easier to identify the most efficient kernels.
-  - Exhaustive search for best kernel performance in GFLOPs/second: The profiler now searches for the best-performing kernel across a range of problem sizes, swizzle sizes, rasterization orders, and dynamic cluster configurations to maximize performance.
-  - Performance search under a fixed GEMM shape: Enables exhaustive tuning within a fixed GEMM shape, exploring various kernel parameters to find the best configuration.
-  - More detailed introductions and examples to leverage this feature can be found in [profiler.md](./profiler.md#exhaustive-search-mode-and-top-k-output-ranking-according-to-performance-in-gflopss).
-
-Note: CUTLASS 3.x builds are known to be down on Windows platforms for all CUDA toolkits.
-CUTLASS team is working on a fix.
-
-**See the [CHANGELOG](../release_notes.md) for details of all past releases and updates.**
-
-# Performance
-
-CUTLASS primitives are very efficient.  When used to construct device-wide GEMM kernels,
-they exhibit nearly optimal utilization of peak theoretical throughput. The figure below
-shows CUTLASS 3.8's performance as a % of theoretical peak utilization 
-on various input and output data types when run on NVIDIA Blackwell SM100 architecture GPU.
-
-![ALT](../../images/cutlass-3.8-blackwell-gemm-peak-performance.svg "")
-
-The two figures below show the continual CUTLASS performance improvements 
-on an [NVIDIA H100](https://www.nvidia.com/en-us/data-center/h100/) (NVIDIA Hopper architecture) since
-CUTLASS 3.1.
-CUTLASS 3.5.1 was compiled with the [CUDA 12.5u1 Toolkit](https://developer.nvidia.com/cuda-downloads). 
-Tensor Core operations are implemented using CUDA's 
-[mma](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma) and
-[wgmma](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions) instructions.
-
-![ALT](../../images/cutlass-3.5.1-gemm-peak-performance.png "")
-![ALT](../../images/cutlass-3.5.1-gemm-peak-performance-fp8.png "")
-
-# CuTe
-
-CUTLASS 3.0 introduced a new core library, CuTe, to describe and manipulate tensors of threads and data.
-CuTe is a collection of C++ CUDA template abstractions for
-defining and operating on hierarchically multidimensional layouts of threads and data.
-CuTe provides `Layout` and `Tensor` objects that compactly package the type,
-shape, memory space, and layout of data, while performing the complicated indexing for the user.
-This lets programmers focus on the logical descriptions of their algorithms while
-CuTe does the mechanical bookkeeping for them. With these tools, we can quickly design,
-implement, and modify all dense linear algebra operations.
-
-The core abstractions of CuTe are hierarchically multidimensional layouts
-which can be composed with data arrays to represent tensors.
-The representation of layouts is powerful enough to represent nearly
-everything we need to implement efficient dense linear algebra.
-Layouts can also be combined and manipulated via functional composition, on which we build a large set of common operations such as tiling and partitioning.
-
-CUTLASS 3.0 and beyond adopts CuTe throughout the GEMM hierarchy in its templates.
-This greatly simplifies the design and improves code composability and readability.
-More documentation specific to CuTe can be found in its
-[dedicated documentation directory](cute/00_quickstart.md).
-
-# Compatibility
-
-Minimum requirements:
-
-- Architecture: Volta (compute capability 7.0)
-- Compiler: Must support at least C++17
-- CUDA Toolkit version: 11.4
-
-CUTLASS requires a C++17 host compiler and 
-performs best when built with the [**CUDA 12.8 Toolkit**](https://developer.nvidia.com/cuda-downloads).
-It is also compatible with CUDA 11.4, CUDA 11.5, CUDA 11.6, CUDA 11.7, CUDA 11.8, and all other CUDA 12.x versions.
-
-## Operating Systems
-
-We have tested the following environments.
-
-|**Operating System** | **Compiler** |
-|-----------------|----------|
-| Ubuntu 18.04 | GCC 7.5.0  |
-| Ubuntu 20.04 | GCC 10.3.0 |
-| Ubuntu 22.04 | GCC 11.2.0 |
-
-Note: GCC 8.5.0 has known regressions regarding fold expressions and overloaded operators. Using GCC 7.5.0 or (preferred) GCC >= 9 is recommended.
-
-Note: CUTLASS 3.x builds are known to be down on Windows platforms for all CUDA toolkits.
-CUTLASS team is working on a fix.
-
-## Hardware
-
-CUTLASS runs successfully on the following NVIDIA GPUs, and it is expected to be efficient on Volta, Turing, Ampere, Ada, and Hopper architecture based NVIDIA GPUs.
-
-|**GPU**|**CUDA Compute Capability**|**Minimum CUDA Toolkit Required by CUTLASS-3**|
-|---|---|---|
-|NVIDIA V100 Tensor Core GPU            |7.0|11.4|
-|NVIDIA TitanV                          |7.0|11.4|
-|NVIDIA GeForce RTX 20x0 series         |7.5|11.4|
-|NVIDIA T4                              |7.5|11.4|
-|NVIDIA A100 Tensor Core GPU            |8.0|11.4|
-|NVIDIA A10                             |8.6|11.4|
-|NVIDIA GeForce RTX 30x0 series         |8.6|11.4|
-|NVIDIA GeForce RTX 40x0 series         |8.9|11.8|
-|NVIDIA L40                             |8.9|11.8|
-|NVIDIA H100 Tensor Core GPU            |9.0|11.8|
-|NVIDIA H200 Tensor Core GPU            |9.0|11.8|
-|NVIDIA B200 Tensor Core GPU            |10.0|12.8|
-|NVIDIA GeForce RTX 50x0 series         |10.0|12.8|
-
-## Target Architecture
-
-In general, PTX code generated for one target architecture can be run on future architectures
-(i.e., it is forward compatible).
-However, CUDA 12.0 introduced the concept of "architecture-accelerated features" whose
-PTX does not have forward compatibility guarantees.
-Several Hopper and Blackwell PTX instructions fall under this category of
-architecture-accelerated features, and thus require a `sm_90a` or `sm100a` target architecture
-(note the "a" appended). For more details on this and other architecture-accelerated instructions,
-please refer to the [CUDA Documentation](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#feature-availability).
-
-The target architecture information is passed on to CUTLASS via the cmake flag
-`CUTLASS_NVCC_ARCHS`. In order to maximize performance on Hopper GH100,
-users are required to build CUTLASS with `90a` as the target architecture.
-If a user accidentally builds a kernel which uses SM90a features
-(e.g. Hopper Tensor Core Instructions), using the SM90 target
-(note the lack of "a"), with either CUDA Toolkit 12 or 11.8,
-the kernel is expected to fail with a runtime error.
-
-```
-cmake .. -DCUTLASS_NVCC_ARCHS="90a"
-```
-Or 
-
-```
-cmake .. -DCUTLASS_NVCC_ARCHS="100a" 
-```
-
-Note: The NVIDIA Blackwell SM100 architecture used in the datacenter 
-products has a different compute capability than the one underpinning 
-NVIDIA Blackwell GeForce RTX 50 series GPUs. As a result, kernels 
-compiled for Blackwell SM100 architecture with arch conditional features 
-(using `sm100a`) are not compatible with RTX 50 series GPUs. 
-
-Please refer to the [functionality documentation](functionality.md)
-for details on which kernels require which target architectures.
-
-# Documentation
-
-CUTLASS is described in the following documents and the accompanying
-[Doxygen documentation](https://nvidia.github.io/cutlass).
-
-- [Quick Start Guide](quickstart.md) - basics of building and running CUTLASS
-- [Functionality](functionality.md) - summarizes functionality available in CUTLASS
-- [Efficient GEMM in CUDA](efficient_gemm.md) - describes how GEMM kernels may be implemented efficiently in CUDA
-- [CUTLASS 3.x Design](cutlass_3x_design.md) - describes the CUTLASS 3.x design, its benefits, and how CuTe enables us to write much more composable components
-- [GEMM API 3.x](gemm_api_3x.md) - describes the CUTLASS 3.x GEMM model and C++ template concepts
-- [GEMM API 2.x](gemm_api.md) - describes the CUTLASS 2.x GEMM model and C++ template concepts
-- [Implicit GEMM Convolution](implicit_gemm_convolution.md) - describes 2-D and 3-D convolution in CUTLASS
-- [Code Organization](code_organization.md) - describes the organization and contents of the CUTLASS project
-- [Terminology](terminology.md) - describes terms used in the code
-- [Programming Guidelines](programming_guidelines.md) - guidelines for writing efficient modern CUDA C++
-- [Fundamental types](fundamental_types.md) - describes basic C++ classes used in CUTLASS to represent numeric quantities and arrays
-- [Layouts](layout.md) - describes layouts of matrices and tensors in memory
-- [Tile Iterators](tile_iterator_concept.md) - describes C++ concepts for iterating over tiles of matrices in memory
-- [CUTLASS Profiler](profiler.md) - command-line driven profiling application
-- [CUTLASS Utilities](utilities.md) - additional templates used to facilitate rapid development
-- [Dependent kernel launch](dependent_kernel_launch.md) - describes a new feature in Hopper which allows overlapping dependent 
-kernels in the same stream, and how it is used in CUTLASS.
-
-# Resources
-We have also described the structure of an efficient GEMM in our talk at the
-[GPU Technology Conference 2018](http://on-demand.gputechconf.com/gtc/2018/presentation/s8854-cutlass-software-primitives-for-dense-linear-algebra-at-all-levels-and-scales-within-cuda.pdf).
-
-- [CUTLASS: Software Primitives for Dense Linear Algebra at All Levels and Scales within CUDA](https://www.nvidia.com/en-us/on-demand/session/gtcsiliconvalley2018-s8854/)
-- [Developing CUDA Kernels to Push Tensor Cores to the Absolute Limit on NVIDIA A100](https://www.nvidia.com/en-us/on-demand/session/gtcsj20-s21745/)
-- [Accelerating Convolution with Tensor Cores in CUTLASS](https://www.nvidia.com/en-us/on-demand/session/gtcspring21-s31883/)
-- [Accelerating Backward Data Gradient by Increasing Tensor Core Utilization in CUTLASS](https://www.nvidia.com/en-us/on-demand/session/gtcspring22-s41996/)
-- [CUTLASS: Python API, Enhancements, and NVIDIA Hopper](https://www.nvidia.com/en-us/on-demand/session/gtcfall22-a41131/)
-
-# Building CUTLASS
-
-CUTLASS is a header-only template library and does not need to be built to be used by other
-projects. Client applications should target CUTLASS's `include/` directory in their include
-paths.
-
-CUTLASS unit tests, examples, and utilities can be build with CMake.
-The minimum version of CMake is given in the [Quickstart guide](quickstart.md).
-Make sure the `CUDACXX` environment  variable points to NVCC in the CUDA Toolkit installed
-on your system.
-
-```bash
-$ export CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
-```
-
-Create a build directory within the CUTLASS project, then run CMake. By default CUTLASS will build kernels
-for CUDA architecture versions 5.0, 6.0, 6.1, 7.0, 7.5, 8.0, 8.6, 8.9, and 9.0.
-To reduce compile time you can specify
-the architectures to build CUTLASS for by changing the CMake configuration setting
-`CUTLASS_NVCC_ARCHS`.
-
-```bash
-$ mkdir build && cd build
-
-$ cmake .. -DCUTLASS_NVCC_ARCHS=80               # compiles for NVIDIA's Ampere Architecture
-```
-
-From the `build/` directory, compile and run the CUTLASS unit tests by building the target `test_unit` with make.
-
-The unit tests are organized as several binaries mirroring the top-level namespaces of CUTLASS,
-and they may be executed in parallel via make's `-j` command line argument.
-
-```bash
-$ make test_unit -j
-...
-...
-...
-[----------] Global test environment tear-down
-[==========] 946 tests from 57 test cases ran. (10812 ms total)
-[  PASSED  ] 946 tests.
-```
-
-All tests should pass on supported platforms, though the exact number of tests may vary over time.
-
-
-# Project Structure
-
-CUTLASS is arranged as a header-only library along with Utilities, Tools, Examples, and unit tests. 
-[Doxygen documentation](https://nvidia.github.io/cutlass) provides a complete list of files, classes, 
-and template concepts defined in the CUTLASS project.
-
-A detailed explanation of the source code organization may be found in the 
-[CUTLASS documentation](code_organization.md), but several main components are summarized below.
-
-## CUTLASS Template Library
-
-```
-include/                     # client applications should target this directory in their build's include paths
-
-  cutlass/                   # CUDA Templates for Linear Algebra Subroutines and Solvers - headers only
-
-    arch/                    # direct exposure of architecture features (including instruction-level GEMMs)
-
-    conv/                    # code specialized for convolution
-
-    epilogue/                # code specialized for the epilogue of gemm/convolution
-
-    gemm/                    # code specialized for general matrix product computations
-
-    layout/                  # layout definitions for matrices, tensors, and other mathematical objects in memory
-
-    platform/                # CUDA-capable Standard Library components
-
-    reduction/               # bandwidth-limited reduction kernels that do not fit the "gemm" model
-
-    thread/                  # simt code that can be performed within a CUDA thread
-    
-    transform/               # code specialized for layout, type, and domain transformations
-
-    *                        # core vocabulary types, containers, and basic numeric operations
-
-  cute/                      # CuTe Layout, layout algebra, MMA/Copy atoms, tiled MMA/Copy
-
-    algorithm/               # Definitions of core operations such as copy, gemm, and operations on cute::tuples
-
-    arch/                    # Bare bones PTX wrapper structs for copy and math instructions
-
-    atom/                    # Meta-information either link to or built from arch/ operators
-
-      mma_atom.hpp           # cute::Mma_Atom and cute::TiledMma
-
-      copy_atom.hpp          # cute::Copy_Atom and cute::TiledCopy
-
-      *sm*.hpp               # Arch specific meta-information for copy and math operations
-
-    *                        # Core library types such as Shape, Stride, Layout, Tensor, and associated operations
-
-```
-
-### CUTLASS SDK Examples
-
-[CUTLASS SDK examples](https://github.com/NVIDIA/cutlass/tree/main/examples) apply CUTLASS templates to implement basic computations.
-
-### Tools
-
-```
-tools/
-  library/                   # CUTLASS Instance Library - contains instantiations of all supported CUTLASS templates
-    include/
-      cutlass/
-        library/
-
-  profiler/                  # CUTLASS Profiler         - command-line utility for executing operations in the
-                             #                            CUTLASS Library
-  
-  util/                      # CUTLASS Utilities        - contains numerous helper classes for
-    include/                 #                            manging tensors in device memory, reference
-      cutlass/               #                            implementations for GEMM, random initialization
-        util/                #                            of tensors, and I/O.
-```
-
-### Test
-
-The `test/unit/` directory consist of unit tests implemented with Google Test that demonstrate
-basic usage of Core API components and complete tests of the CUTLASS GEMM computations.
-
-Instructions for building and running the Unit tests are described in the [Quickstart guide](quickstart.md).
-
-# Performance Profiling
-
-The `tools/profiler/` directory contains a command-line utility for launching each of the GEMM kernels.
-It can be built as follows:
-
-```bash
-$ make cutlass_profiler -j16
-```
-## Building all GEMM and Convolution kernels (_long_ build times)
-
-By default, only one tile size is instantiated for each data type, math instruction, and layout.
-To instantiate all, set the following environment variable when running CMake from an empty `build/` directory.
-Beware, this results in *tens of thousands* of kernels and long build times. 
-This would also result in a large binary size and on some platforms linker to fail on building the library.
-Therefore, it's highly recommended to generate only a subset of kernels as demonstrated in the sub-section below.
-```bash
-$ cmake .. -DCUTLASS_NVCC_ARCHS=90a -DCUTLASS_LIBRARY_KERNELS=all
-...
-$ make cutlass_profiler -j16
-```
-
-## Building a subset of GEMM and Convolution kernels (_reduced_ build times)
-
-To compile strictly one kernel or a small set of kernels, a comma-delimited list of kernel names with 
-wildcard characters may be used to reduce the set of kernels. The following examples show building exactly one
-or a subset of kernels for NVIDIA Ampere and Turing architecture:
-
-### Building a subset Tensor Core GEMM kernels
-
-To compile a subset of Tensor Core GEMM kernels with FP32 accumulation and FP16 input targeting NVIDIA Ampere and Turing architecture, 
-use the below cmake command line:
-```bash
-$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*gemm_f16_*_nt_align8
-...
-$ make cutlass_profiler -j16
-```
-
-Example command line for profiling a subset of Tensor Core GEMM kernels is as follows:
-```bash
-./tools/profiler/cutlass_profiler --kernels=cutlass_tensorop_s*gemm_f16_*_nt_align8 --m=3456 --n=4096 --k=4096
-
-...
-=============================
-  Problem ID: 1
-
-        Provider: CUTLASS
-   OperationKind: gemm
-       Operation: cutlass_tensorop_s1688gemm_f16_256x128_32x2_nt_align8
-
-          Status: Success
-    Verification: ON
-     Disposition: Passed
-
-reference_device: Passed
-          cuBLAS: Passed
-
-       Arguments: --gemm_kind=universal --m=3456 --n=4096 --k=4096 --A=f16:column --B=f16:row --C=f32:column --alpha=1  \
-                  --beta=0 --split_k_slices=1 --batch_count=1 --op_class=tensorop --accum=f32 --cta_m=256 --cta_n=128  \
-                  --cta_k=32 --stages=2 --warps_m=4 --warps_n=2 --warps_k=1 --inst_m=16 --inst_n=8 --inst_k=8 --min_cc=75  \
-                  --max_cc=1024
-
-           Bytes: 118489088  bytes
-           FLOPs: 115992428544  flops
-
-         Runtime: 1.55948  ms
-          Memory: 70.7616 GiB/s
-
-            Math: 74378.8 GFLOP/s
-
-
-
-=============================
-...
-```
-
-### Building one CUDA Core GEMM kernel
-
-To compile one SGEMM kernel targeting NVIDIA Ampere and Turing architecture, use the below cmake command line:
-```bash
-$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sgemm_128x128_8x2_nn_align1
-...
-$ make cutlass_profiler -j16
-```
-
-Example command line for profiling single SGEMM CUDA kernel is as follows:
-```bash
-$ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096
-
-=============================
-  Problem ID: 1
-
-        Provider: CUTLASS
-   OperationKind: gemm
-       Operation: cutlass_simt_sgemm_128x128_8x2_nn_align1
-
-          Status: Success
-    Verification: ON
-     Disposition: Passed
-
-          cuBLAS: Passed
-
-       Arguments: --m=3456 --n=4096 --k=4096 --A=f32:column --B=f32:column --C=f32:column --alpha=1 --beta=0 --split_k_slices=1  \
-                  --batch_count=1 --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8 --stages=2 --warps_m=4  \
-                  --warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50 --max_cc=1024
-
-           Bytes: 180355072  bytes
-           FLOPs: 115992428544  flops
-
-         Runtime: 6.73655  ms
-          Memory: 24.934 GiB/s
-
-            Math: 17218.4 GFLOP/s
-
-=============================
-```
-
-### Building a subset of Tensor Core Convolution kernels
-
-To compile a subset of Tensor core convolution kernels implementing forward propagation (fprop) with FP32 accumulation 
-and FP16 input targeting NVIDIA Ampere and Turing architecture, use the below cmake command line:
-```bash
-$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*fprop_optimized_f16
-...
-$ make cutlass_profiler -j16
-```
-
-Example command line for profiling a subset of Tensor Core convolution kernels is as follows:
-
-```bash
-$ ./tools/profiler/cutlass_profiler --kernels=cutlass_tensorop_s*fprop_optimized_f16 --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3
-
-...
-=============================
-  Problem ID: 1
-
-        Provider: CUTLASS
-   OperationKind: conv2d
-       Operation: cutlass_tensorop_s16816fprop_optimized_f16_128x128_32x5_nhwc
-
-          Status: Success
-    Verification: ON
-     Disposition: Passed
-
-reference_device: Passed
-
-       Arguments: --conv_kind=fprop --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 --p=224 --q=224 --pad_h=1 --pad_w=1  \
-                  --stride_h=1 --stride_w=1 --dilation_h=1 --dilation_w=1 --Activation=f16:nhwc --Filter=f16:nhwc --Output=f32:nhwc  \
-                  --conv_mode=cross --iterator_algorithm=optimized --alpha=1 --beta=0 --split_k_mode=serial --split_k_slices=1  \
-                  --eq_gemm_provider=none --op_class=tensorop --accum=f32 --cta_m=128 --cta_n=128 --cta_k=32 --stages=5  \
-                  --warps_m=2 --warps_n=2 --warps_k=1 --inst_m=16 --inst_n=8 --inst_k=16 --min_cc=80 --max_cc=1024
-
-           Bytes: 1130659840  bytes
-           FLOPs: 118482796544  flops
-
-         Runtime: 0.711496  ms
-          Memory: 1479.99 GiB/s
-
-            Math: 166526 GFLOP/s
-
-=============================
-...
-```
-
-
-### Building one Convolution CUDA kernel
-
-To compile and run one CUDA Core convolution kernel implementing forward propagation (fprop) with F32 accumulation 
-and FP32 input targeting NVIDIA Ampere and Turing architecture, use the below cmake command line:
-```bash
-$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sfprop_optimized_128x128_8x2_nhwc
-...
-$ make cutlass_profiler -j16
-```
-
-Example command line for profiling one CUDA Core convolution kernel:
-
-```bash
-$ ./tools/profiler/cutlass_profiler --kernels=cutlass_simt_sfprop_optimized_128x128_8x2_nhwc --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3
-
-
-=============================
-  Problem ID: 1
-
-        Provider: CUTLASS
-   OperationKind: conv2d
-       Operation: cutlass_simt_sfprop_optimized_128x128_8x2_nhwc
-
-          Status: Success
-    Verification: ON
-     Disposition: Passed
-
-reference_device: Passed
-
-       Arguments: --conv_kind=fprop --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 --p=224 --q=224 --pad_h=1 --pad_w=1  \
-                  --stride_h=1 --stride_w=1 --dilation_h=1 --dilation_w=1 --Activation=f32:nhwc --Filter=f32:nhwc --Output=f32:nhwc  \
-                  --conv_mode=cross --iterator_algorithm=optimized --alpha=1 --beta=0 --split_k_mode=serial --split_k_slices=1  \
-                  --eq_gemm_provider=none --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8 --stages=2 --warps_m=4  \
-                  --warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50 --max_cc=1024
-
-           Bytes: 2055798784  bytes
-           FLOPs: 118482796544  flops
-
-         Runtime: 7.34266  ms
-          Memory: 260.752 GiB/s
-
-            Math: 16136.2 GFLOP/s
-
-
-=============================
-
-```
-
-## More Details on Compiling CUTLASS Kernels and CUTLASS Profiler
-- Please follow the links for more CMake examples on selectively compiling CUTLASS kernels:
-  - [GEMM CMake Examples](quickstart.md#gemm-cmake-examples) 
-  - [Implicit GEMM convolution CMake Examples](quickstart.md#convolution-cmake-examples)
-- [Further details about the CUTLASS Profiler are described here.](profiler.md)
-
-
-# About
-
-CUTLASS is released by NVIDIA Corporation as Open Source software under the 
-[3-clause "New" BSD license](LICENSE.txt).
-
-# Contributors
-
-The official list of CUTLASS developers and contributors is available here: [CONTRIBUTORS](CONTRIBUTORS.md).
-
-# Copyright
-
-Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-SPDX-License-Identifier: BSD-3-Clause
-
-```
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are met:
-
-  1. Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-
-  2. Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-
-  3. Neither the name of the copyright holder nor the names of its
-  contributors may be used to endorse or promote products derived from
-  this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-```
diff --git a/media/docs/cpp/profiler.md b/media/docs/cpp/profiler.md
index 58088dff..22f88485 100644
--- a/media/docs/cpp/profiler.md
+++ b/media/docs/cpp/profiler.md
@@ -45,7 +45,7 @@ compile or fail to launch at runtime.
 ```bash
 $ cmake .. \
   -DCUTLASS_NVCC_ARCHS="90a" \
-  -DCUTLASS_LIBRARY_KERNELS="cutlass3x_sm90_tensorop_s64x64x16gemm_f16_f16_f32_void_f32_*" \
+  -DCUTLASS_LIBRARY_KERNELS="cutlass3x_sm90_tensorop_gemm_f16_f16_f32_void_f32_*" \
   -DCUTLASS_LIBRARY_INSTANTIATION_LEVEL="max" \
   -DCUTLASS_UNITY_BUILD_ENABLED=ON
 ```
@@ -525,7 +525,7 @@ To best illustrate this naming convention, we will walk through the meaning of e
 in a GEMM kernel used by the profiler:
 
 ```
-cutlass3x_sm90_tensorop_s64x128x16gemm_f16_f16_f32_f16_f32_{optional-mixed-dtype-config}_128x128x64_2x1x1_0_ntn_align8
+cutlass3x_sm90_tensorop_gemm_f16_f16_f32_f16_f32_{optional-mixed-dtype-config}_128x128x64_2x1x1_0_ntn_align8
 ```
 
 The components within this name are as follows:
@@ -553,7 +553,7 @@ Note that in some special cases where the input A/B types do not match that of t
 instruction's, the MMA facing input type is added to the instruction string as well.
 
 ```
-cutlass3x_sm90_tensorop_s64x128x8tf32gemm_f32_f32_f32_f32_f32_128x128x32_2x1x1_0_tnn_align4
+cutlass3x_sm90_tensorop_tf32gemm_f32_f32_f32_f32_f32_128x128x32_2x1x1_0_tnn_align4
 ```
 
 * `s64x128x8tf32gemm`: indicates that the MMA consumes inputs in `tf32` format, and therefore
@@ -563,7 +563,7 @@ For custom mainloop or epilogue schedules, details of the opted-in schedule are
 kernel name. For example,
 
 ```
-cutlass3x_sm90_tensorop_h64x128x16gemm_f16_f16_f16_void_f16_128x128x64_1x1x1_0_nnn_align8_warpspecialized_cooperative_epi_tma
+cutlass3x_sm90_tensorop_gemm_f16_f16_f16_void_f16_128x128x64_1x1x1_0_nnn_align8_warpspecialized_cooperative_epi_tma
 ```
 
 * `warpspecialized_cooperative`: Mainloop employs a persistent warp-specialized mainloop and kernel schedule.
diff --git a/media/docs/cpp/programming_guidelines.md b/media/docs/cpp/programming_guidelines.md
index b85108d9..5aa59744 100644
--- a/media/docs/cpp/programming_guidelines.md
+++ b/media/docs/cpp/programming_guidelines.md
@@ -1157,7 +1157,7 @@ has shape `((X, Y), K)` and stride `((1, X), X*Y)`.
 `get<0>(stride)` is the tuple `(1, X)`, not a single integer.
 However, A is certainly M major if interpreted as a matrix.
 
-# Copyright
+### Copyright
 
 Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
diff --git a/media/docs/cpp/quickstart.md b/media/docs/cpp/quickstart.md
index b728f7ed..388d2751 100644
--- a/media/docs/cpp/quickstart.md
+++ b/media/docs/cpp/quickstart.md
@@ -462,7 +462,7 @@ int main(int argc, char const **args) {
 }
 ```
 
-# CUTLASS Library
+## CUTLASS Library
 
 The [CUTLASS Library](https://github.com/NVIDIA/cutlass/tree/main/tools/library) defines an API for managing and executing collections of compiled
 kernel instances and launching them from host code without template instantiations in client code.
@@ -585,7 +585,7 @@ int main() {
 }
 ```
 
-# Example CMake Commands
+## Example CMake Commands
 
 To instantiate all operations supporting all tile sizes, data types, and alignment constraints, specify
 `-DCUTLASS_LIBRARY_KERNELS=all` when running `cmake`.
@@ -750,7 +750,7 @@ are needed in the mainloop builder:
 
 We encourage a user to refer to Sm100 unit tests and the generated profiler-based kernels as more comprehensive samples.
 
-# Copyright
+### Copyright
 
 Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
diff --git a/media/docs/cpp/terminology.md b/media/docs/cpp/terminology.md
index 1c5d31ea..6ec6158c 100644
--- a/media/docs/cpp/terminology.md
+++ b/media/docs/cpp/terminology.md
@@ -78,7 +78,10 @@ replaced by [MMA and Copy atoms from CuTe](cute/0t_mma_atom.md).
 **Thread Map**: abstraction for defining how threads are mapped to a given tile. Deprecated starting CUTLASS 3.0.
   Replaced by `cute::Layout` in equivalent usage scenarios to represent thread tensors.
 
-# Copyright
+[comment]: <> (Don't remove this. This "##" is to prevent Sphinx from throwing build WARNING.)
+## 
+
+### Copyright
 
 Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
diff --git a/media/docs/cpp/tile_iterator_concept.md b/media/docs/cpp/tile_iterator_concept.md
index 63a3eb0b..0da69d7c 100644
--- a/media/docs/cpp/tile_iterator_concept.md
+++ b/media/docs/cpp/tile_iterator_concept.md
@@ -469,7 +469,7 @@ struct WriteableReadableRandomAccessContiguousTileIteratorConcept {
 };
 ```
 
-# Copyright
+### Copyright
 
 Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
diff --git a/media/docs/cpp/utilities.md b/media/docs/cpp/utilities.md
index b6dffe05..ab45b617 100644
--- a/media/docs/cpp/utilities.md
+++ b/media/docs/cpp/utilities.md
@@ -431,7 +431,7 @@ Additional information may appear at the end of each line, such as shared memory
 
 Please note that `synclog` is an experimental feature, and its functionality is not always guaranteed. We encourage its use in custom kernels and CUTLASS examples, though it is known to be incompatible with profiler kernels.
 
-# Copyright
+### Copyright
 
 Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
diff --git a/media/docs/pythonDSL/cute_dsl.rst b/media/docs/pythonDSL/cute_dsl.rst
new file mode 100644
index 00000000..71fa4f7f
--- /dev/null
+++ b/media/docs/pythonDSL/cute_dsl.rst
@@ -0,0 +1,18 @@
+.. _cute_dsl:
+
+CuTe DSL
+========
+
+.. toctree::
+  :maxdepth: 1
+
+  DSL Introduction <cute_dsl_general/dsl_introduction.rst>
+  DSL Code Generation <cute_dsl_general/dsl_code_generation.rst>
+  DSL Control Flow <cute_dsl_general/dsl_control_flow.rst>
+  DSL JIT Argument Generation <cute_dsl_general/dsl_jit_arg_generation.rst>
+  DSL JIT Argument: Layouts <cute_dsl_general/dsl_dynamic_layout.rst>
+  DSL JIT Caching <cute_dsl_general/dsl_jit_caching.rst>
+  Integration with Frameworks <cute_dsl_general/framework_integration.rst>
+  Debugging with the DSL <cute_dsl_general/debugging.rst>
+  Autotuning with the DSL <cute_dsl_general/autotuning_gemm.rst>
+  Educational Notebooks <cute_dsl_general/notebooks.rst>
diff --git a/media/docs/pythonDSL/cute_dsl_api.rst b/media/docs/pythonDSL/cute_dsl_api.rst
new file mode 100644
index 00000000..c4726eb3
--- /dev/null
+++ b/media/docs/pythonDSL/cute_dsl_api.rst
@@ -0,0 +1,12 @@
+.. _cute_dsl_api:
+
+CuTe DSL API
+============
+
+.. toctree::
+  :maxdepth: 1
+
+  cute <cute_dsl_api/cute.rst>
+  cute_arch <cute_dsl_api/cute_arch.rst>
+  cute_nvgpu <cute_dsl_api/cute_nvgpu.rst>
+  utils <cute_dsl_api/utils.rst>
diff --git a/media/docs/pythonDSL/cute_dsl_api/cute.rst b/media/docs/pythonDSL/cute_dsl_api/cute.rst
new file mode 100644
index 00000000..bd5d5c56
--- /dev/null
+++ b/media/docs/pythonDSL/cute_dsl_api/cute.rst
@@ -0,0 +1,11 @@
+.. _cute:
+
+cutlass.cute
+============
+
+.. automodule:: cutlass.cute
+   :members:
+   :undoc-members:
+   :show-inheritance:
+   :special-members: __init__
+   :private-members:
diff --git a/media/docs/pythonDSL/cute_dsl_api/cute_arch.rst b/media/docs/pythonDSL/cute_dsl_api/cute_arch.rst
new file mode 100644
index 00000000..4e2d4d0d
--- /dev/null
+++ b/media/docs/pythonDSL/cute_dsl_api/cute_arch.rst
@@ -0,0 +1,24 @@
+.. _cute_arch:
+
+cutlass.cute.arch
+=================
+
+The ``cute.arch`` module contains wrappers around NVVM-level MLIR Op builders that seamlessly
+inter-operate with the Python types used in CUTLASS Python. Another benefit of wrapping these Op
+builders is that the source location can be tracked with the ``@dsl_user_op`` decorator. Available
+functions include
+
+- basic API like ``thr_idx``;
+- functions related to the direct management of mbarriers;
+- low-level SMEM management (prefer using the ``SmemAllocator`` class);
+- TMEM management.
+
+API documentation
+-----------------
+
+.. automodule:: cutlass.cute.arch
+   :members:
+   :undoc-members:
+   :show-inheritance:
+   :special-members: __init__
+   :private-members:
diff --git a/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu.rst b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu.rst
new file mode 100644
index 00000000..4f5d18ae
--- /dev/null
+++ b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu.rst
@@ -0,0 +1,18 @@
+.. _cute_nvgpu:
+
+cutlass.cute.nvgpu
+==================
+
+The ``cute.nvgpu`` module contains MMA and Copy Operations as well as Operation-specific helper
+functions. The arch-agnostic Operations are exposed at the top-level while arch-specific Operations
+are grouped into submodules like ``tcgen05``.
+
+.. toctree::
+  :maxdepth: 2
+  :hidden:
+
+  cute_nvgpu_common
+  cute_nvgpu_warp
+  cute_nvgpu_warpgroup
+  cute_nvgpu_cpasync
+  cute_nvgpu_tcgen05
diff --git a/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_common.rst b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_common.rst
new file mode 100644
index 00000000..fd1013ed
--- /dev/null
+++ b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_common.rst
@@ -0,0 +1,9 @@
+.. _cute_nvgpu_common:
+
+Common
+======
+
+.. automodule:: cutlass.cute.nvgpu
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_cpasync.rst b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_cpasync.rst
new file mode 100644
index 00000000..84c22871
--- /dev/null
+++ b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_cpasync.rst
@@ -0,0 +1,10 @@
+.. _cute_nvgpu_cpasync:
+
+cpasync submodule
+=================
+
+.. automodule:: cutlass.cute.nvgpu.cpasync
+   :members:
+   :undoc-members:
+   :show-inheritance:
+   :special-members: __init__
diff --git a/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_tcgen05.rst b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_tcgen05.rst
new file mode 100644
index 00000000..ee2c6f35
--- /dev/null
+++ b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_tcgen05.rst
@@ -0,0 +1,10 @@
+.. _cute_nvgpu_tcgen05:
+
+tcgen05 submodule
+=================
+
+.. automodule:: cutlass.cute.nvgpu.tcgen05
+   :members:
+   :undoc-members:
+   :show-inheritance:
+   :special-members: __init__
diff --git a/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_warp.rst b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_warp.rst
new file mode 100644
index 00000000..bda907f4
--- /dev/null
+++ b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_warp.rst
@@ -0,0 +1,10 @@
+.. _cute_nvgpu_warp:
+
+warp submodule
+==============
+
+.. automodule:: cutlass.cute.nvgpu.warp
+   :members:
+   :undoc-members:
+   :show-inheritance:
+   :special-members: __init__
diff --git a/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_warpgroup.rst b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_warpgroup.rst
new file mode 100644
index 00000000..441f2305
--- /dev/null
+++ b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_warpgroup.rst
@@ -0,0 +1,10 @@
+.. _cute_nvgpu_warpgroup:
+
+warpgroup submodule
+===================
+
+.. automodule:: cutlass.cute.nvgpu.warpgroup
+   :members:
+   :undoc-members:
+   :show-inheritance:
+   :special-members: __init__
diff --git a/media/docs/pythonDSL/cute_dsl_api/utils.rst b/media/docs/pythonDSL/cute_dsl_api/utils.rst
new file mode 100644
index 00000000..086bef60
--- /dev/null
+++ b/media/docs/pythonDSL/cute_dsl_api/utils.rst
@@ -0,0 +1,9 @@
+cutlass.utils
+=============
+
+.. automodule:: cutlass.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
+   :special-members: __init__
+   :private-members:
diff --git a/media/docs/pythonDSL/cute_dsl_general/autotuning_gemm.rst b/media/docs/pythonDSL/cute_dsl_general/autotuning_gemm.rst
new file mode 100644
index 00000000..db76c8a7
--- /dev/null
+++ b/media/docs/pythonDSL/cute_dsl_general/autotuning_gemm.rst
@@ -0,0 +1,154 @@
+.. _autotuning_gemm:
+
+Guidance for Auto-Tuning
+============================= 
+
+.. contents:: Table of Contents
+   :depth: 2
+   :local:
+
+Numerous GEMM kernel code examples are offered within our codebase. 
+When integrating these kernels into frameworks, auto-tuning becomes essential 
+for achieving optimal performance. This involves selecting the appropriate 
+kernel parameters based on the inputs of real applications.
+Next, we'll briefly introduce some tips on how to perform auto-tuning.
+
+The auto-tuning process typically involves the following steps:
+
+1. Define search space
+2. Benchmark each configuration and select the kernel with the best performance
+3. Enable caching to reduce the tuning cost
+
+The search space defines the valid combinations of kernel parameters that can be used to run the kernels. 
+Different inputs (shapes, data types, etc.) typically require different kernel parameters to achieve optimal performance.
+The search space is related to the kernel. We take the Blackwell GEMM persistent kernel as an example. 
+The search space is as follows:
+
+- ``mma_tiler_mn``: Defines the dimensions of the matrix tile that each Matrix Multiply-Accumulate (MMA) instruction processes in a single operation. 
+- ``cluster_shape_mn``: Specifies the number of CTAs along each dimension within a cluster. Refer `Parallel Thread Execution ISA documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tensorcore-5th-generation-family-instructions>`_ for the possible mma tiler size and cluster shape for different tensor data types.
+- ``use_2cta_instrs``: Whether to utilize Blackwell's 2 CTA instructions for MMA/Copy.
+- ``use_tma_store``: Whether to use Tensor Memory Access (TMA) instructions to store the result back to global memory.
+
+After defining the search space, we could traverse all parameter combinations to find the optimal kernel. 
+The ``autotune_gemm`` function below demonstrates a simple exhaustive search approach - it iterates 
+through configurations, compiles and benchmarks each kernel, and returns the best performing one.
+Since kernel compilation incurs overhead, it's important to cache and reuse compiled kernels 
+to minimize host launch latency. CuTe DSL facilitates this through its separate compilation 
+and execution workflow. More details can be found in :ref:`JIT_Caching`.
+As demonstrated in the ``autotune_gemm`` function 
+(between the ``begin of cache the compiled GEMM kernel`` and ``end of cache the compiled GEMM kernel`` comments), 
+we can use ``cute.compile()`` to compile a kernel once, cache the compiled result, and reuse the cached JIT executor for multiple kernel 
+executions. We could maintain a global configuration-to-kernel dictionary (``config_kernel_dict``) to cache the compiled GEMM kernels, 
+where each key (``kernel_cache_key``) uniquely identifies a kernel based on its characteristics.
+Usually we could use the {dtype + kernel configs} as the cached key for GEMM compilation. For example, 
+
+.. code-block:: python
+
+    kernel_cache_key = f"{ab_dtype}x{c_dtype}x{acc_dtype}x{use_2cta_instrs}x{mma_tiler}x{cluster_shape_mn}x{use_tma_store}"
+
+If the input tensor's layout is static, we should add the shape in the cached key too.
+Users can customize the ``benchmark`` function to measure kernel execution time.
+For stable and reliable performance measurements:
+
+1. Run a few warmup iterations (e.g., 5-10) to stabilize GPU temperature
+2. Execute multiple timed iterations (e.g., 100-1000) for statistical significance
+3. Use CUDA events and synchronization for precise timing
+4. Lock GPU frequencies (SM and memory frequencies) with nvidia-smi
+5. Process results by removing outliers and using min/avg statistics as measurements.
+
+This ensures reliable kernel selection through proper benchmarking.
+
+.. code-block:: python
+
+    # get the best GEMM kernel for given input tensors
+    def autotune_gemm(
+        a: cute.Tensor,
+        b: cute.Tensor,
+        c: cute.Tensor,
+        stream: cuda.CUstream,
+        use_2cta_instrs_list: List[bool] = [True],
+        use_tma_store_list: List[bool] = [True],
+        mma_tiler_m_list: List[int] = [256],
+        mma_tiler_n_list: List[int] = [256],
+        cluster_shape_m_list: List[int] = [2],
+        cluster_shape_n_list: List[int] = [1],
+    ):
+        best_kernel = None
+        min_time = float("inf")
+        # traverse the search space
+        for use_2cta_instrs in use_2cta_instrs_list:
+            for use_tma_store in use_tma_store_list:
+                for mma_tiler_mn in product(mma_tiler_m_list, mma_tiler_n_list):
+                    for cluster_shape_mn in product(cluster_shape_m_list, cluster_shape_n_list):
+                        acc_dtype = cutlass.Float32
+                        hardware_info = cutlass.utils.HardwareInfo()
+                        max_active_clusters = hardware_info.get_max_active_clusters(
+                            cluster_shape_mn[0] * cluster_shape_mn[1]
+                        )
+                        # instance a GEMM kernel
+                        gemm = PersistentDenseGemmKernel(
+                            acc_dtype,
+                            use_2cta_instrs,
+                            mma_tiler_mn,
+                            cluster_shape_mn,
+                            use_tma_store,
+                        )
+                        # begin of cache the compiled GEMM kernel
+                        if kernel_cache_key not in config_kernel_dict:
+                            # compile gemm kernel
+                            compiled_gemm = cute.compile(
+                                gemm,
+                                a,
+                                b,
+                                c,
+                                max_active_clusters,
+                                stream,
+                            )
+                            config_kernel_dict[kernel_cache_key] = compiled_gemm
+                        else:
+                            compiled_gemm = config_kernel_dict[kernel_cache_key]
+                        # end of cache the compiled GEMM kernel
+                        try:
+                            # define a benchmark function to measure the execution time of the compiled GEMM kernel
+                            cur_time = benchmark(
+                                partial(compiled_gemm, a, b, c, stream),
+                            )
+                        except Exception as e:
+                            print(f"Execution error: {e}")
+                            cur_time = float("inf")
+                        if cur_time < min_time:
+                            min_time = cur_time
+                            best_kernel = compiled_gemm
+        if best_kernel is None:
+            raise ValueError("No best kernel found")
+        return best_kernel
+
+This brute-force approach ensures we could find the optimal parameters, though at the cost of trying every possibilities.
+For more advanced use cases, users can explore sophisticated optimization 
+techniques like search space pruning and genetic algorithms to reduce tuning overhead and discover better 
+configurations more efficiently.
+
+To further optimize tuning performance, we can utilize caching mechanisms to avoid redundant computations.
+We could cache the tuning results in a input-to-kernel dictionary (e.g., ``input_kernel_dict``). 
+When processing inputs with matching ``config_key`` values, the cached kernel can be reused directly without re-tuning. 
+The ``config_key`` is related with the input tensor's characteristics, such as the shape, data type, etc. 
+The setup of ``config_key`` is very flexible, users can customize it based on their own application.
+For instance, if the data type is fixed in users' application, we could use the input tensor's shape as the key, i.e., ``(m, n, k)``. 
+To further reduce tuning overhead, we could consider using a simplified key like ``config_key = (power_of_2(m), power_of_2(n), power_of_2(k))``, 
+where ``m``, ``n``, and ``k`` are rounded up to the nearest power of 2. This simplification can significantly reduce the number 
+of unique keys while still maintaining good performance in most cases. However, it's important to validate that this 
+approximation doesn't negatively impact performance for your specific use case. 
+
+.. code-block:: python
+
+    config_key = (m, n, k)
+    if config_key in input_kernel_dict:
+        compiled_gemm = input_kernel_dict[config_key]
+    else:
+        compiled_gemm = autotune_gemm(...)
+        input_kernel_dict[config_key] = compiled_gemm
+    # launch gemm kernel
+    compiled_gemm(a_tensor, b_tensor, c_tensor, stream)
+
+By following the methods above, you can customize your own auto-tuner to find the optimal GEMM kernel configuration 
+for specific matrix dimensions and data types, significantly improving computational performance for models.
diff --git a/media/docs/pythonDSL/cute_dsl_general/debugging.rst b/media/docs/pythonDSL/cute_dsl_general/debugging.rst
new file mode 100644
index 00000000..649aa608
--- /dev/null
+++ b/media/docs/pythonDSL/cute_dsl_general/debugging.rst
@@ -0,0 +1,133 @@
+.. _debugging:
+
+Debugging
+=========
+
+.. contents:: Table of Contents
+   :depth: 2
+   :local:
+
+This page provides an overview of debugging techniques and tools for CuTe DSL programs.
+
+
+Getting Familiar with the Limitations
+-------------------------------------
+
+Before diving into comprehensive debugging capabilities, it's important to understand the limitations of CuTe DSL.
+Understanding these limitations will help you avoid potential pitfalls from the start.
+
+Please refer to :doc:`../limitations` for more details.
+
+
+DSL Debugging
+-------------
+
+CuTe DSL provides built-in logging mechanisms to help you understand the code execution flow and
+some of the internal state.
+
+Enabling Logging
+~~~~~~~~~~~~~~~~
+
+CuTe DSL provides environment variables to control logging level:
+
+.. code:: bash
+
+    # Enable console logging (default: False)
+    export CUTE_DSL_LOG_TO_CONSOLE=1
+
+    # Log to file instead of console (default: False)
+    export CUTE_DSL_LOG_TO_FILE=my_log.txt
+
+    # Control log verbosity (0, 10, 20, 30, 40, 50, default: 10)
+    export CUTE_DSL_LOG_LEVEL=20
+
+
+Log Categories and Levels
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Similar to standard Python logging, different log levels provide varying degrees of detail:
+
++--------+-------------+
+| Level  | Description |
++========+=============+
+| 0      | Disabled    |
++--------+-------------+
+| 10     | Debug       |
++--------+-------------+
+| 20     | Info        |
++--------+-------------+
+| 30     | Warning     |
++--------+-------------+
+| 40     | Error       |
++--------+-------------+
+| 50     | Critical    |
++--------+-------------+
+
+
+Dump the generated IR
+~~~~~~~~~~~~~~~~~~~~~
+
+For users familiar with MLIR and compilers, CuTe DSL supports dumping the Intermediate Representation (IR).
+This helps you verify whether the IR is generated as expected.
+
+.. code:: bash
+
+    # Dump Generated CuTe IR (default: False)
+    export CUTE_DSL_PRINT_IR=1
+
+    # Keep Generated CuTe IR in a file (default: False)
+    export CUTE_DSL_KEEP_IR=1
+
+
+
+Kernel Functional Debugging
+----------------------------
+
+Using Python's ``print`` and CuTe's ``cute.printf``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+CuTe DSL programs can use both Python's native ``print()`` as well as our own ``cute.printf()``  to
+print debug information during kernel generation and execution. They differ in a few key ways:
+
+- Python's ``print()`` executes during compile-time only (no effect on the generated kernel) and is
+  typically used for printing static values (e.g. a fully static layouts).
+- ``cute.printf()`` executes at runtime on the GPU itself and changes the PTX being generated. This
+  can be used for printing values of tensors at runtime for diagnostics, but comes at a performance
+  overhead similar to that of `printf()` in CUDA C.
+
+For detailed examples of using these functions for debugging, please refer to the associated
+notebook referenced in :doc:`notebooks`.
+
+Handling Unresponsive/Hung Kernels
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When a kernel becomes unresponsive and ``SIGINT`` (``CTRL+C``) fails to terminate it,
+you can follow these steps to forcefully terminate the process:
+
+1. Use ``CTRL+Z`` to suspend the unresponsive kernel
+2. Execute the following command to terminate the suspended process:
+
+.. code:: bash
+
+    # Terminate the most recently suspended process
+    kill -9 $(jobs -p | tail -1)
+
+
+CuTe DSL can also be debugged using standard NVIDIA CUDA tools.
+
+Using Compute-Sanitizer
+~~~~~~~~~~~~~~~~~~~~~~~
+
+For detecting memory errors and race conditions:
+
+.. code:: bash
+
+    compute-sanitizer --some_options python your_dsl_code.py
+
+Please refer to the `compute-sanitizer documentation <https://developer.nvidia.com/compute-sanitizer>`_ for more details.
+
+Conclusion
+----------
+
+This page covered several key methods for debugging CuTe DSL programs. Effective debugging typically requires a combination of these approaches.
+If you encounter issues with DSL, you can enable logging and share the logs with the CUTLASS team as a GitHub issue to report a bug.
diff --git a/media/docs/pythonDSL/cute_dsl_general/dsl_code_generation.rst b/media/docs/pythonDSL/cute_dsl_general/dsl_code_generation.rst
new file mode 100644
index 00000000..b4b463d4
--- /dev/null
+++ b/media/docs/pythonDSL/cute_dsl_general/dsl_code_generation.rst
@@ -0,0 +1,90 @@
+.. _dsl_code_generation:
+.. |DC|  replace:: dynamic compilation
+.. |DSL| replace:: CuTe DSL
+.. |IR|  replace:: intermediate representation (IR)
+
+End-to-End Code Generation
+==========================
+
+.. contents::
+   :depth: 2
+   :local:
+
+
+1. Techniques for Turning Python into |IR|
+------------------------------------------
+
+1.1 AST rewrite
+^^^^^^^^^^^^^^^^
+The function’s abstract-syntax tree is analysed **before** execution.
+Python control-flow (``for``/``while``, ``if``/``else``) and built-ins are converted to structured |IR|
+constructs.  Computation inside each region is left untouched at this stage.
+
+*Advantages*
+
+*  Sees the entire program, so every branch and loop is preserved.
+*  Keeps loop structure intact for optimization such as tiling, vectorisation
+   or GPU thread mapping.
+
+*Disadvantages*
+
+*  Requires a well-defined Python subset that the rewriter understands.
+
+
+1.2 Tracing
+^^^^^^^^^^^
+The decorated function is executed once with *proxy* arguments; overloaded
+operators record every tensor operation that actually runs and produce a flat
+trace that is lowered to |IR|.
+
+*Advantages*
+
+*  Near-zero compile latency, ideal for straight-line arithmetic.
+*  No need to parse Python source, so it supports many dynamic Python
+   features, and Python has many features.
+
+*Disadvantages*
+
+*  Untaken branches vanish, so the generated kernel may be wrong for other
+   inputs.
+*  Loops are flattened to the iteration count observed during tracing.
+*  Data-dependent control-flow freezes to a single execution path.
+
+
+2. |DSL| Code-Generation Modes
+------------------------------
+
+CuTe’s Python front-end combines the techniques above into **two mutually
+exclusive modes**, selectable with the ``preprocessor`` flag of the
+``@jit`` decorator:
+
+1. Tracing mode ``@jit(preprocess=False)`` – tracing only.
+This results in the fastest compilation path and is recommended only for kernels that are guaranteed to be
+straight-line arithmetic. It suffers from all tracing limitations listed in the previous section.
+
+2.  Preprocessor mode (**default**) ``@jit(preprocess=True)`` – **AST rewrite + tracing**.
+The AST pass captures every loop and branch, eliminating the correctness and
+optimisation problems of pure tracing; tracing then fills in the arithmetic.
+This hybrid “preprocessor” pipeline is unique to |DSL| and was designed
+specifically to overcome the disadvantages identified above.
+
+.. figure:: dsl_modes.png
+   :width: 400
+   :align: center
+
+   *Left*: tracing mode records only the path that executed.
+   *Right*: preprocessor mode emits structured |IR| for every branch and loop
+   before tracing the arithmetic.
+
+
+Why Tracing-Only Is Insufficient for Control-Flow
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* **Branch loss** – The untaken side of an ``if``/``else`` is never lowered.
+* **Loop unrolling** – Loops are flattened to the iteration count observed,
+  destroying structure needed for parallel mapping and tiling.
+* **Data-dependent paths** – Control-flow that depends on tensor values freezes
+  to a single execution path at trace time.
+
+The preprocessor mode fixes all of these by lowering control-flow first and delegating
+only the arithmetic to the tracer.
diff --git a/media/docs/pythonDSL/cute_dsl_general/dsl_control_flow.rst b/media/docs/pythonDSL/cute_dsl_general/dsl_control_flow.rst
new file mode 100644
index 00000000..a16c79c3
--- /dev/null
+++ b/media/docs/pythonDSL/cute_dsl_general/dsl_control_flow.rst
@@ -0,0 +1,140 @@
+.. _dsl_control_flow:
+.. |DC|        replace:: dynamic compilation
+.. |IR|        replace:: intermediate representation (IR)
+.. |DSL|       replace:: CuTe DSL
+.. |Constexpr| replace:: **Constexpr** (compile-time Python value)
+
+|DSL| Control Flow
+==================
+.. contents::
+   :depth: 2
+   :local:
+
+
+Overview
+--------
+|DSL| walks Python’s AST and converts each control-flow construct it finds into
+structured |IR|.  You can therefore write ordinary Python loops and branches
+while the compiler decides—statement by statement—whether to
+
+* **evaluate at compile time** if the controlling value is a |Constexpr|, or
+* **emit intermediate representation (IR)** when the value is dynamic.
+
+
+For a high-level discussion of the overall pipeline, see
+:doc:`the code-generation overview <dsl_code_generation>`.
+
+For Loops
+---------
+|DSL| recognises three kinds of ranges for ``for`` loops:
+
+* ``range`` – the Python built-in
+* ``cutlass.range_dynamic`` – always lowers to |IR|
+* ``cutlass.range_constexpr`` – always unrolls at compile time
+
+
+range(...)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The AST rewriter inserts a small helper stub.  At runtime the loop bounds are
+inspected:
+
+* **Constant bounds** → the loop is unrolled at compile time.
+* **Dynamic bounds**  → the loop is emitted as structured |IR|.
+
+
+cutlass.range_dynamic(...)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Use when you *always* want a loop in the generated |IR|, even if the bounds
+look constant.
+
+
+cutlass.range_constexpr(...)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Runs in the Python interpreter and is fully unrolled before code generation.
+All loop indices must be |Constexpr|.
+
+
+Limitations of Dynamic For Loops
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* Early-exit ``break``, ``continue``, or raising exception are not yet supported.
+* Operations in the loop body are traced only when tracing is active in that
+  region.
+
+
+**Example:**
+
+.. code-block:: python
+
+   @cute.jit
+   def loop_example():
+       n = 10
+
+       # ❌ This loop is dynamic, early-exit isn't allowed.
+       for i in cutlass.range_dynamic(n):
+           if i == 5:
+               break         # Early-exit
+           cute.printf("%d\\n", i)
+
+       # ✅ This loop is constexpr, early-exit is allowed.
+       for i in cutlass.range_constexpr(n):
+           if i == 5:
+               break         # Early-exit
+           cute.printf("%d\\n", i)
+
+If-Else Statements
+------------------
+
+Standard Python ``if``/``else`` is supported.
+
+* **Predicate is Constexpr (compile-time Python value)** → evaluated at compile time.
+* **Predicate is dynamic**     → lowered to |IR|.
+
+**Example:**
+
+.. code-block:: python
+
+   @cute.jit
+   def main(const_var: cutlass.Constexpr, dynamic_var: cutlass.Int32):
+       if const_var:                         # compile-time branch
+           cute.printf("Const branch\\n")
+       else:
+           cute.printf("Const else\\n")
+
+       if dynamic_var == 10:                 # dynamic branch
+           cute.printf("Dynamic True\\n")
+       else:
+           cute.printf("Dynamic False\\n")
+
+Similarly to for-loops, the ``if cutlass.const_expr`` and ``if cutlass.dynamic_expr`` constructs can
+be used to force the evaluation at compile-time or the generation of IR, respectively. Unstructured
+control flow is only supported when using ``if cutlass.const_expr``.
+
+While Loops
+-----------
+
+Python ``while`` loops are always treated as **dynamic** because the loop condition may become
+dynamic after the first iteration. Similarly to for-loops and ``if``/``else``, the
+``while cutlass.const_expr`` and ``while cutlass.dynamic_expr`` constructs are available.
+
+Compile-Time Metaprogramming
+----------------------------
+
+Mix compile-time constructs with normal |DSL| code to generate specialised
+kernels without runtime overhead.  A compile-time flag can, for example, toggle
+an optional **ReLU** epilogue:
+
+.. code-block:: python
+
+   @cute.kernel
+   def gemm(..., do_relu: cutlass.Constexpr):
+       # main GEMM work
+       ...
+       if const_expr(do_relu):    # compile-time guard
+           # ReLU code is emitted only when do_relu is True
+           ...
+
+.. code-block:: text
+
+   gemm(..., False)   # ReLU is omitted from the generated |IR|
+   gemm(..., True)    # ReLU is included
diff --git a/media/docs/pythonDSL/cute_dsl_general/dsl_dynamic_layout.rst b/media/docs/pythonDSL/cute_dsl_general/dsl_dynamic_layout.rst
new file mode 100644
index 00000000..9c5cca7d
--- /dev/null
+++ b/media/docs/pythonDSL/cute_dsl_general/dsl_dynamic_layout.rst
@@ -0,0 +1,198 @@
+.. _dsl_dynamic_layout:
+.. |DSL| replace:: CuTe DSL
+.. |SLAY| replace:: static layout
+.. |DLAY| replace:: dynamic layout
+
+.. contents:: Table of Contents
+   :depth: 2
+   :local:
+
+Static vs Dynamic layouts
+=========================
+
+Static Layout
+-------------
+
+When integrating with popular deep learning frameworks, one question is how to deal with the layout of the converted ``cute.Tensor``.
+For example, when converting a ``torch.Tensor`` to a ``cute.Tensor``, the shape of the ``torch.Tensor`` is honored for the layout of
+``cute.Tensor``.
+
+.. code-block:: python
+
+    import torch
+    import cutlass
+    from cutlass.cute.runtime import from_dlpack
+
+    @cute.jit
+    def foo(tensor):
+        print(f"tensor.layout: {tensor.layout}")  # Prints tensor layout at compile time
+        cute.printf("tensor: {}", tensor)         # Prints tensor values at runtime
+
+In this example, we define a JIT function ``foo`` that takes a ``cute.Tensor`` as input and prints its layout. Note
+that Python print is used to print the layout at compile time. This works fine for |SLAY| whose value is known at
+compile time.
+
+Now let's try to run the JIT function ``foo`` with different shapes of the input ``torch.Tensor``.
+
+.. code-block:: python
+
+    a = torch.tensor([1, 2, 3], dtype=torch.uint16)
+    a_pack = from_dlpack(a)
+    compiled_func = cute.compile(foo, a_pack)
+    compiled_func(a_pack)
+
+Here we first convert a 1D ``torch.Tensor`` with 3 elements to a ``cute.Tensor`` using ``from_dlpack``. Then we compile
+the JIT function ``foo`` with the converted ``cute.Tensor`` and call the compiled function. 
+
+::
+
+    tensor.layout: (3):(1)
+    tensor: raw_ptr(0x00000000079e5100: i16, generic, align<2>) o (3):(1) = 
+  ( 1, 2, 3 )
+
+It prints ``(3):(1)`` for the layout because the converted ``cute.Tensor`` has a |SLAY| with shape ``(3)`` which
+is the shape of the ``a``.
+
+Now if we call the compiled function with a different shape of the input ``torch.Tensor``, it would result in an unexpected
+result at runtime due to the mismatch of the type since ``compiled_func`` expects a ``cute.Tensor`` with layout ``(3):(1)``
+while ``b`` has shape ``(5)``.
+
+.. code-block:: python
+
+    b = torch.tensor([11, 12, 13, 14, 15], dtype=torch.uint16)
+    b_pack = from_dlpack(b)
+    compiled_func(b_pack)  # ❌ This results in an unexpected result at runtime due to type mismatch
+
+Following is the output which is unexpected due to the type mismatch.
+
+::
+
+    tensor: raw_ptr(0x00000000344804c0: i16, generic, align<2>) o (3):(1) = 
+  ( 11, 12, 13 )
+
+To fix that, we would have to trigger another code generation and compilation for the new shape for ``b``.
+
+.. code-block:: python
+
+    compiled_func_2 = cute.compile(foo, b_pack)  # This would trigger another compilation
+    compiled_func_2(b_pack)                      # ✅ Now this works fine
+
+As shown in the example above, with the newly compiled ``compiled_func_2``,  we can pass in ``b_pack`` to the compiled
+JIT function ``compiled_func_2``.
+
+::
+
+    tensor.layout: (5):(1)
+    tensor: raw_ptr(0x0000000034bb2840:: i16, generic, align<2>) o (5):(1) = 
+  ( 11, 12, 13, 14, 15 )
+
+Now it recompiles and prints the values of ``b`` correctly.
+
+It's obvoius that we need distinct codes generated and compiled for different static layout. In this case, one for layout
+``(3):(1)`` and the other for layout ``(5):(1)``.
+
+Dynamic Layout
+--------------
+
+In order to avoid generating and compiling multiple times for different shapes of the input ``torch.Tensor``, |DSL| provides a way to
+generate and compile JIT function with |DLAY|.
+
+To get dyanmic layout of the ``cute.Tensor``, a ``torch.Tensor`` object can be passed into the JIT function directly which instructs
+|DSL| to call ``cute.mark_layout_dynamic`` automatically on the converted ``cute.Tensor`` per the leading dimension of the layout.
+
+.. code-block:: python
+
+    import torch
+    import cutlass
+    from cutlass.cute.runtime import from_dlpack
+
+    @cute.jit
+    def foo(tensor):
+        print(tensor.layout)  # Prints (?,?):(?,1) for dynamic layout
+
+    a = torch.tensor([[1, 2], [3, 4]], dtype=torch.uint16)
+    compiled_func = cute.compile(foo, a)
+    compiled_func(a)
+
+    b = torch.tensor([[11, 12], [13, 14], [15, 16]], dtype=torch.uint16)
+    compiled_func(b)  # Reuse the same compiled function for different shape
+
+In the example above, a single compilation of the JIT function ``foo`` is reused for different shapes of the input ``torch.Tensor``.
+This is possible because the converted ``cute.Tensor`` has a |DLAY| ``(?,?):(?,1)`` which is compatible with the shape of the
+input ``torch.Tensor`` of both calls.
+
+Alternatively, for compact layout, ``cute.mark_compact_shape_dynamic`` can be called for a finer-grained control to specify the mode
+of the layout for dynamic and the divisibility constraint for the dynamic dimension.
+
+Refer to :doc:`framework_integration` for more details on ``from_dlpack``, ``mark_layout_dynamic``,
+and ``mark_compact_shape_dynamic``.
+
+Static Layout vs. Dynamic Layout
+--------------------------------
+
+Per the previous sections, we have seen that |SLAY| leads to distinct JIT code generations while |DLAY| leads to a single
+compilation for different shapes.
+
+That said, creating JIT function with |SLAY| is useful when the use cases targeting input data with fixed shapes.
+Since more information is available at compile time, the compiler would be able to kick in optimizations that otherwise would not
+be possible for the code generated for |DLAY|.
+
+On the other hand, |DLAY| would be more flexible for the cases where the input data has varying shapes. This provides more
+scalability of the generated code to deal with varying input data of different shapes.
+
+Programming with Static and Dynamic Layout
+------------------------------------------
+
+|DSL| provides intuitive way to program with static and |DLAY| in the codes.
+
+.. code-block:: python
+
+    import torch
+    import cutlass
+    from cutlass.cute.runtime import from_dlpack
+
+    @cute.jit
+    def foo(tensor, x: cutlass.Constexpr[int]):
+        print(cute.size(tensor))  # Prints 3 for the 1st call
+                                  # Prints ? for the 2nd call
+        if cute.size(tensor) > x:
+            cute.printf("tensor[2]: {}", tensor[2])
+        else:
+            cute.printf("tensor size <= {}", x)
+
+    a = torch.tensor([1, 2, 3], dtype=torch.uint16)
+    foo(from_dlpack(a), 3)   # First call with static layout
+
+    b = torch.tensor([1, 2, 3, 4, 5], dtype=torch.uint16)
+    foo(b, 3)                # Second call with dynamic layout
+
+In this example, the JIT function ``foo`` is compiled with a |SLAY| ``(3):(1)`` for the first call, which means the
+size of the tensor is known at compile time. |DSL| makes good use of this and automatically handles the if condition at the
+compile time. Hence the generated codes are efficient without the if condition at all.
+
+For the second call, the JIT function ``foo`` is compiled with a |DLAY| ``(?):(1)`` hence the tensor size is only
+evaluated at runtime. |DSL| automatically generates the code to handle the |DLAY| and the if condition at runtime.
+
+The same applies to loop as well:
+
+.. code-block:: python
+
+    @cute.jit
+    def foo(tensor, x: cutlass.Constexpr[int]):
+        for i in range(cute.size(tensor)):
+            cute.printf("tensor[{}]: {}", i, tensor[i])
+
+    a = torch.tensor([1, 2, 3], dtype=torch.uint16)
+    foo(from_dlpack(a), 3)   # First call with static layout
+
+    b = torch.tensor([1, 2, 3, 4, 5], dtype=torch.uint16)
+    foo(b, 3)                # Second call with dynamic layout
+
+With the static layout in the first call, |DSL| is able to fully unroll the loop at compile time. While in the second call,
+the generated codes will have the loop executed at runtime based on the |DLAY|.
+
+With the single JIT function implementation, |DSL| is able to handle control-flow constructs and automatically generate
+the optimized codes for different cases. This is all possible because |DSL| is able to walk the Python AST and convert
+each control-flow construct it finds accordingly.
+
+Please refer to :doc:`dsl_control_flow` for more details.
diff --git a/media/docs/pythonDSL/cute_dsl_general/dsl_introduction.rst b/media/docs/pythonDSL/cute_dsl_general/dsl_introduction.rst
new file mode 100644
index 00000000..ca409771
--- /dev/null
+++ b/media/docs/pythonDSL/cute_dsl_general/dsl_introduction.rst
@@ -0,0 +1,128 @@
+.. _dsl_introduction:
+.. |DC| replace:: dynamic compilation
+.. |IR| replace:: IR
+.. |DSL| replace:: CuTe DSL
+
+
+|DSL|
+======================
+
+.. contents:: Table of Contents
+   :depth: 2
+   :local:
+
+Overview
+--------
+
+|DSL| is a Python-based domain-specific language (DSL) designed for |DC| of numeric and GPU-oriented code. Its primary goals are:
+
+- **Consistent with CuTe C++**, allowing users to express GPU kernels with full control of the hardware.
+- **JIT compilation** for both host and GPU execution.
+- `DLPack <https://github.com/dmlc/dlpack>`_ **integration**, enabling seamless interop with frameworks (e.g., PyTorch, JAX).
+- **JIT caching**, so that repeated calls to the same function benefit from cached |IR| modules.
+- **Native types and type inference** to reduce boilerplate and improve performance.
+- **Optional lower-level control**, offering direct access to GPU backends or specialized |IR| dialects.
+
+Decorators
+----------
+
+|DSL| provides two main Python decorators for generating optimized code via |DC|:
+
+1. ``@jit`` — Host-side JIT-compiled functions
+2. ``@kernel`` — GPU kernel functions
+
+Both decorators can optionally use a **preprocessor** that automatically expands Python control flow (loops, conditionals) into operations consumable by the underlying |IR|.
+
+``@jit``
+~~~~~~~~~~~~~
+
+Declares JIT-compiled functions that can be invoked from Python or from other |DSL| functions.
+
+**Decorator Parameters**:
+
+* ``preprocessor``:
+
+  * ``True`` (default) — Automatically translate Python flow control (e.g., loops, if-statements) into |IR| operations.
+  * ``False`` — No automatic expansion; Python flow control must be handled manually or avoided.
+
+**Call-site Parameters**:
+
+- ``no_cache``:
+
+  - ``True`` — Disables JIT caching, forcing a fresh compilation each call.
+  - ``False`` (default) — Enables caching for faster subsequent calls.
+
+``@kernel``
+~~~~~~~~~~~~~~~~
+
+Defines GPU kernel functions, compiled as specialized GPU symbols through |DC|.
+
+**Decorator Parameters**:
+
+- ``preprocessor``:
+
+  - ``True`` (default) — Automatically expands Python loops/ifs into GPU-compatible |IR| operations.
+  - ``False`` — Expects manual or simplified kernel implementations.
+
+**Kernel Launch Parameters**:
+
+- ``grid``
+  Specifies the grid size as a list of integers.
+- ``block``
+  Specifies the block size as a list of integers.
+- ``cluster``
+  Specifies the cluster size as a list of integers.
+- ``smem``
+  Specifies the size of shared memory in bytes (integer).
+
+Calling Conventions
+-------------------
+
+.. list-table::
+   :header-rows: 1
+   :widths: 20 20 15 25
+
+   * - **Caller**
+     - **Callee**
+     - **Allowed**
+     - **Compilation/Runtime**
+
+   * - Python function
+     - ``@jit``
+     - ✅
+     - DSL runtime
+
+   * - Python function
+     - ``@kernel``
+     - ❌
+     - N/A (error raised)
+
+   * - ``@jit``
+     - ``@jit``
+     - ✅
+     - Compile-time call, inlined
+
+   * - ``@jit``
+     - Python function
+     - ✅
+     - Compile-time call, inlined
+
+   * - ``@jit``
+     - ``@kernel``
+     - ✅
+     - Dynamic call via GPU driver or runtime
+
+   * - ``@kernel``
+     - ``@jit``
+     - ✅
+     - Compile-time call, inlined
+
+   * - ``@kernel``
+     - Python function
+     - ✅
+     - Compile-time call, inlined
+
+   * - ``@kernel``
+     - ``@kernel``
+     - ❌
+     - N/A (error raised)
diff --git a/media/docs/pythonDSL/cute_dsl_general/dsl_jit_arg_generation.rst b/media/docs/pythonDSL/cute_dsl_general/dsl_jit_arg_generation.rst
new file mode 100644
index 00000000..a7c46003
--- /dev/null
+++ b/media/docs/pythonDSL/cute_dsl_general/dsl_jit_arg_generation.rst
@@ -0,0 +1,196 @@
+.. _dsl_jit_arg_generation:
+.. |DSL| replace:: CuTe DSL
+.. |CUSTOM_TYPES| replace:: customized types
+
+|DSL| JIT Function Argument Generation
+=======================================
+
+.. contents:: Table of Contents
+   :depth: 2
+   :local:
+
+In a nutshell
+--------------
+When using the ``@jit`` or ``@kernel`` decorators to define a JIT-compiled function, the arguments to the function are traced to determine the JIT function's signature.
+|DSL| provides a Pythonic way to write the arguments for JIT function as one normally would in Python, and the |DSL| will take care of the rest for you.
+
+Specifically, |DSL| honors following when generating the JIT function's arguments:
+
+- JIT function arguments are assumed to be **dynamic arguments** by default.
+- If an argument is explicitly type annotated with ``cutlass.Constexpr``, it is treated as a **compile-time constant**.
+- If type annotation is provided, |DSL| validates the argument type at compile time for **type safety**.
+- |DSL| provides **runtime checkable protocols** (``JitArgument`` and ``DynamicExpression``) for generating JIT function arguments for |CUSTOM_TYPES|.
+
+More details below for each of the above.
+
+Static argument vs. Dynamic argument
+------------------------------------
+
+|DSL| supports both static and dynamic arguments for JIT functions.
+
+1. **Static arguments** hold values that are known at compile time. It is not included in the generated JIT function signature.
+2. **Dynamic arguments** hold values that are only known at runtime.
+
+By default, |DSL| assumes dynamic arguments and tries to infer the argument types from the call-site argument types. An explicit type annotation ``cutlass.Constexpr`` can be used to specify a static argument.
+
+.. code-block:: python
+
+    import cutlass
+    import cutlass.cute as cute
+
+    @cute.jit
+    def foo(x: cutlass.Int32, y: cute.Constexpr):
+        print("x = ", x)        # Prints x = ?
+        print("y = ", y)        # Prints y = 2
+        cute.printf("x: {}", x) # Prints x: 2
+        cute.printf("y: {}", y) # Prints y: 2
+
+    foo(2, 2)
+
+In the example above, ``x`` is a dynamic argument with type cutlass.Int32 and ``y`` is a static argument.
+
+With the ``cutlass.Constexpr`` annotation, a more sophisticated uses case of static argument in the JIT functions can be something like:
+
+.. code-block:: python
+
+    import cutlass
+    import cutlass.cute as cute
+
+    @cute.kernel
+    def kernel(
+        self,
+        tiled_mma: cute.TiledMma,
+        tma_atom_a: cute.CopyAtom,
+        mA_mkl: cute.Tensor,
+        tma_atom_b: cute.CopyAtom,
+        mB_nkl: cute.Tensor,
+        tma_atom_c: Optional[cute.CopyAtom],
+        mC_mnl: cute.Tensor,
+        cluster_layout_vmnk: cute.Layout,
+        a_smem_layout_staged: cute.ComposedLayout,
+        b_smem_layout_staged: cute.ComposedLayout,
+        c_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout, None],
+        epi_tile: cute.Tile,
+        epilogue_op: cutlass.Constexpr,
+    ):
+        ...
+
+        # Perform epilogue op on accumulator and convert to C type
+        acc_vec = tTR_rAcc.load()
+        acc_vec = epilogue_op(acc_vec.to(self.c_dtype))
+        tTR_rC.store(acc_vec)
+
+In this example, ``epilogue_op`` is a static argument in the JIT kernel where the argument is used for the epilogue fusion. Upon calling the kernel,
+an elementwise lambda function can be passed in as the ``epilogue_op`` argument. For example, a ReLU can be applied for epilogue fusion by simply setting the
+``epilogue_op`` to ``lambda x: cute.where(x > 0, x, cute.full_like(x, 0))``
+
+Refer to the `Blackwell dense GEMM example <https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/blackwell/dense_gemm_persistent.py>`__ for a complete example.
+
+Type safety
+-----------
+
+|DSL| makes good use of type annotation in JIT function signature and validates the JIT function argument types at compile time for **type safety**.
+
+.. code-block:: python
+
+    import cutlass
+    import cutlass.cute as cute
+    import numpy as np
+
+    @cute.jit
+    def foo(x: cute.Tensor, y: cutlass.Float16):
+        ...
+
+    a = np.random.randn(10, 10).astype(np.float16)
+    b = 32
+
+    foo(a, b)
+    foo(b, a)  # This will fail at compile time due to type mismatch
+
+The type safety check helps catch the type mismatch issue early at the compile time with clear error message to avoid tricky runtime errors which is usually more expensive to debug.
+In the example above, the second call to ``foo`` will fail at compile time due to the type mismatch with a clear error message:
+
+::
+
+    cutlass.base_dsl.common.DSLRuntimeError: DSLRuntimeError: expects argument #1 (a) to be <class 'cutlass.cute.typing.Tensor'>, but got <class 'int'>
+
+JIT function arguments with |CUSTOM_TYPES|
+--------------------------------------------
+|DSL| supports |CUSTOM_TYPES| for JIT function arguments by providing two runtime checkable protocols:
+
+* ``JitArgument`` which is used for host JIT functions to be called from Python.
+    - ``__c_pointers__``: Generate a list of ctypes pointers for the current object.
+    - ``__get_mlir_types__``: Generate a list of MLIR types for the current object.
+    - ``__new_from_mlir_values__``: Create a new object from MLIR values.
+
+* ``DynamicExpression`` which is used for device JIT functions to be called from the host JIT functions.
+    - ``__extract_mlir_values__``: Generate a dynamic expression for the current object.
+    - ``__new_from_mlir_values__``: Create a new object from MLIR values.
+
+Refer to `typing.py <https://github.com/NVIDIA/cutlass/tree/main/python/CuTeDSL/base_dsl/typing.py>`__ for more details on these protocol APIs.
+
+Depending on different cases of the |CUSTOM_TYPES|, |DSL| provides easy ways to adopt |CUSTOM_TYPES| for JIT function arguments.
+
+1. Direct protocol implementation in |CUSTOM_TYPES|
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+One way is to implement the protocol methods directly in the |CUSTOM_TYPES| to enable the protocol based JIT function argument generation.
+
+.. code-block:: python
+
+    import cutlass
+    import cutlass.cute as cute
+
+    # Customized type that implements the DynamicExpression protocol
+    class MyDynamicExpression:
+        def __init__(self, tensor, offset):
+            self._tensor = tensor # Dynamic argument
+            self._offset = offset # Dynamic argument
+
+        def __extract_mlir_values__(self):
+            return [self._tensor.__extract_mlir_values__(), self._offset.__extract_mlir_values__()]
+
+        def __new_from_mlir_values__(self, values):
+            return MyDynamicExpression(values[0], values[1])
+
+    @cute.kernel
+    def my_kernel(x: MyDynamicExpression):
+        ...
+
+In the example above, the ``MyDynamicExpression`` implements the ``DynamicExpression`` protocol and |DSL| will generate the JIT function arguments for the JIT kernel ``my_kernel`` based on the protocol methods.
+
+2. Adaptor based protocol implementation for |CUSTOM_TYPES|
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+For the case where directly changing the |CUSTOM_TYPES| to implement the protocol is not feasible, |DSL| provides adaptor based approach to adapt the |CUSTOM_TYPES| for JIT function argument generation.
+
+The JIT function argument adaptor is a callable object that implements the desired protocol methods for the registered |CUSTOM_TYPES|. This way, |DSL| automatically queries the JIT argument adaptor registry
+to generate the JIT function arguments for the given |CUSTOM_TYPES|.
+
+.. code-block:: python
+
+    @cutlass.register_jit_arg_adapter(MyFrameworkObject)
+    class MyFrameworkObjectAdapter:
+        """
+        Convert a 3rd party framework object to a JIT function argument with JitArgument protocol
+        """
+
+        def __init__(self, arg):
+            self._arg = arg
+
+        def __c_pointers__(self):
+            # Convert the framework object to a C-ABI compatible object
+            # thru its C-ABI interface
+            return [self._arg.get_cabi_pointer()]
+
+        def __get_mlir_types__(self):
+            # Return the list of MLIR types the framework object represents
+            return [self._arg.get_data().mlir_type]
+
+        def __new_from_mlir_values__(self, values):
+            # Convert the MLIR values back to the framework object
+            return MyFrameworkObject(values[0])
+
+In this example, the ``MyFrameworkObjectAdapter`` implements an adaptor class which bridges the |DSL| and the 3rd party framework type ``MyFrameworkObject``.
+The registration is done by just decorating the adaptor with ``cutlass.register_jit_arg_adapter`` for the customized type. With the registered adaptor,
+|DSL| will automatically use the adaptor to generate the JIT function arguments for ``MyFrameworkObject`` typed arguments.
diff --git a/media/docs/pythonDSL/cute_dsl_general/dsl_jit_caching.rst b/media/docs/pythonDSL/cute_dsl_general/dsl_jit_caching.rst
new file mode 100644
index 00000000..30d07377
--- /dev/null
+++ b/media/docs/pythonDSL/cute_dsl_general/dsl_jit_caching.rst
@@ -0,0 +1,152 @@
+.. _dsl_jit_caching:
+.. |DSL| replace:: CuTe DSL
+
+.. _JIT_Caching:
+
+|DSL| JIT Caching
+====================
+.. contents:: Table of Contents
+   :depth: 2
+   :local:
+
+Zero Compile and JIT Executor
+-----------------------------
+
+Zero Compile is a feature that enables explicit kernel compilation on demand through ``cute.compile``.
+When ``cute.compile`` is called, it compiles the kernel and returns a JIT Executor instance.
+This JIT Executor instance can be cached and reused directly for subsequent executions without compiling the kernel again.
+
+The JIT Executor is a component that independently executes compiled code.
+It can be created either through ``cute.compile`` or implicit compilation.
+The JIT Executor instance behaves like a callable object to execute the compiled code.
+Each JIT Executor instance maintains a single compiled host function.
+
+It encompasses all necessary execution components:
+
+* Host function pointer and its MLIR execution engine
+* CUDA modules (optional)
+* Argument specifications defining how Python arguments are converted to C ABI-compatible types. Note that arguments with the ``cutlass.Constexpr`` hint are excluded from argument specifications since they are evaluated at compile time rather than runtime.
+
+For example, in the following code, ``print_result`` is a ``cutlass.Constexpr`` value that is **NOT** evaluated at runtime:
+
+.. code-block:: python
+
+   import cutlass.cute as cute
+
+   @cute.jit
+   def add(a, b, print_result: cutlass.Constexpr):
+      if print_result:
+         cute.printf("Result: %d\n", a + b)
+      return a + b
+
+   jit_executor = cute.compile(add, 1, 2, True)
+
+   jit_executor(1, 2) # output: ``Result: 3``
+
+The JIT Executor ensures all components are properly initialized and loaded after compilation.
+
+For example, all CUDA modules are loaded (via ``cuModuleLoad``) and kernel function pointers are extracted (via ``cuModuleGetFunction``).
+
+When calling a JIT Executor instance, it:
+
+* Parses Python runtime arguments and converts them to C ABI-compatible types according to argument specifications
+* Invokes the host function with the converted arguments
+
+Custom Caching with ``cute.compile``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+``cute.compile`` bypasses caching in |DSL| and always performs compilation, returning a fixed JIT Executor instance.
+This allows implementing custom caching strategies as shown below:
+
+.. code-block:: python
+
+   @cute.jit
+   def add(b):
+      return a + b
+
+   # Define a custom cache
+   custom_cache = {}
+
+   a = 1
+   compiled_add_1 = cute.compile(add, 2)
+   custom_cache[1] = compiled_add_1
+   compiled_add_1(2) # result = 3
+
+   a = 2
+   compiled_add_2 = cute.compile(add, 2)
+   custom_cache[2] = compiled_add_2
+   compiled_add_2(2) # result = 4
+
+   # Use the custom cache
+   custom_cache[1](2) # result = 3
+   custom_cache[2](2) # result = 4
+
+
+Cache in |DSL|
+-----------------
+
+By default, cache in |DSL| is implicitly enabled to avoid recompilation when kernels are called repeatedly without changes.
+
+The cache is implemented as a map storing compiled JIT Executor instances within |DSL|.
+
+The cache key combines hashes of:
+
+* MLIR bytecode of the MLIR program generated by |DSL|
+* All |DSL| Python source files
+* All |DSL| shared libraries
+* All |DSL| environment variables
+
+The cache value is a compiled JIT Executor instance.
+
+On a cache hit, compilation is skipped and the cached JIT Executor instance is reused.
+
+On a cache miss, the kernel is compiled and the new JIT Executor instance is stored in the cache.
+
+Here is an example demonstrating automatic caching of the ``add`` kernel:
+
+.. code-block:: python
+
+   # Global variable
+   a = 1
+
+   @cute.jit
+   def add(b):
+      return a + b
+
+   # Cache is empty at beginning
+
+   # First call: cache miss triggers compilation
+   result = add(2) # result = 3
+   # Cache now has one instance
+
+   # Second call: cache hit reuses cached JIT Executor
+   result = add(2) # result = 3
+
+   a = 2
+   # Third call: cache miss due to changed IR code triggers recompilation
+   result = add(2) # result = 4
+   # Cache now has two instances
+
+The cache can be serialized to files for subsequent runs.
+After serialization, compiled MLIR bytecode is stored in file.
+The cache directory is ``/tmp/{current_user}/cutlass_python_cache``.
+The cache loads from files into memory during |DSL| initialization and saves back to files when the process exits.
+
+The following environment variables control file caching:
+
+.. code-block:: bash
+
+   # Disable file caching while keeping in-memory cache available, defaults to False.
+   export CUTE_DSL_DISABLE_FILE_CACHING=True
+
+   # Maximum number of cache files allowed, defaults to 1000.
+   export CUTE_DSL_FILE_CACHING_CAPACITY=1000
+
+Limitations
+~~~~~~~~~~~~~~~~~~~~~
+
+The intention of caching is to reduce the host launch overhead before each execution. As above example shows,
+the consistency between the original Python code and the MLIR program is hard to maintain because of the impact of dynamic factors such as global variables.
+Therefore, the MLIR program **MUST** always be generated to verify that the kernel content matches what was previously built.
+
+For optimal host launch latency, we recommend using above custom caching method with ``cute.compile``.
diff --git a/media/docs/pythonDSL/cute_dsl_general/dsl_modes.png b/media/docs/pythonDSL/cute_dsl_general/dsl_modes.png
new file mode 100644
index 0000000000000000000000000000000000000000..1f3f0bd3f40910eacc52e22d8651309e84cb3472
GIT binary patch
literal 1134058
zcmeFZXIPV4*EOo5qM~A?DOF)x009;0B^H_>U3v+M0#b$0OCq`z1c8mxTaaF)ODBL*
zrPokHq=puH=;f@S`~A*&H2XQ%e*b=d#0wU=*P3h2F~=Np-n>;&l09{T@x*}x2Tt9W
zlTrg;rVku|oH}+Cd?#b_>T&SpAY4uM?t#n}=BWb*E*!Wob?2e8;VjlIEJ4HL$^tHR
zZa9;}E<DrZ-1W1@)!mb>Z?79P|B=b0rC9c;?9w9rEv~li7mL<)7VK@^-ZuiP^#rYH
zX|Pg{<81lu1)MLbR;As)b$q0)yGoknFiUJ{i}l4Y+ptX6^mNxGK9nmfYmC=sG)@E$
zADRe2Y;Bcux*t1p;r5FI2M-<o7yt3RM3rzU;MQON^4IqtdhCt_!@+;|`2F9SymMP6
zi&FgF@qd4g7qKxG|JDSgJ*-@~kkUgD^YP!9@aM^ItF#LI>uUg?;d<sw*UZ6Im49`O
zKlX4T1<&~JUE$y%iDSp6i7!mA{yfpYG3c2t_^E$?jR!ng4<9DFR`Lh^FZ}V?bn(%D
z5ea`E`Zo2Um#z}u&&&Ut<Nvh&VWK|8|H74GDG%O3vo4&x^uO?@m#&$ApXlG%vhCH2
zSVL;(H)sDB{&Yv8=ifZ~??eAzSRh&N|H1<BhyM->Z5fqsuf#a(wrrdd!FRHcIAx+&
z``=o-rCj;joqq4gp23$cq!i}1RGE{53{L^x?5B+(Wq5jZ`B3DH9<INjw$RG$pTk?b
z6M%@trn8klxd_m{e@L&u@)=rU{*+r*>O(H8(uoKQUf6w$eER>A0j~01NYNUUzeF~?
zW3PD9C<#AuYuWgWjs%EaZGN5X+I^5aUGyS$=pghE)$eTtei`EnR=A2h+xp|5%xQ!^
zer!6Ukfsnu?g-ysfE6s)U1?~@K8kb?Mmk3h6GMWv;N(8%MJ4hCz<EY;m-gel*@EC>
zFcZc7bmY#TeF&`3{Qgn<EV&gw&bvwC2b5Ag<ew)E5|_`_)QkUQpBI(WG-tZ>JWNe$
z$hl7?7adrUNNL(fZp9CWngkodpC$ja*()H0>>RdYBx}L_&zUtPEjwu~N^Y6a4zP0u
zF0YXMRggXiQvm#<UlLZzLGJvTe}NSzL{BP^-!q9f+-3t8vgw^l_Qr0<Mgy`p^NL(t
z{qQRb_1h{oRx#-Y5ORLTefxVgjj}i)!XmSHdoB?i^2NQJ!{p5Hg9}^(Npa|AtoeO%
z)Zmkir{--w{Pvru<!4U!w_Eai48<;{6qaXEmHs0R84*ZU#^&YoliR@e1II1<a-XuZ
z4`t(564jm!7!IS^kl0P@R-t*8*r$;p|7?%LAy)y+NeNexcVJ)A11`<)7|2g8VUPo(
zJZ?~4NEe)ktOfTS)X0Gf?0jxH&z0QHjb0y*avph`ED+gDaN|=@v3boBH2%*NF*X38
zr$=>{57|TgccxppNeVTjsdXK>K&re6jLyT6i6OV($J4bxg3WNR7+v!vw;Szfup)hV
z@KXf26+fI1O`_Lvrq{?qjS=6$I})LX$^Lxb?Jy|dgqvb;vCHH_EkqSK|9RVdvJ3W|
zpz;I>X+BaK>ySHt<uol=5sf$LCb#0pc_*bo{l*|+Oul||1E^p82YKHkeSY>XKu`=W
zeBcJT^Jl*Uc5d`e7a+Ibhe0$`NJt}bu92Obfi#oBifM77^nbR{Z9eck^yGO3a^Zfa
zR{>Ogm+p}LIqBX#iB~}A?Y0d1le3TKQ<9x!^fSmU_;KFsyQEw-`9+(a-1)ObNh|Qh
zI{(E&t`JaaO1Em03)vTyhT!D9$I1SD|K2}ALGB<rO*lRvcm8YfB)l9kS0uM!9|k<>
zxPXD~QrG5^a}dvNPOw6>wK?sdEz|~BiXoqCE%~Bl^(-h*@BcIU810Z)Oa*_TF}d>_
zod<RffoNHfTkzw&pHzT>=v=RdNRvDNCnd0=+r?V*zgWnSR28@LlLz_iP>>WAUj8%w
z%qRm~*oiJ?Axpj*ukBOaEKW7~P<)i~z*ebR>gW9w=^h@MC_&Eee#Y6g52Wk<Voo+*
zKEX&H<F@LkM9v_jo2lF+g#!w2-9rv`{{JGkC-7qK1(j=W$=O943s$6WZGVa(x8etK
zV?m~H;7%gHstrh3t(Wfp)3E>$kV=j6mH}&WUhKI8SY098UyA<)QRw~v%U+(YQX?BM
z`4_<o!IpMjaw~pZrOHEq+$lk1+24!XcfgIkRB8C<kI9xHL3jO~{-fkPB-;+G&}`As
zK2C1M4<B0f0xq<xB1lPYS#~DrQQfu!<g@7hCEQhMl_m%O$IeXV0iZzL$tTNJ_s?5-
zf&n<<Fy#{$vX1y)`TP&R`tKQX3#fl^o&N$%9wcDOS}xEdhdV}4KrslEPt?y`Xv?o$
z(Bap6d>s|ev74{MxZW7e5asI8qj>U%OaJyry&y5hP?6`>ZKQHR@|C&li`%V-?iuqZ
zCk7SnJ&9jw6%l9+k7VAFYNatFZ*^@vh_;#SL)jkW_8+?kR_q9M2OjVoI^58`36-u!
z_GyJOFHRQgj#HtVjc$?ii`zyXBqBP$T<Avb!;Q`Z*9keOp>XJM$hGUY5E}(S3P@bi
zmr%z1hN@6;PtV_)<cGB`rr_Dh+igV$gi&j3KjcE&hsp)#9Y2w}nSO!r3aep5p)!jB
z9tp)K<STF^7ShH?s|p>+Zrp{m@j4AHu<^^+^)ry-vf??|N!qDUQwt6@SpP|xf8gl<
ztcH5f117+q;s_M*KU6X?6)cd*0m1uI`F`I53s-Ihej#fTPlsy&Vf?9a7_wkC4p@dB
ztCqp>GukgJ&udv-W!rr<Am-P^S>a@DxAHoA+FhAEUS`_>Dy&5oPj1Jy0#ETc+U<V;
zbTBL>OLzZzt>&J1`_+_8hMlgb=q0inKX~ZBh1w=KS&yj-;n*4N+{(7~rmW;Bs7mY?
z<Sp}H6qic!*a51~UqhSK5Kj{9?^;jL=<fM@$8(GrLTsI-n3*BadWcXmEv`8*ODRw?
zbfWW$Dei_H;h<sp6d6NRMv!=To~(s2xjk2U0@fm?vNUn&wvmnJlJ?G5xf_PJO=ROW
zMapkSzo)oxU0iY}+av*>%-jn05>o!>tq&X()|r~&hOX2cWFC9KlY}P4|BxX5k8fPl
ziPg{8GmsK5xc$n4DPMl3*;0~Q%BZs#C0^Dgz9Pg?Cj!YSZf)Fr2sirP<7oJNn&>f?
z&O#<uwF^lwusUfKMK0-T^8%O|f8Sl0QrQ+xG?u4WY%<|&Zl+GEKra&XDq!}XO7bs}
z+?5WZcVc!L3IbS}E=4`J>ChD=OX%2YB;`;L{GD|5_nS(EV$!Z1mR4h(jly2!3Oyq%
zK+TNE_ySe35r_n~BJe$1o>eefpXEke4vm2Md*54KFAuSnn9a`%6(6DO!UR@UuYOs|
z^c3@%6_#~}XKHRtvFDqIC>y2dvMEABWvX);g&GZ`;Rs!0^Og85D@htMaqB5SV#G4k
zm;!Qn*OLMi6;9u0o0)TZnYz@fGff&=V@LqZr&MLC@1VWBz_zX7Bg~F3x*Xim-Rzwu
zHAOk#WVnE{AtV;&?9iQKMM+-5+ds|8oI)|>2cmd1)6@12i3Hb~44vgP?A{xS7E6+=
z5Z7bcaK_8ik#)?+;3hLqk+a&1+bEKlb281~k!R^p(poFw=vdeG>*8;n1q{|DT91lF
zm%n&#&w9M8C_lfEBW4k=#g|`b3pSUDL0*w9D>v#%XUcE5cLKvT6JL3Q4~wY&{)GYw
zfdtGL4_NCFloDT$n(_@g8W&P@lV!9}65+o@KCk{A>?f##(mPFM6H-r8INAscpMq_b
za^QFd*E7EkPd6FE$0{uNttFGUxzXkMSpyEKxWe1Tx)tK=GE|%HiK^+_`d@UPdYlz@
z{bV@w*9a8|24x;Mvky>Ail1b@*HLm^6i){paL<_mxCydBIu3Y&-f0a9-(x4WX4qOG
z%Cp*$Bd`3vHNeF%1Pzr;CFCu3iNwyL{S=u<C%Gi@;?TIchp7~^p4oBE^4_pd;%O=w
zaplKOqHMb#HCG`Qf@y8ls88WJ9t+C4b4WMIRA-1hmXsEEc{W_p!NHWDp+(GQCD)3r
zEFoc4p{&g)S(mT_k-`)DsXY2{$#D|_4(Jn8tCeXbiM8!=03qs9+o6YyGp)L~uF!&b
z)jyOB6IB@!?>b>ixAQ>AJh;<`K7Ga@RzG@U)O-7@wb;0~O;Qw8pDS3>4m(z_Mb88U
zA0qGXto{YnSK6PjcFUZDZ`x6DnymRiYg&ucn$~>r(}6Bhm==V*@Az70!Z*0u%59_*
z@;km#d6rb*9W&KdJ`~xqGd7&~`ku{H+_!zEP?{28m%m%(Eu2sv=6P<gEZ(ufL3%4-
z{pR1Z0Os8*u<w&Ot}gdVmQPt6fo>T4^-ro((R0se7(L!Ha_OmD^W7E4N*MSE!wWQB
zOockB1G-UB>h6EAAEkOXfbA`Mo)y-MQLN9qtw%1884*aNVTZrq9ynW3Wr$V_vrC=T
zM!&c4<A6j%lv+eoH1h_UuH3!xcNEu%4a6<4FU7^{4>O=#((dKy#c+fQ0Hqn}TIL-e
z?z#M7|A}U4b3Fl(rR53pc($v$S(z1w>ik;8KZh@e`Z_g--ZGEp<2B66W$?5k`p8P%
zs?h!{>(kr)t|Jwmp8?|$HEAtTG_^u5R905fi99w=UKAO>(xm8*h*OWACwp5cQ24>6
zL9l*5Ky^~fl(#;xwXf%<ntT>5My!(Z$?qga1yXz*inu62`h^%2M@3<zj0k>tB1wkt
z>dmA9BULGcY`mbWW8jeaCf{}aXoSB1%D0i|&PafIJ#<-GeZIE~zx1nF$Jk-Jo}ma&
zTSjbjnzAgqWN?Fx`s9QJPa;W8WmU$;0B&>RP+gh*7UUHrwYw1eQ3k!PFH8)l(I$@v
z=IHXwsD7t*#{yZzP?r5jBS_qaP$*uwuo0srXdav)T^m$4)9S~(mnTOs5a{&N9skTb
z9lWY)Wad86qux@3GYeZ%VEhn2KR0^zvpL7}!SyDJ2$4O7iwQw3&mQ<cpid*HJj`ry
zzs?RVwbJpS7QD>eN~m*pB?Et+L{c3YqcmU$j$wPK3g2Ye+$`RScr>->*I$pzEUM8v
z^Sd1G3##E_;nROsKF>$!_&eV%gtHYdro5^Uv|S=Z*9NB(3#F>l#kvWLJBr{4lRu77
zB)ZeUJY^-VC#de)(ef>@(Fe01>KRBVyZ>a{R6ihYcjxnVQmSl6`KhfYw#N&0)ZBA!
z=(s$rnCS8%Llr(TkQ4>Gj@A1wDt%}{;(*0)@LCrFIBWqGLla%`LEqo0fJP`%0KD4|
z03?TWR2N~8vjZ8K;pbk`S!Cz184sD_jGkUyM>qkF_1V`o(S!CJV^#&n_*Hl0#~?~p
zM{HLrfkNLyAEgNypKfB49b+gcl$(FnC1cp7oMo4fFqUHOPD=jE3i^fkyXON{(18Qk
z-rB!k2H5`Xp?jm9^Y=y2u~nyf!)2@za?keIDE<y&XZ3*25NvyaR2;EDZuAlcMQUo`
zAA<8l{kYG}^_AAR$7}YrrDn~C$^yy;I}54M`RvBU!_4nCT%h@)(G+_8f4DN%p0k6m
z6(b`#)Ov|-U1v3KK>0cP_k833O^`I>K<2ZN2w(lkf+>hsO=qt7=e_1s9C_2)hp@Lk
zJ(ykXvIl-+`(bI$c+O@OM-9}z57wKkkc|ilO(#O%dc(N;^HkG|ob+ka103!d6O)r^
z*>n=d=W6|nIYu~IwI@e|Zh-QKPu9}1r<2&GY)fvx=wn5FxrR#plEAouhmN4EXQ|I?
zj!o&%{_4)2H2~`MS+?|I_0jHs#65&fmo<BODvI$56E|yWL+hsV<!)e}A}ernXFs9F
ztGdIf@%61&dzUw5GICRmGIAxUEJZE0UMakyu#p!jz|zH*UXpgNIKQC*&u^z*aVfc~
zHG>QA=IAHVb<Gr(AbxkLK>H?*y<n<12RIHM;#T{SJbe1teM^n+O$`Z^dsxgQrM8}6
zxT=-qELHuLiKBx8Uag-H_U!aYeY>5F<lTW2u#)j#$>a)X1Y`w->C&AAyWo544ZZrX
zEFg|~!-a6@&JU6>V1;~yvaZ?G0*Q6MtW>F^g;_q(WYtK~Y@enV9!)znUWI(_maKS7
z(Ent$l(}QtGFcgLffQAlM*$sVzTi1Ly)tT-piCcLZofs4g{^G`pi|{CA}BbXcZ8#a
z*D0C&o#8FEb3@#6kkS!M)f-bjmul+MP18-k&fR2JfGb)pEZS)3o^p~+;1A&q+-Ft}
zu+adf(j|Pvvtb2=DV~%<+Wg&joLVzT21o!;2T$_$Q1Q2R2KrQuypkht>ZX$?Xth2M
zNY9klT4vax5cBpat~K8v&B?)PgX!U;xy3ASt!mkroVIzFmER>zQmslUe_Lh#_P5AT
z5WjraA4W@FkuHfnR{h0gxZWbEz#|JBaEI^tNbl;_V93$(vk*P{XbF6>K#~l!WKYXf
z<`<&uNda^^rwd&@Yf<CoJB^UVHdb0kz=Q6s2_VeMcKw4!n-W*(yYzf?k+ml^W>EJb
z0h}E?<14d$sew>#wR95)5^c@$le3x5_T>{W5B6GvCs%cEJ{1``?naDig#<*WYb0%n
zgc|<d?{0s;4d@!nRhQ*VJx`2Yp*8k-^%TT*@;2*swSPZnPlnz*hw?=`hlv0VyN}78
zEx3Z`SxiIugb)X%0`|ha*tc1%D_8>|kB^mXm1Rk)>17Vm)?ed(1(8iKve4tCc52{Q
zlyJ_@UjvS(wzIVo)a0Q~*fF`8k58bxgDx=R1#ir&vC}&8$UEr76r$;aV%@zuZ)fd5
zYZpoX=+Yv)dx8Ci7J|`R->N#le!L<(M#C|(t|rd}_wQp~4DhnlUr)BUTtJv}I6cb1
za4ro~5y-A&ZrU|3bS<QAn$L$8`MEwyC{d4k;HKL*4#fAtU$?pcol0|(>PNU@wv#R&
z>oA)0`S~QCV`NN%emAva`zuvesq=upM*ewUf=;Zs;ojXb=O=+OQ<$pLPOoykyLK}~
z=ANP?(_Gr_fI=nyhzWfc_ltx-|1*1KYm(x<;eMvS$4RYZwond_+$wXuwJWUH^2-&O
zs!SfsF#`9xr*LwCLlqC-2A}CFoexKqu6<mra<dVi?w#3BwF=lqb0#g2?j_iv*~=xJ
z58?&Yru2LCq(NP!0NDtRMkL3uGd6oqiT?#dFTZTxo3zr5U!xVao?WWDr9Q<U{m@Rn
z;)amO(v|3GA9t7}zJVsLc%j+*oc?FYx&cP@U@eBJ<TwAh^`CV4(-V+*gWW|j1rwxP
zbP2K;xz*e2?z3Ie*FQPGS~3N)Xk7hN7qhK_&$_w%qO#LTUVTK>#=wLt!OETO$mIg+
z5af#eXpN2h+zo?KU_T!PR$Ey%tym4dL#F!qk0!JAT9r7_slLf_RMW)A*1zbzM!~>!
zGjb{P#v57obF(l!$11_w0J8X=dC}CWFk-|Plu7ZcpQa{0!ndZ;uve}$1C(CYtnTs+
z8|MQ@5rHG?-bM3<6}~p;j0^~*C&}9)0<vc)zcHHkAs(hX9{NQh1olH@5G8O+i(3?E
zZm1-eBqE@<B;l@{cwajym*Wz<&o3N><$)r=DdallFZAIz>BIoM_Z7#NoXvd9xGb-C
zr?rw{;T-4;Y}DVWI-_xoOgI`Pl4$)3O>ViszP`Z*k-b)sO1oIwr#pU10c#(b+}{Gy
zPk6eYVM*6eEm6zJ`4hri$yT~O)6b8C;znSE^h`I;hsvqY(?J1ju6G)25&pfs<JVrg
z8ocEEWdh-o3An><ch__q0ze-e#|XkVOMI`{YMBzc{Qb*-o+r8hWk2)CtJQGHo9(mD
zo4VwHRuO}NXAer}*JM@ZdzUlI_9|JZ_yTD8rbdR%-T1aPqega8!*RGU#vNBaH$zTi
zuXJMHq2Z#9D!h%zNF>8vz`!PY&bVbj@|G}k!PDDWKLCyAz{9K__#r;mNzS;gtK{#d
z(6+ngnRx9TALP#-cqUgm-`G8+ER|4Kn0`Od@VXw83yg319JwBo=1b5(kQY8;q0MmU
z9=?k$C}Oc&DN-@<)=jrzx#0sn*d4)WxAC0B*RL@jRH>iwd&R0GQ}in1LhEL-e3)%`
zM6tW;_uj-==!80C2wfNwU)Zj1cO98gRk*dV{M-47bW*cjAf|s>e|bS<AfTn$lc_&L
zGI6D$?^_^_JD@wuhjuUDz^Y}jx;V7~R%qT}Oe~b~(IT?vMN!8M*w0;|=4)nie3Y;~
zVu&dU$jNv<Qc68kbls)7x&XtpzPe?)9z~$G<PC}mQ!7FG2usSAI7uzn(zctSk6c%C
zmk1r15erChx5%N~IR%&o_d;9oUSEm^0*zy0&TJ9ql(jxGb9AexbfsR#YDQycrZ7fy
z9Fz$9ERZWsgAOgNB+F+T8+>`m-ZT95d*%He6mq(P&NB5KxwzK`K11hri@pFua`jNW
zw=dPKg{<-D*PX|Y2i!gT*+PTrOj^8yaM))LS~kjxQ`@OTyEhOnM;b<nP(S8YPEp@j
zL)BEto@m?dHy^8z>yeyGOm4>}vFNLywZIktr0D-|x#Ag2Qkvf@n_6X9rHJk@advDC
z5YmDoll6rh`c1uRlVBlguBLjPQA5};Mwk0hh^gBy%WZ4yBi@G9){#)NQ77KrjYzkt
zoU9zcviIVHI`_EFNX<N*j*Q@xu)&YjKSkf-6fYo(O;uS-lm>M;dkPI>v5u0`n`Q|U
z(f|rjeeg_D#jj*@?tw@_i;_m>wX)hnugA^NM3XA$Rh53LZ#+(8&Xs=Z!7_yuJ#HQ@
z?h^um%aLLtpI^r2F)<X_yP@oI%4I7?eB7*x;`^rX$o;aSv!AR0uK5ahvAOUi3w@eH
z*duvM+MtA06=Gro7xxzjO8rUG_V>FRZ<Y(2=>WMA2ZGNI8^_3_Bc-i?JDfUV5lp=E
z{SFOGVzRW<dOL%5%4u_5mKDh5!c4you%(0H4V(O^u4xz^#NHI`qA$U*qllROYUI7_
zU{3L7_!e*Vwjzron#A?@k*{H+!|=8IGF2VSX|aF>y;|(HO;Tw)-W2^|CteJb5F>tr
z>?MEx%kcV^e;_7uHDsLsmL}3we$h)p@|Lx%kMo9Q#!_g8bVxc;Oth?hE(BD5SqN)0
zaJi+N9X4xuctdn~e~Jcc*}I({{Ta;^fpL4h-c2!&Rrh$4<bHM>gUhI?)DO96Mg1O`
z9)e~G2?(GUXY}|<_JOgF96Ta)3wq{Nq+^%ThSHbKt6IrWNFXaLK~femq;IuCA#L7$
zEdUfZxbu)Y`l%Rp$7T%I8mC%%lC0-j!mE2YZgf#lhu-G7wo*4$Fd@zwF#L<Fc9RO~
z2qg<*W`5g1+K=+J3tM_k%XAvzl7P+Zyes-q<gYztAK_F31Ak{z?sJ&Tq@VR9Y|d-x
zdH!w_^J2M@&D)te{jKRhj-l&XOfKqD4IYuBD+-ivglt@8{Z~55i4PX1YoCl;0UsoY
zs#n8WF<i>CJ!PO|>Pe)vna#zydtD!v=m*`oCMADFX`>6~>BfTg&Nq>oIv>k&Iyt}!
zS@jYM%$jXYt#O0Q%w|YUks(Z2T3LyivcrF#>SxVlbcvLk#yWEfKyLb;nh4Fo1&w~{
zOKqr;h5!aoAzQuU)1AE6o#H2+teufIer9sXwWvTfJt~`drfnH|q@g<;=2M?>z~bY9
z@w2l)UVCA3>_5vfqkPgF9Oi?b!3=MJa)q#+Ir_C#yRyIg9JrvbrBn^Zovm+!c0X4s
zIWZioBE*0WTiBOZl;yYrtrleqQW9b4<H?E=Sz5(z>~Plbt#u@`8LKBKm<m~YXg)|V
z$=dOEnOfyIO3hd#uD0D{=teN!*5in?TIa;yx*;U6B(@Xg=4?5kBQn0%EuE!|_wH%x
z=7I^^KHj=mIL~Nyq@k9hjzf$n7YMth$K;`$TteL4Sy*4mnuwBo*fyMWiZU5hzUm}f
zu9sb|(w78Jp4ZZq%Ww<xMNCjKGzdjiSY@i*lO|{1*ChKcP~?{7UsRcV7U*V`Kai~H
z$gnNHH<;qLJ=@=A@VYdtS5kXO;JNH@@I3MJ=+{wNdio!t#|cVfDxuwPckiGteLXMp
zpE05B2P(7l#V$=;MB|UH>a)V7D26#9p?pF_sfF&xlmSQV`AnwSr*)hiuk-Q);b2*X
zR5b<teDDx>dYN@9983HwKR~J)qCjY83}ts5_l}0NZ|-cXNHOQHAb`f{7o5Cy7UR}d
ztwp^#*DPe9`bSv7ibDZgIJ^vDowk%@xJ8ZP&l3<@e(*(7Xgn%LQb@pVqkOuto0}EV
zzCClAx_fIGAc5<b4b0O%6^DsL&ZT(<ocmOuUq2?@r-vbnISm8-z|NYN1>vlTG`Yf^
z#W$o#ZU|bt4;+|#%l(tJ9}vz>StB|23_=IOZDzy7{*YXH_gZPxU&Lms?fdracy%l*
zZXY)dy0{7PmF>pFr8l&BHF~>yOuPB*$@D-Y^s7{H*c4JrSaaex$?C@jj!YjyKDY>8
z7|Q+Sg(1)G%K!rUl`M%FPMtafZ2FAnib|>)oIvw%bS?8dV><F`;(E8CBl^1V^9~{&
zwP^#Skk{?5H2sfJtgJFlvo2agNF&5>q=JDdcN7DJVZ5W#jfo*<GectQ^N@+q^ZM_j
z3L-Hv9EAu<oW}DT()xm>#&h4K=SByOcW8<q4hbv^K^BV==Q7wvHz8d=2nG*xzblh&
z;DzyIyp5i7!>RdxM}<-%H7ajsa<FW<yyvm0ojMS@O4yEC<kW%w^>)<gV4RymAc|j)
zi7;X1Ev<*sS&K*IcSt8jv9dwD#=QsF;{rJhZUS)9(rTt9(|z%z17%;~SL*sD%$rI8
zKTT01EZF%Zoo8K2GwX*tKAL;nlAc}u7Vp~}t~PX2Ok|TV*YiXw>(Ebeh?ej{U0+;C
zM<m@0T`~eBvQQg^;+3@;+3hD!{ny*>P$q{C+3H#E+-YR9pX5j>d}ucMAJpOCp>!3H
z-RM~Rx%BK@EH^0vJF*H59R0u5J{*riTfC_?QBvTvnSa!%1@flXXQ1{Zp%^)Lq$3R#
zPHP1J=m#Z!;vW~bSS=}bqC4KfvM-K5lH4<C*cfD1oqJqfVP&cBtoFb%a{loHP>M51
zCs6FV?I8{m0VF<=dMHMK;@1}jyXy1U)#e@!Y$D7)?S=wi;!`Uv1^HArq183*8JcH#
z)qnyS!x7U~Q;sd2Jfy(+pTPJtpeI3o>A7FOc6gK2^IdL<T5c?{3Rcs<E~X=r)mgN(
zzT(S~44r*gh+h+tr;bUt+A$Y0?pl>K0+0POXe2idtLBf3iB2m+@0%1zmpvRy6utrL
zs!x5)Zoyqw!A_$O_5rqY_72*b?$@^3&pB@#1-f)AW~11RP271)F?#4w&x6Fe*{H$I
zVvmeM{fzl{-o~N1p<s9TBpGCdfa7u>(d$~!2sdF9bZipi4gzOB>DPng8+Vo#vasCQ
zb$s+_*j5aeL)dfVpOeiO0aP~Qdf$=P6mB9Fi8ll``{UrJG-d+iAt*jwq!tzr_u)%m
z#@V3#u8sGeOYEf;@qdQSp3OwgsV`(WLSssLVuwg7vf0EoU-asGb^}qpfz774o=O9o
zsV&3E88?)T&+KRa{Oz6nXt&4TlVEGBk=)A$;-n@6QJ!pr;xtJ%h8)_u16sO!_CD)1
zmmUzafW~bq>c<2Maba16Omc}t1>d-*mw}MObZF`M_yQ;xl-XA@Is18j7Lo0)4mBVj
zZu&uNA1*>NDmuLjSzJ)@m&-{Cx>Ep6*NnW%FH^8a2Ba>vud$?2Br_{&gAlbm@2Fil
zCu231UGr5Hs0DDf1genSx38AJ2KGF#vN)1D5*BI=eZ279bG~qf9}pj>E~Vnkf#RPP
zh{KJ!&gDWL@$olKR;7u)H`cC2*C!4e5b=09aDVqiiHzQ_QA4T&tIbLd)Di2`cW_vT
z&Wijczi9LM&E1fw4yuXEvL|rrW@rsUfx)Y+xAjN9zWUkg${c5X3P_iaw{ZT>`nC|r
z4OeQV%;}>|i*%80{$^_z8cB$Y;_J7`=n1ZyrVgO(#2ywv(#9}vbmv<eBgv&mBuJ42
z8{6d!1yA6k1YYZs-9OuJpJdG^)!+25Blol8LY0zoGaA4Cqnnx0NFh=g$s3>uy`r|+
z8P{mGRy`Ug(M14yZ${y+^O*NbNtk?dN1h2^ziU7;1=Tbi?ZaR5r!2VmnaS+6EloI=
zmZ|mR_L*Gw`|*OCBQ+sTJ$j83CF?9x*o<93cf=8`<$#tspdPp1*5gVH?4y#^{g|04
zdC$0$PhqMhTi>&Fk2>1wlQ=r^wsvxo(~($Rn+hgNSfmMKexQ6EqT5w~{jym1&g9lg
z3{XqzN=~!H?3XkuGgfb3v#JRp2fm_9VmR1A^#ub&yR^#C5hsXjOPbf;UKU~)fJ=Qj
zGi?DTBJ_M$^&1_4UzjGyvCWwTIK5dB1S|-O)GP^y@sI%+?M84=iw%2%)zf1S#?)MT
zn!c(re6Vv}o$abe@dzT!W_{|CySAkYci`3Wc=KRBdod-C`9oF4{CxGC70G7FVs<BS
z!Wz$LGqkky_bq%Oj_3s;NS3OPg@R@Sg9QH_^vDF)FA+9FDnrmwE4w)SbBBrhB1<rb
z2$$IN)L2{9aa4czDIlVRaqQS88$uU7eo#I8);-N1>I^;ML#Hp3I!CCzfU*>`)5L0X
z^A}G{)fq95-7-}sm)gfR)U&sWuxt0aH_d#C%G3j1fARc~3Ek_$3@p*_fWCU3NZeqm
zmd|#aW5ow+XkF7Xd~D4zVsvg^y`dE}QVCp>-Hc)nGxj@0k76)P2Vn(|u6t|_*x*D5
zG9BSW{xt*d4UE9jXC({1&7)OH9-3Qg09zw{&?1XarwDdQl4`MQxBcmpqw>rzGR)6%
zcZc-;uU8!F;r?UHZbQLqY+<nV3AI4GYOe|NU{hdw06BTIi%r&BRA3erQ?WT0bAY)$
z#V=^^Isvs86RpHNfItm&d^ZJ%zILN(4kTNOLhSD93}P=wC|}Prc4#RoAoD*X%d?wQ
zT}LR}Jk9a|4|w3?{Ge20!rP{$n+u9{x#CnFJ&)oTIT&p*|0xV=H>&D|v>F*3So_2W
z5V~+Yx<oqw)GMIXcH{_t{cWyYevFtOHx{wg9*qbXNR=&}2laWYB4;%2qilw1`a3>N
z{+}&ar5U-OVoq|22NH>QF=T>7mY&<g12kRXosM^3wZA0)tR(UV<OU1po=&RB`bNi{
zjZHB`Z&|!R`R9u^Eh0O*QrnuPg*S&Jw0w#zK_^`;6rPp3^r2<D`b*(SX4_}-i;Y;z
zYtm6rz4WrU$Q$OrINeWUc+LSewq#jNG1kI!sdq)4Be7!A^}7KD7^?xgo8%TVax;-)
zy!l=sZLw|f+PVpsgb98O)5_yuP|IUrlp^Y(<YRy5%V{wXbc>+)J{&-C?(bOuo;%+&
znfkNjICl!`H?rk6*wmDeG*IDL_5G0{Y2CSi*r6MED>lZW_?6FUsVjNKecM|b1Cgkb
zVS@o(=$_<%-WTW(ktod()&@rH4j?$Og+)@p=F5V`!KjI0S26G7{~?`nRzCJqVDd?8
zEt4mpXYZDsc(P&u5{zCz)t~rFSSUsHBI>P5k0N@l^;;H)i&N!$sBg`h^MHwr>t!yJ
zNz6Gp%|-k3H}qSi(G8d6!kTe8zQ}aHvh?(Pf~q)41O-w6Z{56gm(1yz2~SUe`19-|
zIR-7Ap-aSFK{N*a^2^2-g0J|C=c=R38-vCDldnBQ4lDk`(*C{&X;LeW`EZDi(SDu~
z&Psk}w?CJ_Dm3I(JFP6o$*U(n|97+Aph4T%mF65{dIfKyvvD9V=Lkh!e#5d)SDMK4
zL*t9#i}l5wr{;P9^Dg9l8azI|(g%4q&1$S(Rsx#J18S~}ze)$$WdQyfXG#rAsr{5I
zu8emrHrrF;Uvp%I_phSklmM<XLOw)KD{!*9VV{ie%^GO2W>s+6ecstI1q#@kuwFsj
zGk=K+Hqku91IBkVV(&IGWHOe#uqy`ggrB@x&=?Pfn%>lug9sAi80i%p4W%_HE5#+j
zp)StPhTCkUt5vacS&?osbumukJCqX)<$VNZkJQ%1NWatuwzb3_E=#+?-e72G=htzT
zL-A{aIzIf2AA+H}uk{L^{L*3if%DnZz{54MfjSH3G@uo@cMd{5^A!7IPGWxXXaVS0
zyGXs`O>`0xG6?O!g#mW*L#Cv=(IBFPicI?)lUurt>f~eTeJD9$N_q6?92=xg*2g$f
zWWGTJgrn8^66hD<batgbwVVf)zL4}-Pl17lkGXNKU2C?17Y}u`I#R{FR*Qk#LIzRa
z(r@-L*&s{F;MX^B{(dEb+ogL=Vi~?B>AbKS$^JMSD+>3oh=@$vmM?SzJwCR#-pPED
zyvR==<66ON44EWNO)RV@!VBON>sfxO@-s2_hg*ME;Cn;XxB2m~K@ow?wbzlnIobLd
z6@Rer#$p#h54Y7VD>5BbJb$;Tu*JGvIcp2(LO7Pd7VU;rz^A}~E#a;=)LQRCc2UqJ
z+H)U<Izcs@a(g)98YiF>0&1JjlTmwUAryOzfL^-0nPOb;Z+gu8m!SRk+X&AH#|DWj
z9eSd(lgp6B$<qS+vmb7llL5+XD0qousxIfay6F{JXkORi&N-)%>eYMJ6~*>VETxe+
zG=K$B5u$MeU3<KKboo{GXbqzq%d*zu(+gROQSNS@{g*<2^rI;#vb?=b9g4mTh{hx9
zAvR&45f~~y%*ZVKvvBCQ;z?e4SNtsy!&XAO_fs|%Vjn?R9rZ{BRxNW@5B?Q>w(bG+
zL0-+u5_ywGTubuCJM|ME+n{G|@hVl=8#%)_=IzF;)>0G~zto~-5fYcx;3$vFcBy3+
z6BuvdCpj)^{&^<JTntY7b*x7sTQi|gKGMRTiJ2TfSZR%U?qr3(U_5Kx#pN}ZzLu0T
z1-qJPA0PyoQ+0EQ*L0~@!*nz9i%WXQw@fu8`UiNkfLyW8J_@7?{E5LHu8y?liay(!
zef?$wxGxOW1t+My(uh6D-nc2pspg=V&v3i3T<ba>y&WmuQ^<KA{CJo9<Al#5cDFc^
z*94OacH7GjRW=r!_9K%f{?AJ%Z}y?6f9Qb4k%<$7;%HeT3+J-k+Ne<n1x@RT_FCz9
z;-!FjJENJ)w)`WnIg?Sv%T`v9x;cj`grxM~UmTguBAIX1+_@tX8}XTuI}hE*&QlpE
z6~;CCN_Gp%mf2)#x_q&-nkZTf_Y&mTQw1HyLU@AQKxE?fQZW@bY{qd`_Y>bwNXHI^
zob~4k=w@T(-2M2fuL6t_VOoQ!f1!7kC%_=2-W{);qanG^wtY~1;}fP-mgD<V47ai6
z3W$}3Y$qA18G>yo++X=rP{&MGB=a-25MOW5{e1=5=(C#G9ROIiZjBKX*Mtfr7sV06
zcFuHJNH{MP7*OEm7qUyUNE6ZnJ-S|X)5<NJk{A;<WLJtt$ckVYrJ_4Ac7Cp83djB!
zE>z&w6QZ^mK66=iH(wnL(;;M;y!7UJRTzh^<J?`woE@Z?+f!lq`Q?4t8()&IwHTj=
zUc3i}at!9c6z0U}#AW4yq4A=p+-#DjTkXvgm8R;|_uZ7=@9PDeOD`Iq>Y-|Xb>98&
z#KHi^@4WBws+1&kF4+bh_}y-x#88>KnV7vx?5u64ZRf9i2jX<J(`#(V6O{TgL&(-q
z?EZ+|mQ^!Df}F9$;DsR;5m-e0N_lZ=MyJK_+X#^Rl-WmmL|OFtgIV+7b`~?=K6~tZ
zww^`IPecyAher)qjMCcfT4r>(GT};ge-3k6`l>Jr=hzOX;--(L=|@Y7u`A{?eSo_=
z1Hr7xdNb|UKLDbQ0R5M7Awxd?LAtYBG9Phy$t(S}WA%Ej`63gE0|7+<Agrg7lwHbp
z2kRH11Mk9ZESW`|+Y6Tn#zsFH!#4d{@ju!@*iZQb{lht%;UYC+{r(C+H07<d^O;H&
z*JU?HYwGwU-U;eQ@9r>9C1U5E&2L6^Ru6W`4HOoz{0~EY3{RwGXj0N_DP7(o$>gQ-
z%8G%tp-V^0X>I$qzgPAGk*oj2#mw~9XzE1^tgO7#kb-DhWAl6qBp#mMz)-?mXgPi*
zdgJXY8y4|6qRQ6~45%j?NdHi)SB&IzB?*wt^iLlSQ6*IvcSZE*Bjbpgxqbchd_n0%
zjN&rpgEYg=?8dEcA9KPx)fyxuj_^zG&gbQ~EAzCj!WjW$xRoeIA@j;3YBZ#N>eVaC
zL?Of$LC<unxA1woBHqx&@>%)DnD=&G;L>qOV!ZlTBs`%w3O-X-!J4{4+_Dy5VuD2q
zZ>)L<@PcU*R*15N&@&&iCJa~AYX;5iYzG>T=ZHs_WrL_rPicqPMG1&$6c-r}GxA>j
z*$1Ex7<BIX;%h+T>&I=EA=SVKhWE<dPA&H{{%718X^;f*24lTIU)D?v4#k0puOCr+
zHXPPRRfyG2D8LC+jds@IqcW{}gk;4_A`i5K7Z#>C@d^XP2b?HTps9r;=J9g~Ft>DN
zydK$IjLhpui5ASuD*2c*1y`5H^YwG!@q%DLrPswD_33Ir4~$M^NY`j^j&>?fWT~LW
z99}I1>O7Qnj|J$@yrZ3}^6gHDG87gQSQ<iCRB(S^%dJ<v&#BY^VOmS;^T6)}*TddL
zl?zuq8qQ^E!3k;^!7P%VtiMY??OTOd=N1{qNK|d{*cuC+s{YzKpJJtV>p^jKxx~rF
zRKPJEq|CPjn`x&yz*T0gUJv|5UHxeuBWaQp;%%JmoqzTDVEBNL5G7?f)*7^!PXU)=
z3SdIHfP|0HBldgj?hl-$143DGO;-HlmBJ_#26U&*lqFVgIR6~w&xrg?O>ml**lbP?
zG8Ee))E-$3G`)6hR2*t8u|IEYCUL)Dq1>+8lq0c!5RHe@(+o_^TWf;h>$v4p-<(H5
zKkE%*&MHM0`Oqf}yce+9SJ-VaFC@8Ix+g!c1&Y+c;r=k2E-t1LtSlI6S@Rbf`#Eom
zUDf>cC7AzVp(UYp$uMS-okxK?N!}GP85%L#3690)pz@z5aQsOeol^4=YF5%TgxT*N
zvYGPr-Y$``T4|Pcb5s|4m=TcV-AMH=9AwjE9!)J))cJrYKK(M~H5(f4faWL&CxffE
zUc*#hvzn!x2r?FH7!k;pn<wZ>d@SIE<=(*guKk>6{5JEWm53`{+zlKJ^Xzzo98eV-
zFHthajI5DHg6%Kj^Ew0OHdqEmS@#5j>6Ii@?qF*$h$fc-ucLTM#v`ka`|9%T=(FVL
zX&fN_nk$RigXZ+-Sko)mlh@IXZuKHok&%qW%SY!yW6}>yt(R_2R8}Ur*HL%go{Eh@
zu}83@jS2*p%FE9yTe2~iyL=~t_pUZEdGL4(j)+!++2i@qUQU_4?v6VbsS=%$QsSTB
z#5lTAjwQWaq{<8rgp*p!bhjL<ZlPzOmbX&b=Ud=-!iL^ZZSl%3y>+qE@b=y1)#e`9
zN^i8nT(%D-{jn~BSr=EGQ)!J|{?LgnYioY%a#bdg6bA!`0li<R;wqO;L_Pm{d;s3!
zd(>$)2L1DIy!ae`8R9ftl6B6L-R8ppJf}GdX^Zcg#Ie<%`jHg~Uthw^ySS+NN^04u
zr7ekE;}e`ciqWfmaUNnA#j%dIwdqmL#6sz6Bd<+8fCUJ`A+_jx4!#wBHZZu8i<NsC
zkTKVW;RS5bkrYCo2NZnPD}jfW)tEQ@=(UVI#wB|j<T1(4G0~*<wL7yg1hvLwN~NyM
zd2d%r@_!zB;A|l^$PCG7n{`sFcH$VR^|m)onDF;}z0$|R;HQtSnBxxGSMByxo}idc
zE3KhH(@S@T_`6|el5(A{2}ol%YxL_Zdy;O41mjeKGg1FXPS6to<ywzaixp};f&<DY
z9d$Oa#jJ1-qYj5xn@KU}Xd5&v4&<o^7p_$;7UggwGsEe3Z?YGx#KQ_D)DjdZsr^G&
zwkEJYnlzenS<0wwA+J{4W{@6bfEl`buH7nPj>IZgpLJ*+`#ed5T#Lyi(pM@WUo4E?
z;7n#{QSvJ*gv}T#dNg#qa>w@0Ju5WWS^S1|12vkEOtq@jwU90+fj(XCU*P%ge&zWY
zR>pLwMB=&}L%L2!)z^|qSjuN{LpERLe8(D~vJTGdtE?(jY!Z_T{G1~<2-A88bFT74
zbp;f@&3`E^QsX5ip}#msq&+SN28K}493I(t98t-VZ5s*xvS5^+L-_h4UgUmqFMUH|
z=pR-Pj1R+MTHgy+E!VlJyi-!B4BNU5v@p_C@T4OUo6)D}1JV3#uMHM+p<0WFW?nL=
zWQ+dMnah>^m3=6L?wXa7Zf0<Sz3BZI+)S6IeE5B(oA)n&b8mr`^0h8|l((+!GzU2<
z#Vjjs-&j>Pw^Nm>VA1^R-f}wD;FRlqKBn>u3*I-4sp31MUa=M1_kA#~X_{)joSZt=
zFfSbqqdjl?k#LrNIIrzp;-9BH3g7lpcXMLj>vb<j$|y$6HkYYBtYEo>%ts1JeY=11
z(6I|I4*d3?X8KswtiF*Zp-YD+42tGla_%nfo&Y&>eGW!=t7G5Z|M1c0&pBVi7QXZ$
zg3l<9In!<Q<R({!BP7o~s3=s+%w2Ort2z>7fo^x#WtxBepeep(RM602DV``7I@xHL
zdbx2XaOS+ogSoSG$zP9qK&jl<{&a1ME&=aWG;zLGj1Od-doYGjMgM`kJ$!Tuni^Ji
zA(CF!ReI%A@bqz}%W%YIbMT1hBBG=54NUi3;SG!wB5Do$A>F{Oq4AAPS+vBEuz<@w
z>o#;we_1uhZ%%p3@P!PzmdR=9WNP{BQ<L1A(~R9?+k>&CqUP@LBlIh$*wk;9mRcf$
zoeopX^~7QRnr1<yr+P@--9CE5#pC+quG62rrf;Gg46fy0w|(<A>I!?Z6iZfIx*eiq
zL^9De&orXp)c8_b!+0@;4_nY~%)%wt){?VB{T$NrLNZntPST|GrixT-VI;ntH8gqc
zV`zv#+Dl*mi}f6T5-w^iWccwBlx%V<FRy!c#TECYn*7k)4NSXwG!``stkwb_P{QUo
zKV_%gycoXP2dk&n-u9&(WBb$K3RK3M<M6@5XKo+(-9Mfoq-#wq^cv~cxG;VSX8oAE
z0b{ciW7M`YGIZlU-&W^|D;d%ki-i`j@!WzDse_P{MOohOpMMH_J=8W5SN-+qN4w;s
zGWXdepCja?3exJCY3|FMzopdV%3c3n!sbnefO12lp*cGPMidb@`e3YLsoZTuYWa^4
zQ@pu8!eCgcK%$ejE5W9W&#tM|sChfAldgQxe#+V5wavAbjX6)6j8c}mE9hCX*CzFQ
z$K2OGnfHWDHY21oByUU<9S6K!k81IEF57*)Ek!Gup}mz>&ZTcYy#74I;6!4tI;C~V
zc9#`ol<AyF#L62d&2^7w4&V=z4T<oGz_*OY4MPgnUVaU2cw5h4$Mo`I@$Rf`WW(1`
z-dd{HqlznkKwM;l7%t6GtPzav?KY23^wOYoZ>-$quK9}V(JB%2ZSV=BLzX<Ms-pgb
zzR6(BZ1ynC45S^-eF{<Q5v=g?;2OW%o(&axfSOnSV+R$r5G6wNVw@nx9mKF?r;c`J
zNb2_Eibeg27@=F-k`nYWmfDN2qcjL`ikDURhyLG$5)yFV48jeeft>FRCr&^lD(?Ef
zq$j%2L|d>s+@$~A_3nSq)0JclzWYlTvSnkDS7Qdxx|MZM7<1Q`c`!9hFI`p)hQB(v
zM#aeo_jx;{W^MekZz^oXB6YKQfEX*`9_B1ZG<Nd48rENtUbA~?A#{lN&I@g7z@TtV
zE=ii8=wC0^5Pey&Wsu1yF<L<oX1kQ6Yp$?jqM0`RH2+;jPgse-v`e1=*HW6iL#o_&
zxpwULwTGfd)ZQBky3;J(7#)9@e-BQ99lp28>2L`>8wS&MX8e+Qe0vU=a^%wZQ(nP#
zxW<?f71B|PRqMmNGBkVr&XAnG?^WYpJ=$~PSX)XDGhQkB($v!`+`Ozr9|1;z+o{vc
z@m4Ffj=aI|)cl^HYN14Y3Vn2$?JZ-47_YnPcN-0R_<c+04HxZ)6Vs>|t8MAmCSP;a
zQ|q6K;Lum?B-)kYe4{eeVtw4%@uh4`+`RHMDdACpZ1y&c7Vb3+(sQg&Oc*98OB+!`
zTF)-B*h8nD;<(xOI^HPsU*OWJySBS=YWMA<DoYKmuS}B4HJND(6vR(S9ch;@o+E!}
z^k+d_z7~9?CVKC}_|qYixE@XIW)IK#@+Y{k4Q`w8DO%KWwG``V1{t3%sJlZe=2ht4
z%8RMRXuXE+VOk$w!OXcnl??aC(yKMP@3+!e7-}QyD+t%@iIN4Ie?m2x*#rqZ^5v=u
zs|QtGTx!;_2Oo|0nUy)uR)rGB;{A`0POP0m4TpJTMcnZ7&XA2ajB_HSYbXe5rL{`U
z1t@H<r%-EgOR5OC*<SBu8PBK;mbZx9N`=oyJ>oA6^-#}G)sJ9%4s8?kjZ^sz^mrbp
zYD*D%^p!LELSN`kY?!;~iI+Ny3)@DmuT&pLLTJWC-lm~13AOEHq!5{17Z>itbA(vD
z?X2$_VmfJ$lr$%}jhr=HMQS{;%^ywk6Uqs8sc`Fd9#!Lh6RPv=P2)JvS`>SZxR}=H
z**WR5$5tkLv?m>rMUMv$Q=Fb*>$h1M9%=H^iq@a{TzS0Z;nspVBFO@kn8Q)^;jF^r
z^riBx<rMzrwVSdcaeL~#h}OL36p^?0ehZ(Z`-=@eV%B9bj{R6rKr83kxA3S_b=*hj
zL%;+jR@tBK!)zl$@j*-K(?zbOov8V8$JXA2P!^u8*w0U}U+A!X=J*I)<c5MQu};IE
zJ8XJjiDGIdow<<P9f9koVdg&NQ@<Ek8#zdKvP6Ns#;1O+w$?9L>F(ea+)<4krTdnV
zVU{$(;p~ESH=<3smTR80%&E7T$haDz#WwzABDSF6x^@$**$3f<uZdYGK?7LHZr;?w
z;d(i(uZ=^Q>{-lw+R-C}gK06c?3ZT#irwJuX@>jM<X#a@o31JLT)~Sz`F4%sH%@xt
z<pDoe$Y00KYZ-kM`doB1XD>tQ$z@zKbxBqhZBNq{yp<_}4#n!0I~cVk+3KwZOA~TS
zN*z3Nso>5Hv8E`Y2}*G7A=}6K^6pAAxqa2IzWg;vXq$#+hO_WAe5`((Yfa0z<_~le
z9lI;y7Uh<w@FMc0CeO~j&+9i;hpAW@{^;aN{GdTRM<b#pMNe=!N>g6n9=LAHv`tfm
zm*8=5xEnuY7;ABYqekdYzS9}r+sU1O$To;y*)sTE%9j%6b>KHX`SX<hQh)>q_FZkT
zr1pFWn|Nxni3?8SS$ht@gIl<9DX)}!{M=0WkLd4*XLStdD9#$VRBUl~w**hJNZ#*$
z6X(Cx`Kmpx^virPbUJJiIBJ2O^sa%_>5QQC2;wF5u!Z!jRj}P$aTYtO-YRmDv)JT*
z?@67FX*QoLY)(qFvi!j>HJ&K6D6{<$r^)H~Y3n}U@*@aK9>hLtlfu?VR;%-2=+YZy
z!5p^Wbk2oNr3{CjT#V*9{`s#Ct-f?V<In*v{85TI2HlT_=D*ANX8z#nnr-LCbl#)G
zv%IHqL3;7=n;m`>Q*OG=7ssDg*sy1Ds%t$mc{KBBOAEy+xFA*j-Xk1&rW5HW9;@V;
zWcSQgcJV_X<A>UCxD$o(`bb&Bm39r-+G#pBkCypzfrhp&pN|Z>oU1c-IVO&e5ZP``
zwO_Fo8x_a+PHr#LCgr-~&3e4p^gO<@#dj55F=>~!P=9@QO2B+cltv^Y4)x9S<(;+n
z<`ZTzvt?A6rT`IxVFmgtC3}zHgWf)~Cual$!+ahH1x_`K+vY}G9)1E_#Xq(i3~Mpj
zV?F6uk$2v$lKFRx;K%oF-2>Naa+SX>2qNcFg}a%P$9>J&<9lf<4+G7(Vv_MY9K^D|
z_5DmM?_6={N^YTdJ?Ch1A{VOZR1sT-<0}#gRK2-nRc$+yaY11z9qYwWt94RrWn}o3
zjI5&ozot26;NyS^^ZQbhZ>x6q@owSq%<T(*RA23B^;zr{iR9B`lzfpnE$<q>N82tW
zbw1&7vW!P;xl_3T^(R?{VZCRKl9LXf>_=+9eAwld=XvyC>E%1_mP~cp#<A|<O65{j
zxTCy&(1fmlho1HI9X}-<{+Zw1amuyZ@rF-1*c+%4G*33?sLg&{jI^13Q#p??<`%tN
z*3`$fnPHb7-nX^<3ah1|w>KBs;Gr&cI@QsVxVTAF%?l1VS#pPHjFMpQjx@05(!y}`
z7>cl-xq?wAymfe6J{xSFTlRnlWTVYdtn#W!09|ltnttPHIvoQwG{<GT)>?`XEZ>f<
ztEksh)X=b$J`Yb^vIu)}xLhviPMukwE7tITChy$9R6ba(`4HL{qkddjA2#*$EhXRH
z)WJ?;4yp%_y+m}yWvX4m-$tJq4%h8|dZe6bs`TjFq2Yj3-92rZu-=9;96D|K+xg!R
z`JW}sZQm18o8vCN7$u!!pMxr2u4RZMbafNu664~;pQq(Qtk;4Z6~8;}RShizhAF_l
z6tifNwsVnGKv-m~oN@PfFv2(R9ks4~Db#$v75)4wgs&Q*lk?`-mGDbjxiFd!aIT(M
z81?p3o2->H_MgPx4t=b8XiWVuW>guHv^u_hd#yCScK)pb@zh>kI67-YxSK9%wTgso
zPmNnkctf=RDgr!KRU{cLbZ&Cjy{70il^oF|G@_y3u2C(JA6~u{eOgbn@y>5D&sC1w
z@vLkCuE!F7loiHfn|H5^?F^X;igX*-ZjSYOjP!q?%eCTpNBbc^O3g`ouMNSW7ZuFT
zC>YV4uH5aGJDHv0Az-p-BQ&xhlWVhn@9}D?YuBf^^b!*W$!#NKtAE;ZG^k{XuP(wy
zqwSN-6#z#o3~h{Dxyj%?Y19pe!Ykw1)doGd24{~?obW2Nj32sdV`i)B82#kj#zP4@
zotR}TGA2b~ojI>pfS^HR_QGBBxGED-4Up|B6Kfb0!;jX!@89?QH52h^1kF<;@!wXR
z`%!y}0dS$=>Ic^~4uiBG&$YBDKjU|Q`>(6ijvQs*R?e&YIo#FlXmr<>g6bqCo${RL
zO?$Ykz4ig!Cnh~UK3*I05LWu^!p@HF5?{?($ltR7vgbdnV0a;%Sk;TG<&{pbn}$t~
z(Dx|)&1M&UmQ(eaPA1?~6+Re-iS<Qp_@wMA*0?xQrqqW-&9j~pkbF0=Q{(lVCy;Hy
zr=B8h(}lB7mvidgrM%>?`hr-KniH-eQ<;TRyq_M*YR`M-QHQUk?{YY4`Vc;E?bGZx
z$y)wy-L7{ZC+p;}{+f)8xBz1p3U(~bu~2jlcS<wsP2pFNn!E9VdYlp)$tX5cod-PD
zioL^t_e1KZ7V09r@{@ui9#wTg)vjG*8tm@{1I;7Rpc=Ef)ICFGiyjVC2-PsZ9wk>?
zT6=jb7yZ~yC%B>&>1s%&oSWx#_!ah1r&e>%-TBp5RBG$tKG%k>eB(QWlMs|2`7>1K
zV_c~0TPw8(je^sJ{w=DTnZk^_+i#$)PkCAVgJ*aA%pXTOeTmq}(e8nTf3x@+ddeSZ
zhyQ;#d+WF;x9)uy5fnrPq?8s3=?3X==<W_dI!3w&N0bif4(Ud^O9|=j7^S;AfA^s0
ze9w6vpL5>N`yc3V$J%SJb**b%YflvgE+Q-u*FjFnBNnU2kxy#EV&*8cX}eHr$b6XF
z_aD&I03zEOyJpOV(#!Ap>jpU={KRMlF3L%eAjhi;@=HZ^Mwv?Pz1z<kP>%+hoR&A@
zIVYk$YGd5Xr`<n<&Rvz)6>zK?dD4=JMRZ)6e$QGSogBw)eI6)rVBDBSQ$&mB*}w5J
zGh|FKYNM+`O!`as-S$Rh>-%>G!TApKcCMuzW;E5=7L!<NG`Jy?_C|8f=KARtEF|9X
z<F$K`V>L3I$yY)hv)u$Dhf|ADMw(|`7fli`i7M=)yne%)$cw%e<>Mz-(jo0+_3sx?
zoC~ncbA>1|7U?_?=eEkpA>8tnW(}?Kmt|F;O8DCCm@A%QzMipiLFr_~&AXbns-~Vb
zZ~q|on--a@E=K6K?~CLI7e-E^gjm#c1JPVMGS{l0VBF9wNrUb21t%DR*-l~G*KPsd
z)ke`EuyTUmbdl`Fg8s;$0GoNR>%!$w0A8X~zy^zFv#yLSf%HO~A}>ePX-USlT1Yk7
z;tGj>-?iZdTl+>LnXPC<CZ7@?kHE(Ue#NU9p&vPrj6$<yjsY`+R=I_{Zp8V<yv6@J
z$Ch~k2xIfJ)bRKoA}XU@g+C-u@l)ne91suzhUREG#MvMr`=+=uh1Dqlb%}%vtOrzJ
zCa2xmwIof>wN<6lW3@$)+oNUW-buhe(E?GnPB1Kvwj$JaJyvJ#XxJEQSx8`+N6S#R
zY|iY8wl1uw$U0`WEA?%tncSt^E3VsykPTKr?83CXzv83&Ve7$CZn%T9Ov_};TA1<W
zb*D-`WtXkjR?U$HP9)iHQi!xNfD`=gu{6ve<1#k7oE@Fy%S7tgW`BhyJ5G7jmyZIK
zKU=%x8Cmgo07yzKcST%cfh#vmsZ=0);&z(q$>kEQ?5y%SVb9~r^H}8*#oCwg%&-Q#
zm~V|lyyqW}gOBIhI<l$P5_}HPH;>`rHhcF$5YKvzy1@Xi8iiwO3?YL-04P5@HD1cV
zrQP08V$s9}cIonRos{c;$4M(1AnnIn6B<=U2?9w>M!!8GPg0~}!8i*&3PP`<<Wb^u
zqOEZ59?QI*n8*_{<2=$40&vd!NHE;wauv|re2`i(`qIT#pQ@75<M_Q2xd^u^Ynte#
zyVK~(@s*P&r?jj-JR%R|hzXAs#fGX?R?7{M$?GjoR7Y$`@a61@6lAq%mCov++uq=!
z0%s93_57IhinlPCWF6j2BVy9A3UAaP$phortlVpD20uT-zL)E$%&fmDf=zjD;U7*~
zcRjDVg{Yuwk1LyJ?z42KhQwSDHQh2(@LHaC7qsD(Ybz0<G_fW>`H;1GMB&u7*Qrs-
z$36XbfauSEqa+thn|*)sIIyYEySy^y8z+rMpwpU8hRu)5^yt&izL33%;xhu1<!>d$
zaR)~&l=U0vpoOyCC56y<+Xgq$2=1d`3D=1MQ8ols9Wn6Hw#XvdVq2ttYZ`~n#D14a
zE1dnSW>R?0S;3?@ef=ZP?Tc}HZqZj)pNE74>L;!5G*r-M`BJb0dO~dD^g6pB(;$A$
zbzb)1+AJnTZPh%vAiIj`&&uE<GNn`P>Mxy%YmXg-{2HcQTN0eIzXdo2F>9!eYg~*g
zABU`o5&pL)^jiXki=*##FQECjQ<A5$aJjgqRBj05?ncuQc6l`i#9uXfy%<P<jLb)w
zv&Oy5?;_gJ*05N)&ZXN#;4mRj`+sk3(zk+<Kzvp~6@7u|hSptex_LDuDpmbFNFB?A
zU_%$2NblRhm%G>T9DFxur(%)_nw<1BRkde}vfgsyL{n}pFVmZe11R~cV+hr(6aHXi
z&fB{Az^1D1t{WXJhGTad&Zo_R5S+YI0+SD|9OmXHG%?=pLwD3up%(knRj3fE7KCRK
zOh`@P=I8I7MMbPjt(S35vxhUmF<)1#O&bIx*`GMkv)Uo{1!Ko~*H)g@W+pXwlT&76
zgSidn2IPRKi`?sPiRFvN*#B+Cw}NR%KuI4#=Q5>0I>K||lzDVotc#FMre!;2+#P8V
z^_~%BVT~LLv3n$Uzfr+qEZ|N^?kh87tm?0rr=b#NJwm0+SAj5}v6OEZ|DD|bgmb2G
zK%A6GGftHSNXoA;{45va<i)i&-?L`?@ZW<;dvx~qvU1IG<A>s9WR+0kQQPZt`8R=q
z6@b9YJ7~Ii3EhUcrdlv?LK>pEIx#Pk@7cWulD1iX%<$<~AI<J7rYq-jq3=$~x?T8+
z^%JZzi{6@TTdYea(Cs97YJisIe??Z>?pF(^c8{l}4vl2ll;l8HHHx#$Td=fLT04lV
z%mYInTk%{*x7gTxx^vddvv)w52pd8CxW}RvyW%n*tWr(>_>F*)ly%$h+6HX0a6)||
zteOuQ*R_)Ix#SN`%vH$Jy@L^G^3^N1LjT;fd+!<LHrCcOEoFZoXM_=d3a%3b1!IIW
zC`?dOPN^(3hppZY?sUbf=N$id{z2Q*k$H`<<>_zY^`Ey7rw2Cp>urSn$^ya8uDTx3
z*SnoKjK7F~1AQ;VFx?|eOs?0{2e~8<?+jjJJDj~_B<44@Zy~0|17}{IHXpR7E#-4C
z4Di^;m~YH>W<6QULKL_RE#6~XA>hw^?Rb2`U~9$ny`g?`G7RsvwhbGWSwd&Ifvs%y
zxY^L*_rqiv0e;<XIG74hbhfWMr*YNkm1}f?o&Vt9npSYF%-!e4(Xu4#@Gu108JN{d
zA`o2~T+g!A&Llli%`xnIlE3POi@ujXDY20fAjpo(bIHG&PJ?du=5TQSmhuk50Z?*r
z!)ve}^d{ao%=JYo+IVYR6i?%HL8cn=KH2_*Gr}We63`0>k-Lq^jzYJhb37P^yNwa*
z{QifQ*MrmBRQo~6027f+cIItkp;G<q@f$tNSM;^QOS&&2eDMr~PEXDf$X<*R)-IzH
zw6%fxu`zKb9wit0gp4hy?t&WdWoZBp(>2Z%2-%yE4=rlR<ch`?l_KcU%q>K@GdRy-
z(WB?mC7pGWP+(-6kM4e4jdNTHbUnf=8N&uwK56=cdx^>pJTvs9aN1>d--7g(q|DEp
z79Sft(yM9~D8Aqi2_;=M;`IAgC)uJoyu4H>4G>Z79=~~E=n{af$9OG_h5y`$e`bgO
zpp-R*0RL4TM>aR5g}w$aiKN!z7aNOEPb)L@pjVOjKAOsoiHP=S^vh?6_5|n>$Z-+B
z2f6tJ1$<xS4KJXRQ>tJm>L+zzB<m#IXiZj1+ZLVAQ<ciYqdR>O8T&Oc^$K#qJD=Rf
zIT{0Ccd>}>EzHnMnh{(M%-~>bJbpN-n;z{U4#JIkQu=gfMr6?zNJpmCOHBs670&xc
zQrAU_I@3kEq^l3~Xbr%$oFzb7f!Uu9_KX-8l@+<|#dc_%3LOjN_Q?INwi29&$G8Og
zSj_+R`hQphs)`MG7LL=0I89YjhpU}WxTT*bs5>QGYe$*Q!(3nY7rZ&sHjGZ#870n9
zyeF4JaLi&S+lS6@E;KCpN4>UdUP8yYQkaLWwt98L>hS33v4yhIv~D$AuQoXeC^I--
zbvo=P7%=k|Th>|l;4JiCm>>U;w99n$ftj1@1rMg<EwRvF8bQ9Nf7-d0J1{Cz4qsec
zdLCPEKBRvv*A|d7{KAY9)`?yN&MAAirN;R2pyZX;bbMz}f_1AGG2w<wYTU+8eqo-c
z67(c3wW#ZKbz8)b;?8gQ%@i(p71AE7@rMn5^O>LS169QYjHBZCj(buVXrLuq8VLAn
z=KD`a?9XHe87T~nVwk`?O;J`%%GDf?-@u63PZMS>ZQ~>vgx!TiW<&R9z@ok$s^WuM
zwJquPx-n_~cRJjF4vZn->WfPWZnORQCkNx<i|d2008~P7(=at`C4!UUA2834Y4A}*
zL>RhGw?y3b=<7(`><2=3$Qx|O3V<NbtqY*QS$eYe2r2zw!dSh<tK29L{q7S)06{-6
zur=mNQ{2eWh-VOR&|(kXukTg}UbzN6E_FEDn47+P?&b*crTAUrFhv4x8`L<nf42=T
z+%_T{?^q^Zk`BbXB<Eh)4V#Y)*juZ(S`D9wwgLSfcoAzHOw_z_&@I_x<rX66I-vzx
zh!A&(pUyD9o%Sqa`(`8Tw!vG5lWcsm)DwQGwQ>8nqMg_F{kK~Od)2HP<`!(J7B9KD
zSB;YW-`vv`Qs!UtoxMmHpz_>T&aqInIV+nlC#UjXLndRPH4mcPo4?pNpg!jYYG|;?
zg0HPhS$1C!ys@S#{iO?{A#znAOhh1AoTaO7^;LbiTxO&yJ^C`omr&38vbo}kj%`oa
zfO+5}kJzD+gG@=o?uGn|6oRC+_7A@)0sDmkW0m(C%isGA%=r7~`;p+-?QJ)9;xoW^
zlYH-kv<GfOx35H6#=U(+a%Rd7qEDgW-l@3Zn#^_Gk{pEW5ILa8V<bb*6L<ci?z}c*
zkIAJj@{m1rVq|HG7NYYi-C46*w8_Jc>byh%FMOVo$%OqSD=V6oT)pSv>a+8Qk}ea+
z@!!^LJEV;qAWeLp`7x&+Urk;Z!vf^H(Se}Px^Pw+fN6m0NBR*c29(dH?=)x~dV=v>
zUj_s`I)~k(fa48n;n??oX5YUcM>Y*0cv5<?68-CoGj$-Ua6Xi_t(OjIO20>5UoHib
zbOKR-X#u8gY8xL>R6Jq+Rhiq`#ZE@=-=5hn%uITo<);!}w4@$s;pMZponLW_{)drT
z-nVm$B+fZOoN8*$TIbAzW9&A3mgBll8oR5-?IPs5ltx08Ju!)JcDC3cHKDW8O<T0G
z=Y>ZdFJ`ndS3ZEPTooU7Z+Z?IjwB4LbJ?te=yW0b2UfcpJTxlGQfay|rGtJUf7G%|
z%lyB2Il+B+Kd6C=^>=mU-z^nR4F@Z0tWM{sKMRLq3D84owK{*xjvj{ICq>j2CAdik
zB~Ag~n2nZT_$RBkLBpw|F>76PEJH9gQ>K8p!nPOZ;@_P-bEBHk7{3wg*e@h$wuurM
z0+Em;2BMW=T&@h;x!b)_Y9ydVxx9R5hFNYFo{`js;>ku2--@p$LOm5nsK|mN1%4Fp
z4Jz=Yy*^UJ;gSR;QIVZ*2G1xc@>vWCo}FV_Sj2yJ!7Fsr&6mF~=S$}{GMmkrfKz<n
z+=2G2^)io3s6;VwTI@F|H9HwVxre3XSidpVe{6t<@V)YtLBUW8HNCETO8Q{ZRwVW(
z(VKA(FH{-qSe>J-QOMxHmZ%k}t0l9E%6NVcr(<IzMx|#b&ah{p7C#`7BwzE)CocSA
zb-3gjH#Ay?Tr9Sj<evNSP^fLP+j7}L_oh&_s65uG&8(9gF{558=Q)vzi%$CK6}G(v
zGcgbNCb9Z6m#iZ-F)U-NcqIBIDCD@PfFPJBTg4;Y0}qSs7zrg?(2C#mb?Rc74xgv{
zdN}7V&^%gj=TtGX-uj)gL>k_637_xH|I7aPyC?kdA5E{}%%8?y_tYq2$LvO8>fT;4
zE+$7GwxJxK=g?x7wMDrI<&AiPBRWq?q#=2KW=vJ<{3o7()IuMC<)UBPMrZHOHt9K<
zlsg3DG}aEh(Mgm;p8k|t)N{JZJ~AA$=dEX90HbGJ8f|itBig=72pHWGuqrzceBV~>
zq2#V2;scU$>U%tMlm|3`_)DCcZkWNE{l{VnLp=*4WVWx_*B^x*Lbv%VJDII8tbqh4
zg>#P{sez{;sx|$Lg(}YMWP)&A6Rr&!35#a_<K_Ok+(B?ocekzkpRoMTzbgv^j8{-b
zFg(bC*nzX2M4K@OZhIlLh1c7g=vls-8%|<10`v+u7-@mo^!AZSr0Acz9C=W1ZfWy2
zXd|!H`GR01=?XU^7__}3k;{Qi!6DpTuxr)}<F{We(FRmIAAr9X@cTw#7{oqwb)kPY
zzqq$$&3mCiZ@2v<jZ8~Aos>)jldI~XOd!!@cI7&#O;SMV7`fp*O!N5Js*yGBz-KO7
zSRUh!PUC1@;P1#y(|q71f(|Ko{(Tpi!*{`Cu)N$93b|WpyMN~cGTZELg-UDu1eV|Q
zsK=6tI5^5TBKKJE;#W*)uup26<QnBc1*0DT%LhnOdgi1SJqq2LksV1#i*JD#8*{e#
zXMnA>XG%ZCHJ7(go>{Le*cBH2V5{L9;?q?07|Vnn99zU*uQF!>RWmrdGyb?k=`poE
z(cbtP)|EQ7=U^>8{oF<R!v~I$nf|Vgh5&~TRFj)>p16T_8sP6vdYCxou&Q6J<5!qm
z21Ra_cS-|_LIH)-@_V%3Ag2^$c!X@0`9%I3eE#>(r$FIg10#AEr7RWhhR7n+%GR!|
zE^<sOkYZhn7&tyEyfkQ!mJU`OVZid!l9<bVgh!j>$GVoyN0aLL-0zBCz-_e%KvRct
z0_I70$FQ-(ajX!Cpb9^Jb%)I2L;8*^yStf<T77ZL$pQzDsBn4oKvC~|yeJ)C&2Bd^
ztZ9bLPEwF%d~UXA%r?ZwR%0q3TfSQZwwAw=khO^0AZpN7K1)*Z?a*n~KdbU%tKaH$
z8FeULcy~&|IkKD+QTXs8{ENrp?}`o6S3ttobhdl`;P3DJn;>fX7I2>jf)o^p1q>e#
zSU3Y#@@R*v)0h+!&UZ%mw6r|<g0jI^om$4+t+9-c=<DUBb<@d)q%AA_ms$_dDY8!9
zu9d;=u5((OE-GO!>wGqbbjKcnF^_6717G!uwbBvYcf77qz$%lRUGR74i!@kOR*0uI
zpunoHeW6%qk$ZK&yt6_6?tl9(lLoLnGEuzy{~vN4(6X9csA8o>45#^=0zUB#n*h_|
ze9GMJeURCxO&xq*!1(b-jd(2rT4GPr#H!Yl;pZ%^I?RKYRq0RcSblSVAbJ!4jGiv-
z=AH822h`uj96_mj8`jfjj@5I@hjY_{XLj_}2>!W3O>6+!<EpfZ7iwrOE^qy9TX}(p
zj9WV}5>maow{>?dA=&<Sp$w%g5?CFpavkh|zuxV)AC*=IP!TFt7y|SNCk9q|^gbr?
zY6ui&Some5wORkTXv*n>GJlfbm#&5{n5Wr(b(tI#mq9s~-<VH#gN4WX4fEtq46R%q
z{|;)t6*RdARJO6_ow@&x%y)s}Z@v`TgpKzi-_x`)l%81+bfpWL^IT$jf^ugDM=2p|
zKiYFEaY(UmVi#|29Fu~Y=+j)CI6%%6-)>3Qr{QURU7yXDor)gS4WZg45}2|-&6(mR
z@xl46C&<?V7Al`&e)R92E(WK5ZN{>nK<}W`Rb4j}wpZ|8UBuu#l((y}ayD-(T8yJe
zdl2Iv@mu=>1Z6v*EgIX^>PGzc!qx=Pi?!cH{HKS?E(M<4;SR;W!`S{gc;G&{qd@%M
z`{d6oR|ZEzo0TTYf&cC*a3zZ03iJOK_5siR7|=8D@#FpvTm0WWnuY;A8Q6T2Gf(6<
z;pG<t0HuWM=a61!{u?833rSNPcsSh2`;x!wPXBSaUJM||vmS-2{QK5a05U=VjmU2(
z?LSRNilG49(^vleKk|P!mLMVnVuFt)dDlO@2y_%)oH`u6ll=GHVhp#-eWRs+x*RnH
z5Z@vut#SW72%rSH0$&pUPZ#3`d|1sye(>L4m7W23)c`QY;y-*;@ElHgLEJ=JfA1$u
z4oHA>PS=zI_@>@7HL^1_tkb3DQuM32qO*``E(h#MlZ*|u^NpXKG9?{g6>_dn^)G*a
zv77ahcLH|tnyjSo|48sDvWS@y;1f<V+ZI-=$p=u_WgEAQa9j=QXFZ2Ba!mLJ68s7W
z2``wj{=s>gl(7J>&fR>m{(G;6I%57(R1~j|$RR_{o^6&y<;YW=v*S3%CXxQI`a=uA
z4v$)iK!(y?-k_lUC!hY4()f?{(-hU8MYdStksY&>*`N++uZ$KAm@<*r&AD5--6&J2
z!7|D6K@sKV#oEzsF!E~6E-ts?O57zJXN~V-Sd|DK<;igPv)sfjM_|d?@80_V5t;<G
zP;c5Pb+kvgN-mN9gtTNn{@)-txWy{U4bL+5(2N;tRnE$XRRc2B;mZT9z!d+f;c25E
zKOlM6e86s1n(~k_;&N^0fw2pGDSY7m^NOI;_CH?wufOW=0qYfi!Iy&g_cezup%IP>
zl)=75sVDoolM5TpRw3tp!&SJH-um?27LZsQ?5lkVyz!+(y3afBHU>n*ww{24GE2Ax
zz9+#t?rfHE9OWfC258>c1xQ=0{}&Q|^V2Y#ci{F4d-!+Lr7R-j9vJBvQNlPvCk|;w
zL0j8#g+=Pu69@B$of5w^7{4_BZrZ4&p6GXKVSdUMD4+ETiqVy!@KA~&16<lv7W5BW
z4FNF}9?7IdNa0KUfALpv3N41x&4&>$a)-SBjy5;pNmq@?F#gAcpJ_}<USDXtCOWF@
zBw#nn`lpaG`5ha5_3NWNipmeEH->Hl0#-=KoxflB|84>QN!|p$@9s--|Hm3c;fOm=
zjL_^|dE;i!A>ZE8f4Tg#k=z96;9M=DeIvG>mwuQ2UP~qO<Rh-$^Yf880uX%Z5fJ-;
zARdB_^v`#NniCw37U}h|#&d7qTE&0%ai!;~JiAU!q(+3Cfh=BTf!ky%5PhLgqokg>
zGz{mjOQWem`om~8C-u4r^>7nV_;bm`_)y=xZ{}M;^VS^}+KD%@8M8nwDV2pFq7?PQ
zbCI4^YO>r@jTzEp+D+a2c;<A)%ZpDAl$dbo|0{(E(xUA2SK>HA@&D?g|B?Ap;9b7;
z?9JEAwa8)h8hXmE?YB;KK3la(EdUVX?bH5XL^$un5UCa;uGmt|`l|(1Of)9k^T*-}
zmLlOE6uOy71T>nCx~Ea*h~1e<(b#83huceV`5xU$S3~aWlFaUVH+{`sob0|%xW2z@
z0z4^|pt$Qo*=yjiacbcB*_XAeC5$EvBUHn%CzA}|Kt-ABMJfcAc@}jM+aMz*jo;qE
zSGb=LSmC(&Zn{67;O`D_<8n+NfESAwj^`7JirXt8;S3Fl(kWc?*C*v;5)oH7_Cfj8
zxDC+59N^q2O@9}2VaOD773QnyLl<rIr`sN4y*TbQ>xiA&#tVJ99$ltLxm*Qj6*8Xx
zB01zG^dYdFC&3fPoh^L$CtC@UE}4^;aF!^ERr7@mXehTt8H799q3=O$Ol4J5cMdJy
zE7o&Rq<nB=9VK`J;vQ@GpEvyPS87s*!xoy&qhWQ6!7hFZvhzr-0)Kab<=Uc6>`>%w
zEj!<~T3qZAQLpALJ*UnZ_4Xk|#;wMp^<DaqD{wVqbo{qVni}4nA%T|viwED$fNsvw
zQ0iW@FC}`6xPnO1qoJvuP?CTNk{18?#28B=4@pPq_^kxfR8Bu}_qqfcM{~HabuQrs
zdZ^q3c5*4BvW>2p<|M4Uno1z1p`ereeWjmZF5|v2>!oq5ZRMSYTF-d8tWQQ;=DoDo
zeF~vYe&2pPNU#)>o(teSp|YNKdU%uvT}u~*`(!p89Xn7_JpG4FChY*Md*&HcO6h3w
z3K6$7JIB&g{;|d?oampl&P#?D!e&E>W3ek{WVZRf<e-I*CiyQH0cix&c5SSq7Rg~S
zyuNamR2IPQ5%+widM3}bGTVgeHqhext2l-B;)v7r^^R06bh3|p$WsfY7jclCV*9+2
zrND*@=X~+cYhnmFMcsuG=M<_{_!|Af0v-c_CIuhLRX?%77P{@%*>y~uMF#FUwXQZU
zG|zlm@c}8!W8&DigWX7S5+Km*G_w~nBOhHk>CTNlqshu$eL}`kWELI(;oBKn-rylQ
zAE_hKr@YBZe((_io0lyA;2nPiClD1Xz?670(&%La->+o>46|(F%xgYPfhDw%lxF3u
zB9qxlE`%v>#XtlHJU%y?h%GQ9RpkF_jwow<SH(R26gy*_p=dl5Ni`!juN(R9{Vy61
zh}-06C2F}Hd-u?&BU*xI0HKxE<~t?bqeHU2W6DPj($B2Pt}~6JZ3Z36!+^0eL1ie=
zEFi(;GS7Ezm!z^L<Xvq{T_P*w1Eh;LLtUAZJwOu|95N!E(pg#<kb&DHpeY?4Ctm4W
z)nnD|=C}#aYI%*&%eW?DR&02nOj(6fAc+a6|4`4Q3;|hZ{eUP*b6v`HVqIjRh_-O@
zHPd7JdVibn$Dj32`@bmi9)-~C@ngm6Xi=YKG|63iSQ$PUkwm0sJO^zidkGW)9G8<{
zhn(_e#X#ubCc54X7RSELpQ(<B7HCSy`G3nVK~uhI`m;WL53wV6Qj-R-4fiMB`=Tk5
z(D%_?Q<%##29D8byNuR5a_;$^*zYT2fpYe$$e)h5*0T2LPAzQQSSoe;9{k2OSf?w_
zuPALjXPuH^W*>j7hT4>uJsY%2ePuWsT2Rqe47mInq<nH#{%1U81NJ~F7v^uP-#_9Y
z2xww6vfx0iNHJEWs(x^<?TSo`Iv>P`PqJKSV(C0JSml;3hPP84zP<F9zy-#`68y*s
z<l9^^t2NRO4x6@hyZqc9U{F&0+Kke1@M>IG1fp7qfHPtHedFcQC0sfZuKFD78B-D<
zM!NV+^?KB6>KMkMie;L*T|vygOy%b7)5F~wI@jg=RLsOPy_0iI;%SVId+@>Wdq(pg
zYZ!o&R11UpNSwGRoshk@!W{Ls>d?mc82L4Mb(Cc>L+y!l)ra4N!41#}VJD16y~$&7
z@VZ7RCiZu2xj&LyCY&x+GOOfqEAYdaz_XCp3Mi8dKSuJ}!h6=r@$~sbyi2I>Ref$<
zZLgm2&7O?=jFf5m8Q2b2mA-ZP%|ja)rR&OzlhYf6awi5RUgZ)Yzij&ePyE10Ad3<&
zGG1JTKbGK4J6}fj4-uAIfEMTgy%kF<!A|)A=%|U1v4HQVe7;3%Vs1e=#2K^;78lr=
z;n(L;OA$^>iutw+m$EJ2ldg}XsN0BEA-}MD)!p@TT60Pkm~}jrdo8CZ)~hfy7Vz!q
zW`*~FS?{e7?HW4X`r|&e_ARwqtA~d-Gy@384vdemg#J5m{NW8i61ImsjTYv_B%<GJ
zx1>rvon$19tuY`L!=k8AU`C5Vr>Bz%XjJM3I;B3Grb$3R-GZe|hkqQk|8%T(Hx_p}
zLK>Z^R3o_aiY()@CqW}MW7*gIMB61krUm5S9oK^!`9w9WDksSI#k!$+qlbCfdT_gH
z@=m*iI!Hzd=-9)B<#Bh$IG$hZZ3ofpl|T7VAW#Qyz?Emb`cr(w<boI(r}3pTLD*)X
zZG?}2vPjZcRPgogUVIifq^E|I_p<&WBX|Pd0(wDTYbT%)d1a{-TDjzY9G^(bbf_A2
zz;aEU3VvR?2L|+0W1HmR8$AhWxoRUn`sy^JiLNTIqjUG9SUYl7al;3(md5AJcd1$>
zDVXDpm}4oH2$y_4#zA4{(vEt-fXqbYTiNL%&Cn-4a4Yto!D)>vVugRoPGzxx(^*x#
zir^^({HPEoG(*8x*~-j1<^1S*@nK+W<6cy(?5j4e)T3F-gkZ{_&5}NL=@)8^Tsib#
zi`a|5wj^Y=6mKe)Z07cwN*D>ycL-%h$rmHj)HH!`bHDLoafySD;&(3OsGj?-s0~wV
zCjb-7;zau#<Rcf6>q}g%bE24joUh;QNubf3B~ujV?Ki~7zCVq<VVv~+QEsM7t5`1&
zcD4}>Sfoa*;{qE?WiYpb^_Hg5d>-uz(`A3|PeyKV6K*&B+=PqEBy&*U;bZ7%q!X_q
ztxJ5r=10!7HqmqWVuik&nc$k+(fmfy5j!Oo$_pw+2scv=R=0-1vU?$4uNf$p*>Yxg
zL_kh{D<R4oy~4alT|KOF?t*9K$~_l10L70!1kwqL+P=C}Zkwpymmw(PJ{|Dbw;zV6
zc!B6{@R1gr&WoB79ry?L?(c-J&fb>1mNN9GJoD0H*jellnc(hVV*tLuKowlLvh+=o
z{WAUzo8||-7x;ND7Cwf>YSwTyEoeP?<>Za0fryLIxVTv=7&r7KWn5|SJSVmkPqZb_
zSC?_<p7^K?2nO6*$ayuuEJb2p@umI8;3D9JZs8b$n*?QtbOR3_)hd932@O^B9vElA
zR5qh?c*J}nhiNd9hM(o?(emgn5;eco;?EpKeC#8L3>CpEkiij^62J~kTMNnnW}!~E
z+Sd}J-3y+|=P2|^^T(0c3rygTD2nhQh5Z!r&o8l@N;9qKDuz_!CqDaOq?*05_#nZ=
z$1mTmWi+N`6p6sL@gp#~GiucUBv$$uryZyv`^si)TKG|(wKe$-$}!adqJgHP#q;}r
ztwAWI01-_xrGZ+&H)%{4^=^Kw`;K}%8i!1be???}a(5S&K=^(TA3cjTLP@#%lH-Jm
zAD6vhv}y;oNqM9YsUu^4SjQ325iMyBgA23cZ^jHROMM9&-NLkbFc}*=FID)ch&acp
zon+%V-4^<tKkd|oy$DdTEcWMxf$##x=kBmkjk;;vw3u9}tU!RO>quSU)Z3SE!1qFB
zNqbp+<FcUtzNXR4PoPrvvFS!t(s@?msmFBb&J^=dj(ka&v_vqKai4<yvX<rcJ)rS#
zGShf=5c*956$B74sEktuWUs<?`Q#qJYHNC$H|)bGm$|Lf12S@kv$M#e7u}CjacSdi
zDM>#bcze<tFcNZp_*q0ispDAKcq6v}kZo<&l<W~Nv#1rk=1{HDHR_kmZX`4>JxG1d
z9_!s5bQ3^$0h7m>^hp17;|QD^N@jAxoSS^gWI7J$#XEkx$!?Tlz@#gD{A>a^G37>m
z4zZ;%Q2lEuYsv{+YtpJc+v%6cZD9QC2%8r&WYTOQ+EGz=2^(wKzeF!J*NS#5eywx&
zYK8>zV^ys>3yxAWwa!q|&V1%Ns0_p`oVk<yYuB&+$)5#Uu6_&~A9yunD1~>rJSQDK
z`{pELR!K8eH<F-0`QXK?A__+@GT;fJ6<B}FjNuf*LGBhOZL19lln)UY;7+RYI5{CZ
z@@m@VCt^m5qyM-;#IH8Y4_k3jIoP&^oWbz<&`;wT<*OC?3VTmywk;q=-Q&;USkcRu
zkXsjm<)${feI=Q;RHQbmtY;h=>&<AI-G0b}b86fX3TEFFQNhgHin#Qz(*AN|tVwNv
zi{O9>cHZ>dSMCGY>jCWQVz*w+kW9c)viICpjqcpaBtPtieBFdgN-RWPuQuTD;JQ5l
zvx1X7(iQ(=SxP0~avj4aTdpX9hdf>PT+pD~vU%;(=SwF;c)<x4B6K>-7st6PKESKE
zIrM5V&rI)o8@Fb)N5_s^D5q-yz1A1m^ta*4&WG`^$vMd*;|8)^PpsnccfK1yO^Z2i
zfyR32E^QdiHec2`dGGa2<`eAtrF+1<;-n$|RPpXbacgsK#t*!aYK&~s&+DysGHit?
zD_FLjirzhf8s8&7FWJTn-vON%Ying`*$lwTel|mvCDSCq3{gIX8-dkqoqtTu0I(#r
z5}>6Z&XAw8n&$RISUruu*riy(B%;hcWU-BsX~E-{Sl@>FKP8svJrE(g(bfxg!P#CG
zEup734y&Z1|H<jRS%mui^^WJL*CU*XL8I2MC&)16OZ(5(NEk76@I?e;x=b|e`^XHt
z!1!EB)(mw2Vp|MT61dlz6lEVw{Gq@0X)Fz#ev5s@z`9&}ceHPa?Z*R>EJ_pq&JUQ4
zro?8hIHwSag>X1dO26fRqIq+?Mpoh{^CI?8wTN>vb)rm*SZs=sF>8Er&qkwrx$@$G
za_ObWtRj$hoIn8Zj0FIEDSMzyq583QvB9LY?!0f^B*<8;Q=#W_XE~4DR+3eqbYxiU
zXR5J4P46Y)A2v{xnV<2v+rz&wu6IaMU0SXx%w9nGd3~EVW6o=F-8L8vm_Mn|+#DXN
zMx+LxfMqtidbB_J>qhSnfvuMkx<{O$l0mLCwo1KDeWvad-Tib>Kp>eq-3XIftFs+y
zDoN(ZOVV8C`Z+I22S`j*!oNy4tY^D(0dk#iI5>W0zM7|Wl$Q}oeN(WfSOfWOBq$8@
zGk6SBe%xV6RvADK<e6MxZBB8Mu$p*sQx3=n0KmgLE0&nKZ5$5a^rU3}rb+&z0pgTZ
zzyS<&MPx&=I~}Z0+?^C64U=$^;1jjfiBzdYg-2+jI7_piQq@{BSDRi&GoUrr4ewaY
zeppY!qm7uQF!+IL<-_6r1vepO2LBKc6AOSAWFns%-sn(m=ACVROl?RppRR{>XaVEt
z8SVqyS~op%z9ezDOF(m;e|A3Y8LDhG`9QU%Vpo95u^<-?BrR1|-|-xZkH03;j7L7`
zzk~?ziz%(!F~{b~xJ-=2e+wEJP9cN(K{~p(Y`Uu}UlddgQLouz?&fwpqga1fxYDc2
z?)^OqH+wjnimVY}(#E57ky=%NYnI4Gp3f*r`}e`VA5ZJ43`*RSb%|AyK8W%i0yE_7
zpE;TePrLCYmg!+0`!U@-B?tUe5Mc8R-qf7H5d^<9`^M}yDXCYTJ`di$0AziuMkJ3y
zKNzZLJ*M7jT((->EXL0T{ZWg+U$_I{55X+Mv8BX_zl1Y8u-09g{TYIjtpSzbQ@ay*
zTOXNPyNIkPpS@ijI3xrjwo!VwH)(&wIN_1q&R#Mx6JNvr4B6hSjV_{c-(orN9oNi$
zB&E#)yTz)E_HrITCCP_H?I!%r&z*I<793QQ2}DqZjs;5`J~5CkuETVd4Gm{hLe^}<
z>Yh@i%nQ7q)fBgC${=2ByKdDCFp?S%8s^6J{vabx>LS#JBgSfUm1HZ%^xZp4%<gIb
z8|crGW=Am2wWAJW>NUp6DoL=4b~)Alhb+rwkK+Yz#t76nN&vYq@~-j%TbiSQl+wHk
zaLM4Pb5?QyBuuV4;rqcSXCHA8N7r(w3kCNSIg(hvc#}xAQg)fkioSXi)-|PpIE*`m
z(RtH@h;aqt7$7j}rm5WfqX-I|{|RK?0T;J^6zXCqV?@>tJi@ZfbFZJg>CJ@z-XI9x
zK$-7}pUcRq`vHy}pBQ0Xfln8#wfp@%An2l@muh8(M)6Gfdor80l*~x5M!(w>t&CxP
z`GgA3)cd(gV7QU&(bZNOIhiM0WD|3j--;JSD5-8(yOU#UMXQ)(nk=0vp<f8LgULXG
zqT1ueb=w+$)yXGQhxrqk?PcjaiI?_qN_5zUAJpq;IW$*jHt6_<tr>YFT_;F<znLw*
zr`i*}ykYSiWIPjn>jywz7!SA9=B;OWJREm%(2tIl-82=0hL~^ATFMNPIqy@hi=qn~
zXyZn&qZZ>KhZsN}*QzX3u#|dm@64FXEm_O%b`L5b+A(O>dXAZ{R#JuqmN2oN7M6On
zY1xg@P3SW4FFt4-ZD}B&ST6KL`{UG!c|Qv)c<idZX0IFbq>@?IaX!~;L2HeU8CM{4
z4fN&J1BKDBc#N5VUV}hiqVp};Lknp3(tWLz2F^1U+^L;fOJEfhk|t_uIlHtmiNO+$
zSEo)->5B#gu;r}%X4JM?6)2CY>tG%>T}4KLh&<MflQvk{Pv9Vwv;?S~LUT|+`nf1E
zl&bd|zfn)niX`CqO-xRwxXmkT@U4SVfNyPkd8(*3_AC<kB9`<pjALM9LeBuPq~(dp
z{!>F^_MeKF2G+2@iAlKO9Shm1ft=!`_t737{4esOM!fahyW!_F7+G%iK>On8Errj8
znTc!7Z-j4An&!Y*?9GfXy<t3?u}@S&G_iBP5IWnuax-N?&eI_v2gAjrZBn{0ENp2!
zT_$(sfl#%a6;w(`*No`1O`T}}hl?CYo)lxrI+iGRByJ+eq!#H#>Jf?ISU^YY2wh9}
zY%<A4Lv9>%i!2fRlm!s0j@r_;qzO>Y@+gVDkCcsqA8q94cL<8r6X6Fm(jg*#4-A3x
z3ON1OD$`oxzqJF0bof0lZYgUJEEDC!#6kTs0<yxUFWnQCNIb}d8VOi=B=+Wi7M@oF
zm>#puian6tk)65ifU|?sn>1Z5fI{$S$i};|ehpy#fdXGp$~-baRq8UIb~k{xFNPR#
z29vQB)}4`XeFejBT1P4@8cJrGDsEChRjU;_S9QaR$df>Avl8Pv*Dl_=K|GAf(EeI5
z*yjS*POK><sw)IWg|xgq^T+}$G%|8LxCHtyw=~-*>ozuN_Bu7&yn#`Wc7wTPbm{Aq
zRWP>zK)dI}PLTkes`632)+zu(p}?BJsahaQWdLZgB{7DmK5{*SRKrj=PxBW&IC)Wv
zw#F_ZE4u@6j%$ZOeE~_@jl?IbcB$=HesBoRG=ij9{}qf=?wK>UO_SiuF~U!-l3zI(
zz}H;{(ANx3bGkUCnEp-Uh$C!+ew`G}ZwVlHg}Jmpj2b_Yf&G|Ffpasu$XYN5?*Ob_
zeBR4_9Ti&k(DVHUf}}6)?5K!1_=m&~+?{&KBiKJtXWdwM>^(<(^G(vG2PSfKrNHQ|
zPy5NCKw(CKM{$+*C4P7`Ck<~qaFq{Wb~ZXFRod84?CSukEnj}1i<Y6RO^;Z$Tk3|!
zX^MeI!Qlo*#&N92kkcXsy>A9Qzk=LNrb)4bH*yl{6KH6GIbZm}L+nLk8_DmQx6sE}
z_<xQAD|0#e#Ap+3+p6-xHGh$Y>$pjEr1So1u{9L#+yiN3EU!~v&W&50UJf-r$wOrx
zh`;pBjo@qkguzHj(vVLDdyCIQv-rn=xNU&qNPyk#)joFHA84*)^=0X;JFOm}5X#9@
ztCJvma!$A#G!6YidljDQ$_&_{&RhiT28>fLst&9#Fdj?kAH$R&%!R=nU0y=RjRP-J
zULnFyRic;X#m<N|Cu5|O_<0xr%KCF8NKHUAOzfk1=*O>TK1Wjx1r|`H=M2iFA9728
z`H<L^E|G;3<7kOA-O|=%E<GUYxV;gn-IDR)TyvO)$ZW|Wl-}^!Y>GfQ5M2u+fO230
zR32OfoEnt}=o=TT6{-iGOq`AY$Knw3(=7=jK=I+seUYEUV&zQS&okV-sUO2zP-Rtb
z%}O}Wg>jZ6Gq02D!O*a@+|esXl|FKE=njg68WmC%Sy2ivNI9&q_qo68;XO%blm>aB
z=BDBIoHWnSMM!lX2q9tb+@eufxQEgi@=z)9?Y(!tq;C|Mq-(gLq|M~yv4xWYStf|#
z9m-=lTWd*M9b6Vh1^FZA8|_Z!Yc{=CM@N-Lqr?{8lJSwgt_XaKldB$agzAe)!h%mz
zFyv95BVi-nyY=BM@GDa%%JbCO=ZtSzNKkJXHV^9jaO?A?uvSylM1yvGPKZX>z;0e`
z7`*c(PLuTyEdV$fuV_^r>y@kyNGb6YEB3_uaYTT%OQXpW55o0Wt9JW73wL;8uP4@z
z!h0pOI#UUhX%-gI%=za-Z<mz62tDK0cj35HpDCuHA|-$s?;t+W(iMgoE-sc{KYf-5
z)IGqXHN^<&&l-4~F|=v9!(btPM`^av8k6YXZ4guuS8lItH>lM(w5kw!T0~VL3#D8~
z9@TsFOocoKFGIJg+-|MhHAVJ~zIA|dydM$}(2X~L;I_=_7G|Y8m)GsSC%)~GEB`9N
zG(<*6!lHxj9m&aYI_73MZ#=}<QnV=5{0@?0Vy`=;HqIK_4_r=*Bn%on^8TK<V(vzv
zC4yLA6Wo$myxfWm%2A?QyBAUi>rT+mjoophGoPN`(Tu!5Djv9IWU;K|N=ebS8i}uZ
zF;L#)G}Z;FKL}PyomJN<a@DyjS@dfm0{IdBvpLT!Qh4Q20=W@$3NJ>w-OWdWF^COF
zu90^4LC(P*9#2*e(re!wHRg`yP|_e(JVLtPg&99y`sIt!96`WD_wv_;;q9B%Oo_yZ
z8C-pA)s#r;yJio*j~GllzSJkmiu={J#d{+ifkd^sg3E~QQmF#xkLCZ<8=y;K`b*uh
zGYAYH5A7#PjMlnEap@@Vco4l8FP?gL?k?9>oc5R}pq)NMmzUtII!a&+c3ehGD?N6`
zn9-J*Mj*buBAdK2Mq3eonH9>t^kFs8b!o72C4;4d#q^-2f{Rl+<cvsMqgU;*y7H}n
z!@|U*{%;lxW{1QwqysvPmeWg<OA4=q*PL#gvcD5Wn#-c2!iwVBu;qm}^fp5G#~>fn
z0Dp~rHjUF4i<y1fu|Fuv3{fWA5oS4sZQecd3Spd#ZM8X58XcJl7caD)3h;BHBC)V9
zI@s6~A@V4mQn=Snh?p}Jz0n$Xtt_v988|G(HAG|w&3hY%2o(zRCS?jiw^>neDse)o
z;Q=v6<odkgYDC_FG9iQ9<)6l*Zm~Vd6ui_`LkjGj_3&L0x<ffOA|KglLG7xXA;E5i
z0oCc%45K+b@A5);`n~X-6{Ktg0qxlu<_~}2QEZ(kE%U&UL-QOF`Cgizdry!YY4{VE
zyei_eu)ZZ3X>GB*$bkpK^`q~4RpG2sd0hI1UozuTF-)m})n1LxA3fk3<B&7Y`5sYv
zv(%(Wx4zn5`9md(weAO9zKmX8;(wt@>P?Okr9EXyKo5)1xiRFQ%M0J^DeM^Xp!!jD
zsznNYdoDes`7?iY3}VEc`;Xi2b}+vY8sK2ezoYbZt)3tSU4{7=&suL+{o=y3$iKQ+
zI@VFM4-!`>KrA%Dij*+Y*nF--M|5kMel+MwM-!4VB0a<Ssv+4)<%mDN>eV@d8cmE-
zvPJwhm`%`oG2!m_Fs53~jwuS?*P$h@0!{fI34Sm!L7um@oDq|K5SM{ky$3^i5HLN`
zux90`D>e8lW(;J_gXh$0p8wVtRD`UrCP-2kL(6E@2SMd49}{^5(;12G_Gck%Q1gJv
zxkf(a;~or8S7F9Dif(VkiCy0x%zMV1*2mBG@c^Szz)%H7yuVFcnQ1j;*mPlrzfx5l
z*Tjjp(Vbz)ZX8?n<r|e3w`%CprjGjf)0&Dgw)<Vh2(Q0JV4Uo{ORVOUmc5*HM#|Kv
z2M-B`h%Xtj4+`x*xRx_W(D@d7O>3cEzurYgK}98=K<dgO9tVy0uW(6Cs8|XOt5TaR
zv~pw*X}3)RmOd;1Vkzmzr=l~q3}w0%=)nKI_-gVCjv8Tuqortmsv;1QD>#s_6~Go+
zHaOt@r*8(oT6OrXKOEM8y>L0-Om;7E8*LNiF;fp7PZ-bXbsW3xjxJQ2!&GfASD+HG
zwy_?g$e3GT-p_w)Nx&lF1d{Br(q%}~g!|BViR+Z;jPvt`tM>ZjQnc0!?Zuj#;4_X0
zO<%kqx#bpk{{d`kGBzXx*Yef2SefIOmM@2QiwNkif9pf1n1By0={;L~%A_82ruRz1
zqM`R%gug*j8j323q^<s0W|blA<*uAYee&ymdX>TsthYxRV@dmOBKIwljMaPlZi(Bo
z&QF<bwqVUoD~%V_np}8~oLMvr`SyCPG7b|isHQZ1CJb$I(U;Fg;;#(t7%;HnY)JJC
zMU9rQe;_3P?oCPQ@qMxy_xDu|tCLHoEK)p&2j`g-2Nl&D7DT`)_koFm{Rj+*&QPey
zqaDu?6DQ@ht>=d3pxEJeNz@HReTV~vExcZDZMrY+uOFD&0CP_R<m;+L%0!gGAIb}F
z)1~Dq+AGd;SY`Ib+!}1N*udj!`jDhvJm)__D>hR7sSWV?tS3C%Y3pZN2pfoBl#^Z9
zS&6OEDjM0&++8YrlQua&9z9|5xcA<v#T)C#Q%(!wNAV4>{5u!wAlv&Xwy(TmZ6=<6
z5&4jRdRTQp&36x~iGgv<F7T-lBi*aR6U@w0nT1QBbvbSO^kJF_xh>0WyMT%bTCahF
z!iyySi-&IqTOtg0jz}Y#rrSXtadMte-I`|)h2sMS$~&cb3|EzV_K{H86eOb2A?Npa
zurfYMrqnDTn&|EAf?~F%?d_T4abfSx*`6SJkGKJP%s~Yw4F5+cAz#Fje6PFvlg$X)
zuhkvR&R`0k6t~Y5FwvruN6Qyx@Xw*GX{cqN^|Sg=#)h>Yy#mz_Y_RE|DH=M|n>VLN
zE>qTSZIg~d1bDqwDZW+gX+DPmDF%A>G{x8T@T;nsD$08@CUs2xj{ecL?h{w7TF0-M
z9T|{x-^=5-JGbu4-qt5WP*7PkdV44Q>uLMe!a>G}eLj{-%qS&g0lEW+F7%OL0Q($4
zG&p)Ir)}GL#_W~V44>bOZgcwbm{l$A7zrJBsTGHJV?WDIeq}QcQxr`-{e7*{swB;i
z$zS=GjV@k&UrV*>jSf{|`I#wIFtVkGUv?yJSg~p@K!E)CbUu)jF1CIqpgMS23)x~Z
z>%XB=zca8wi*{pZ0|qde=$0V}v68G$E;dsWxu_;a++I?o60<ts*{y;>#|HY|uee~y
zI~nH7x7};EPxbU{RN7&=V+u3f!xHFn-%-~S?!A8Jc5Cy(^&=`WRB~(*S;M^L6Y#~u
zW5b^bZF(QYoOCtS`d6G%xdaVRWF?fpo&xy^h<B%=&9#4d6v|oJ<m?mAiafCs#QfD&
zzs5&V?=2ODo~PBhWMt<9+G`;MjwZJ*(95^$Sc$}9skW#0ORWa8nqS~GZAES?Ipe<9
zb6u|w5WT#gA74J)7~OzV!}&??cm|Jg8|C6WqFGrrGjfdk`<6(@=_xe@kzaI$T#;$U
z{zCfVYm@_OL=tv3bp7`)MCOikkQ(Yf_VAL{j@ooSaQwiB)@1YaIG!Ehigz6NhAUFV
zf@H4+k5ESD22CQICBFH|%oS;z_3_GczoTfiM*FAG-e8s8loC~$%)-04k|y&Q;tq$z
zYv=a(n2yi-8{JPl5qF>MI8e?Bk`a5!(oe2h;4hnXM=jBhI!*Ag_w1i>2IE5M%?=^n
zf&wVM8zF^O4LnS1TS*}Ea)yQ+hKDY99;oahnawJE_&_Ar8fcuqc0cbp4?#zHH}T`=
z>=<@K3Zo}c`y>eAG!qnUp$avrt;pTM``4dmIGn9&J9uPtxbTB+&H1M#x!&(`sTu99
zz`WEtJR_Du_i`Pl<)ST&J4X^ERA|SYGu+Z?MBOS!T(I>K?g!r<^I6HW<{+A!X&5~W
za)Z232m#X>vnGd3S=S%3kGa}(L5fD*5?>#nKl(v}c6P_A<tjcfSyJ={m=F8EC0iTu
zPl+qSz+l{@;^S>jE^`F(8uE@4<=Ljc<yoQR_s64QUbplx?B^!*aYtaI(lLCGG-xeD
zUvuDmu^-+!$EVazgBCHoW`(L$n)eG-WI7&`hvk*e>%mYi3Ez4kNE1C36k4ET+V#L6
zq7of<x9B)pG_-hnH&w!MiIH^9I%oVb7pIA)Vu_^q%_EheAU*~01uiW~#enh8eHi+B
z9^@B}l1ZgZEC02Q_=z;ueBsZJshz=%izNI=u!{pleTIKRiXS#p%pZ|`S6jZU2QAMj
z7$_{J5V%WRiHwT&@pn0+OnP7=nN*XOLNuD;F$X$`Em!rAJ&+o_o~YGUcf-=m!lN0S
zI`wfoe4YiX79cQ&2=a#I4DGRMm<;H!yr~MWuNe22^xIOYm5tAFbIwDrlksvy@j@?9
zenCS%xU4;OQS^glh$ou(d-$<wzxEx^3R4rvi;LqYqqljY<ZA-%`1^_y*MwZ27qVUB
zyQIdD_I}ZoNRVq;R<;SQA%1T+vsssCs732XdZ)&7)#PDVq9_@TKl_P|z`P0LqO_W@
z(d*DC?e*s8cX!VTZ*3duQ{G|~djZqFEl<+a)pS7Un>;Ex+ZWBahC-Gq+V*)AHK%a)
z_C(gU%EYZLU4|d;=g9V3H7`HL3e?j!Hlkk<Ki-ZAC82qlLaaAmgpC&`Cz_}w0r}1!
z7%#R)ij$T|r9CyxDC)rkR8vTss&@ik96lT>S4r3ou-2$quEzt}#hez9ZU_dRpL06n
z1#_d;1_w+9<^}<%RggSs9)~A7Xu*vZ$L|I!)rjdGLA?+*Bn54egri7j4w&clKo`8X
zL!1dNrq*0!GPx5=V|F%EEMAHkh$eevuh3OaJcmuiZa?_;>|X7#stjF0?A1mpIv53u
zumJYeKi@y~*lOO?oISCVrWN~$h;E6+f`nmdX<CWXvgLd?&+nPvsVS4=-B)-BJP*$a
za74~p-IiDola8UD<}FS@UDX7J)|_gk&AmvzOWox{foR5_yWCg&yyp85Eaqet=~2j7
z9-0kp<2vVhy(*n!Zr<@@<1>x^FRc4&{?*-1sk)ET=t)F}pY_kZOT=x3YMKX*x|Vkz
zn{!$W>FT(mG*5+=w|tcuY4-RQNqH}rqU<P(b|&5<HN<-nRyo|7m!JZNVVZClmZ#&2
z<E)L6UgPTYic*2yy7VR9K(+L<i)~y-kQKE-X`IY9Q*x}01+xh;0VU_far)vk)kFT&
z-T=L%JtSQEuiL(9ancu8K}n)EO^ve2_S`7Tt)HQv+scQgr~}2oe4H8_bj#=nvlR1w
zl9?ZcBs$<Fd4ad!<0rvy=kHZKsbbdbbAHf;dG{k2t%O-K5_A1Y>EHw3&h~4K<85k!
z^ZG>lFBSUI25sb~Y(f_#O&r~Ip=^`l{B8Rw_fh7X^)asFQLdLGFQ`P*yu37D{=FE0
zE(r@_9>!VlMR3MH`&bs$O?KjmD%C(FImetoyN<w3jUr|2ULV6s>-Yji$H#g!a6}4n
ze~0R&(O2X%Uz;bRGF6}6TsE(NX`K(nK(z-Q%!zlD?y5g-i0XZcbiw=v{dQHIetfD{
z7(NaL6Sted!_+C7?^QMW@l#^R%v`XEC#V}}{d7T}8ClWZiOWB2%U*+hM5l3%+^A{v
zQa|@dF<6qC)?Uhc=}D4rxrH#Eqv?|?b3s67Pc!<;xV+--a>4MAo~=@zqyh!G)>YQN
zj&1qLrezk=d-R=QPl!De@4rzp^E$475q_a%Re=0C2l<q1V}BJPIQLZ|@!cPydPANp
zntr~u3FYJ}YsW?T%nxrhyWZhfFYBps$j3TQE~=|pdt{uTaC>Pn2YYNkBfflrRu&J_
zgZ2wdL1nAmWtOJqagdZ{9ez_*t}W#kazql{&#dp<St)(QM@O<RCCLI~m>6oxU26dC
z^<@wkQRkw-?Xs>&PMmqCWXTR{51ue}(8*hF{aE7K_%%f-x45}!8z>so+3g5emeYNv
z6vIa$39&7uS~P0Unu^>|t57YdrZJBzMF-dF%M5&ElUcVSpX?SMwTU}&i(&rADxVfZ
zz4-7h5tB$J-kQ9i6UKY{;r^I5kDbkQFS>o_hZPXXi^nlqA<M_ZtkDf3+s6%vG)_rH
z6_`zuloxN^uY~x}zO_Kz_>%5=M_$)jI=z{A{J2VDKfCsgifimfg65(oklv8D$_-=!
z3unWd?XCLHFovkms>`Y>D~3^?t=*~`EARPAfQ3DDf4!NAgPfzR(fM>5r+gug!?j*>
z(M_c;?Ff8O3OX6@H)$2*JU9?3*C#$^C&QoHX>oWb*kIDSOrs>m99?*_5j4e_A@?&6
z-o7O=oOd@@a#__&5ZLDO#*c6`@-80-6VW1G$~DWvo`~4wc2$3Tq8GrD)muU<ba6;j
z5pO#bK2ypuI-we#XCmoHe;U!|u5ur#p3~5%?rTqM2%F1};Vj=*+cP6h$=Ej<%oa&}
zOF{~_;01Y_e^VJNgh%1Hg6lrWjdkDg|1otIY*hu!7N$eGq(i!q?vQRo8tGKJyE_go
zAky94-AE&yQqpzkj=RzCyZ1i)0P4)1nYG?ETl{gp$7N??Qo%=%2#M-|Rgqz9RN|_N
zxPTobKPH|@XErVTW;)l$Rsbp|7S4;W2mXPeZa(YIvU`c|{b3nm4Y2~1<iS<M<TN5j
zdUpui(P)?W;%<$J>V)5&4tMfheqI}<r6J5GB#-Nm!p_Wa8g^+4g_rDBJC}Y%WG12t
z2i?DahJQMp4Ddiw!lb2|vx6zf^jK5$@oe4eQcnpIa~m8hSgWVYreHKo?%I6|Y_#?;
zCQiKR20A3uuG-;<Yr$aC*Ln8OX=RENC4c%0b&u``q2hMa=Giw2VI&__2p$`g{C%+o
zake1gSl|uu@4lEvErp%Az)e6^X~@sp^^?)#w6ZCJIkOXyqN902u&m+uks=<%%FQ!J
zfHtIt`UFYWJFE|LYL?ecdl5PkQUBH4c5T;@xN;_k=F5qJ#uRwIYT)pL6Knb6Kr}^n
zXa1ZAATci9_-I@I9O$~D6c86$jFK-PKGLIn<2ZNEeIcoNMQ$2y@*+Mo@^kSn%3`H=
z?ca*nBHdLM%t-M-qj!#ZzskUp`GmHJ_`xAi^O<+<;sL@qN5skgILO=nse4nWnNBKY
z{rHRqU%ONCF*&~T42_{uz$zG28whjkwXhc3W%%jqm`~KvHD|2!<5DxSP?F#80PxKv
zHUjqz&0a||bLs_YN>i#X^%011)c3=}&Ctunz_Y!ap5En&M(-Bkm1hmRaB`S!dCec4
z*i<UB6_g$-iRDKXqH4VKcHF?dM>1W4jjHV5y@k%(!a1x`!&pde>}}ok)B0f)Ax+Md
zOuGZ`(nq8|=^xlmyOd3=W$is?;Lcam?lt$*t`*W0-b7po=%y`N<|86~@NyW{d}Fle
z{D<0eakrIU9KJ(Sid_f|7-enzgs{!C!nv&Pk&?&g`ij}*H5l3t8;9G~4KivxCJett
zN`QC{R+-qXHcB6L1B!0GTc*OR7W*u4P#bqcvk%>%8i*VPhPhHto{D3y$wb+Kb>L31
zAbxKwnUTpjP_7Zw>q(AyRVHOBr9&{6R<YQxp(eQAgUyDzxmwavyLp5jm!?<ummEn@
zlT|SPZB@;G#Q{CiZCwET)l@jICP39cz|`fOEe3vGSwF?d_1Ig@wYroB-Gzd$Fzvk(
z^XRwnJth}5Jlq@>r-j)n@TDxz1)sPxEr`k2djhQWJ|25<R=U6xIPpmme(?Acf`E%t
zvomkmSjO?UH2j$`7Vf@f$b_HW$8_@;jp*;wP08+isKTI79Q1z*?;M8ZX54Lr5JKlk
z-#6h-?&y%cvd{)0d_yb{*bupWCL(pxbaaYXW$a>}=Q_p{Q!n{};NJ~>N(%bEUUq_|
zQ1@vV=YKmV2=!H}<!fRneTNVh?OeI@0x`fZMyatceY9v!dVfycY?ADV!@2`MPN8Wt
zH`-qhH@$l^dWzMuw}<}dO)~kNM-1{V8N1felU&hS`%KmRS)3a6$g6wZ7Wa6RSAkF@
zfyYcUnk>TPM<4N5?w)Z^@ldHu0p)V9zV0B2{xU5g3Zf&_D;PigDO4CFp$%Y}BEb^^
zmZ1dpcy25^t0p^>aN!p3{>9t7NxiKUNUZDczBf(v)(JMTa?A-e^y9h{j>k?FNwPOg
z)pNhg^7V8bsM#ZJxNC?J@<oWKFKI1#vYUQ+)%+N$afIVPTG5En+*Eu+awJYE-CB+D
zhI@k}?LB#<KCz64CBiKl^ro#P3c_Kx=Fh%loZ6ID(<h|()K~SbMJi@Meu@EuOnTk4
zv;PI2pRr;xOju*kOs{Ab>@mj8=jn@(ql7j;v2GmUVrEa(N>_RVjeK8!xrTlb19W}%
ztB7rdcKPCQs!U^}NjgVyBSk+nON7gzqTu>NQY9y(dd#a3Egd*fl2J6s$H9=P+n)pq
ztIOrJ<_{=8e4IfOCuARe%P1y!FA3@S?nO$bc_R_mP=AboVz3<xZkM~)FJ9j5cjgYG
zE&}45+8;;3-7uS~7rl1b6<p{wV)4-09P12C7GdPWXO66JQMTUjZK|lf_3g#FK~j(@
zWvhp98S4|}&hDt)n);o1rf!@9{9CVZ0{L1YG`Jg_XH4xPTG6h2l+^}%mw_bK-*3S8
z<t~*tO`#Y|9M@>N8L1ChqXe7@@|rpj>VPI*PLL__4q9G^A3m$s&7e~;_M+seA@Xts
zF|u^$%Ay)P`)%6`<o|;5S&F)&`umudkbK2zCt>>d2eL=40aW)oON!Q{;oCfuzmACr
zCt&30aTXQxzVY`jiLP#xjDgmKQpdG&^u*WQ4$SN8%Y<?=@q48ABQ`oPoFch=ykaS1
zY<rwa3e}ParlA>H>&sP?j|3tFL9T<hWOTJoM4qI@8vSxIqy+R_ZdDZ09@V5t$Z$1N
z>1kXIl29gMxHSRQCi73keM?}|r|?>W`Z5KoLcm?JVnX;<${5}o*r*nEi|=|I^t4&A
zKp;6os5jXf#}Hq0-pF>lCz8raL(0Y0pk7DSBItH}Y&+@qVsr|;OS~8UnsS|qRYRy8
zNx+_FZl+%*pMwp`P?sS{)NGVtXH;KYy@L8*WBgm2@2Ou^ZPsd#F*8E$boB@H*(3&3
zqfxr8(R{Jso*lNmnVj?vYI)b0HHku;5Z|w(!CFcFx2aeU^Uqmv=~_7h%oMgx+pS$~
z@1@l-u1r2+Ctjv&9Hb#L?1r_?c)TYC@33{2aY#6MPsy#;cC&H(;>~ZRF)W))?wgy9
zu!~IPR6;;1Tw&@<wL|OL^r{vLjB|F}3E=sL7LPBHQ@MwZev#$1%;ROXn7{9<k=wL5
z?@@J;Ms|FprQkI?wo&Tk{p8M2!J|WY^OFy-7GeVT8Z;A8Rdxc`I(o(h^h%??CBD!G
zf{WCYg72s%)_BAk0+*--L$<?2-So@4D|f%!;X?Wv4Y1R0G;iiy%GIOmnR>A@?&=2;
zi<<N`etRbXQ?M5BB<0=na+N&rR63cLe?YY6FlVr_vF(4);mz&3VdX#nmf|Z@io|LE
z6UQeb*LFTd8wb0A{qbcv0vig07z=yyf(cxQeZtc8HROv=>d!aGCq>CD<dxwcd`&ky
zzsSe1T^RFQbxyAxOdeA$CT2>F8VTmaGFu{g3nCOYFJ9i1wv*!Pw$lXCRgIuy5$lc?
z_zL}%t)^6}`z3F`rE=O<P~Ccr5>sA9K$ZFja*md8f6&Ln?YF&h5ftmPqV`!@USE5Q
zYsdgB4=GD_PlLhEhyILYZx+9hq-!$Z=PYKzZ<q*ng!-)vZ~^znd|vn=gwzo2=eY<o
z9Dg4HHRx7}V`-(?J`d^i>2xgk#@R8YbAFel%165w!Mdl0pSOHuD=I0H+VhV&`}77G
zk&yIg<j5UndHQ>;hF&kh`BI!t4HO1nj4pi?edLax_3Wgm$)%M+hP=LsUo}P7E#J88
z%-nnA)-eE+<QxA1vR|jk0|P<%WwbB{psu$57FG(alG;Rz*_-BuX1TpBe`s#y4cokf
ztPug<Kkt;er$d*pQ?aa%!*f$Ky+)BMX|WGh5ui%DZv9f(NsG-K7{*xnE@=F3TRa}*
zv#^j{?h8DYmG7k_$rck|I9Ojk5|D1PG%w2ku0;Ck`z`C$(=|g!C~fJW`{kjk^@ipk
zY84J79^Z9WVeR8Ms)L|_Jy^${q&O?Lnoahjw~rbRUQ)2kHph;u*`d-h)##6Q6l;*5
zAuo%J>N?~8>GggsKRu;H^t+F$w11oYKL^<=1Q2?Pk#ZK%W2tbEkE=L2LcS$-26X$I
zBzu5)3Desj-tR$?6Jg;tY*QNUsQlL1NosQ9(pDOF=;-d64?JSqs4<!oXr1(_k8gh!
z=8q0p8loq>(7rwwA|dhjxqj_u&7WpnLJ00Ugud09<ceW$Io~zT8qpdH^wT#!4$2iv
zI3C_DmsG-`6ttHl3|^=nt{<c0gb#7(W}l3zwMobA7aW2Y0`w+fiL1u1^DV73ab9Yj
zL3rj8QFN&8{&ap5*j5)B_Ip@-<e6t`wwfL6rhZY~n^if+UuH1&!Ho2*#UT$Bd(q|(
zh|jAL3~Un;C?)E5Uv=+dzKps`&WibsbOl@g*_*Q%YEri55*3==A2sh$`_O>EW0cp~
zp0c~;y0_N#LQ~zy7k`k5C1+_ip!em&)<j#5K0;5ky}{3}l^p8`<|JFts|w|z+VTeU
zm0%Lz-eo<acQ>04nZXySD--m&GN{9J)GZd_2J5WSa-W*lEfWKLTkX7?bn06)h=qLo
zJx5T=i6NS!CxadN*=ln9@mJNkA`V#jLkGukv-$9WVCaXn61z%!q=u9T7>5MJLdIRq
zNt~+a{uKtv(JojCesAUattoXkl5Z24f?!D!w`AWGe}0bB^y38Y-MbUi$>2tTPe|vE
z3%g#hU9uI{4V?Fw7IQrD?zR7A0bHNdJLZ$-`ODlKgL9yxI^c2^65bs^?m-*FhH0=B
zA)LxWl|xKxlV^-Wr=N?DgF&<nxx)w2qkFYzD9n%V#SL}mN`ry&8X1DB>R~1I&RJb|
zJSp7hFiNcKY_JrJIF?>eC;w-8x&V6+i;Hw(PdO}Le(Dw7M1C=0L_~{=v3=`cCM{=d
zAG@`po}EB)k6)bZP(ZjCia~wkWu{a!Cg&BBqEuYmlJ-k*U`SRn`9k8o_*IuAi1W|F
zvZ>{P>H@+?Q)^0o&k3@e5!3DN$MCA*rm2>{rIdw^7a)=+mi8SiVJI#rt-f0EGa!VD
zG~f=CJ2|dMQ4I||kTkxpdndTK+naJL2E$j^Uot<!EIB5_SrhV2#~59j+3uQ(^b`CM
zvcTuGqvODnXfvkkDZF3b{%^9h3VuZ})SLDch#YEa&AG{SDaAD?8^grU5%W1)W4=8S
zCGCmHXEJV0vB}_azM}$OW1jrz7<RT@<+g<~sUrToi?lcjeeU=}Y^o{^40Zb{mK#P#
zuWC&B-5>|N=!V{)(>}udC)aQx>PO0rjie3+1?6BJi=*Wy@rO>0%B7DR_f2xP(~8Vj
z4>^C7@$MA7=ASdZ+gu5DIAYGVwRk*o@z=R8`TEzlL--OPL?5jw*OcDC8SyF8$JU_h
zD+NVLOm*z>1i<)eLo*q4AxwWDIo7r;L<i6SBlL7qGlu9jd^Zi%8a?)k`UPSC82#C)
z6nazvcxSK0`mwGP@fsHZXSn%nzh*s1Hnv<?RVC&m7a{PekYzEF^Tig^cNyB(u&b7g
zud+#Z-^FR-3N)S0!7&+}9F5GbnAH!<&!mv2<jiNzy0L8N5&EnMs;#!^na!qLr<J7`
zre@mhpe6Q!M}kV*+ZkOhr4kl*QkK<e!?nmemdPtG3TAwJebWW1#C6<}Naa`K-%t6s
zMt|y-MaXaQKr^9^GiFy2)sZ!EJ5m|fuCe~y$ugMFkD_c>Zds`GwZPIi!M0e@oC{05
z@b@G!NEqwPRnYrsB8SonC^Tc_N2@w%B{2~O$R$r5&+%SmKaR3Tc*qW#FR1Lk5)=o<
zWR%=xWriqz%icNt&-YVLScN%t8^Xua9M{*U->=C}d04X7s?X0CU&1zxDNK9A@XJ@+
zrO=bPD|SiIH+%`g_oW+#C(ef)qZ9V4IK7@V-ATUvIWQa3Ouq@36BAfQYH0S(oiD{-
zyZZd3$1wM+ifFQ&J@yx|`Lh*8WlGgktK>bVk!3so;<EZRvf#;;#T8hs*MS+*PVd$u
znoSuDh*6;MLvfP(UpMOzjr(F1>CXyo^0K1;Yymi69hxUN%xkj}g+XR`@%9s5gQ*Zy
zT+a*>cUiRfBvX3B4s-uOVhtZ~6J%9&!a}<0DyYEJ*Ge?&w2NZ?OpNb!cbQ$4`X^#g
zAKypss5wd(mObv`c^Ol*hkJaPl3p8VValD>LJ&>%N&}o;M+P<Pc#V9@zvN06It(Sh
zsP&KP41uN6xx6HW;hr#$oO_pUE@3tTBs0x+eb_GxkmqI@3l-LqO))W&*a}}%R4G4~
zPf)KLL+n7geVzpeDd6eBCTG}NJ66s)Xt{qz^e@t_!f$H%l6rwt^8Y)X?rU|<)vt~b
zl}qn=xL=jfn1o^eg+kn&^4vg*yQw&>pgx^-8sur>-50}F7w<p&;@iv$2|Zh;J}j<K
z(`^&OPV@<r^9U(`N-W*A8({U93K6Kq=%_0}>2(k{N|3g*C^$ogmJ8oU`w{YxY(71q
zdYER&`}E-fcq0@lsNQg>TeM=PfcS38)bUxRoK$Q#*jaLM6mH{Ptk8rm&*xagM!dk?
zjJ3+9pMAGEsqDe>YQ0rSM98;jYMjk2GKb-AN{h8?3e-Q$%Tqq-%8eWC{0QfOP9OZj
z+4bX#hw{i%23{eK!%ogYk~ro0bq4Ckiio>T!%Zj=%kX_a+TSq<US8efkCY_}AI$P$
zo|%)8!!cP~0;*Cx?!Bj-%AdF(YN0bZF%sqg#X8uslNLYu)Yj&mIdkH-%GB?MyqiLl
zaytiy=0unp_FFswY;PHVCwhUq4PkaIb(gXG(a2*=G{B+Z`}6+uu#>KD7B4kIo->Y|
zIwXUTC<vFmM6H6);93<|ac_A0PamQTd4-`)i#P1K9X|x;&9?f9Xv{RJ+bif#$R~Rc
z$|32u6^NlvND>90r!4IGvJ{}MjOc+sQMA-VfI+$zO_H)gU-xkqd3|GHjbCS6L6jKS
zp~w?+n6$SZG3H()WBC)dl59ncZ$-ze(h4Du@WbH$Q1qwayRk$VmB!Ts290LUpW9G8
zG9~o*<3RNB`?6iO6yZ^>ohWj}M99t&B25HlSK%dHXv5}!3Ff&Pt+(oHhSY1@4r3BM
zt?!<vwjU8eu5jgr(S3^E*V!yF10M56wWeVnqcly;pgkW3&dtx2)o1Mlb7PkZAElCu
zl8=rA6T>P{^|$6TKe+7t*>L!80lw-H^L&>10@iFWpLE+p*;I770{f-v&;-gLW9;ru
z{q^Uv+DAJ&2?*G$cPv3KmyPY$F|=yAtG9NC!><1Bh6s6wzlZk2m4!P@lZziG=fLUW
zS&pTPlm_eFX9?{$sMf+PmaX{PaHXSzs;wTqPu+pVgx~L2qw0UQbJ^zZ#7RF#NmgC*
z{?SXZ|ACQQeUvb{y@EV&R>SIn=pAELVMI+B8m|I4DnHQtt9qj_1oZar2-iB!hkN@%
zd=)4_Meys&^IT{pL(+)}V%+uO8-6-`RodQ(W<R&;9DtEP?;Zgo{q~-JWrLH*tbt7k
z_-%(vxdXt;;U1zAD~TCD@$%@2{!gBHRY`5-nJwkuOr3TrVb6xL$FY2e#>^^^;*m;c
zM*CiLN6k)9>#=IP3{2f86AuX?3I?)#P`sYO>WkcH%a+u?dIHjgDByGk%HH%IJmt&(
zEaHP8!yJ=vO)rG>cylD2|8;3CsdN3AY1}9=WV;Q*K(hWBYa>UL%lm_l%&KIeBRYa_
z(wTWz!`;>U#czqipzaqI>593YM7ojY1b<rCe({?RaVx?VAzO#0fJ9Ix8x*v0#x<f%
zs5_3^jV@kBAA0M#4$PGM4W`KjFM_H=-xPmJqZ7+?RVyRdia9woH++?HW;`Mwtf=%_
zo$xwyyIu1Ynm-%<wN{vZwj8dyv?E5a@qjE}v4!+^$&$eI&3m_er;x^Nc{(c8`e*Sy
zG{Ew0UscFIz(1#BOnjqCmSl|3F&e-a6X^*Porw&v#=EwEB5{f-x+Ydce_8jO&^QJE
z1hxi`Y49|k=UOSo6LYkimu_amD-2^fsm2nYaT@+zNPjSYMB;~peYkyi)Mm>2O|ng6
zU?3^4@B38Q92iuc3LnWlHzJSSe%SxiX^`4U@HHMl>mW|UY2Snnt^Vx0j$T%11`nNR
z5puV>X3o`;?zc#jynK^~{ATDuff22O<WW#n*{LZ;L<QyDW>5$&`yiDd&*Zv+Z3&2f
z%{Y!<3o@E7nLLJ&yszO_x#3pXD@_zS?cG=-)0Os9@?w;{q>YLC4eZB4oX1v2ks)1i
z?jxkc??}F0?RMLx^=8~G4MeFV`R-P!MLkn|{o(q=>QISuh?~Ul=de<tgqpi?a(=+h
zQ$c>_b3Z7bz%DcSmu=KNGw!@cR3dm*wa^g;%Lc7;RWhcC8`x>3(DJu2b))AKc8;fH
zgQt7!HU6WF>ZppL0H4(?ZNL_9>W2gs6;%^uSwpOT9zYH3)U9LR9UN3=p!j?j5oV5C
zlLB8-Pr+%sY9z>Myu^y!g4cEO$q_CXVMb-oagEf}ku2g_a;V_#<<VHjvs=(uoGC;)
z(aa97g}IP2Z2gziP)>S1Q*#JlMm~X$jq5jFF|kdp7L`wo(i)-zHq@vOT~%poP`4uD
zmLpD-sAcsYns~}el<e46jt^lS{p8G({%*C<JDQ+qR;rw;;0cb-0>xotm;~)o=NJsU
z0ad(L|2kzquIK~nKW*hefU5HUpkfhAasc`%Z?-WWJ4BG!^x0V2X_Gw*#klg?xhWqs
z^%3^KW6Ljld^r_i<==0gC-X&Y0<faiKjbvCNXhSWNFbK$$0~sI3zA1KCaWc=I>x_{
zGpBI<Vp^!q+&Sun)xDKClQaWDFE_nw-%xFrVR^f<*+nvKqn`_T<c`o!EBg&1O|#a8
zXSPEiz^;bb_(J~XELD=1bG|z1k<Ter+%qURA#Pkd`<z;C&26coH>4y70gv&)De2yz
zD2E%?=g-nksL6J!X*tTP#Gd!Gik-QDM%_b`WUfF*u5?iMC_ncE3z1G4cY6Z|>f>pR
zWoJp~cRnRR*Jb)SGM4?w!~8U17qEKT%$BDHcvY%75;cxPp<$07HwB<4sAJudkA!@u
zhBHOi!7xT$ZI!7bVaz_+#@h4ou9io_c=Syv5KksV$%-hwoIl3DOwBq4GT?#_KHIyq
zZ*i#Zp{tr^H?#Q<ph=#MRp9*-nD1YPUWkJ$#*%fH+qhKeMY1v|1r6i7e_|hx8lCy(
zj_^W)NHxx##frVpM<Q3BEb7l@JoVg#R|>I2c-|HC;Bht+)lHC0!Is94Nwo(FDK*%g
zcL|R2GZ<xuVtV;xTCt+{Iy8C2dQI2w?v!S(39uLbGoc>j5(OL{BErp@eoB5Q4`FVn
z&E#cf(WAhsrOjDW-(<nO!0&A-4rzWMbgj8jc5)3{kqVz|nFx#vKzh)dL<k+~sFd}i
zVVV*E-{bJ3Hq5khneqn9`VyAA`C=gbf)Q_5tHc_3q5q+TzGt6#Yr6^u9-@`_BH0qV
zsa;up-cJ9bfMYEzj{Wm1e6$e?7<PJ9{?>!D-*_0@t^WMhPZ-}Nx3nR_>$~1n>tB=$
z&-%4N)M<z;+PcFOxNhHEE(nW2kI}n!{1FFptsZm=x9;^4av|BQ8vXtr*CwN?QG(3@
z4W?h($vi*=F)E|2E6$JWXesHh&WwGlOH;DUV(6uZN4*N|n#VpGEsi>4^<gkl$!lNp
z)pijJ!gu|0NKK^C^t|qz*e0WO7sGaosulB(FO!`;F;S!Am4WFgX+u^~aj>pc_*9Si
z+9|Q2;?VH~K-gOw(#B+3V{K$pd4@dzYX@n6oL(WgyW#)Bt{`~f|04UX?FsLZ%TN4j
zwK3v}n4m}z1+I^OK^wvnPgAVR7eY@i%Y4I<8@nRyq7F1+7!E>yEc=rBb7*hrhskiF
z;WAeS0yB-oxYg!DfA=Shu1>fL+-p2lY_;eS-N+QAyljMX{cFLpdBJa7do5DdB$g{D
z({PnuA|>^nN47H~YqJ81G$+1MUKaBM?inYIjfVsee`J5Gb#6w0ilQqA@Wi_IoJim{
zh%tKUn_XJk7qPr42J-s;A9Ic%0`Lbj`GCG7E*+*PX-RX0)d03~;#-5+SjlRV&UG?0
z^cec`lj$hgc>YQy%ue<jdnl84;bY&F?`mg%iV`~v?_S{K;)57^`((Kp()=s26ajYP
zloVb6x3q8pJS^77pwO;cI?G#Y0P_5r0<-kvNJhRQ+jzab!kUEHP*j_8Hbpot+!i6d
z%c&++PR`W?3pV%r*RQd9a5nUSz0!v7PRV4Bku6_U$|37_oxy#>8YlrTqo32OU|?R8
zT{}Wa%I{bV`ZHEDXP7=O86koFMFG@H6XN<SA-r`Zdwkgp5>h{b2{imF^;{3{#bBag
zRM47C2eiGDs6sfLlV0yda@o<t7an1QzNr&JBp<73uy+!{Zc9kIfPe%eo(FT-?$Q#Y
z1M9Kw``ot)x=QPnfMzAB1<J2ZKqopvcqg68FB8`Jj$AtKhh!$<@lBpAWr27q*HiGw
z4t7po{b$#STa?nP`G8$yN?0@(;kUt&H=rD0@dm)KZ=T(gV;Y<=%=9z?<BC~*3)|y^
zTGN}l%?!t+5zG;kNmIkv@jDymHeC}~j+cS6C6wZqyz?xhK55#?W87sxW{i;2SM&H@
z;pn)}V$Ic<%4zJp8d(y9n2ggXml<hOBEm<kWqFsHtI&xJ`a^YGF(3DxSu9pjYn(yN
z`zJb+UG08Zp70pU-0y_<>W5D_MAkz(KP;p<HXo-h+R)0HO2n|+!+6A%33qw?Dq_c!
zpf9Lwm@m}u#mesm5F*B^&V4IqV*T+iB6wAtPObL7L;!8urLtV-2-eQJmg?fLwE1VB
z9jh?vsgc>sj>mS^K)s%F2bgPP3ba;QoIsM%g*6|(QWX16YKFnp^Z3Sw_}xo(Ne;<0
zKiD@$N&8Cz-mD3w)zjZtg$M)1uXq;Ba^$9S??Xv4Q{H4&J2NWmm3pitz-&(B!nc5%
z+#Ah=**xsQW;Ij>sDiiDxVvD_(N515ZSQ!VqvVPc**<n)B0{pipQP2LV|ej|a~)iW
zlnwh29Ao0d{ujRg2GFTzWa!nRwg5Wc+GniwRr&8T;WPP1g;^mu62_&S?D};&;Ad`x
zbPXon4-7OY{O>k{aKXQrk}0PuRD^<ei8Ze{LedtTLWtEF*?X;_Ua74$+ijSOZeSvF
z(}6=}n5SGp$6p@WH8TxPEwDV!8I+~`R3ce7e{Q8sd0CYGf4#g5PYNj1fm<kUPd8}9
zs2rrrWl<fbui~{PtIAR1kM(3iywds<Z!jc7+5wXEfa(RMEB<#RY4^YsF*W#dRnl2F
zlE8S%!G?0(vHRkdettZSL=8MgE__^<@x-Bvy9K%U?plb1X1&tDBVuR`q9=0Ht$S@<
zSi>^dO^YUHkg7GUG>lbvm<Xa&may&jCa7>_GIvd%Zc?Fmq+a3klD>VXr^Jg`z7b!}
zv@UD%b4cCzyMU1GG6V`il;bo?Q)6I)YL=Ty(#LHc=K;zT7zcJI{6A@CwY_|HROT_X
ztP(kx?-~CST(;oIT$WyCxzPMrH@<pfA^wT@x#V4tB6t`KW`CiADBT7A-Pdm}?Duql
z)qq|ESFNX`ZLPxHHl{}EN6mRCt0wv(RH$WPJ?$26b>`#wfg124%ojDcUALt5OdbN_
zTyYwyy~lZr>u$tbPRsk^lb*#KKb>AFs+<f-r>=9W`)XQ61^!%D*Rfn*xC*R4b4KF=
zTwi&Gsn!?X|IR5bK=XnKQbPYz_TC+Ue5WE9P?Mz<eN?9Z%J>3c(r#i+=<?M<3VV5~
z05yk>TK4*GIAb(dhx=a&eJp^!@T`<`zNTb}0f{`qi`GSDXbB^Nt?)<#nUhNX{e2>y
zr~1WBZ8KBLLj80=nxILL^W{>{GA4U8Env3E^(iaor7h^4SNJ#ih=o|a`xO?P$*mPN
z6V5$4Z+0hmWLtz2&S5BFC)xE3q{P63Ki0-Z)_#uf;yKm#@$)*l*&YL=7E0?|?KAg}
zbOlwN!sJ2?v12O)A#y97+&^2Nzqih-dr3BKK1t}fqL3eyLv3(xO;;BC_O_37Ixdtn
zRo-pfZ+eh`@?W*76Ufu<;e}AFE4bE+aGN%Yj@l$Hw`EBWIOp0Vopj$HeuAPEk)MQ(
zcjS0$)36Fc{xyEfR{=-!voem73WWg>peVv5RR3U{0@T<Xf0&7e4+$$vQq_g=JXc_8
zl`L(MI^|d1Q1nx&p}Pjl)TurpnRI8|NYOm#4FPUTABGZ@n$%T@B*49;De<Aw7lblA
zUI?{_4n$UIFSIBCzF}28+56k0rw(zbHEL-RXX}eB?bC1vma_iwD$j#jiDhivTYA)9
zZHM>k!q~LpP5om(XoB_^fB4wi;mlqIreo>j{KkUJqb$Ko20rt|<kn<13P?pp<!>OB
zi&3%W)5oCK+;I|^xbVY1Rkza@&QSjE1`I6PZ~!%B`6GXUo;XfnoqRnk!1`Ghjxu+o
znHWS7U=n_tCCYVC1ge;fO)(UF|66Py8%i<i2a!(2iNN9m84!D_u~$I>@;|6q2?1#Y
zQ8^$5*pFfC@#bcwU0zI=eK(SkI?6}|lF5$Dcx#}X|D+W}F-mEd;7nB1!LE2tvV)^v
zcGaNsOyUek<MCe1%yVfo(BhH$pH+e}GA`e8@GkB@f)IX6-%{4&E(=&5uMBlbQEp-e
zHQ&K5V-IasqvV#}n7r3z$UgO&=VExa#?u|o<O0Xz6gd|!YmBN+OR$?>W%Onl?si>%
zlf3<o9OsiD^&gTC7P4%UByr$5d&JR^^jlXUqfy$br6@xJ4#Qr4n`NPbR(k5zu!I*@
zg1C9Ft3*0KEwIm~lO!6b=a+LAQ1N@ZOiRDW&D?}@A5WolRHQcoJ8nSRN|kbW5pJ_*
zZc}Bb6T7#d(xWz1M(lnEwX{ds{)4T@_}{dXZwDq%A5Tpz`-=ui7hAK4&O=-UH=J+l
zm(*R!cR4oaUthzA;qiK`XHD`%4GQ)HlWbx}#~Srfxs7nnh%2uI%kl9gS4h$|qLeu0
zQ3VTC+i(50wp<>fswzc(u`M~|QRbxr+CJQAT77+Tw8NtJ!ECUK$Ro|9BGmHpv1_mE
zCT4ID@rRqDh!AI7(9)!{Y9KyG#;bHxB%hRk=he!;C{XgFBId(zl;SSgM_wA1EgiX&
zJ7#;OFEzU#Ocwy%j`H$1bpCfGJI9pIly6zo<ILRWNsVuoWOb`P*BBQz?H!2Ed=~3^
z651n~d-7D*iF0S3L(&mVG32c-)6mqo$gbdcxUnc<_a(<#`-tBWD;VkTow%L&23z7c
zg{(8*MT(u;UJYBv-i~}+;eQJ=;k7Wo>!{<X){+}I@o^!fcR#m3+gqCp>Dj)BVNJjj
z8@$FgB+IMtQy+?%cSX&RO9SfjJtnnV3<}o>Q#T}Oe>7?;jY)TqP2(3O*|`4I;*+F<
zR2`0;JULTGFk2D_5fA`1E*>V^z@UIUu*mWiYX3q8^(s#j><w2zoBd64B?mV*ypBuG
zmdk~fRJL%e6J!B<=q*Trs~A`BIYyI4!N}4WSFgC9nf{AsmZG!mDilHPw0qO1ZHJUA
zeC3cc^Qm8d0zC&Fo}njp`SkFaxxfE<hv3-_FKaPYq6fvvO60$P;c0e@cuemXjU1;3
zu1K%a{U*D&F_#$h_bK%+lsopy|8pJ=gS<OGC0($>i6$idBNdq<!wh+I#JmlR*!?>{
zR<XYudRH@dKsyhF9n0rBC%VkZwZ2Y)hx4`>$=4Y^RHNVi2uSDU{8aWi&aP(qjsSzX
z4BJ7NVs?18z2Aq_--oQVOy0|wHMLkeI(D{RhPgh={LRCCq~V45hA6tej_lsuD+HG*
zdk*~@9ZMMA4J_*r@Tgoh&9CYRmvl5bLA8nAW8_TPRWpJ48-lxE@<*eRq2W)JL?i2v
z`nS)(djBfqFL(t0rLKfjE8alnGWZI8XQ|j;lGc_p!+7kJ`Ci!!y$wiQzr1Z}>MSjv
zoS!3Su?gjzw=2zdP9Fp(p{Sn~?6dDo`WM7^f9?-m^qj<*68Gsl)TSI#RxOE&A01ar
z(mZ{Puj9)>ggP<Wsa?^bc2nvk*rgw31q5cC4w|r@+7eZ!!I#LoC_<L6;_#Ro#y~~~
zA=9CDXH3}$kx5?OO*($BB|?C&2w+_osUHFc6Y&aP_)R8+KUEv=drqg4g+*Kb#BtnV
z$ysMpRcF>0i$+i>N4xq?6X%FsEZwXLZm_?9aLb|?-Pk0uUs$R%&x%3toe{x>i~CN6
zcuGYsl*bG!2{2kw_%a`?P$?FE?<`efRnq@9E_-Bx@crcSMDBW%sI~&*5VDPcc|S^~
z@qKU4i0kQs+D>gL*q@#12{s(D)9(0ka<=Vf$iAYsqGYU1&sJMd$Isyr^GL-!|J>OO
zctcjgB#JpR$#qWnzZUR&c_pB=j<h~&HqS?#j013!g;=n&Hh5`=%f8Z>;<8_Sh5>_i
zBB(X5gZO|*GwqP-NpWFotrW_k*^D+Z+=Fe)9xhY4jvdJ~$-OGDp6t0_6fU{C_&=0U
z?W?*o*1|Si$r>*5MNMTrAjeu=02u`Ry<1@{BJ^_)Dh4yX$ynu2!&(AjOqPYznd{~T
zI<5n4Gii;eqO0}J*HuB!OEtvhhHNt>*h<ojqR7<F#!QMr6%W%l3b)mT?_1r{=o$Um
zD@!paYOrqLwdqC9=a^R_lha~uL^SrMEQpRi==xvmn!PA)Pa5m$71J-^Mh%uawV&Zb
z<HoF{Z6J^P#lYaBXmx8L?`kFNF17orFq%FB!L6?lo^V5KljaeNmoX+zbojv{^{=2m
z8q9Psq}m?|w(drPvoVeG7mNQNmp^;`z0Wp_>xSON@-Yb~IKP!~!W=?(WK+g`?!lyl
z7xY41??j(m^V0XUd%S)UoN#pwtJqWE1~Ko#ytjGt^FU$V`>9?AKz+Qd*<p<}72o>d
zY+?Do#XYK%|5;)7rqr+l;6e*wh(324)S=p<T<ENR93CbeSbw46&E<%0PnV+qM#<|d
z&iWr;RR(_J?IzpZ4r|PV{@kOB{^&fWR~Fj6&R|QsVM)@PTOdllIO_GaYVGoyW#LUY
z_Wb*e-+Z18Ya;ix+5N;Re-}5knptE^rpE;NN1uWEF(AmtY-Ka$a6+*TJh}H??Y=Q-
zHc5>Rq*2VFuvZK2!ti<K!j<mRbP}q7l=yu2F4gJ&)i2`8|FQtEnRlG;dj7b@>*cu1
zqr-YbC(LjcW0@=*(||eDTu8YQLjU<Z>JSMfKO?|r{<RtlDL+f+F$~SPEsJNzwLZIu
zgR-D$SvOkC_HpN1d-d`KAN?!rQyg>LGV0C?(Qo;ccR*yVurjs@pnqn=hx=UYjJFb0
z+!b??h7$ClOA$-FCX~Y4(l}0mmW7=kDWXx*g<gX-6OYL)LZlM!e))PPXq;qK#vfZ)
zYmQSvjF-Ylwzn#Pd0x<#qm-<4-nx`q)7C4!hLWLMuM1kzMpEoUwt8fpR}*Juos2@Z
z1)$cuM1ER~l@jEKiSdagwoh@{q<vCibN!b~6@Y<y6)<Jl$}a_CwN9$qV<zc8%`k%R
zjkopqZ;7cRpg=ruZ#up&kLa+EQL8{JSHkp^6WC3kAGhf(?%Gmp#yGYQ4BwP=2_h%^
z*B1k|>I(B#BGIL!DH<cAK4kVmE_`z!Up%$p@Rj=I*t}Qf`gbtl%WKv$H@K~)RgK&r
zNKL@#IVg4Hu?AxuFNvb$MXlmQvTCI)#WG0v10a;{C-YGHj@OU95GV-eE|3V&Vz4y*
z5!(?wP`pMhzRLaiPayst1K6CpB_YB1tFpeyZ4D!P=x5bH$r8yI|639XqPs-YG<`_}
zz`gS?Y75yfq=^{{lICC`@k@gRNY!3wd=6>xUF@2D+t9_#zlrYS<D7^6!N5By8)mHX
zbD!^%Z(jsWd0|s*x+#L1LquF3g7kAU!F0@QVx^@qFp0oF8Fc`o+cLzpiC2Ya2l%!{
z{7{?e`lN;6QvU)BPnPV?OL;*wl|zzPw*QxP7XgAiIeKbU#WM9|uSx2GzWHZ@Io&s1
zB@OnMb6HRVV!GcA1*>wUo-VpBO|x`jX$~~eGSOTsM|qTKxI}%H@Y`Dxq;4>)LP3H?
z_a^U>pNm0Oxh9{I9Py!Z7N)`iMFe5_pH?OBrY&43F?}hV55ErF!t+h+JLKW~3$NG@
zuxfsHtnX{qOfws45$~?<F|jqPH@D74Gy|2Tvg6JiG!ex?XA=`kdX{r<yf0^X!HBvW
zmu*7d>jR=oCi^W>Gy8JaMq`x=!LbM$*fl$0Hy{6#%y){7b96K#(`vElMo~$t^T0*l
zi;!F-v<NjmqNhM$KE59~wA`=Y-dpzebE&YLa0hbn$a<EI!8`gvyuD7kIqA%08a*Y3
zyaAdxmqLG2`e=^!dtkbeJ2>ciY#%*|K6p4BEJf{hDXihGv8?2@7w6iYI5r_E>NwR&
zLpw@glR;z{p$-FcA95_I+CfTOo`?7tnt-R`^g%s^vLWVO`c1YWp4*zUFZ0>I2}`T7
zxTJPGD1JQv;93WnF0g}}N74>r2ziZj9NwrmU2I?)S3@_Dq1kGdCnKJJ2*huE>u8oA
zK)$gaEHnDKtk8^7BRK6)_+$xqV`s5N{_Y1n%1zha;g|N!4q24+Ak!oGA&!5|?!;j(
zUbT1)SGO}@fPS8<aHhsAms2CY=mrUDfZYzl&jqSN0hF6Is<-i_4`=tUf1_~tvD(r_
zi}rgFcl(HY<^+NlDSpkIQd2{-{=yQ?zf3bJ^?cZq-+R*k9@)Tn{cKTuR0rxV5FtqI
zm_mQK)oz<;5;5<jw<_%r(yKZ?JC19%Kf7%r|Bp9{EmbB>V4cr<eU+Fz^Sd|7iOcNs
z#G_$wl)@qZxjKL$j~-jc(CRPsPY{%;mo1ZoF)1;&X4^Qtuk+COSjI}k`5FlMVg}RV
zi8;V<ZysJ%h;|LB4L?Uh52Wj`8keWbRvW*PeF%DUyhYG>jP6YZ@j^MU7@T(3$=09(
z<VgK#Dv-)zmAcT78t%04bw5XfR5yn|zW>f*IOVHdx2wuCnkdCm7U-vb?xNLEl&hp?
zED%3EuF0py;+|>RQ(b9mM}vRK=#TprfacCLKWhJd=DgmI`B!y6a9%ghxkUU{A0Sh!
z3xU(hl$g6ahLjQ}ie(+%7hN{3TE&g^qk+)N*B5;Z_I<5-pGoWTyhY*Coe^4BX46UZ
zX@#8W>52XEXlQA@T8uj;;(81)H(*xI;6VaLmO8|D0vJW2-%gT*w_9H3=CTI)AksW{
z{-5?-oHI$2`(BVi;Gu<>&jUAYjpeWEAxpUhF!qb)M;i%EN@<u(yQ<}T?d82j+3tLU
zomu4SzD_e>xV;u%@VA3e(w}`eGBp1=LL&jc(1Hpa*LbsT?eF{NZ#Zi~x-*nYAgR>~
z5Q*3;gf+1<h}&81p29lMhIP>o{%o-5ET`c%T1@IH8Q;qw!$KybW*{*}E&pV??4Q=P
z6=AxF!Fz^%y^CXiZ><^4gL`k_H$FJDCu#nyjh?BK9|Q0i6J+rjzj!aw9#NkNZ?{FB
zf<jly+`~Xm=<%C7^muCyj%NK8R%~3s^1}`%ChHW5TMx^1$9gSR<Gv=&T!AZpvt2Ci
zIfPcfiah6M;*bFV2_9LMzNQ)I?AC`-8-Gt=aIqE}X`FR?PMdz{7!;5jX+&CI%(3pM
zUuBIdc=pe!(Hne<_1Rq7??O$>-G7m8pjun(I9l4Nzq-~X=dRpZGW$#=tiA6}7HF=?
ziffdnK9~||;?z$^;r{!Z!rY%$8x=;N%Mo5PGT?ewkEy0vF3{?-H5Z?lRF*FVa79HJ
z7p3#e%kAJVzga6yJ!R!f5B=aKb=m?0HXUv&wVC?7v~K;d$w_P(aD*KHtG`2Xon(1^
zWe`?W^3M+yuD|wy2+?ct5z*2Uzkvk&la_^M?|v8`pE{c23hHWAjb{rpq562tzY(!c
z<zu_kKrooX=A6Ce6;Ezfa+z4tOfZHJHyzW^)4ftN^HPw+{jd8P-2jrFnG;Y_H7=Gm
zIPJF8H#FZWNezL)(HpZ8f&34(yo6`XbWMSNl>~simbrE*E(5kS@r`4Qey-3}k&+xK
z_&H)z6n|(ueZ4zcb2mdAavzBpT!5{y;I=~uNxT*F0vT$6^@6=@I=A1nzmV`s9*@DT
zghx^YICb><yWu578p*Zg(7gKuj)qA_Bo;UDz{kv*-kiZ#)<6uB+MrPC^EKO>sEhjJ
ztEzS-hfbt%=#Tgoc}2Bu42$BPyPzMi%7Oh97aU<|jDkg~^2$44fzCZOuR0p+>x|Xr
zMX_l@h7|XmKrTINQqr_^z@P)ZJkN3MBq-?s_*k8C(memQk*;@sT(IlM5FUx$Bb=h7
z-_x##@?Q!wrp108oyf@RE_am9fx)7ug;*SGG|XRqG#cfuHnh!Rl8RPhQT_iOtnxZB
zD>O42&lP{0eUO3sO^%<n@CVCtCW^jpdUt7uIldz*9xq{X{}Dnw6V=h+)uuMPe~#aK
zKM8dknyuHl)_ZeM@5)s|q4GP2i0mvVLzwLp$Y+q?Hj)~O7dIVSPf2yz!!AE+UEQZ3
zGk`UIC=ui%Ax+DbG@k9q|Ht>N!=3Vm&uyDa_2T}=P|H?x(6i3$IoQDe2@}zW3KI%U
zC*uw(vj))TfEl3@3e#;g<I@*HSeguc?#1O!y$SJ2UE8#dGop{p{BwCfHjN)sIA2DM
zTx^sSJSeFrsywbwrNMByW<3c0Z<8^X-0kDE6Lj0T(N)%)L;f*7-F;1lXQ9vDYF!Lv
zsSAWRSR;2de!rnrW!nFQU1(lk;Zh|jIT<}UcXKgd9-3StpXob|5|CbzVD}Gv84QV&
zkc<>~yM&1QMH3RK*8%2^?Q~TlDDsUKdlkj@Ke4(K>GK*->yMB{g0Z#U^Wz{}m}k)!
zxR*Iu%796bVUhLZqV9$C${JMY#A63ev-}s09BpGZMk(wE3<+OAwNAxx1byCiCyPu3
zNUbLymns3@v3hpgkpn_z8oo2pi#2ed7mM=!c4V?EcD@tmNNWc$g!<7t*I9>qmQOCW
zqJ?E3_jKrd>*Z=p)=s6yiYGq^e$?gGk`4nV*`Aw&UstR^UW~!l6C^)_h4NY8oBB`%
zC%bR{_Yp4|X(2{^bGua!W<xXkIJzqv=EKy#$PhNvf0Yp0&H>6-AOGYFg?f#p6Ao&M
z_<eS$95fCZW{kSUF|(1u?a!v`6)!a;S|M8fG2aZlWU1ol`Jy*()%rQC;tj8SaIzxl
z_l<gTjOIXzTj&jlw2$uw;%an5HoJ@xQ0)h6Q1q5<&J|TO20mPZrW-x>;{{fhO`ZY$
zu~UmhpKG7<mDzk7G%ru}c@w+>@1eDOu}OV9!#`j)sb7&b&$T~okoI}Qp!leQrw?Ky
zdDR1)qrULb<}E0Z)Rx>pb36vZ%fdvS7QFP8U9k=D^3wfpKq}^8%++6Kdb3EbWyd2*
zcZ8IEa{4*wsVrE}qWZ0NE?83t*e!q~WEk>If@E2o;!f3t<*gq5IHB+^3}tUY!}I*_
ztLw}jYa@k-l5|wc?=W`p|9alMQ!%e6c*c+dkWsBb*ZL5@=+1{~y<#dWGd0k>jA0A(
zh7~=Um^kf!(?AeSKqqE1P!U%e_9&dTTdgD(AL3+8mHwakBXan<QIt2Dl^;|aGV$V_
zl3ILvku!KMOC1@EN04Fyb$frF6f8B`HYZ;F{BRU4ODs-6AXdWQ%f+s*-Tg(3FO-90
z?>i__hAcd)l%cq^p5n87#atP68T<G5Ss19FGdRVtdMJ2Rm?@JOC4OK9zQ-W_JZNT9
zE}`}pvd~fuL?g1#*bfg-vJMh8&1{~oXV*}V$FHv^*;Ji6&=tvIpufdmO36>EC**6K
zx_iu8c{O&0R#v~&CRr`dfi)OwW*I&E{?4`jCNWJ_L<PImQ*gG5V&fIS7*#xbuZ)}q
zyWU0c$NgJMOGNYK6XyY9I&%i=@qI`_*xqQBYvg-VdyMN(y5qr#5mZuRxPW#5MD}rZ
zUQX9?v5ipoFY8u;=Gt>Kf|&uqm<u>WbUMDLE$`FWI+egW9)3!zxj2~=>+C?#EgRHP
z5RXXbf+!Lpj_)cD$URf%VY;8-tvQ1X(vOZ7qomOol9slY3W$Tc>!v-fr=0|We060f
z*H=p(EVdR8CAb86*XEyIPxXsq(L(?N-RMa`QhLk%_YMqc_ygFaNeD1{KHTKeFCezh
zHw<5hyd9}RRS|E>iRJ9S5S+q0YPd6ps0~r1{L5JXp~E*FJFO5_KdGN?J%4x@^)2@E
z?CsDpnG=ct!6>s_g83raB?kwZ^^7+>ooRQ5#vftDSBUFSiGE3}0vz#&`*?&y+yz$m
z|8gq@k$x0!8tzKp^ZpmNV0aaGHML}Lq~yo|*JW=Q^Bdz*EK0V52!2umtbrPo%@>VE
z6LWB}xKlmluL(b%L(4<kin_b%3p&AsOZ)qdJ|~E{&d#ES9=~^Aa~UPObb&|xmv6ke
zbe6yfwUw#7Pf=-2q!87*ET`!#uz^;4y<ucCF6i7V<rL`7X-~;YiZXCQ7WQTsxHOT9
zo|yLgoSNsf4ee0{aE)TeR9CtxQYV~0F*@FM4mr_n;7`pal$>${>7_zeef%q1+g~&F
ze3-jZ#buhok;di`w}fs~_Cm@j==eIk_FG}|fp!(G>8_kD`~(>@=nV+KAJNkT)GPAL
z-&Kd^FAF`Jyz9lSIHmfXH0oa0Nh%mjn!}3oyQ3#}Z0YV}&JT}uYT0_t0><4$T9bz`
zmki?`-L3z<aPjb*8|Hzb@#W)6)rk^T9Y1_!FsA?c=p&Zv?;z66`#|VcY<2VFF)hPT
z>XWUyXqysE*miZ&kO?L2ac`J577?&4u=_+^rcN$;3WWKPDgE+PL1I4v%)FR>V!7Hw
zEHFnOV}}Mk^-MH2y#L#`)XC#t2h6R~nry8n4tc6oTK;#m|HuRorW#BR7JAq)aN)!g
zGb>-IBa~&d;#520nlJLx=&rp0>Ud~Gm+T0IlmH>d?~WTV>E&dwsw1iRlfA+QD;FU3
zZmNCP5_j~Cx%Kizb4T`JDBcg@ii}Vvt09J8*8S%&RX!rJb!U!XVGgLSTN#^!(OhDX
z`@*BKl85f3eZ0zqZ?ah$$R`}IxZ|oFoKqsD?|H1V6Jx+5N;Sx7rNK&2VcK>A^Mp*|
zDPDO!$w2~)ec8L4{F#79qU&ZOe=R6xmW2W`KWnZ&UcVjpO)#}xpUlj+_5rDQq{ToO
z%kQw6Hx#jC&hXCE2`>JAq(;X<Dx2i&GK6rimocCoVL3i#Ro#zw<@Azle9N(Zri*y8
zFa_K$c&}etIM)$WYST(id*)Y*q0Gr?RVkW0liM$~u1t0|s%kH3DOQbW!=JQUQtGfj
zqNB%#(#CLMB_%`kyJEohqh*t84(7osi>r%>WTpIyAk878M}s_r^g6F<G5I87g~4!+
zu3T-y{xw^~e%Yoqg;(E}3u&K={bt-G-m1aTSJ+3GY&XO4<Yj#PPln@js14UuU$VF}
z!;kKW6!Wm*<Sm2@F4qX(kC$aaNu%B&u_sP7@22*7k~nrSddM>}{aq@Jf)83HGhqVI
znpUL-mvYs@w|MT9BvfKzzB;98W@#3^bJuFi(R5KbMZk1vuA(8J8&>Pg4XL0j?DJf$
zUybif<_l-?9*@#_K6b_s_}!7fS*k~@AK1l`#9@XR;Hj|#hkbw&e?3z6kh{#DxJ+EG
zmI0sRi8~EiMp#G2bwA;EuWy!}`UUw;Yr8QswS`ZruQLvKBx#@2x7n&Skha;xlY7n~
z<NJ%YCs8=Pap;uny|LJKRvTw({=KceOwDbz)RwXDk|$2TzMm=!oM%_#8z0YJBRNXX
zFQ@Gs9KkW(?Q<7=-Zy(x2;vfh5e2?}1dC`E%`<L^wJsAwL6ryCvN>OkL_taB&Dv2J
zM@k#&<E%v@d^|`Jg>bm*x7phspO(|()RFCOAU~R=FOSXrarObmZP+_z|C5T64*sPx
z`YZ?rs)-F6vr8FPcMQ(j^B_2;^D{|R2Nc5YEv+JfgE`8(l@PAqW@eZyFvrHZR|@S0
zGKGv$bf-s{T?w$dg7GxFyEo_ECoz!gZo(G~JMSfrl(^a5;dWLilrlR;NVo<cFnYu(
zwT-o0R+)PkPz=wpA*x0#d@^?pW*b`O2MxxE=s>@#Rj&=mA2A?^_8@wD&7>YFCk;{|
zzA4~@TP(7p$Fjara{+y(dq=|kNS0F^4J9Jau1MwA%Htzod~opY!+pt@X51hsN)ixe
zR}3Xm>qX=ygP=T;_)Pi77%GY_=PM+CJc3|ZGo7^5I(MvNwY{q6nW9Y^i4lAVY}bHV
zeE9tcv-bsZQc1hk-sGUKa+MTWclA1NRb$qDxMe8LeDr*NKEHz?`iNu(aT<>tm-|HB
zJoBZsV*2xj`&d9Xw5`N7b=3`|sJgP{aM(@>apa5Dj9^|xDmpzH2EyO31se<mq^pMI
za@&O-_{9a9XWl*AGi}_iH@0DWqK&mH=dF)_W-&z<nu~Pm6t(J`N*mSn+s1Kawi=ZA
zjGR5*9+I(>ORA2fM~ZN_@EW^lT5q&oH6T>?AVPWdp|r|=8o{XE5K2eqY4n_%nZ3DF
z1@lB{U<dg9+`hWSEHJn+nNUp48%z=D3zPgU%~HC^fZ6k+{WLcp7=aWP|DD{AH>B*Q
zBYxkbR{hb<Tt?&)i#@?kjv&PB*m-*ic5@hoZbqf<$QI%h=Egft$l#zh2`w3eE*8Si
zTtRSr@JsPrd!?yRsEBQ8=dmWeCnZKrCu`$4ui{CWNEfV_Z*Bdp&Y_MZ!U~uA|1kB|
zQBnO}ALuZE44}Z!NH@~m-JMDg-QCU5(jX|E0us{QjUe46-Q6XnckuT<_uh5Zn!jh}
ze9u0g+Ix3IfI4=$n%!)du7`uT?_=gGvVKHi9}0iDk3W0W;Ex0FZ@@K0aMBTrH{7`S
z6>Z)88F?Ry2OCunSy7td3EJG;`2G+Lwnb;5s&A`slU(@0Ft#fcz3PE!T#=?3K-L1~
zX)d;4ln$(Pa`WyH0Cfh>bW@&{DP#uM0;D+p@V9Bq&S|h(9y)W$5;b^3GnZ;53LsEK
z08Q({yY%+3Y6k@59f!4yWxt!H+0yKz$|8pZiddA#@w~9w46q??x9Fq`=_@j{1fLsE
zZ18g)JH=XO;JG_Txxq<c?Mq!~qLVWvCo9O@e#XwITDb3`@mZ1E3pH6=NWsh5f8ZJ4
zjr-*Jtma&abD;n0J468y<wmz6-{8|D*V_&)hBIm)-{2Cq1$}yVwiqdHI}6$%dH4N&
zOpDik=P=@0nS`A=352Wud1`lbT5HAYShte9bQnvylgu+P_7hnFGa$1IBqopG#6uNh
zWEZOYQYyp|Y-Gq7XG)2CRV5+<F@QT1vF354&b<aEp<m|6u%~NyFxQ(M=;;Z_aF`xZ
z=PoK<eHK$WNg{XgVxXRq=GxLk_)UsO76orVHFkrHfI+W&%0#Pa4)kiC0Iv!B+&&sZ
zj92^=wS@;vWIf7Wm+C>`>R4Mf{+7`b_#?GtmDS>F=)i*Pbb`|>;1KuZTih7L&`xj{
zYD4?|4=x@ey(DXn^y5Y1>OT2R;Rt0Ojz|G76N3QQ1Q&b;_ff>!vxSPk<i<PvMYbx6
zwN<}~sd#_26O!$Mj`dJl9<VCA=_{M~2N`rvQ2-Cpst>`a4_L|yJ8?8USavn2WRugH
zB1~9HCaUWFi=s{(j*w6{Tk@>m6e<o>e+}q)8_HVswOMoqs<5QRozge7FB`4e83V1?
z%Mwf?(Sg|?A|!tmSpRMI&|F<AGb&g?xdCjTag=An0z9e;7$Caz3!X;!u0rfX>e!vY
zb7;{`nvec>KVf7d6jU*(h$8H%3*WHNaO9*GXIWN4<Xei+_CO;RfYBg<z!xbU%XxQz
zDp?>9vmDrGKrtj!b)psPC&GfKT00gZ7F%z%K~BlwB;b=XpH=cLd{~l^?EnMsqtI>4
zbJ&~fcArmC2<{u9KL6d_m(NTLSYhe0&0ht@bh-LESVoX5A|wt!*yo~G%wdhs5e|?)
zSKkY1{AA8rLi7Loo=eG!6e3GHJ5AT=<?*G!8J7&UN>lH4w|Lr{OXaWCAE83?PT9bi
zrytHSbv=rmk6*3&b0DC30Kky*=6475Mqo6$u%@xeeTa}O;=@sk&S9vu8yW@?0^p29
z8Da3JAo4j2=v;<yqoi5@mlVWru|0K>_shk1vL~%R`<+v30tR$#q>q5DaLa1WK#RY?
zs+kn{$yPsi)zUq0&n^!7h5v4wF{m?AK@rj}FIvG&Ay{KnBzcKBsoVK1C3!#zs<W)u
zCX6P9@4B2?OEZ7=xXEKq{vwpu|0OzEnMk9Kg(?`;mMjK>O6K3B+ilT~s~3f%aG50P
zl<r6f8fLV;8Z^j0Ys<vhMa41xj-keWgwG>1G!_+x1oWq?q$R=01^~Wg!rSWSOcNnA
z309V-G_T@)?k5jCuM1rHBb#x3nr@=*n$jpmYbT3}eZW?8G>p>X(;R3>sZRBsAV4Ni
z2y(k?>J52zr_9<DmVwKLy_?T{?%z|WCwO`-GcmufiLDw@EG6B(-*{WZ_}netKT=%w
z+ozR)qTm|G2pf67EI8W;VUE&9Qlptq)-66mBZ8A7M_EvfWdJd%ak$Wab|&8CRCQT=
z9A1qPXVhqWbj>5d(B$yYIqxA=Uy(yt9?<EM-g{KT$LVO+cH}B>*wELmDQ9tq*tfTG
zTb<TSZ%|RWQIlV-%JTF1EzVjgPR7_HjalCA!-eVkt_0GnGa3C$w3Oj^684+ENC_a@
zs8=As!nIN>xOOz=(C1J8>_vm<JM?rRiPFUJ_n6|5N3W^>LnVhHFjS&NsxMJa4qPI`
zn5`)yTVGM0_|5mF4ADZ0L)-bsVg>0l?kP96Y7|Xu4tWpAT1XKchi9=U0{8Pwu!gGZ
zu$@*}Yp$s=_7VR3iCK=28b1B_JRw}9PZjWiy&s=7zhIz4lHxc#qD7@SlZ4bL+CI-u
zwMd)V+V`=PDG!jAsl`uR?+cpZeYrppHBtO#8b$+mT`f){Aif6i_uo9}5lo1e0njyg
zYGc8pGupmFUH?;=``4iF;T33AgDLk^3R$J0@6#f4z~@|Srz`es_&mNhCqH@jNci}L
zd<1}*hd<QB7K=t!dlY$+g|vi1!oS4~I`T4H=mOuVO;`%;af_snKt9T@1a|;|S;_NQ
zo>4>boO}kk);|piD5eQ@ayLUY-g6|Ab+P#oT_|hBKcWM**jVii#@z%cY70ntl4Uz=
zaWH=<&GlIbU9AWlZU}h`35&1B3X1XDgyD>Ojh_4}^}JN!GV%_HKOvrL`fAzET3R`Q
zwuxLw5}++x$Jn>(UclWJ6ESwh?%2dg4qL;T@2)m+#@c(*m)I{AdY+~0=bqY%OLhOe
z+mrUUBE-RwUgqG6-vjFK>Ze=&#8-br6UJ)BDYU(^Hkv;ldB`LVFoOPf?UG~Q9&({S
zo^-RvZ?x;+FVrkIKRl$_E+1c|ZZquOQcmGr3VgqNCC9Ec+Ne&;@`YfFy-}}G@;0HA
zT}Eh;F(!N|w$LR<hGKqudU6Vf*=j^;j86)SCghgM%D+w7o?2W6Vs0iB6sJvp2WhN*
zrB?H``&0+>%5fQg2`U_M>fRRCU$?x1;D(~&Q0j}@RZ@>lO_$z{xu-9SYn@O+%j|Sj
zqT>rW_?*b@u!lo7TMN=8YxbFdD2L;(dYk@8cdw=F$Xy-(Zu2<CU;+eG$3e<iFFI9@
zQEt0l*HPMDezy42PH=Q_e%FvhHy@Ci`M)ectIW?67nH9v#wwg~tx0G=UDXLJ)aQqF
zOA-NE%E1mGhRB(vFyMMglD2$|3d+JBFdv=V9|eC)&!aP$YQRTwO$u%e32<m)t#rH>
zxM0H`H6lhQj=)CXFBpZa7Ah`O=}cUw%VxM$<(LR1!;41Twc=EMcnHL~=MC<NrDG%U
zrUm=eep0KnDwu`=REnJvX70ydznrBZW2s24@Fd{K$tbAGIm!)ay%iCXH7^^JVPCr3
zycbWgW{=Y$9F^!U1~s0K9=#3t*^$Vuj1-*H@+gs#?z1`M^PA`e_6ySFW+*%@sgJ`4
zA#x;WvwPBzuWVBMNh~S#Mg7hY`9r<mU2^@2sO}y46rznJto<|5w;Tk=9aJa65->FE
zK0&7gBQ_u1c+p+j#ZKp~?hVTOQaqDc-I!9Yn0;-39x$a7&++b`(!($fO1h3QhhrJ5
zdr|@`BCB@7>eSK-|Cjm86>wn4*QoyvTKT_s87hTjt{CVX2#ggD-D1;b*QVED%-6kr
zi|}L<vhKni!n>w8Fs*dgn-9;%<SPg#yvZF)wNP25>TYgxVb$>6>zyi_v_&{G2l9dA
z9L-}#E()3a4?BA&4h%m=kfD?tM%Nh?>Udewx0%Vg^`Jv)kdrkN@jhr>?$bJB!@z$u
z<nz)4#(H;KMK)t_P_ZKXpST<jclQ9bfVQC|ogWT6Y`U%cOut0KIfKJ|Sy}mE)F8?i
zQdA`<_G_g0Dr|=V!8F~O9&UMu)w8^$R3q~mg-BjAL$`pUFBG5mnX$OSRbElBAtF+G
zWnje*?7jMB56?W}*dV?~99m5x6beWiwZHsq4}TjAIOl8R;>vHb`AK>Qz<-{a!OFpC
z=KeU$8WUxo5He91B9^y8P5Ac}yD%H{Dj=o3?RTFN)a|1y0DE*{KRI+V#x`<(C9j;=
z(*n*IJ)l)wj|rz7qQ!K2hy%7Lo3_t5aS-M0O!mx@*NbrS7cTc<`i|$^NHrZ8y3N%D
zeImKfKwlv{eou@bvvLI+J(5=zt*>cXw2=fWQcyAf&Kct+@g#EK|1Q0k8WDyk>5oAc
zW7DjMu^w=k-&v4^^NWoldq?!T&TKJ&vB424QPW>-Ql--J3rCh&V^^euZ~>Px0h8D^
za{sMut-r>^S#TCANlsKlmM8_TC%&@HhtW<;MTn5(%!Tw%XV`R4<^{=t2$~C?p1lMU
zi2_O_AZHt)qR_coV!w;BCv5OX%R@!3J;S>#Tha9?`;or{dJ`RJ^F>zu>VVXPSv2r6
zgorZh`B#0dwsGY7y89w*W$eZ;ab5S=d2ZWE20Z#izp)ky>0i$AWYN?^QuEPXmSfb6
zaM_Ie7ij=l<%~Xor6n7Z9(J;WWoZv{!kTe~p*q;!U)zd|d-RRy>M%rw3qIfFd(m^H
z97JEOv=$$^eDc%^A-3`e_>@+6vhthfQclC%BQW`jt}Ly-fC>8t^F$+Q7c%y_^rHa=
z35cL1jh(xSbHGO&B_ZMd6G!g}!z~!^A`x4%rS7BD`nJUq`BUa|b3R5Sgl8b?XNvjZ
zTdlkGa7Q(%;Z;B&V}+_e+0zgVVBFbv4M-;R`63(-9$n=77kg%6wi+F*QY|vxLRnQ<
zz#e$)7mgR(L-XZZ9N%HzXu2XIOY`%h_bRD}UA&#eTB)zsRTfDi-Q~@Pv@|;hjc_aH
z3@f#wbCVXis1EA_j0>BfR3lRPIg<M@NDSSqbKS2iQG`+}mjmR8TStAfM<oa`ZJw>H
z^khspV7*t_l^bWkt*zttd7~G<%+FTglrQ->r}fqEJ+vtF4?gnA<Bzk%)SmSuzXgW*
zuzp?OX-IO>2wtw7ylmI?R-;e9P-)!T#E?^0<Zz$_%oX(9k>ypA^-uw@Da=6ZDGs`q
z)s%K$mRm>XKB*iKesMC15QSZV@Id3Q242>oAWjgR)bPrN=Ah%DbEKyienF{VKq?b`
z&&=1nsIi%;Uh0~RevIK@YN~1LB*<WJU;b)~iSDgQ@RXR$Y!JZ0K+E*zHb47ecn~0R
zNpM}f$OUXF%N?i$U7pc#cUq$8T=r(J*W;~4WA=n2TTz0APampPoS?$_%(r>VO5Ew3
z%u=DlLWc|}L}T{fFcU1cr;&|sCBXsmh`Hc?P%Lf4Pgk>tV$j)J1J*fj==dMHVFD^$
zDO~2_lH4-vZFL_%>;vx~OZB;9Z^Bb81Z4rVe1p91tkE|8NZ|E%0gtRMSj_tK8uav^
z2%^~nFxnfxLI}fs`|us-ceL-u2m~$J)_q5ud>IFo8p`yL$>0E$SY6MU6Y!?|cKJ2K
zns!hJ+{G`h7mX!~1`c6{P?Yb(;o<0jt{^m91BN9hx3f2ghF-q-e>#|ef)<l7&f~n$
zt3x~Xhx@U-0WO2O`@zJA<f9}wNO|ZeQbwvGVc{v;|7UV7(MKKdhhFpStQofZiM`f4
zEc17`bz=BR1R~d2RTNg=sJMY>PBPP1KDw6|A$E1p4I!tFlfpMNQzbHJf2HIsMBp$D
z6uAXe7g+zi?Ti#LVE(MiElKu&S70~{Yz|1!J*{v6SA-H4`0rB;TCrOFJ>h#eK)+?k
z9tlK$PyY_O85f{FRGiOVSPb6O_3V6SeHcl&Y$hK{4$sX4R~{8d0Ti@@XZ#Gv={C6_
zP)o!tjCMC%crskbDvO8$4PZ>Dt#*bNpmXMfdf$vswX$ydJFp{MhXrRlowV%T`w$^w
zNhUVK9T0J3{f3sZXidh50J@AF?%*}FmVyn6WCsQkbR`R{UJ2xHc`TNFBw!GS4|Kh3
zdEfvm4kKDHHf&C?*{Grd^|R+^hxN~pm3|$`=iFxAW!&B*5I-Q8DC*RjDpX7Ipd{=x
zmkYdzpE&r)+0{2oHEo2I@MS-r>!VZ{0{2}w^8c{^Or^FQhX}}4Z%&F!3w_wuwM%}@
z;4-+zdi5=rHF^{xOhQv8k^@*=i!O52l5FG6i3<o1N30dY;yE#4&b)WbF55o~AwJu8
z0c_$9y_wdLEI{9#j<ob@@Cx+=oL7~UE~nZl4nq?F0Y7WCIy`6JUsqVYqI^G*X}6f{
z-gPJi32<+hvHB<0fCIwKv>SCqe}|d%eg@szd@{Qw5sU+Qz~%Yd2bxlchUUak<;|zz
z1rRb3Dp`z6^qui9+DoIws9t&;<n=kG9`GIkHKx5B?_5n+wHuRHEG1h8tB~&5Ifz9R
zuMGyGec`$=HMHVOt-(zEX*-`}!P;Vgs6A>jM#`S0T-iIHQB>nI-sc48R$Q^{kK74V
z3P(oMg{vQUNQUo~SLcR{Ed+xAGEMh}K1E;3guH*aP~g+aZDEKv>D3(-(Jd;J3ZY9@
zVUio(sc&=sHmD&#yKXaj%$%RilJv8k^MrhJ%-=B>cG0C8HaVLd3)%l{LDRjN!Netr
zawD4PiGC^6unItIHq77gt;f7dO>WHyRCb24<e!0!MaT+xeSdUp>2P3r=5b%VnOB1B
zEMm{$D(x~$6gZ64UbVDU7*LgQV!!^U7Zy{#6fffLHy^@5?<sO&rwD(!@-6)+Z?3v}
zKZ`eH{MTUmwS3{xD0z-8vh1uA7i>21vLDUq%zeqL+1L#FOZLoptUa!={B_3BCSD$E
zSCN&EAW~%IQV}JPmr94}oMkX*|J~qmN5t=foDlcyJCGAGT$4wdSV7voTTd2TsAch!
zFCVXWQ7$i*o;V-{+wCU)+jf%s<v@ggG1xD#2tH^G_Q30Tx#V^FzAi&4ssB%>RFJ28
zBc<m<7ua5A;*t<JcA3#cF;(qmD|28W*_4RGTLW(%F|kM8bx#pK1Z}Op%!cb_Gx?(|
z;5caZmSdNc>sbk-ZEd8uVudP<ySiH(DE28)Myv*KualR^KK3d&194Z__V5DUTZ}B(
zPexl1-_Ey}*S0t3u&0nqTi=TB{&+`K*JveM3Wl^_2Yn`{u~34TBm0X>$XTes_bnkL
zY5?Fw66d$da#@IiK~-_x2_B0{-`d`V`D7vhk)0hW9)UnXEu)flQKxt&xjSVducI{g
zxu%8&XIKZ@&uq*|1JHFcf_>-lY`h;t6tAytxf-Ewfshgg0n9Oy-z=O@&a;0%@OG$2
zUZMIi$J9Q263B-ka7K5)aq~N_&3A{JyHzgzc5VtDU=?Mq_m|GBrOgd_QGTV^%fRza
zGS6{vO*&W<y*K=?5X=F=bf*`ntc3fd{MH(I2F@`5Ys>%W7_KNx)w2y!vLVM)7+lBV
zW>rTUU&*3R9*oF?sz}}`P_e+C{b+aG3r_4&uh*|4y}!pz5tNhs<y0vdsGxB;Tu4VL
zvnB#)BN%P0ccQzO<sbkUH%9A!hwYuMpLrFP45(mzw$J<`i^rU>nceR#=)!mbQXk4*
zwLCm5l8&4IBM6p5XgD67gsFH(*~R;lewOUMb)mln8m%Rr0?6~BGx>)t(K1|HXL*q$
z%wuh4SpJLU`P9)ea1^bL$~(+a>vkOZEx7%hww2+J%tJHdE%vYN=%IZo1Mf9|6BFc)
zlXnRMeuYP`EdjlPO9n`H2Wt_=-XsufK1N1Y)!GxTlSiIIKOKL&C$`6GX<gtb9U@!y
zUe&-FXs|!5it-3&93<xMQpQvde>0^VF$2e~_=TzsW~_9WGy!2sge4iROH-m(?G~Pc
zwidSJE&QZVgp*81@1J%KgSIT+Jx|=d<vL}H$ge3r^b1%KmhT3al>)D+0sB&ct)I39
zLJp_;P;OtnI%W{^P(7{PiJhFC53p)yScOp>RAOXOskNO@%r2z09G5~8tskEDdV^N<
zzOd!0IiJxy(0?9%H|<`S5QG^I!V#Lb&MVFpYxs_k%!`uu1**M~L~})QU#+7os(L&f
zitK<7V`Ib`9S7k~tS=;8?p9w^aDY0FcX{m(T(GBiWZJpO%;--ptY4w8kgF@)G$l_<
zF8pJ`b5CZ|U8#D8Ic?6p%tyxZj~3oG;BVG+i_<|)ud4BhdVJxGSQ;(mnw##vf}~oc
zOx#H>y|n$#^E)mXVV2E6p^POKAX<WirG*OwBd!B@5kDI@uL2t@-pNT=OZ;pd1j|0+
zUF(crxK*zq@;8gv93F;;x*=nG!LoA|LywljYqrCCb`@tz06%v61b^GQn4fFrr8@-(
z?S}aqzV(+am*<Mk?L-8VT7CVNjK~f5r`JBjwR@XrKRTX6Mz~-`!yTMcMFxxP`K+IG
z`&?<~SHFLFPE_cH9UcbR6mv-Cq5w~cb*gA99gR`#)>Sz}*hrh8q39x<(PKQT42$QX
zIYy!J-)<;-JgKAfnOp)lBmyLg&J)ohj6BHm`AL|0W^~Z|v6=3S0WFPlR7kjG6ad^O
zb4Y0(2^vSKSL-)ht%2lR?>EO8Lr(*q@Jl8?>fxvyy#RIMb-G`_kSYOJzSGZEQ1__d
zush>otXh0Q{k7nJh|Bn=xun$f={LV0S|!#uN5Uk?_^Z}$)o5l*M01oA;gSNFZ|*k-
zEkn%%FNtVB<`pLk!6T*7Wuej`1YgeHvHQXUvd==vfv-&)sNUBR9r<V~P~dam(P_k;
z*>-J7!g)+4_FKGaAu!#_IGU>fdrMzE5h)6%{>FfF{~%hQ*EE>EL#q>v#DlaZtf57F
z={9rZsm$saR#J*|VVAPl<bVWVL54^%P@L87wxs=CKR^#4mCT(%oKH4%@6DeiE=yxk
zV-g^RtMt4^-Eb8>*BBGVO)ydx&i_MS{H&?I$O5;aBzE9BIc+Jm9b(g&jBKiV3MGX|
zJb=%k6khO?4WNSjb}IYDn&m-L%}RY=KI85Aq;UsHey$wl<r5@ZDcg&^n{{7YUr%hb
zKPa0sj)#X$<clSE!hal^t=jd?9dceL`1{;YY9SxK`SJhh+rSqG1C?m%MANu=I+6U>
zVx&8HvYhaI;&D4w#%&cfqa@>l>skK=%gitD)VpjHfUP3rWZOPjb_zp9CL|!hLVaMn
z_nGQ)ZfX2~^yGwW*|>0a`shTV^Z@Ujl?`{od(~9>^y)939Y=eLnR{z9sL5_aQ}~*S
z!_AT*)NtF2SocDO1Z78Icg#$)NtI%uISbr^)a0)hsO(=A2m$up3}};3+@<tIIw%9K
zz7j4!InEiN-<4<#laRsHRD@FGlfvODa4)a=h8zkOwbHVmjIup0SrIEE(EcU|O9fwD
z5x7@JXeiPYhYr-VLjgwHPIgtbvB*2FRK^rc2!b4y0|D<30^V~t;Lz<K*6+kVo<}uF
zIwQ+h=*!OQQFAY=MC7m(Y%!uiu8brdFSYO0(aqFfJhRb3r@ecjM}?XvyQH<h=J5V*
z_1VR@8GG)MpM_fUsg4AJyW~G2oTP-yq0^(Ex@ym8G_*QLTs7t1q9XFU1iqd#X!p^o
zhqXROHU{ir-PjMJPhjP&zspP>Y9FAvRD9n(eYE8{e>5E4QK<ON^D`mEK=Ema67tnY
zgvU7R1y%KCYz!uB&Yvy0{sL1pE6+K$mVCH7Zp>b0!JMqO&vXhh7!$Nopt$hGGnoR@
zHcYN7l$+F8IpWVtGMhY6WPj?T-}IAfvv-)zkN%4Fn(m4cMfKS<IW21*UiyMeh<{(G
zYJka5u-a4k6yjI82!EHoDH}R$LX%)}!3s1Bt<AZ5Uo5<T?Qh|@G3~%jKR*{W4u87(
z+CYmjVAMPU$4QEdG6?X&wQAKH8a;q1Gt~OWs(Bu!RNXFKK#1U=Fx1E5RG9{*A#>TK
z@vD3*W{V25?}uSgz}qk5acjlSt=U}eH~xw;fH#nZ)^T6$>EO5izie7$gTBRjT^_nU
zb(5WZZ29jXKT*K6k$WlTOgxHpxS&h8E9s(lp;b+vG>K7Z1XD3_ag0$*`yJ|s!-UkD
z>Go{QsJ<W4n)*`YiokDXS*-nTwN=8$c$I$|{0*2g>OktDDY;sEi?G|mV#WZ^j9e`q
zSG9hd?Al8xo9IYvP|S(X7@0{1_KON!_}bzW7)2lxh;-E+79UAM^nqXBA!#g_)^At+
zYtjQ2N}g(^n~`0tDV$S6NINx5*BS7Z;@uGk%@GV(tIVah@c&r*zLvIAvOb#6CYjE5
zO^&bhmX<ttdO%0*m963AdO{*68eC|R)G`?P3!Bi6gG{IU)Vm=YKEIAc&M{ypDer^4
zTT7bUGTU`)-cAJ|*D<0jmU7zm@UQ}I-?|tD=E))he9vPpQP#b#Z0{j*7L^En3J7lc
z=7--)GhMZ0b<GYVQ4#FCFx7aE%VEZ+hUSVFBkC6}KRVr0#FRV|2eKshyt;vy1F%v7
zQA1aN65Z_yAyw=9UO2m4vD-LU*rOtAX(X;mUxB<cXQvOg+!|Q?jl-=n)NACs+Y`0O
zZRzNRQzr)7pmvwe)Gv#oGx*yvo{wcK3<B=abQND<qSA-Jb1w9&Y>7aIE8@BFa1O_m
z%B1nvM;)3U-1P)b+b+|aCW5$49qK7v>O*!Z<dYK(7r4HJQn$q<Lk;FDW#cy*XCDg*
zv8;tsA3}Lp{}0{2WCD!r=`BX~S9^|J(&+FRv?^Yka_1S@%ed@#E)*^E=e!QtQgeO&
zy?DcKsKvRp2RN5&WWu;lW*JaQhVOzhRb{oGm=UWEdnyUd<4@NNV3AQ;x)D0kF3n7*
zXik^I*xB)xL5LVO0Vg40LK7K<evV2h-m4Ra_z{<%Bow39>0Kmsm3mn+R<9nDbB>Kd
zvLf5oua?JimO!qfb%%ZxhI7A0+DSTCsm-4zZP_H!malN=z<!&&fL3&jA^=>pkn{M#
zO4mwqfvO#YWVXLadT>IGxzGIJ5mroS4JW@w^CIRXJn$DOw=Zh#bDs*L+aso$djG+x
zq=i=zlVd93`yle8_m*!Kc(A(U1jFs~Ofy@I>ECU6kU5>!ME?_b1qt&yVGuF|bPnqk
z4$6C-pv*(DZ8i0qfP&XUgr?_IoaPTxwj)Itf!J=hQ%mM;2JxI^0&~pddj~YkYLk@0
zSTbkpUlgm}yG=hP0brJ~qOCRSf}EWmWsi!pcLSmB0|*FM$oOpZM?Xhh+Gr?6>-u}1
z>oGSIpqqki!3ZB2fFvpNTMBO@FJ$X*ly>71KS8O*ZS)-fw!87ie2W>5l!q9$zrH1T
zfK912g^xuH)`BxmVJbj6Zoh#DS3Eb`Q+By~L)w2vbHIkZte2~;pT#dBI_lDU<|<ty
zDOa1V6~qmGmO^#-zp{hkW{gAOxC>E<{R1gbMF&7(sA`Y!M33n<c=bgF;}5ki|KX0g
zDJDU@EX4hB!9oxDBp>x4U%TRcE{xm+ypE`tw`bTjMnX|9W`wCEMv~+hul!|ZJVM7}
zG!dFc^kw%_5Bly1o1xU!j7Xt(t&48*icENatHwacq{6lEA*~qehLY!`qQ}%-e~*{h
z4$ksOvxM!q-R1e@*0|W_kC`N;)m0@CSfz#y0D%PH#SM1qhM^B(XOU&zLsAN~4LApO
zRJ!XIfNSI0UVfcz(LeToTjbQ|cJMW5d05XZR}6L$)QXF&UZDb<(|RM;Uv3`(fC@~S
zFSj+%%=k^uofkaGqhFXU<uj7}I$=-(j^2Un&APLOmStp_1ZN36LdXFQ>C|6$=7xfB
zMIq<Bh=3&atij_+%*`{Ry$^yZLn}eRqu+~Bq<z5SJZd^2oWs#(nYdH_4~U7vK>$)f
zAQbjAZNXAS-gKD;)*jktyDNB*mQ&mkSct}Cw%#^v{wU$Dk&>LKmA;(cu+rd-&!@dM
zmS$M6@NOqE9i=l8YGQ$D>E-0zii5*2Q~S$nK41bs|I5@FFo>sT7dazZb9=3^zEg*`
zC$2xABq9NbhhIk(y>^teu76KN7ZVa&iH4JSnJA9DUK3nlaEfGz9w|+I^R1wVc*}4H
z$K0irm>qI7pt4>)gI+!u>pt$dDK#uyd8Ah`K(_Y%+;H;sT-s8YSqL-NO<xkUqSHzF
zKTQ1cSq=&f^Qou6A3M|kv?5XCl&0|YrzRP=`#J1Gm`X%u>w{FvXDW#LQSyR;b)zE6
zZQlOF(}Vu7p!6S%pr2}#%^J1q_mIh%EmO(yK-=!Jq+Jv$?>A6epdU?TX_;uAd3YW{
zS#*f4emvw15wgOtFXaNdYUqe3&w4LQyDT92YMm@vh>fcfA`qR`sYM)R!J9o=>zWay
zG<PlAIx^;uhSo*xi9<4moY6eri>EfcsNI!%izS+uTN-fvtr1&<lLVm)%Wo9j3lkH{
zmJ_+ATcTM<h6|{HIS`R~mIfV=v(w1FKd1sbv?MnzOMF?0oNl7BF2`kXRU+nxGCO{c
z9Cmqx7t^2ijh@~M)u$_=4H{ab962%Sc-cVJ{LIOq<IOUfg)&-BTMLpooE(vcyo)h5
zi?$f&rE$Vw=-^17<L-S~K};oF_XlQ|XIR$ded8nJxb)5$%w3{{%RV}8d~9ia@cmCg
zXkS2APX<}wvD+9`&y)FIeUDCwUr%wHD<$pSF-^Z<0y>>A>y*CfzGqPbTinN%qhO;{
z`;K~|w(zS=<ft)eo@V>tU~&?oe`UV1<9LaqaaKiY^9)ogtr6ukN0&bdI`s+`5;$I>
zh#(1Y4Rp57dV284O}aNrQ0H>!=jfSl8MF$;7kEBTI#s$4#uBs$Mp?+TW#M@`Jgl8H
zoVBcNxF^yS=%F6;;mM?*IsAFyT)V#YuhSEUAVA3ulg(Lv#(vH%IAH3@&0F$ZL_{I-
zUn>ireG<flG~yLD?u%%(B{p7E2L2boG7DZ-b&u%604~SLV)kF5ZYdB(;<LQVz89BD
zJ@hi_6J}gvhX#gP`rKpy#u@f6&b+hcYc4|~zn6VznWjr``88*Agk@&9XrmLuH&j!$
zrNwe+!dOp&WUR=c&9->cgXk-{#?67e!E7Xq+II>cm=rN#xnvb2`I)*$8L$u*SV-1|
z4NTp>F)E)E7jg)4B6#lZz0fI&g64ifbzgHc-uPZ(nl3x#SC490B|!->{rZGI`1`xr
zn-M_bom8L%h)!*Cr)cx6eCEU@<Y<+Qkz@K0$s5m>F-_p}Cx?x}AS0)vlRcT2z?$%B
zK?(r+e&WOUow31W?}V{Y^F#!-g10%5+ehHWk%ex1dh{#^6XRr?O1D_6W4<a5MUS}Y
z@y$qNRpFbScm7>7mTZMS^BW#bQlf@*`;nwqZeI6UcoQ_id|@#|3lD_Cyhtl?VT0x#
zOJ(!E0riMc23l{BEJFzxJ^Rdm@W=J)i$G&?Ocr#L55Io$Aml)BN<G{DO0Rqc-0dyI
zhO-jWYrV!wh;ykfa0Zh%Rd1poQ8PT#VHs4Nc8QOZtCa0C>R4qxrR#lyH7kD9`$_`%
zts7t~T&ZxTg%00!A^gX~K)lPAu_|RaQOM}?HS=KqGu*{9f}5h$r3Wl8n4kJ*-*2nk
zs@L_j;apxL-q*a~W&_Gg{r~FiMtTT6;_c_(ixGns5Ebrd1})Yu1S0m|QGY(e{%Z&M
z9KOr!s6k`5Fmi<ix4Ywh(<;b!>sun=p7DfQNt`0-hD3HXjr^mdR>r6CF*R8rE%0=&
zyfKj=gSNH(iCNuisOtQ>olm?QW@%s|7_DhK^d_Z#PyTi>24HaKz|0PU9_dv=D@!YJ
zGwdAR8T-aO8B0b{$|7N4P}-D{`F3ZP49wbiz!bkMx6H&sg|&z3=HY>9G(wWnD<QkE
zX6O@=<am90`+RTpd(N!@xb;lkyPA*pe{0P73v1xBSRmhO3w{zKxJ6ao!zq84j-b=7
z6WadK2{`5mrmwrK3;bUez%SZH;qLP%izP`U@-8^N7T31cO~;}&>C9Sw;()5R2yCNG
zu>gd@!~GQa(7pC#wNc%T80-F!rwX6nc_jmP?&kt$)CNM`x&cz<XIRuOOK^|oBz!Pr
zmY`A^hbZ5cN&|#aRSw(}zipL^Dj$CS_0D^K_%Zh<G`SV?c~K4*j%4KMaFC)U^))(@
znh<nbvPI=_aw&#yIx(KuIKNMpp%gv1Xf(#hs@I>8(axU)usZFuWFB&y9(uDS0#Puo
zll^K?8%3TGRvNML;?YEG=gizckL=VGonc@1{h;VTTaAy22luc@?HrNKt5EspeM-Ce
z#InNlN6<;PcE6E=Qa~(_6fBO7tnC1{O((;iM+mI92*<e*=M>0Wy^-l|*QAmebtH+e
z>?{hTfl13rfnJdr{E}^KFt>r4?DHEnjrixcTx+F(D&HE*A|*PGP4`LPiHVL6)c!14
zi_&IugDcS;kBX4z_wCFPFztNlB%{p$;r)%SN-@t1qPeDom894|LTFmNK#X<b%MTu&
zHeXt!ES@qFu9_BMakLAx7SobtEtrv937w~$Tq{M7{jx`#o7VEn<<4}nCkZ7p1)E2Q
zw34q%mw%h>PdGp)4=Lq?{43i^gZ5}dCj9}oa!VD|?ar(1)q7&YFBr<iHup^ImDm^#
zDWp<?luehyi{C$9!w6zjfMv_l;#6$8MAiwlH*F!oJZ4uIvri5z?B#U&@Es(n5${ee
z*Lf$Hf@9{8dd}2Hc5{sybB9CSgp8cc&S6tBoSVlagxzHkv+b~@rlBRWO{0nyQx0w-
z(I~7q>^U-(>Tnzia5{p@hpvPvKI!%F4+qP$^Wc3V5@@DaPfKpwAQ^l+0K~5tM=4CK
z4jzgyY>2GDksUEKK0FjN@OLN_L}jO3SO%IV1k>~MlZzl3%d<Zsi<%+!W{O3_z!RK`
z&ADT*SD!U$<bIDhjdObhi5daPGz&#mx(e+n?krvbQ8@Yc@tIl7D~}zzHNEU>FYTR8
z?tl3MkP9SVO{*C-#I|wZg{c#+bT@w~<1+5`8Be9Ove3PmZEi|0*Pcq10YxJ`H=$D>
z5wT=Fc_HPERQ#6)s_uNVu@Qz33{oCGiWCS-s-<9gQJqRxr`vFQT~TiQz^-+hSGtof
zT6SOkz%)a2XIkJ0;gkMrh>!3a;j0RoKF61QrNZC%@#WFJw)PP3<_Ph{|0&}baA^Nr
z-^iKZz&rv*@VzLQID~QGP$$n!pMt*>s^=v8?c?rmzO4vq*BtT!NBAxh=rgL50V#R5
zmh3<PX6{%^iF(~~I9x=|dpW0uG_?S29vNNLY7iEWo@RYss-|8AEMkNdJk1Mzq(}#M
z2WL&{VOA-t=H=%A_VD;ER<@}>c7O5AUaRe7RH#KhHw8H1kCTBtU_#-Xvs3AYX7ND8
zyQ`MBrdS?lrv$CmStE(1A|tgs_nyHq&7a-BrfjcRoZWESGE$$@i@@cM`xpeg!IDRa
z4=wStn2Qhg6NbIL^AWoEosn^-=Bp6GOY}^9eKz}SM5I6WSAsVvN$iAeq_$n>^?9yH
zNp`=W7*U!QY=JK*cPPpSGw;V!TYO}>2qqt$ek@Y}gpon<7?X~kbhqUgOo7p393BZ2
zyi2e6m2#cZ(#qb#8;SIBO0=muPTWI9y;4B|42!>8{e=J4SRt87kJ&8V$^b4DaQX_E
zKeY~Eo4`5;CHJRxRAR2Hx4Vpjg&BH?pntFQy~teNz`okI2&d+d<E3o_Sp8Du%kJf0
z`he{eJ$B2ag>_KwSn8hfle)irsTI*t?%0upvde|$t$tZJOCEwHlGxmf)mXf(pY!5N
zc%OL*Ho>6TWWmKdNuvvtuV;sWqCXqzw+5tSSs6~=l^mx8o)-rCVQ5N#=3xPM9Iid*
z`rlW%hrUS`epqmVdz+w;aged!wpdIZZ0AeoeE-<^C|Q~|TKSe};NRgRJ(nY0D-aIA
z*AB{xnQWNy{MLM}RBco{29S-yN}Y&qkd&sAgdy>J#F^Fee5aat4zAM8^Jf0YX9GvA
z)m^GGMdj=2zoQ5T+k6L7eL?5#;E0xhHa?Q`xoH+D4ujHtS=Cu=iqW4%SbsO})u7Fm
zu#svDSonz6!|oq8jW@1+6**Sjj^=_A>!VZ5tU93xd@SiB+2Pvs$yoS$Kp?T+0(;9O
zE(R_pOq_t_QnCNGxLQb*b5c!^_{+A<828W2N*TcW=0Q$}KK|3E#Eo8K(IX)T{775s
zEa5G@y7>{_No^xh#d-(xQ7bWL`KYvn+wO}7x-7uRyW1S}j^!U#UZ^@u0FJ8^VRd(}
z$WT-qUX=~pbGa6DI6P!D)_*17T+@Y7L>L=xq!V^w#qLQ!;Vy6=f<THC$`VyqWhvz&
zfc@AaA2iVoudR80n65fI7boB=l!@D4p&U#xu4M4l;GdhklK7P<WOg?B2TGt(&nw*5
zr%rm@3U3JvkC!AkwNE@mB)h`Ag*wP&Q(;NhfXA`<*OLbpG|n=gxTV~sxVD%PcxT0+
z8J<nng9GR_lO~8=tE-W`Xt5v7Sk*?JhfV~~&;=S~xGP;VL|};$&FizSO48Zg3lL#L
z8Ww9WW4yIjs=pzk5(O0*=(@cvn^y&UNxes<_37i!wa;rkH<zx_?g0V~Q!M>%0)@Tl
zTRRwO8pTKkEm!T&&>55QBiPTzrnreXb=GlsBd<x(SrzSmej_LeOdLHPIrJ9W2>oA4
zQhNz2@EkVO8KXwM^s=0DgG2slf8%%w<sYPHSpLiCJTI7L4N}6-(IT&7^8gt{S`c^3
zsPl$BR;`E7-zr^GtY_p$xQJ;YBG>v6{~5!)_tDc5KTh{JEuN8Qz>~9r4;2`x^YLk)
z`&JM`g%R<2@eThJDrk*T@4<BmU;6?kz3wz#R>HJrf8t|@uWqjwy(lQEeB+CN^hIaV
zV`*dZ8d{#E*HUsj>Y{b5WKRzm=UcG<zOS?+p=y5Cj~R=G?Q&jpT)A|C9<vL_RG`eZ
zJ@dq?JI$SbAtu2Q14H0wGcn3%ax}sJdwVbckTt~IL~iaPP>Yvmc?7n!gK*-{)k_`&
zT|@#l0=n82%tJ~S<VFF81ue@szNJ);t_@FY7bgqg1vy;jmHo<gJou#z=n@8)WU($r
z0E`F$av;_OfX9XN2_G*0JzZ2;gJg2oP=x4TRDc9=v;q`M6#<UsenBBQq`N(ZLC&;!
z+BccWaoPChh7^-F=jb$2a()n93s(fibLXn%BIH8DBUYu7{bsMQV(j93jcYe07M1Vv
z9b(jC`}YcZ5!tGQ-zH>3(h-<a9Qe&Y_K21*q)hN;riQW{9UUhAREPzl!g3t7T~G5d
z8p!L+TK=FMj%j)ZGW2kGV2C&WtvA4{aQ;pbJy%#pbROrrFJ5S5QJ%AKCytuT<JrqU
z?vXXp6AgoDJ>kw*eR*>{WW$zeAbbA7D*z*G6c719)^FbN#+Ql55E1ZyF2B;`sC~n7
z-Bq$;!$6F2b9Y>?wBTaNk$dg4XRTr7^IN91(j8wshHJbB<^OXN2zO!Zl?~3|1PU>#
zz<-y0dFT!aYk`Qsvg_}#xL^D8eBcT}FdVgfKW9cp^k0IH5*U<xw9ZMPikbxipGO4d
z?La}Iv*=40$!XR}^}>$(PNqRbE`7-dkkU$(su1N<I*!WfyhQk>9zEm<mL-AX@i_+7
z&C!$xxvRt!<clv~D===ZD6~g+Zs(T8l>xt-VYc#x?hMVQ{d*2fP}Q5^gY)J#AQHOm
zL5i$!Poz}no-3#I(`&Hs=x5(ON7Yk(|7S(q+zU>xV{&tIYR&xy5c48UmC(pr)Gi&>
zHUfa~psQqE$cB@UK{Z@pJynpdc5zLdV~4aNM~4((%`HdqCFm+oOOSx3Y{$%C0bjrn
z4d%OhN&a<>I<qzy%Ec|Cias_CvT-l8k>qUeIVw!ntRab}03}D@RP_N11GLzWaV4*R
z_3!RkBq;ISHJ7Jg7g+Iygh$$bU0vR>uSO;`*ukFlM$vrtuj|L_al1Y`$#$bgG%#gv
z{Co{ow8j%{#AXw%I9ktPsPWX_QcG`oeJ!_TdcrXzpp~6r3XA{z@*~!Y-C-n)cSfzh
zT(nySgF_9?+z99ZUX!hVx)BFYEb)#LZ_Lg76w7_9ePuk>+jK=&Q*`ts?(mK^d%R0<
z|Kz-rVD+sa)tQjpA-w{|B_F8B;CEkZ0Yz+5H2e`ijS6IomOMqPSSv<5YChy3*~b8#
zX0U9t_EFOTYnK~m$-<XBuK4oxJ$$7A5ROQ5Vpim#O|-}Qf2mG#AlFN-a~5IkR=}I}
zVBI|dLHs5DR;mn<0uD?_G@gp`hgEGSRwpRcBCl%zey%<qosYh;1s#Y*7PMZrTf|#=
zW^{Fziy3n+)(V)svtBXh^)&k4?KZ6P;a~c5j-lcwc!b&PJ?>wdd+>COE%Lf}Q5acw
znV+<OTQ=$kSl`)Dz&LaiJIGxgz6xM8Kz51>MY+M(@DtOzd)pCFY~hCi2i@Z7_NhW8
z(l?jlUfnNIwMeC;G_EYQKd-=Jip4>K1)KOO*9-P2fPa92r^_X#mlDu14?oDK<FxkP
zSGacqq#)sSh46&}3b29B&yki)JS=*{keL0MJEM7WxT;RBTh9hpfbR~hj#FFH*@Mbj
z_1aN+pVv1QzLs+&WiWw-)@7_$_CTMgW&VTyq^Hr-?XgE!uwycLal``Xy8ul&D*Dg4
zoD$^#mCcMB%!#Xf)aDKSzlRav?a|nuhQoso*44q3-M)+ML~#5dBk><xzdFC+AH4w@
zogKlnDGs_xQoALnd?MwvH=8=}!A(qMd24<=fwz)~)VHC<k#dW;&d<>-t2+1;+M0}U
zO7-bfKP&sk*mro+Oy#2=n>4oS943ZAEOlU%z5g@$dx*$nn|>4_E+1GzMv=c64#I}6
zuYD(sH%EkU#dj-@P5}X#drYeoCU0E>uQk#r?rsicYJ9cNO|b>h>BS-oW29ueZYd_Z
zBa+2xR`*gS0tz~g8V}pK3keL#0rTSDG^r4Gw&}2ueidYS-`fTm{Dym1x(AI-KKnqO
z_KZNbIuNp~aK|UbQO9J-da%L2*qE>6q)DFtC)2WWZcX6xVm@`>TJwQeF8a<*Uw-{B
zl)Z3JDC|Ob>3_w;PW+mU0f&d6FECBEnyNA&R8NQov`ZKx!Q5Oeow`T~J$XZ7qkn?C
z?K*Lj$TaLMf+*M1!KI@LOX<ONb;mi(d_DgPR?y@O@$x7b%_u@}r~5Ff$(aTTjF@+R
znoxqcGg^QdJq}VZ&<&H~D%Z=Kf1Qy;!L6k!zmS5J7Dszi`Ai}2(h?of7KIv06**I@
z(JpD<z-RjUNjD0!V^2d|WRKcY4sVFiB7^WW3+t+-99ay*<4?i}?ME*q57JY-A`MMB
zB4m+_@jham!!KsH@}g~h)9K(-(O{%Vt1v0}N`>20^&+`<;z&z@f48REZ5cQ~tSptK
z!4Rg#TUPu+l@X=Gh5H2?I7b6D`IS^oA^T7sV`w$PmU-B7W|WNS;Bs218|wHWKuj^T
zS^1?0NYdf!cT!FNv9*sy6#;EGHEZ5KjzMmk1}aX~Sk*fpq$gi{O!T2;evhp$tWY)<
zVQmO#wyFkHrG$!Q1#(laYau@QEF}jX-@aqlrh4xK<$F`-)cBOgtMFj8ugbZ_l4pjC
zaGe#L-NZ|Wgfsg47MFXs^6Hwr%krtF&4;eoG^z27hlAdiWf!g6VZfhV(xX&QG~sAR
z061MwTd1RY?4B<EuQzR{WJ#Yvs1k?C7Y<<2N8StDA?`7oj=(f?(E!pwjh@x|;g9-%
zT}LnOYOushKmcv{MDDZf1_l5dd~of@@9argeeh2XKhFVj3nhS&_u=<xtPdY(Z><RN
zQ{~Cr6D6}G7a{>D0C!9yY@}!pet;1?z|MphEZT|iGm(yGLAI2-$ACgVj9pNqs`~fd
zWa7->903yG0d`zGEBE-?CrIK{$*HaN5kZEfhX$X2avvdnD6##-6<3|%xa3@+W_tqD
zt>*?iDFQzq)+<-~JYyfib-5N+oEnbe_UbeY1cXI&kgL0mo_+va>j9futDrquPOu5Y
zmiiP&gkay3T{;FQ!p1+KFX#BcV<4I4_U?{dlJ%KX;9B)&n_u{|@BXPtnE5DSOqt_m
zf3ICe-0G%QCctArESP`N9S3YA^u|NW7amR%=8%Aov!CJ)QTF@Q8Rg9AV5UW&I(Sa@
z9n_U^#`Gw%(8cxB{up@M5!nKPpQ(~I)$l|3KaoA^SdP+Ma=g3rbFn(C_kf3U%21@i
zB>e&aH^Mw_o~6aYGku8uw2$JV%xRq&3PNu=DDym({B(4YrvRhxfG^p;Nt8><a|6=q
zNO|3H#V$S4yswbsFk@F<>&?4HoMZgDB!(hqw;K^En4{B(hc}@ZSRY;}wC|&uuZabM
zHJ9GMw?~e%2*m+meCu{7lo)rIyZ`bm;5+b}7WgMEP!ppoF_^@#pCKEz>BzcdIfz>o
z50eN!<P7&%Z+>{+v~Q1lv;7C?PYn*Li&u2{-Bqq4U6s3xW46ZAT29UJC@I9Xx4LXA
zsgi99wp;MRfN(a37GoV$^rj>UqibGzp4VD;h_|1n_}AC=WWU<CD=20@D|fcoewF|D
z?7H-6gNb<cS=GkM%nCPMApt3MumbW1OT>AB(slT3dudK3*D8yT8r@5JY+K%mIo*}k
zhT{wBf9Xz~peof$O@!Wiav%`&;$;_&$+p28z}<96N}`nM3&1yPGjUX{u;BZ>ovOp7
zw$*%T<LSWV?4d<&dlD~gr$b}s8zfxGz3XY)@I^&Jm7I4$`}U9SDRaYa5Y3sYLc>Pa
zD&hNNt25gsk2$xw-^C@Rq?abJ`nLw`s)o67)Or#;?S_;+G6>Ifz{$rE#Z5#pZy7e_
zOL^;Ty{dxcZxL*FQLl7t-S1ECV~iQrj7QH2y<qin$-jR^XOBXNn<af{H8|9=W&d|g
zNhuHFD_MB#NY?&VAs#-va2#KT{!>bq=b9rkziLNx79@rdFdDqxdZdTLnH;szr!lta
z$TJ?Jj7GNZcP{ie<7~;ZK}vnAhJ`5i5mkybN44!f^TQ!tmMk<~rQWX39Vr*rafNH%
zXN?hVvcg$jg_E|pjJ&1}h(r2~!_7hhlD~F%nvj`a+=R#tI5*?eHbD)H^?!-lJpBTA
z-hWmVzJmYNK&G#mxz7w6c}yTKO!X9JiSy}hXjzpOs)VJP;;Lm)5hDpWcm@g4zklNp
zwQgUbdUxa!v$$g4L=)`282|rEpm(9eWTtG=nI68EE?osPk%I_?9(KmfqI5CI9zS*U
zW`)3g`7z~kHurjt<<Wj})l-kLD7waXL!w<<mp*MaE}C0{M2pEEslCwnqn*N;?4Q&}
z*O)2o4*s<?)u09GK^W@$MFp8^GYa-wmWlHU+_T+$@&52tkOoNq*_F@<I@1hs<*x3T
znGMk&t?nsDK57Lyi5UI;wDK{c5<{G2p|1NCtNG&C9?UgMOrc<$q>hVd$92aA6n=9#
zBM(mFUi_@q!(Bs8K`t2~`|1nm(*nse($7DQuT8TFk^7XyhmUC9B?tJFaT;GOz@<?n
z#1fXrPNBiupB$A%W|VCXhjo|;VG`olAAALPR1mP088=}gALO@jw6SlzqdXQc0L?$3
zmb2dc?rx)q`8N}Qy=R=@#xv3p4qP!_^kHDEq?fDe5h2h`kX&l8Py~-X^5|YOou&H{
zo4a$eA%jZJzx^ShH2rKc7fF7Imly;++#D|!E@c$)#CKjNL8vRvEwx@<Q=Pu6kOVGz
zU?jMwBJc(+xFQXWlV(s!*4gDY0=^}QUZ24<8bgDFQ<EQhZ-{3vWAU3WQqdTGy$*QW
z^4kL6^z6H>>*utCyw_lG;)-+=-%T)e7JI`eRj3eEAh$<3g%Kf~{D?4^G6E4RK{J@T
zeW8paMB0eWEFU}`Qk>i_4O@iOcny`tFfIImUXU_IuXhOPL_DqE$y4+wRA>5I3e!*{
z+uo2wc4~R^cn?ZvcJdQ@Qsd%B+Lm0G5IoLCt(D5N`@SWsc?IsRJhHU&Hf@_7YefM0
z!H-)K9t5C%4S0d!l%%j~;kbRKw!c9qDmwYP->;ELfEvKbdH$ilW~6LtAS4-p#;|`#
z>g_TMjUPQr6Hk6(C2L#j_3L-=wG-2-CE2YDEyhP>Uxk&|X<ZYsG7QV?YE)^XHv?-Q
zvA6nOn*6G|e&7w@IfG#XJ&8-`{ok=cz|V^VtGJ^#TLaNGMacsENhl(3h5g+AnhGaJ
zgBFC;tq>OTmQPmoJ^JgrH00h|!ij4v5ZIdAs&j@Av!0M6JT!=pD;}o$dDScXNaXz!
zpxefE0*<O5Bi%~f=nJACRc?-nc3;Ka63D(Mj*KEw1@&+pz!wVJHqZO-x(_6hvkU#~
zqOCk)#LUbYFOF`npak7(4>e`7iSzC2npTjCK+>u<-pyAupD9{|pIK&D-`s9I<fUJD
zA}mbLTK?Bc5QmF~gy0qCgV|GEI(2yV_Knd?3sE44mk08{j>&<~SkI8j*SHva5Dt|z
zpC?df_oYkEIJ;U7{xmGh*?PHcdgXgduIIrtbhk8^V%eI!wu1Q^v~GzAd5*z*F28I$
zRkVmt#*q5k@IYZdwfZtLVDxpt|3lYThef%4;lj`$0@973bf?lSjdYhtmq>R=rwB-Q
zcXy*S2uL^5-JSOvz;n*;KKDNSGfaH*?X};%@?C3aNj{+84j?N)YsOA2TP`4ISHFrv
ze$f12=FdS0;J~iqmVXiq9D41*49P^=_$MO(I?vFJtY!IG+P8kX<-}VTauOtt7JKO4
zK<C0^S*pA!<X0Dt02edgem6toNAj`z$UMdVClpcw2nbnp@fs}V3L?Cs0>gzl-M)!v
zE*lI4V|M}Bo7&-RJ8?X-D4Jp>+6loXQ9F`Cw7`h+2{wAN#65H`dBK*^GcpM7wwxgK
zoFbU?TI?g~@hm;bo2{;NpZ8Uq8;tjLUhJFje|`ylf!5ZUJDR-3ieW91c^;E;RTJxc
zQ&=AlAKgION^zT+AkFh<8s?SH8{t+2$g?D4C^uxt=deIY*VdG!o9YK4BHH9UY5>rR
zl1y%{JGY;|=7zY~7UWV=x*Ey>t%ZT!yp5E)FW`g5ip?=afhtA;`4z?oAKo~SyR^<|
zEVQfmG}Tl%&>qOE^Y#9AL0j>->LJFH3(oswSh=S%<XCZBeBX<1C_&jGqT3X#1y}37
zb3E1^<Xuge*(^_APuJ2re3_V6a1Cd|F^Wlw3jDpN0oMtN;Qfw;pdH~qsoog`NLjKj
zrz{2|p$_E$JVT=S7la;0Nx$rsEJ@Ojxi?8T=lWmm9kAxW_yEo=4aCm+8Z{`1t+c24
zdTig$8i^{QY2_Ep6wfOpwX*Q#(!emaB=MW(`DW^P>*o+RB?*KIDh0g9>3OpRuRb^Y
z3bxpNW4Fp?Js#0{9cvR#kVNDbq2c)_bxKk%p1&>K;ojFUb-z_su`k)%(;+w(p<MOd
zQx`ulbO#8~?98D`jA6G+yI*kNaiT5)amKA0ohVEYM$+gxHrY5)iS~rjrwW}t#<Tj!
zXAjMW&8mHPcsvd<NG|CS>E`v?EH8FcRC6tDv_up-NnfE-H|y>AUxr=$l6ltKpk2wT
zOi+41fP6MS^fLI|K^)o!5%+XmVAm@hR}xY;yMniVbQGOC+r4YE@coNd*KhELs!2O^
z4vH5fL?nLK1~JVjDg44gfMk5fpL0;ETmD^&MMWp&LWONA)-X{5BE)V22G$YimGwU|
zw-{^zSf8fuHKz!ctOl-d2(?-(s3xMkNf+U=EHhH*x)|Fv22M3&6N4*g3~y?MbB*6B
z``i#Q@Me76?b}W#ls1qRrY^hYdJcW1t}Uy%++p8j0AyaSIp*5JjH*|ZowSi`e;0ZF
z)IoC^b1`}-OU<4skzRKz#bqQm43A!?!=FCJl|Jz1=U^ty%<bz%B+pH3oc)Sds!Zlb
zh#J}9Ty82?%!)rUH>c4@%qe--ql6h=5>-+Y5;yzd(fR>D3!|Hq0B|Nj;*muGCtUXn
zuI*StZKz)HsIm6*-+Ins&qN?jrs6RWWDM7B*?$W$iv>HLQ`PK$2050e7Q91@2NRy-
zgZSe#BRb@A*xS)Sj<^WTENxrYn|;BuvnyldWWb2+*)g#L154VU1)m)*NzpX5!L4y}
zFUBP7%8qSV#K=VtriO}v7EobfqxiU=4BozEqidL6Kzaln4vD=0C0*9<dIfXrJ?}$R
zKlSo&mGW>h??R5`s<}?D9m;*4Ccb5B=*wg3I^qkz5jBSfsk7(uuv>PqyXlG}q4SG}
z(*psdgnU&50{D~AD2cnveXjC<<T>UhTQX()0Nrai`oY`q`IOoH0d>;nI2_B54CBak
zeVW_i%yJ!2djhnpwq75;5$r%FneIenfKYq)I4AtPmEkC#ll+d5>Tn3q=Cv!*urNAi
zJ*Ig4ZQX3QJ<WK^3StqH_q$U3QdX+?W4E2V#bum7|3cq3f>lQ?rk~>lO5oL|;Bs}c
zF>?b18m@lnGRfC7hK-8k0&2$YFtw8}YIHWe{f>gZp_cZJ9kenwXa}{8L^`D9ure?8
zk;?O95arElsw7_#H&V0M6zs({IGl&f0nH%3&H5rq)#HYLxd5N^nc0_m3Puj$3VLYb
z4{;z~&D#A5HY};*n{^3#ZCW8u&(6mYY~EYf-_$dzghhh#5#XMJSvQVzFCVCw&TPu=
z3r^0C5shLILtp6tQXaHVl*&x>5iQkAP{a?-K-_v4(q2v{GK$UCGA?tvyU^^qAj4;$
z7|q3U7XAQa^)yKS_%El33MwdwrhM6&m9`&dJa4}TH<N;rNt|U%vMKUce~F~xH(k*l
zS$6FABchJ73>VxN4^BiKz9YHtjFjS1i`4G1ZK5&-CKpWrEhk0`UX6BWL)hLsL%>8Z
zpesHkc+MRdqjo;S^sOfO;QVhn0~HbtAcOFrMG3~4!b?fwc1^M;!o%;wq~{DIRVPL}
zWNg<7XGGAVFLSdss)k1l1IZh@&@+lkYydbz)9_L7z3emCpmr4yegEb=&({zqG<DMM
z-ID$e&%I>8ToiTu)1m8KB*=DE=5sm04Xn3ahXDhU8l2p>@n|z@<+l^50WkJ&YmEHG
z8eBOcv?2Jf9Z$qn6EP27EQMCTx+Mt!otqpxXU~+|J||&LpQ>q?MdW%#fAI`NPc$Oe
zxE;MmZ@K4RJN^DQ1BiP`reytx`z`#s??Yw@$SsBQ(?-uli#bOf%N)_7xp~k|IyeRb
zV^J(}nI6uKuI)=yaLeASGrDbdTLl%f-hDi+T;Bo&z^>yUfYk??9=0H{C>!W;Hi5iF
zL{{<J`g#Qx2vo6;uq3sB*3&_0=X-4e^q;jje3-OrU2kI|98Z+9Db?I6u0461#t&*6
z+2vBJ5IvgK7uP3$SxPVVD+AKkd6hKw{xhg1tY;q>1pzCLMVZHGx(SR_-<}!)UGB5p
zAp=v0T;D}N>;GD?Ti{Q?Fr%5)fdrj7Gzo>emoc|z=22MYiiD>dctCkle6qy|PAC)?
zu!~Rp#u~-$6Y~bAkj>66C{kxcz7EPFy$G?K+5m0weudp+kAJuxgfLhLL&hG%!(iSX
z4wSN$GnGarpL)f$*Od@nYHElT=wK#2j>QYvI(lHdV2-iLuY8D)pn&M;Nil`7FH_uW
z(B|qRZT$jd#hhxBz|I^z{pz2WFZY_L35_OdOPEgEyB6rMi(<6MUG@-H*8;Aq<l`@a
zhT3&!t&oS5gmT=u4e)Qk#&LzmkCU@r)VJ?5UCO&}hl09Yit8-b_=H^-hztNOP)C@h
zNLdU}?^envdn#vgGzi=3y^O5orCh++Eqmkn@y%vIRkua`+0S2C7%EOwf<#9K2l$k{
zJ-c({*#>-)p2dXKf#8OrdiyyfL|HkWpzpUZ5bm>b^HBPqfdm!I-5Jg+FRTn;GD#Wq
ziAGJ6K&#7<^UX;5HFob`K>5IrLV;s_qFg`DcX%KP<^~4R9aA)nuHBD(Oy348-{IM(
z@qwNE8PZlnn-%*_N%#x46c~Cph$yI^?<~?6ipIxUf_|#x?}~uy!hvS=Om?TWpHPIL
z)zQGZ{XOV{95Qib(8D$2t2+(aO>xVc>~)~T+kobG$--o=tf*0+;&X-_Adx8zH<4?L
z3}S3AdWT>f7$@HNK7-eP(}!u`%`edl)))4Wtwe0oY6!x#G8Y>Vi~Bl2t8D01`z9n=
zYG%lc*J7oqDQ=!Vka9<y{FqDs#0*P~fVVvt=3Fr4!sv+)45;wbeF4!3E7tOQ#y#`v
zrgK)ro=^2-14K18bh`MmQNSY6`MaV1%J?a5MuZ7muu8xKADW^v*|ZqC5Pn*delj1C
z=aP;+qhn18W!&0j0zDLumCg5ecJz*}`5)B^(MKSOY>T5%!wa$&TfY?52OkBw_`N-#
zRe!_PLfKrt7k0^*YFvi`H6*mBkcAR^Rk+GKiX{8zCKP!~;G#ZGxuiH7D8}uPrz#{q
z`v!fDiIqQy#HEO3yS1{SB-`T#?tooFsHtmg$1B_s8=FH9ERN*$@(dCAX#>ZrEUXyP
z-9r*2Qj4&=DgvjR$n4;hz^wO!<p-C!MKIk%Dd(-;ukEjDQ%;uuj@{#=mOghx#z6Sv
zQRKe8Er3bgxpuxIRK3HtaS<|w#y5&&<3ekqDGa}0t-cQs!*>TA_G_ldh@>%9SifLW
z4BA?MD6rHb+P{o81qf0}s+WA}>Sq|x7@OIS(Rl?iqaej~?|0;4<F&jTgil=(JJaer
z&jwcQijEbGnU<GkGrgow>LWeSgWU9Wv|X3TlkWxnd45R*F{@smZE0DGgj)?NKg>$2
zNRSn~m~tg|#~$3@J#~2A_m(V8ECJWfJ>xJiyPSeF%}gLJCM=8eu8Vb<2D<dk#yNgA
z1G~eeqf0yNc+U<znofSt!8nclEIb5R7#dwZZZMie|NV0)E2h&6HPYIu-^cNhwQnkC
zs|??{=Epc7be)$z4F5YNt$9Bxq?g&9?8E$T@yQ+B^LLYq^$V<EAOCQKNjL{5MNBiA
z%t6YNr3|1(I0y%Z+kp4b-c|e;R>Ieq5UwI1v$_qnMEK6c{W*W2(>ek*C>Fs0@Ui?`
z!IeAf+kH^ry#AM4BIrOg+GaRUduR|=%Jeg68pusdXx9#MX9Ut)*`nq<q3wkA6Rf_h
zH#x2W!k=R4+H3<;BOy2;^_Wp7;cLBG-O(Xb$Xf9dj6;1Uk-nYQ+z313);9zV9Hd~9
z0)E6hkOUmM90i2^i}w_yJ7Ulx(5kXK&E#!((3;5nW?wfJJP!nUi_Wg-I4wKXYg7^d
z#SYqEE-2<(oK4upT>doouOJ>D%@BhCQ*DR{S6@T`$@_g`OvlP&!-wWM*!uK5SRW?`
z!39wmkfqiK_{ARYVxp-KUtoR)1@8>OniNkl>=rT2H{6yxf6ogi6z6Mk@AGS_??<{B
zdg;SaW$*}V2zazFhSXqq4)p~t{Q^ZEhmdU}mGXP3(LL{~ob7?O<<s#miocSpGyN<#
zpc}Xr;#RiBI1x?^#X-#3S|j(8>0w~jRY1Z#;${x2E^&=zWPqbSFQJ3@73?DBA;}-?
z<G72W9}{x)2V5xdio%l>LBzKZ2OtPKsCI)6_U$&6l5J|o_8%f|vJHUjU%p6`hBTSu
z^lT4j<M+j>r^dW9flfM^sBYfRN^7|kNt)sRPNdT>4;dI7AWgUNTqJ>TJU@W888--k
zxNnc;w(!a6+*5TWF~&MFp<2dC@Whl34tr>%HCWq`S~ZjuMtOrP;H`EUa#fp)P9_jm
zv#){u(|IFZ-NIUUpYVgVQe~DuXtEEQQI!QKAQlT?@E>vRleQXxM>;^J)RF=lL!Yu>
zk9N{8LpTx={-U)c(N+S!t5~9-W%+HFi)ift#ZLvq&5I^(e4!R3%vIEn{r$j@-F<7z
z(bc<FzQI=wbnjJDYoydusYyPgG34@dpes8=qctXzE)6z#gM9lxAAW8ubbzjplwYN!
zo@Z!%8x)iZPtRo{2QBj4n`2adMoL}XxXjm{BW~TnuwgaIJ~HlOq;&apoQJD;vQzWm
zIZs1^xkIv3Vd`c~7Y+z5-nV)6tukAR!1fY12g}80q0FU;gnVoDmotP%5I!5LK5QXl
zSK(1y<eSmTqVcF&Wtz8X>zO(_^gkPohrC~Y(c=m`V*FA{aOyg@W5)+Cl9d-~#@)q9
z*&Dxm;%+f*dGKoAcCtD7kQWzAbM167a33`LLN8zN(6c!1dUTnmtW(|z4~CK-67jvR
zuxF_AjbJLo!@rd!o+b_Wz4v!{HO?|um1lE=G&$f`yqu9@tFQT1iPVeoW3b9M3$~ib
zkL5f;7*C&juUn4f!&}J(!T15GF$;+}ZU<|Y7>0648)QCeR<hk;I`h7@a*l7`@n}}{
z9TeUlQoPp|=M;#EL@%0$Z3@-U6qGT)cRx#JAWaE(mLF=A59!j@3`fwGn=n_L^=|gs
zoZ;wlS6KbnHuIBGD@tb?T`J5tkZd{={Wf4dlW#1*8@)OKn5IbeW(VhlU_0}(0A`9g
z90pI<7QDt{TuDK!J-xB+%{k@zC6inApFWB-M;qS@3vCsX1qo?WJ!vZbN(B0MXFQXa
z7um>zULWlHH!f!I)_+#XoynO=kw$Wf3&W8ndnDA@?T-{#CO*r)-!a~LR@`?2Z{~Zq
zBYC#QTav#X(}!6lV6?IsHxjyN(&}P&bthC%&XYho%2E6^Gs(qY@Z5NsHbyZuXC>*x
zVvHV-eHD$A`0Pp{I=&)btNhQQo&pGx`xe8Xf@1<{EQ<}BZOdlJfaY6=9^VBm6B(__
zq)&e7v4h)O<(wVFS`^Q_(*$ky+iKI#--D$MKkq9Jmne5DokA~tEOE)Arl(36nW(Ix
z-Sr#O*$mIe94zWFTDc{QWyWRLziW~1Q`@dBt<TxoPSdGrwMf=aVjkv|o8N{$+G>oX
zo|)CO7|2e#ex7Qi@}^`fyW`{jqQLt)4af?49<KOb!<fo8)-EOspGQ8d)8WSMcj@0h
z>w^s8Ho2lZH6;iu)$aJ}!*jvz<=Q0wzK2+%P^X&m0$1bEIOp7qW!W9p8^v$eWaShw
zvX~~h<-#xat;Zf5<mJaGJJYyMyw?o+50LZ(jUkrGm8dgyDm)@g{axzs7B+-WHuGh_
zjFU{&5FW6~ssTVMr}vhwe!QiI-9=V2<zF~@f(9^fnkG0e616sS{}3%0?gg0v7gG=c
zz^<?=VUr?_rur3k(xc%g5krmR82TyJ0xH*0ZpH_cQ^L2;0>X?G^rXqTY=Y(DlK`}F
zGnMakb$_JN@jxrHCRg>q*UjY4xt_1bW@$q}(LCC|&F=8`j_4oEw=`)Q8@lixH^hcr
zXFm;ahO(RdSf0-@oFn=na)ySP*|*4*<ww|0jh~b}1ptTC1mzCm>qK<HGH@(M^5cH$
zP49Vt9Ibx3ojQlD`bhsk`W)Chb!ookvU=nvJ&R}=Kj@J4TsY+d%`D+-saj0YWR3o$
z2kF*_w!wklXTNEB=O*t%xYuWAA|{Qtr$7kVcF`uWMEV9IEiqN}#fl0SKNEW+z0>z<
z(5!(Cew9ib_T7cx8mBIeSB|zOj34$$W@~kL%Q!UmW2ajYqPuEZ{8+L+A@g)54y&^N
zZ3uM&^+mF;9H;iJ>H+UtK1KddkrP(Yw`y3<cb1T7c%!7UzK9xJ5o(7LXne6@ihGl5
zZ^yOuob17v)n%T_Tm3bqy|d$NetDS2=kl#O0mV!LPd}7xq+Y^*j2U^Pa&FO?A-{4$
zVnBKfN4isW#QsZXvBUzyxl0(P6l>BD*>Br7V;qiCCn#G=u_g@yWVQ!)DrZBaYZ%`(
z!bR0=PP`x%!hW+B!iF)sUN#E4P_JPnE}v@X&|{<P2`-J~P#;vOmYob&^|_1-PswI{
z_-1bz@nPjWq1-<pIeK4KhgeyNt4W7~d9OhDlg(tZeRJ{9AZC3bipZs=^8wsvxMvn4
zNYC0j<nZ_%iresY(zTI09~2;e2Fzscgt6Y*gQA2)WR(GfwM?$Wdaqv!P_U_lFl*Ca
zx^TYg0zXyzuJK3g`5;aG@op{0IQZ4SD%8smAr|*s1oejf<1HL_m$Q3?=}J9VyY?cK
z%7yFTV;HCMGcVK2wT-=$Y(x>=Q6666K0Txr8>)Q`2lWOT7}`+*Nl<$|aF-EDeOK&_
znSK|VY$IPBU-W9X4TVw^FNvDyN6?C2;i_-P*wvNmb+OayC`-7LfPl9_ri+v9^4n<v
zhjjSgRhNFTFV(jA!2;n#2T%Ty9mZ*AE{E#A&l}qwx{>ORbI#m=N$x|X(AYBEnVJ%{
zYhMxkf)0tvJuOh;`_0B-!$_g+;R9UP1Ld|;5CLA6vd-y>-P8#KO-FoOn-%{`(dq}q
zPa<gx&K{G_ZKqLhyzwV>(6G3U>R}Zk<QWju4@B`fIQ$b;%Ib8LJK4CPG4HK)NcQ`=
z@=8SVtw3L1R5xgPBIT<w8&9%ea9sM`3Aj@I%sdfBAiOTPvU%xP#*YPw=W3xmNBKIP
zq4ym4jXy=%bdGanxm(e!Sb1)HV;UJq>FK&{OlL#r{iV{S#luE8z^=hTyo**V*jg)q
zXX9Xf6-%B)T|h&(6l^Uhb(`YLhE92KY1*<!Q>byGU*@!3tX|u!4WSomDRAssHDI%9
zDtY$2`Mr|6O5T}UkVZCN<4xtR+*u6w`g~K_V5Gp8xmffjQM?#JvwO-qYM$H5*-Q64
z+y43j7w?kW{P2(D=j52TmL2C~0n)rm47><3j27o;5N+!lpSzp~%hqI0lp}E+^bX>h
ze%B*5WiDo#rL@;+905uhUupF^dgAAgX1-A^llmFs$UkhD?%(<J5#%&u2i5yBT<PFa
zjGElHvsO)6ciR{sKg+O9NMiC~EiP-1=U=d7#V3yaB;Ub&hj7!oa1WZ<t|`}Zn%TZ}
z{zz&(luC5l2H(mKX{i#O60o@c`eOToyE%N>Z>fctObDcxZ|ZufR=Ct^oXikeJ}t*(
zF&Z_?JYw0n5(57ekFNjb(J}Dq1fzO>aUWT~h@@5aILyv@S=q?&0rtiekiZd#yM#qK
zA>3Z{3AeA0EsXwGgQprm3-TK6=PEoJy6;r+zTm@xgQ=TA)Pg}d1n<?s@$<#;ixPAJ
z#@qZY@`!1PZ@}09H~FYMXaRQ?mJ7oGXlW75a8Zt}P+Iq2O@wQ_vhI+rF6~<g+9;N#
z#9=XEjwijzf>9UBVF=m3qnFCOazl*_&OQ@EFU<1JI?d^`q&R;*Q|^yv$4F4X&`-v>
zbIm0v%=Zsm6oc?rir39;{ee1*H{rMdcnV$ZJEU_yO=LZIl-dd-(j&oE0k797WK<R#
zyQM#o=}?rj?q|{?%m`&7IKEV{6wNV*S=b->F^=!%HdDGZ)xgd#5|@r~Y`qh2Fg_`}
zw5{-CHW8@-*J-)3`1g%^Hn+&EinLzEAbu=b3KkOimGEhSNm2lX#R-9#vg*!k|6(6<
z1Q&G&%mB!@@HD<u6aBW{q@hjtmp)nHHF2`O8@Zt76zT|(CTzFwD(ELiSU1D=-Ld0%
z?1qh5V(D23Gt9w}R@Rq+fwC0kW1FHvvV-hKeI@hmyz!n&8y(0KEzDoRCrvL5<|sou
z5tlc-M{aeaTm!YwP336wP+bz%Rdak+Im`FkesytJg^Uthc+{~jzuh#@tLtkbV#VF!
zm4ywRiRR_Ii&<9<Hq+8)PKha<Yw>f-J|Zw{_kc@|sXK<&326}AX#qUqDL6Pqm$`Ow
zm;G;689|LD8{0vIDUyo)Auc}fGK$??sH~DA+!(bPi>hSkFTcPEnTHNezBp@mpOxpX
zCrk32%B%{+$DYbZ)v?r~<1iK&{yj--o&z<d#cA68N7<iZN2YnDZ}KsaJNv=5xXt#L
zt#^K2&!>1RX?NPN+D>Zz?5ZEKROU9)q)2OFcTQXGPuRGq5Lo5j7nHh5|J(-Ez1+{7
ze}iUbjr^#%`C|dJxBFahKDyGWb}-+nuhC(8S(i79k|TMG><Www9u5Ly1W4TF00|Lc
z&j@GRamc1HgQA?MSmeL@zlTJ3x^5rB**>c&cMHbXoU`^m5f1Eme}55skbt3KdQR&H
zvcnM5u3V~Pyzzkg6a;ER&PSjBMA~Y09Cn&PIfIuJ#)N~#^)l2sLqu`-1T&Fo<L$V(
z-f2z8wi}eSjm-`sgnhwi0btb<Pv8W%r-nb`&W3Q4vD&{oo<+XrqB`okWSi&DLcq46
z*Sbt}EW2R#g`T*eV{_%Zt*OaA=%iF&C6#G@R_$<pzn97||MLEL&|_s{K$bj=y5qzr
z6%y^Mis;n+eaFT4o&w|g*_sT8qKEw3aMuYNc`JHs{pmeE$o`V33?(Z~nJ15Ree&xD
z17O}cn~Z*m)C`8=CZ!FugWMeE#cSG|lRFXPK6AX__$^j#w=eW**ZRk@-1FOdEVq)0
zl)K*#@D;in!pbA?O8(~XyXN)oEUE7u>2Gm^FDIsh;ySF&1TM)*rE}r13VGBkxBly@
zgT;OIT=Q^3&Ex58t*^~F=vtF7J)VmuiWwQsItU%C=DJ{X^UsSnahtl<)e3jsM!;)p
zy&S|EEIT9I=Zrjb9r<<3WG-<m>|Q-*l2|$y)7!ls5Yz>gtk2VuJn_0$2*O^&YQ|V!
ziw*^E>ccZ+y2+{tI=zMk^u<C67x}g$!4{?JG%k|{9hFpiSzi1kvAdGt^Ex-oSmgnu
z{)l5t1{9YAACm_4L!+-tc9(^A3X@fqUFX}>*${pf9-1WuAFN-R-A>egJNWnnz9i_$
zTonqj?6k+wja7hTMIbza2zK13yzbSbhb#bi$e61{t~sseNMWbh<WU{3slY+slh-jc
z4$9uWd)|sfV<G4jgGsQ>VbLUqB@2ltB>~C4CEeTCD=qEQY8mPzz#^9^nk70;{?%bE
zX>=mJP*1z@2=yXMNv{mujm0KYu7fLBndf(3_B~Fuzr7PrL{HMkfcZ>*l?pQ=li${b
zAEP+GHSJL-9!5}>HT=YYyip_BcH6L}B=Xg*hG2n&4kfOdCE@+14NUq7vr`*&{)W9?
zKQ^H`dU4pfYd$nZHW(J@*oKi6#3%V7V33w$m$4lqFTz0S^tT|p+nD3I{QmLwFa+Ct
zyrQ-9OEVaSUxFxrX%YVf3|b7@Xm|DCH2)^A8eiW;wA$&Rvl3!=o0rhluu<5jyfeeJ
z!zcGrc7{!hVz*6&a%H`0hWl8&50DC8YoBRPPHPFGR^DFjh;4xynaTQm+K>c#%HA2l
z_65Kq3SzV1Fdi*9so*>5jB#qJ({~$dt@!=pMK!??o6VY#+dE|6F-ybGIzAh9de#~H
zJPF7@QwR<4EL3?%ekCDp6h$v7N)A$gbP?q1@-1SU(&n(>h~;#V%5sZ#Beo>*GbwrV
z{9qqJqdlC5O@B@)#~e90`05H;=FQBKkgVo<Ch=qy`$#hj434F0(`ygPGWV%;JN=GO
z8Q4bR);2vS@{QBb<j8m}drlq3x><V;-7cePrtsVpr0K(3=e^f_3tg1Em{MZ}7jBaq
zW^A*1J2iiT_9C*Sh@LFBArcURO{sWpKgD3C*o=TD33b#{#!PwesjcjJ0T0(;=0k44
z5Q$*wTz7prft?oX#~Xn$ls&nNf|s3kX4eXBE#fM6vDNrDE-SGf>)0D`$1+z(`7fAb
z|26ag7UB0=;G6i4!BJ59ES#te7@+8Mt)fkVSnJLf42;ehyZovw(k$%i1HwGQ@!C*Q
zwi()shSPf-?{adLKC+JTSzDZCE{YC`+w}8V>57^=@35q+{$7zTIwvr*jKX_<$G_+K
zXE(mYkW@UXk*#BtF^NWyOins0#9AKhK&ScoRg>0oIx7$LL7BI{+lRvR7m~A6FW!Bk
zYsV*4;@dSCAugU+>s?hF;$U>U82-)MATya)Y1nRGg1$%?T3n!cXBz+N1@5^LX~1ZI
zc6tf*Nsg7tNwas!x^AchM@~XDz7)P>Oo2h>WnY<0p-K_L%xwXL|97pDfVaM@;~A&w
zCBsglcg8hN`a@#qcW%{2Of!`MUmWJF+O@5tTJMFB?W}fU$^DC(xlbLD*Zq>eZ8TN9
z8@({N<bOyC5yw-0;QwvU;mejEywvW>etK!d+I^B2F`3Bq%d=Vf<+?9j4VixxH!Zy*
zkSuCJ7MY^TZdl7*(?f_Dubz8H%|;&B+i*0cm?hdv*(X&WONo9Oe+XyIvfK>sz2N7C
zv@+&Ifyu1JLEFco!?D|@k)Lv-F@}eK8{7PGPHEF4#O^x~jX}*Pehmp*S8ZDTu%=!9
zS*B3&stIJM6wOAdDva{$Jm(8lr#_LMtnIP>)y`+_26FR>^X<j+xQogQ9=#Ddd|M~)
zN{>T=?K<TC!O<rK=7aScCia&njYaqRYyZl8Ub+sQg5t1^d{Q*g+K7CD9@E3S51x4q
zw;a3c_xl&;C2kuiOo+b5zs0}3`h1w*Y2s_{&;iFPsugVgc;!b07%KrX?7mfiP<9Ek
z>sDm`qfN!{b%v&!p?$4MqdZLt>Y1OtR;t2Gu~!Ol!12F14%$l<r}!<mnWxQ#t{6X$
zX%eCC{j09#$LxBKZo66sf>8c%h3;%`zesV1R}4gN1li7{DYbux3{aiez);+J<(g5j
z?3pPY{K!2-4w%U{ZEPO*SY)?2$A-XSOD#^Ks(}>kwW;4Ps_fjr_$DYuc7Rn<U<+~O
z?49T1LXgc<`|K-H#;%B!FPcY6jji4gDlnSs5V`CpH_Ph-?7;3aL;{$_)T9BawkT1k
z!<a&A8BGnRmzY4xg^<*WP)C3*N$ZrsQZ8Q5UYXRpz^^8n_ilrYN5eGVGA2?q-kD23
zC&*Yrmmp)P)g?w0zQ;+X7tbtdm=6m^|Id2(X+7SX{L)cp<3_eUZa$L+Y|8i5OcF{A
zlpjZQawnY8BPgb}wex}`fuGg!<X9!#zHR`!c&i}I_ae<7`n!TBXe+!o^^o6wYx)A0
z*Sqz(2lK+Ut!!&tk!XCVt|Dwvd<^6&4TB)ph|wBcU>rZYVHf>k)!lusS!Lvx8_bY{
z2!!^&T~@!d4EwVr<6J3%WZS=7fTPz83C>nN)7gq*Q6*$Bdz5dK$rg^!8Htn5F0M<1
zhXY{yog`t`t`Aj*gjFPMtW|pof)9W95jB-lT<kEfMfBH7Dt@vPOShPhZ+|xHXqcxt
zTNq+y5AQvAn-*#QBZz2l8)W{AYy2+y^Q+z2Q$(H$%@FfBmz0~TE*@OvcU&zx;WC1a
z4R0qTo44NPIrv{5**zN9R$(A)w@S~fd;a7a-hr}{xG;x~LNP_n1+?y(<?nyTMLZ}$
zYWP<!a0|KAE;mh<6AJlG@Me=gwDbc8QSR~IGT=Ub{K;OIHt_q1Hd#4^jfGrGIhBEv
zWj;Wrj3Fuj62mdub87nzHE5#xYactjJ1iS7;W#vFr+=)jx9rfY+@9+4Tje2RmOou#
z;?K%-kHn@DtUJCcx|^7G_+WE*A(1ohwFHyII@|9+({oKSNo%X@Mv+%I9BAgcT7$L;
z)Ffb3qN<~L-zKl+2$MT}3xJJe#<wm!(|4AS>Y-{|%HD+y#lBa|Lls(q0o49^I3~+g
z({T;_h*>hP7EGVN9~a-v<k50UF6?4zpYp3<k$a_d$0|^cA3G!^@3uh^iijGS9!j$`
zi4MX5?rXK5IG9bvLn&duqmFGmFR|+e8E|oq)uYtx69nn$ZS6eDrn8glu%8nU2IJ<?
zymCazpX}y7YLZKfb9(S2@srMXnUci7`Kcy)QneLEz#B*zL+audICxpIaAm&okq#9{
z%7IxTOIJb>(|>;J?dC`)_siJN4OC}l-C`qZ8!xGi)+_S%ePP2`02c~bpI6-W-TeIl
z!8XgX=F;W}diAv&B(HXftNtq5aK+K>=5|JL#D-l-81o<HG%JQTn~Qk`r(9RCOFaW+
z$e7vBicbg%8r+d&!lbfy{oJZWqC^Xv%|h7wqLW&yZJPXC>2_nE$9c8q9XAstW@qgw
zO(jd~IsG8-hgqjNDEHRC&HhjfM7#Z$Hhl2i6hFqzXSd@I6Hm$|E`{(cf|K>_go{dw
z7D@)|YRH(Dd$Q!*!$eO5Ua0I2UQxcq6?#;6IN?C7(wWg^^Q2A(Fk@YM7Ar!&-85-p
zHh1H0Uny{K)JPOR2J&MFxd7moG4JlmS+wMk3Gi3{WIrZ_!<F8#+>y&A!Dal9Cs_|l
zL)lSo?CMi*<H_a*BeOCQ_(Ju)Z>6cq*kaR`?%|z&2(+BB5?QTU>A;k{`MO`?eDAl9
z#Aj{;K{+v8IvCh%70OdoVpYZXr;T6uzH;e0Ho!)ncAAr)biC2i@;EQaSTkF%1sts(
zZl{BAD@iX$#ycwp;4qz69pjO~**w87IfDZPvM+~!Mejc^047MAISnt-yz_PA8?E$P
zB%dljPsosxcH;C%!CPC?WPDv<j$WigO<1wFm7I#J*Z-rxmcg6E*tTeGlDze^&Pp=Q
zr~B#KdSZ7&vbf$tDS9+)X~@s{kcsL)2HSzC#ZG24{Y9J?3~3oL)En}xx66SF8~8{v
zSAM28qmvM2u-;~)9PSIlf!F7;e$(I>6c<V2Lt<fbBDr^r%5_soPzMmL>#Ux!{5F2O
z1E*sKD3hAD0K(w_3J8_Pv@~fCJ$At1QW>T#cXuf}E|nN4?Foi6%p<;5=pGzQ=U+&0
z_^nfPwq5z%am1bvS+{>}+BEqUp203u%#Cx4$zfV3viYy38Joi8jL~uy@+FK)xTfOK
z;{uDuQ2BSv%i7uAgQ1xCu0;J@%FGhUc^LYJvo2(;jnAFK?0yS3yfQ!Mgq)W%nRE1K
zDv95m9NT%RhoVjY^<2e9rfyO<eeI<9cUHlbbG1lM&9Y75U7fu7!J@BV3ua!k?h)(F
zMHh{9QLwy96%$A^&Y`fc{a1qBRFE5&-)&dtofEmoQR3)v`{$i2=Y|%Y!GUMZv#6JH
zRVn)UOw(-2A-{KSekZ36KSaL(c2i#ie!#?w7%!2hu*DSPJ=1QY&YzE6Hih$ek;kkL
zD%aSXzue)#H~*-CQW8$uUMOf?sh(I*V4?M7H$nfN`ue;Gmgg0l#BIn^xb$!_{str;
z9-MlNaKie#Uf;I&NDtnxx(rYypmPvceJ_awjc&!><wsH4IR>$Z7UAL=<*qQDeI8<D
zI=N>zxNRqUhFKF{MGWJv^r;Y?;!}w}eKa`nXO}|p1)mFhmq_;+@_;&!ByE$vRajSb
z6{3C1XI+noB@_Y6=s}RM0J~8{-6>tRJL93w&_{TWW1$J^{$`q8FEZo5LUL6U+j;L2
zYV+X}l<jR;#HPsM492vAf94GIXYF}$x1#Tw70ks`FE1qH@Z*C!cC(<`-LYcN7DYTi
znyGoj*m&=ke1jMv<yK<sl~-ch+LgkQBBi$fsYBA-{Qg-Ze9#XFTgXY`){|jAR}7fR
z{8mE<d`P;0?xtp|o8AEA1*lN1+@Nqp>d)MK3Q{fySp3?!Flka!Xk4ZYkds!nI?H0%
zQlka4x!A8z%q7b%8c6Q2ggyy1^AU0$=8dQKvbtH!=F}R@|LBvQ(KegUPO4dEY5NjQ
z-V!#Bbs3nDF@V!{(q^>V%-}V<h<)na{&{;iaQfK0zx8hBHC7?^g4@7u$FE}3oKJpZ
zgm&Iqs>wXNi*1xVE#EssKU(eG<~uvZv2g^+ai)K7j5sS@HluZ9z**jgOgIk1ym_`#
z77Uaz1#FJ+sqYF)7^ABrVvm2dxMeR)spSrZsN1oaRGo&&09}ghWnJgCKQ&Q6oihbV
zON1-zEOj;{S6QF8bC<fnxwHVg5TAjXea_NX9X4r>gQ^P@V=MU>__HC9-5Nl(ghI~d
zyIBav2AdOwoBJp?*Yfn;E$0Q{+TxUdTX+*hYq0C02#DZSYcnE2Y2SKs#Gnu&H@$aP
znxk-93;VNW%Nw(jh0PF(eV?V*m^bn4XwB1Ry^arPj^zXjrwN=B@myICAyy9+>e0&C
zX|n!6QX<!SF<8G!$v^^eB;G8UT*scW6@xvI!k*QK9_@4V=sVb=d~A&JO$Nm>BBB3k
zX?`y`ugs(di8SmO$SEK>MEe0;!7x?=g8izkXgw`D8r|u=eMkb0E>JMyGrZP)jEe^T
zd-@NBThsfkN<e9hO|i?aQ;LlfVe3bO%mLb>q50f(sFF0K*{8IgkOts~VE35Ywl$LV
zo2uISs$H)ZVGVSGzgsP?rf*=8l+6qjw`E4AMo`smp6-J#n*{e&7SFsR>)}u>R><FF
zqY1bOTQhqqA`fV5K-Z37*OlSK5L?!!b2;K^{RktwCfaem2@IVOI9!XzMsv-{Q21U5
z4u}TcLiF!ySvRKLF`OBXE4YsLsq4$sPP;Yv@Brn2Vmw(wJKmaX(I)3n)HF!rs&!TW
z(Z+GH45J-#7;Pn+FwPX5EV+_jlFCvJ8{agtEkXxXuTQQ+mug-_O1Ya*8%vq>>7Qjm
z1v)ZeN!8GDi6~-}^cfW;q+5PEbAn=78u_4u5-%rmP>?#s%HFukacY_!rr!1I&7<dZ
z7`OhI2%Of`JIs*oy)3Pz)@f;2L1{9)Ef?9&=ic8u5Z+%@5qU>Gz2|HmoMzjPm$Uvy
z{aEyENxNbTU9$cz{_8HqAn}i%DFQe6>v(7$0@QRh^1%MZ*QB3UW4Ev4&%S5PiB7OH
zmpLc6<9H*HvF~6)-umXca;8)%hvhU6`};sga_ThBOmByN*z7QiIWNp(^I(jw$k3l;
zcI8onVL*y+g?L4}$E0d+^y3B{f*tbco!so7jEg?OY9pm8yHFVfWOpl9(F}|Jt}?Ul
zYHh4D-+9z8Z>Pakz2!fvhbslik<Jc%OXZA@(ESF6?zfnvgzN|CioepG{=x^bVz*j7
z`n+XZHodQ{2&{Z@YWUlUxD`kyRN{>7o*kl3yrF>gSa-b=o30Z<M7~SbIYV7|L)ucv
zb>SfWl$3bNT=7SPqWy*g7V}+O>`bDmaKSE%ZaZKlIP^v}jmzO*F{mGEvfV6Z->6f_
zg~)fOC}y3v-lEj%lj0q<-+W0}&XF`B)>n63cEk^9AIBTqZX4*3y^=S#{W0!nY945F
zJt}72AIchrp&ww?t*%Wo%C??9v)-AhpN5!h06~cO6XtgsDp{_ql0Wo;uq~TJXUdUE
zj(6TSboc(gLRT+_tZj%!#-$~V)LM4(!@IeAFW#j~eLtAcFX0yMJ98i&sMZJ&_fTHe
zeGT<Y1(oT1S%P_bvR2C*7$PjP>TZqtgNQ4G(BvK<Rww-J`Sy;yeev4TW)JusJj|wY
zKasl_;LuqVjea8BVJ-WaWMTW=&=P-|MrYr|969otcw(cYF4oNjvcAl+qT81k>HAc*
zsxh-Uu(wQCHn}bG>5VHl=d6iqrQH6H)$I3Ur7;!)Us@)tLLJICx_=uQ9t@y5p^*5s
zwsff6Ustj2kCvhUB^;JEZF{bB_rWsn9z||@bg{|3&ye(!P@<p0Yjn08PXZTME6Yb*
z@JDX&st4<0p7BabI0oi6K1&~j`p5^bOjW9^c%1&(%H876MD1a}?zEX!IM|PqY%NQ_
zP_cHg7MaXkjK|h-6*52DnR2n)@!h<8FST^&XrkfIM{rzo#bVS?><<493;&F5m;+VT
zoPD15>oR7nAp~D9mog(&<7&9FTMIjoj;l{rcwRb2BtLY79S<W94x3ti;DJa?Ft!f<
zDH78I0r{6)2x5jBPY9JY!tHWD%+{X|iGlYjUXQT`Pyo#o!Mw>4GtXntVE-xM;wI%)
zn&5po)Pa}kg!tc(EE#J9KOiAtxdocFDnUXn<XfE1bcL5X_;Tt>fEQ8ZY@n(li3CJ=
zI=R$*fYF~2gqgf)8Hw8SS|VawX^yqJQOdIQ0p(DAoy9eS;8;x>#m3_qD9l1rBY>Ia
zZ*6p3eR*)ZE^q=*`CM5*)Ar+1FJIbK0M12M^&b-tvr|8^5p8RN#d{9VF9~hMtrO38
z;xFe7*<VvwoUa9q>}^fHXPX{6jvhAoK0xA;vaZ@7&2J`(t6(<pHm2BB#=Lek%a6P3
zJt6g#Pz%u@MonJUxkrO^vd#W{Jy5zN>kPp|_7*QsInkG;({LE0(lF~ER?i8&$xtv4
z1O>xP;(PvykuWN0sHbrJ1RXvHWAErj+=_4Sv}wtAHri8k$d>kPN6R{ndj9za2!bXg
zEv+J+jV)4#ALF6+my&;SnXeyDFxiWeWI|y!F7a@@Ptk!C%q?wP;A4hx=$V&rr&~0<
zWpU}5a|w+B`;Gwrz$EJgIP=KZ14GKa$R<Tnt3z_qHjn69xTu-}FWEWqzc%j(gmeX)
zZM8sf&E~1^Ac}A@;6!n&RsUf648W%EXiU$dh_3X)Z?f@Ae}-i9FKni93x>}7-&+Hm
zMaaEdKoQ0d5n4fuy@Qw18%6p()D*kYtRZ}{neQeO!c3E-CwedL#C7MM)A|DM$bO<d
zTdEPqNqrbO3c1`>dQGRUW>YnvQlA0|MY>e6fR`9Rf8=rz4%NuNQM1EWY+2!{8<Yi?
zHzKe;3}s@$XNx!lB_cw33@}I+#f~Pdu2C|IKugVxA!<AnF_K9!X2~BQSQ!BXSkoIH
z$`qe+4mhBv8sW8y2tk`hqx<#@^kath3{{NQ+DMY%l5)U)c~Y*dw#l|NW8%ad_ucd4
zN3s9Sj6uYL9ra}q9wpZn#hrtFXP`z3(tC*K>cR2cEn*q(<Ja*M!q`g{OkWZ4^MZ5@
z2^ECAbZ~<bB-}T@D<_<MoZd@;B+xc&L2f9!eu(&m?y^ud->>tXnWHb-^IHYk{j|JY
z$DC^;j&yvl`x~4CL6K*(ggOa8OmOB`YGEb3b5J&LVV8~{upxW6xl;euG)c?2YBpZ7
z*m(85i}ZEXeKNwQLp=tL5lmHlPM>0gd~BeMe%`=a<#zt8?aRrLR^DY|zDg6_AegT<
zMC(;|w#dXJCgW8sI@)UD9QxkiA?N&82#>MBHJ~_3+=?-29$YX;#8KH)+0dWav)dvE
zau8jkUu$z)d0-Ead_ArqwJP@AbN6%8@Q$Rqsj_#Ap%{${JD{DV=fzVNx&^&+_Byk1
z*AZ8~u(69-LbAd|%(#wmjlUfyhR9=8%8r=PvqY2%;a}i)-9snr3jB^0B*uXO2QdjT
ztdqNc-}rS`jg86o6Xt{1jjQ<?Usni;CV%Exd)aEHp%-LRkyOk32#93x4(-R6l0NNF
z6$sGh3aFRiLF;*Ud<+S9GF5fBKzRpP-vD~7+a^D$cE}uP(f71a&~v~AyQbgG@zP{w
zJ1(XUyx~ux{QJ<D7<^U+t3G5sipk3l9$NZiQc{wCzMO0Ee&3%Di*;lp>ljI>&jb!L
zAqrfhJ#f6o5S>~zB3n>6lGRMb1dzo{r}q>fvcu!lf%N9Wl|9sVLKcG$fm|~iLqA+Q
z^dOQc$Rh7PwuzOd!gq+~a+5BVfIdch0rNK!ll>|3d5fQN8DQ-yP2$*I1$mkB*$#90
z_3<!`3u2Kn$M#cj2d2zGJ>BLRcI-^UlB3YAj_=M#u>V(iZ&?QgUY&w$YyY!mBGNvH
z{*u1EsYWeqQS$#rhhX;w$YWH1kJ;r`nf+ghw8H>wF_v|Tpj8lZ1a{%B>>P1&0NQ`R
zGl2Tw#N3L+Re`fJtQqvh;*au#2Wz)jRGlZTP~{iTAd_`({~!H%ibVjuP)5^b_7m4m
z0NbPKvIM!AzDiNG&db;4l}P~8g6`hf9N7N7kH0$uh+|q4Fg6zMkZO-e02UVw^9>t?
zAkBRzd^P=y$};Zm;Ao_J<T2&=@WE6zm~Xfafr!QQ3Uglop4VJ_bOrUprDM0i9^94=
z<SgHV1f;M4riKN?ZS)yg(4OnK`dhWLem5{WVuL`(%rrWw^7SH+6@ItaN|OGV<vazZ
zZ(?%(FYthm9b+KaMGK@SuMvC#S8FoJ(5e_aU$Sqq5UAA9&;4gb>MB6uEd|Q4V-mC1
zfrsLBZM-Ns`hjrOs)(hVF|XKsHOuvQd*ohI>&X29vcWvhNwh6>nzwqk%QtmovfXSL
zz?+ht-U_CW3MJ_4C@xxz?dS*(0@R?Ak~M<P2Qfcx^WPh0d<)RIINp}fIQ%Wox8^=&
zR>NS@TM1#h)P-sP-2tz0OhD7<Kp2K7rBTkO463VCTAmh$JQ%O+asWBmM`P>r4+crM
z90A}OnLrRz*OdHuuHS;JVhh*Oas(C)zn3`4*Z$fPrM7bjDU_KoU_|kTYzqx-SjkiM
z?LUH{WMl)&k*%1;;R#f+<zSZvTvCGk#ND-(RQ_i6&4XmXYcjU>8qR~kw6u*<ahBOK
zqH9L}{_a8i3W{JZ<Vm~D%mZ>6#bFesW^SoSN@n%H<fojKZ=%Tl4z<yEkYFMI(UO<i
zRu?bAV$&+BqP3ge{pi#Nx<{{|_40pk|Mwxc5OC=m?}wEE-=+Hk3EJb$ATne?c$K(s
zbJJpG3sWFuE!#g*1nSTTGj4eh4ly&r&kirm6458BPs!D|AGcQ>T#KHT><fL|eGJS;
zE?TG2>@(eAN&z{H`!?}?g%c$286e=u4ry(C4Rw6$cw^(4++;HXGWT2Da`o8K_uPnT
zTNM2t&*14u0*oSUN;IgyB91los*5R1M5l@VrMQ@tzrp-$b{|FGNw4nIn60Y62JSiC
z%d?5{bxQ3WgQ5kc$jGw4s|KX~2V}!Q@Mb!o>zC)2By=ZZYMrafX7I)~m0!xO2O0vG
zLo$EM`?W;CbP<lN=+i!Kk}fG2aS_v+!H+Hvqcw1<9Fie^Olq*YKd_6)Ee(o=AMSPl
zBX%Nzt8wM}9*bo~7Ra=>eEg!#QUW{i5Nkay=t=CA#b9f2K3&TEDCgEu^bG~F%b!7#
zN~a3?Qf)J^I8=k1%^cKH%#kjULGCr0Hvrv8`yr_}(Dmk@&Efaz1tBp*U0eB=JtMW6
z<B|9B=gR4kRMAOx=FZpuZQoMdI@1{z!{;Qk<s@gr6!x@;VXzJ{wl01I+ka&A|9Y5O
z2fW*?m`~+NFTU5s22@T7%t10D<&5yjfvd+=irnIhroFLrRe5szjCIwD^cB5l|A0J|
zzn1?f(dB&HE%}Nr^_^ls^WZjh6YJJTo)P=|XNw1AM^p#5-ug{cjm)A?2R)t)s4uV8
z+9>_I(EkzNUw!as2nSvOAgNfXqyPJrgQ&4Ko4xBlgR=a|N^gFI^Vk8DKA9;zEVJo7
zP6(UMun}T_K!2+D_AfaI9E<R?0;^g;TI3VTfLS=aD!RwUirFFL7E@A=2}OV4waBXy
z{X?bu4(wn2>p{E!2WJ1CEbv7i)fekK;K&VFujl`LVu$?e&Onon5<@sF*;}j7i7)tr
zw9&z`XU~&!scC3dT!3IeC#0Gv<1g%i>$F3Ev)_wB+(igR(pFj2hv~W!&YBb^KW9SQ
zrNLGR>Uyr_vD)4uq89#GbLICk1baxhTbhxe0kl}R`PPuI&BH#WQ(6dso7j*QwiE*t
z+H&4ohyaP}u=fiW&q(fJyWQGeZ9+9Xzv%@NB$N_@5C7iq{|fv+&!W?SnE<D2rhlJ{
zLSwY3A@(H16vLdrF3;nDOm|7n_%L0k1L>$%=jzvOGjW!4_ri|nk)Qi0A4M!cO`|}B
z1?^-yhR0MjGb`WBN-F<ilE_L$uo8`P)!^=$<~UM+_TVoABwhhPoYMU3-!P27@weN@
zAbo1H>?;BI%iLRKt6h9(O@$F26F2CdKT@2+?uKlBmxL$#NBq6N|L;l$PyrONjMhE+
z@4(w!;9ohzwNO*<7y5~MCY<lbU;53ZkXsne4)PzgA=WwSJFpFfJh`91+tGkH;FiMk
z5=}7Wx!jl}($tC4;F}y-WE+J&4%Eh4&EiEHU4SB=Ye@IkvXQF;>p-XbbxJwJ4PdX!
zF?Ldn$H7Hr%cqQ(z1pY=$lAS!q3N6tbnUf~CpMX<uVem;@c-Xx{x8xSz|<hsJ*&s1
zOvJ;Q$<2?!=F->MqWym6-^+%Ws7-}pwNoq*_pdwPg$mq;70f47<oD+^CoH~f+jm@V
z8LS%-i)U=SyICso!Q8hiB(}m1g(qa9lLOn2kg+C+VO-DeE6M|LBrzy{ODhr4Gv9`;
zWMEJ?3C<L6k)MdDMKfU->pVFF;dBW0B=T2!?k2K~5#PW}I^cNg%ERwC27F1TqK0z(
zygzd^L3F;7If=s|0Df&2L>)MfY2|hG4$sfp#(iP0VTj_cc7z%7zY^Jc1>6m%05kI6
zCzmmR3;+d3moklR7>mNjrr#As@j&uP!4N;}e{dj-^|#u_bA;dmiv{_UEU~4iwu$w6
z*vz?FIKbd<cESX#2=REZvs-<DjvFq`yNbd`e91?}S?fTzS@m`1KN1{zI&cuB?7f~U
zWyP?J`Y}z#*b#Stv9pPEy9c-5085s#^zzwtSM&_uua|_`V{~1<xhdb{lii!MaR6^H
zH4;$ip@)>2d~KQ#<nCbX7(d`Yb5P*_k6wA7iU|&_5LLX`{w{}+^lK%YPTJQLjwqo|
z;XsMCGR+L|n*J)1*mRIMmqWB5gXPMZ$U%>5%jvLaaog-8oV=XDDuXDfU%=}OHFq}G
zzr``^be57U&LiUb=9|}bUq`;KQ`$m3qXc~?xTi?*>Q+IRx7Dzz=MJ;=7VH4THiC|K
z8=ljq)mj$9jiBTZ|K$P*z?ar3NFqH1joOfVU>A(v$xGrK69#zz?6mRySJAU-fYoI)
z=%1buFC%z>DPMtklq6af0OpKc6^pJKY5(&X#wr-aZ%s7c3$`wiKTopH>h({35;<?d
zi%Kzh($98a*M5#NE&xQw@IFsY7J^}4wTChRY_G>vZr!F-$E=*EPG-_qK0JdVuV~#l
zhBf?|(?L|i8tta8Vp5{y#xpZLtp{*ARvLfeHg6m*En`{5MMDYCILp!|KkSDf%!{+d
zRgja%<)F_+VN4Y-Mm$jX2X>J>4JdcCKpnJy)=-E}7G$?ue>KZl^XM@=$bCo@46BUb
z4*92>RS+SGv&PO}d<S7T?l9<Fu%0PfU?pRi@vkpDsYRl<{tJDhT!KS-Je@oU=~&Io
ziEjOf;87&4!T>+aD!rqQ2qvBam08XzcwO~cG|y7oa23gwYEFe-8IzCxW6^~0156FQ
zRyyh|PuWzQ!XHj=p%&qqX8qCIvJX^_8;glrXQgF041~8E(LBEPbJ*LDiQ`X@6%)6}
z7z}x+m#OQj@BEEEKm#E25+@(Z@vD2Qf;bfoP#X4M&AhGoH!P$aG6b)*0;%)!KPz4A
zZieIHu$Uhz_!<rws~zrE@I6_n`>Zhk9sSyq^#nsZmzMGTTeQtj26tt}c{PdxfKJ#B
z0yrU)#CqG61S3!A@1y|E|Ka7KIbvlbzgUJg(7hH$dCjzo*#&vWgZTlDr;C&70lUmd
zuPn|%Fx0$1MZ5;F3CzT~IA49Qt#}>A_uVP>j~j4t$%_VSE^W1kHV~3{U<dC(j2ZGL
zpZv=f{AlXGym?4^XNT|~rDF~%{kT`iLZ*2j-X$<o&Qkt!+*0`^!Oc>y8IXe0pLS0-
z=K)=|Qo$ZdX+~l1L`Wo$WwyQ@t}NC)p27!5XAEg47PPqVYy)i!a88ZjPof$ln=QEC
zIzIV0|G1?EaiBJ+8lsunvc43e83x|WLwSGhX3Ne%{2>Z=mhJfND(RUqiEV&01d;hc
z>XW+<l^)QubLt&9i$9UhPlE_Had>y{alou1(~hejrie2sL5XS`vU51!{T~Ay{%eGO
zuKTk%ccXQ`(q2qQwYD;MbgZrke45h#!`N4bRk?LtlhQ5SUD6`mC|yc-N`rKlfHX)+
zhje%6mXrqRMrzaD2z)ns^qlj&?{~e=Kli@(b;(|Pt~tjXW6ZV4I4T}K;<22o=sp2X
zLMisSF0vZQ>nZEsdz#n;c(lH774=_=!B7Z2Gu@H42mj;yMF3r*ZgybO@2N1qCU`22
zMy|pwF37H8Iqs9Ll2Q-S5X&gSvJ36sgJzF8Y(gZ{mMq<q;zC1Z;%Wt^b;{uzc<M7|
z-d~|2`{?<m2~S4ilm<R0_;5TEO+6;o&5HV_+rT$NSu$UE6Oy-iuqHn<D+@yCRK*84
zGDlx*>V!b$?dfSzqH~$yo7D?L(`loDC?ATFDYw>xzP8m1M7v@ja;To6>E(lre{xq4
zktnlZf{{v83kf6u#KM8tHY{+vcTY5S^PCq5Eq`s1ekcq<v`8xSUmle0St7snz()VO
zL~lJ`EazhV2y*+Ax%&yd<Tv<gxuA6%$+-h;L~VjP86z{ZsX3Nsb9lfS{}W!0(gD2|
zr}QVc-)|N)KdS0vD%Vm0LZHJY=Sn~-uLPuWpJERYS;@Zvvtm4elE$ALQNlJAaUYYy
zIS%o!es0VDP{bYd`PXmam-lEW-A?d@Ri<ZUZ_UnLQwB+j)DVJ{RPyL>bz#<#fvR32
zf33+d^w+>=dXcj7U)|sbY{7aim!{7IKW<GMjUM5D$H|csQ2qcg;U9DoC@@iOO=}L6
zvT`UQ%>&*dX$A6>CWk6L(}O4=bAptgO3wJ>BCQvVwjv3xpKE2HL>LJ#!VZQ#VZu9c
zfEXX3VhPQfj`|+i(U#kuS)Q+cNs;&w>WwnMK<GiJKW+0`qfQ_9vUMe*{9WW8gJBf!
zaj|;hHGVIaS)6bNK_9KXSJ9PZ9JTny9^l1nrK|j><G`#1^d8W-#V~K{pvprlG}%6U
zJDuVdK$G|z0pvc`+t&&H-EkO+w#!cXQXg=|Hi#i~rzGk8T6hn07&t$3+FyUzATfWp
zx3{2Z4k@<YsFY{%%S5(Z{?YO=6Th(Hkd^g&fu0y|H-su8C|%e|+~ZAHwso%-H=`QL
zW__6fv6)cZhwk<Th>yEcWe&@q#FyU<d}8ZCJun|@_9C9eZFa&#Yz<G*1rVC=Fh(hh
zBwI9J+Hxs-rykxcF9+S^tUf;gZaa^IdLs|46fl+=V55DG0*s~lk8=ML{3vFMJSJuI
zXXC&5mQi)TUxJjns&DK-S&n^Kf}GC$jC5k!KY#}gpjGpUea?V;_b%x)%-0So!(B6s
zdF0n0s`?0}8MR@j08iXw=hq{s)l7b3TV(GZ&y^U7qzu8*p7##jhQIoGnK+ZPJq1UF
zFx5qrv`L7PyRNtSNJfV=m&f9zu+sK2N4;BQt=ePX<=6M@C$hmyGQ@Jn6BDbR@-`Uj
z83o}W0UIYzI(93UDrmi2zSO3&O5>NWZ{pElFC?C1Rhl)8eKUSo)EOhI6bM?7@otM!
znzNL_&gZ98_8mfOMY1TtIaY(LLd+_@T(+HR{i&r}>ZVKh6_UOCOL(5aF)62TS8WsW
z@`_XM`Hi+7VQVJF<N5_KIQ(8eRs*P?lPREfg3yG$sx2Yq^}E3Pi3luJc--Gr;IX8M
z2zL)Qn2P22)aPfGoqayk0&L?2<n#)Ev79vlj}*ZwyX-Z^uX6l)8>S#NXS4<)=mpkG
zWT}mWJk{sbAKc3Y9w;uasiCoHwYvGj-5y%9_#C0sJ4jm8$<$N|>{`L<4!f;zo7uvA
z2;x_QZti^JHIAUiJncNEr&w&>+zGL|u_Ksix%qr)a2c2>JAeEh!8GU>Bo#8Bep;z&
zO2Ug0K-wNhc+e+8=l%6E=@h<vpY4sL>L1Jisr<_XD^LwV%uq_+mrM9%zpPWgw2i-V
zkZ|?mwt41OCiD9Z9oWtDCfeIJeu<Z=IH?D4yUD42|Jc3+jDX;yZtrc#mU;X1?t6+G
zrKnM3(y#cVd8Pm5JuK~Yn-IEF?PRgqsqG7;N<6N-zsf3yyt1UG9|@!`yuLf5UE$AU
zQyp(XV9WHgox)B*3!9g@6?B(V*jkUs+4t7=i~3Y7<rjn$=``bNxq1E6%oNyBfU{K+
zD<I4Jur@ge82k84BR^$Ti`3)(vj$16te~x<;w_a$y_CItIa0ZV%jWzofSP5HSUXgv
ziX(DTi6lVjFaj7_aG_skJRjzJ)z)6HI*%FVHU6j{BdABogd<Y1H%39;@CPqxZdJ7f
z3Lb}y69C%Nf}v6?xin{%tB#=M;#2BRe>L;Jmdz{x;$dqZUS;#IKKP$e2>fQh;gY^I
zlJa-~ErpBV$yZ8&Hw57+<(gz}$Dn6S`1!)?Pa(`9;MCHL-zyxcDChzbwEg=8>n;0`
zr7z>+8~O1#$3%?ZmW|~+-r<wsej3WXDov=3wA}iTm<q+j478H%eyh|<{c)uphq3J!
z4Yb)^b<(n<j>H)_w`zgzk8W+r;)wy?pFd@$H&kDk!{Ux5v-B%iLA`e+)Jlf565-Mv
z!#YeL*ycUcTJjSMO6I6157w?`4dcRlunZLtEl?nbS4J5S6$tPIWsYfAJ&o5Ui#}!D
z)MuyeOj2Sa(L(e;%Hvk=T^`vBbb?{us$T-4TUEAE=MqwV<fLx`aOlG4(5Stbl}EEn
zd`>AjYtK2oR-&+On#*e;^N4;8*uMc9P=12XBt93ycMAM3Z0_U!{=Kq(ngA21R!lTf
zOvQTH;g^j<>0F11gY?d9-n_8w%RrvbsI!wA9k+S2Dg99^A~_HjyjgoZ2F=q5N5K7b
zhzm;ZN?t12vweVyeGOm}8=FW0STYh5pQWi*K082QR5H|}risQ^C&Rpz2+V+PXW*U+
zDSged`}0@zyFq}7Z4s|wdHw1o;GuEOnZZF~-(M*7^@adFf%n#8K+&mx95FE&Kwf&t
z2=*cl<(jr+tbK6|aN4>M4;tmXMI)3vO8X<_Og~@%xu@v<N1@Xz)?9ltLguGpdeotv
z*jsaQncO!Iq!918d{=G`oWC*xynb~d^>lJGr}gDDgf31lugY4DXx;79zpG}ZFy-U!
z{%^t(XiB8@?wf1{4Vqc7S9l>LCI#GT0tg}9Bv*tW;w2_PYeh}75HGGj%CSYD&`aMI
zd~M6(7JkV>u=b?Mu=jaV_07q|+Sn(nD7s%3o=WZ7Uk-33XQN*>7Hq=@z*XrJ6%)qx
zG%+0N7lV>@FtjV(-IM$Y^Zdu#uB405HHe3Fd&}b08__A=V*FaWp+e?N_T~FYNQPXE
zwtnv--cIyAq^sK^0uHi!U37#4kvYVd$P^x77Pv4T#Ze8RHgMsSY-2(~oC|Iv08~Vy
z(LM%#CTEO-JcPgCyfd`yFHpm_bN>k*@IIYlbHJ)2>%U67)Ktb}q2pdeQ;l``-H#3N
zpoA`Fy|-K?Xm^irSe<)PI#y?Hs>k~=7}tmt`sknl&9&fYcK4_2t*$%qS)kfA2=&)w
zo!qB=2hu-V-xq5Cu?4zlep~YIA)ep4;ZHE*-~uH&OfT~V!^mL6;fIw0EcFw;rwv1j
zjRT5a@0!{-M$lM<tUYRXX*n8Qine+o2XR4I(Q0LYo~CD=t{SEAc40vIs1ebr3>dij
zNOudEPWGWvwaG6AfFmogsBw+xSn**8buaxDJ!y89_<JpyZ^<!}u*;tw+tSxdU~4OF
z)i04w`r18p{qTs(h8j7ZHbGeM!-{1Dlq-bdb++j@T~Q8QAovYqedS*~CJ~hIgNBeT
z&wh*X(KcA=hcEyWKJaduFd){{0?pZ%r!$gojOGvS^mF=X?ootCY~sUmK%KC?yd_2M
z&d-r*Cu(&47BY7(-Db~7i{}QYkmdjKQ9MAdR23}CVBi6&j)i8kzz_0{wmAl>Jp-I(
z+a5ImH=P%E19fv$VSn>{PZO1)0Jd^GRyrf)e+rU+(glz?^j`zbpY3biDTL(-7&)s3
z1=x00<wLsex>D^&zs7Fk*rO@IVgIK0#rGZ#ne)MUhQ`6cm3%VH1!T>Gh7pGn+?R#Z
zgAJFWN2^tpp(E}ZyJEjGh!W^=R;W8A`=FGG@O<w}J;M5~$tB&5Zn@B1Zo!lAYyjmv
z$-rj(4g-ONeew!|Lrc;`u8)-01}N7avSSvX2{aI4X~chx@n08&g{;J0d+%l|t)&pF
z?R#8_Pg<Jo1p<w_52;ft5ALEM*IUQ3!aRg$i4~=wc9<D8zp?^L6~cTFu_lAbl+NO5
zki|$E^6}4Xzgvou*aolU2(F&$?^~HD^N>8-Tw*9%4%TePi|J>Pdzs!v%*nF$k77@U
z{%FQg%EkM?)Ch{laSDp3oeYTLi5040D=++hs86<g&)wfMJqja}F#+H`bXJZsXpy}4
zy^lwyV0I7GIQYPv1y@y4;CkE@-qSVWiTU->JsK$=U0_S0kM{g<>!gamY^&Gr*W=S;
zZ!`v;4vHqTyaNW)9h&M$iCAzl);hVBGlLM{C5ocLo{d^0OC{GNJ@N@nQkD)Hh&AED
zS(n2kN|YyOw0_>KhdGQbdNqSnAGe74<&?eWc(M0ihpZ5lFNZ#F_0%o*s48!0><AdU
z&BQ~G<_oi1BVXjLIe6q6_u39&y`abIts@2We1UsjAu4xf#C4RTtC8*s3AMg8i@Z!~
zwRwV=FS7kPbQeiuw*;WUmfraRZpudCEQNU0mKng}e~9|i;{Q7mN1r}+LV;;vzwzlm
zi`w7yjWy{}=t?R-=-M9whANzz%pJ0Y%P+~fo;JPaISUc8YffBXOsex>^1X~(&V8I4
zj;8TVs=I~v^&i-Hy)};{&wKLJua9YDQSRkq<5#HVkhLd}s2l52atEM<*RtxT<sIBj
zD~>`l(wx<w5FVM?loaRZE(IWhYdugZ2$7&F4$bSHm-bFrEQW&~c1}EV&RoeX8=)BR
z%}T3ie+x|z{cQV;dz-MsqIg2G5WtytGF+d_^YK3AFYE$j11?lgv+}w1c_KV5h2Gag
zwZZWGJk3k?N^oJ~MW`*byL0yRIWXYnq?Y?Iw&j%QW94U&Z~tkvQfykU@yPb<pP=f0
z0M@^$w|{0*P!1qZF6Fe=U^TIgI|uc0B1pW8Wf0VQSF$W?_8mB14giw*-y%LUsOxUk
zGlz>}P6gzIjw5LlbLK~j&yoQ-Qr?fjhvsJrwPWV_o(~*wa{2P8#~umI1@G3nA?sET
zOBWB<D)cuY*^GU|0Qn6f(p;JxMK3Ovf>3@CR#TTO8oxzcyewS1V27~9>IN=)Ed?*J
zX0iI|l`fVHp5?YR!&MbQr9x@5#rGTO+N8&mOcr$RFWIk)z&T4g33um((F-lB8;An9
zp!O&ZFf_H#*76jK)>Hm1eM}OD5ZMEdctNg7w{b?`%%(C2`MfZ?j}?pau-rU`+;zBo
z%_LghKd_Y6@c+4efg`B^LSZRu@Skt~9c;ft5EKWLp{2M7%mO5UqBz7{31Em_gj<<2
z*SxuZY*83SOkCrk@&vA+g>}^qo~wA)e5@73r?v(LD(Od6n5S^p0-Wv^+arILdq=1*
zRFy-OnQlCD<PE*M^|}O14Nh6I^Q~BX1XVr!{IqykJw{cCM2@qe#&(hlxMLho;oH0^
z<vv=up#3#?JnuzXk^!}b!OC&)8s0q{e8e=uLys%a<`(^zMIB!j(e|S9rC$p9O=*Ys
zy|q&Q(90xy5;4!s$Aczz8o&HC;IV%Qu}%RoQbg^-YJis|aDDH1#J5Vtt30^0P@ft6
zys<@>5O=E?n71uH@9*Uwg9W4`Xy5Ft^e-cW|4xJe>2ptMlWm-o@HfE!UBv&fo__R?
zIvl;n_HNU$2LALO`VV5z8GJ}smc5N&N)xl_GqD}2k2Q-a*sdy#j)*+%D>-E$RjKC9
z3Mx%gY;mYsYiImKKa2J(%qt&~&Fn)NpDmv4X^AM?2+3b6E&HU^<B=AY%KZ#G)U4od
z@gpJFclFRWst|C~jJA9m2HCoNI!se@bdSWaGpC$HuIt|Dxf=5it?Zpb7v1sUM&G=h
zG#3fMYpjU}6y!A)#l?crUSzx|dg*z8HS8YeY^uN?lZBJDC8QVlUdAZD6tvp)D0}%1
zXFUrD($X^ziCliDmKTTWLZMZvd|g!Ti^VT1bNV$8ak|lz*L`vVf~X>)Jts}#Nj|c^
zx?g=4WJi`ys79|BbX2i@+RTjcykHYJ=!N&$z|(&Krl$oOK=t7qKNkNR>i=V5|NbXy
z9ng<o#VfJ+HMTa6qfBN5tTPhqXjxX%-WBQYvMW5w$3BqyN{wcLZIvrIw!tw#9@W=W
zZ_&1*q;V221Q4kiEab1j-B+bAVF!@zrUI4Lw5ZEYjODIl!8z&E47F8}bAWMAiohM$
z_fpktu*A#ng4aO~`PgVzAf%`jUY!Znw<4sdnNV(*VZXMh+*GT1hWLly0%Z68O(xXa
zq-1|{7g!8_)b47?!SIUuKbRTA&=M8lHe4|I%4Vc8vfM2?n3jee8cBSW^vQEqw8DcU
z&V44C=E<}P_p!I-w-1?Ds@U<1d93b8@|XS|spG&2Sq=w`P%V@eZLhdYUX>aZb-(-}
zxX^ckJ5rg>HyPfb_xIZQO+BjdmO5yv{~-Q<@ArSCQ4keC5-m|R1T)pZa>U+Jh>w&i
ze)9qvwt_(EboSf;&YkNb+H#<+!iS~W+Y~Ej#1_lA`h|Yv$fE7^sNVzVN9O6_h-oQf
zR!_Y&5X_ooI6tRMXM0vuXFDHE4;R+f>59)DQg@PIVuTw4WYxe>bPSg%UzxU(!!zmn
z5t$*CybBuMXOZiq`O>{tQpET=A?%C&^eMKQu!)*YN0yixEqpLVn6|M-H|zFJfF&z}
zR9^FNlneV8<HVxN(zZcrp7ZNHQp>c<)Kl-`ye5X0ky}a9F_^sd1EKfy@kU2dl&Rwk
z5I)cqM7}qRZV?E6-P%xFjp*>2qAgY4vKCGQO%>D9xkbC0Q!3_O@ulwuQ`+B5B{Br<
zzVX_PakUB+)wH!1D^;z1EvNFD$>?=!JX`$v_fr<_@y|Pb<S=y>Py`Z_epU>A;F`!6
z`pv160-tjyc@u+C^|SiAegq+wKP-kaMruNizHRpzAJ$1NJo_;4+(8@Db$IZt!)R;v
z<+L85>u}xBfr-(*Y#VC?z8Zw1==)@B3U95#>bq+gqgTpXaSJifVnu=$b5H%+wNK_)
z#=1=xRyUB-^TT$99$v4QxZt~v5JbjT5Ww%Tfy*k3wP^5=VmHznpL}H(hlhtGCxZ#_
zMMsAc&HN4{UNHSGCr0s~?+X9#FJ|Ttf)#Bo+o1{g9_qBex8jW18dilWmREj_sNN26
z#~Je*;H{2IqMCRHMG}uxCAZ1MCJG7=<^^X+2589{svtw=$MJ(r^NuOW%a%>7O_W(+
z7SvBt6(Rz3bA#4ltm;{DGt1T~gRHtV5lf+6X6$;~1se4;&c#$!5D_5BM<qbcckR%L
zHo@7nN2%r?Dzl%5_c@1E5-PJ^@tKt0wY*heUE06Mm<;ONVBNn7!aPEPc!R?8gZn6T
z9N9=hc}DrtFHKbWz8@rB-LvKImvC3B@$94X-AfqHF{-82BT2hzF7nC5#6|l>zLM;?
zN(UNfAv%xY?TA8VjbE$vdtHRBfmKr@<ie-^YZ-q(__?P4`2w&nQ%uCBWJf<PwPbR{
zkV*6X9mHIAm1g`-jY$}<yKIp~fIACpI!7N>R2RG-Q?zdyqbHQO5L=p-<P^pLpQ)f3
zfe_{@`)SkVI_tTzBa5TmiU|uILaQ_y0z&Gsq~+Tm$+&~vun4ENB)X6lpL~5-&8R`G
z5EtVn#w(U4BiA38<%f}g1tfyPpJrjxPcb%KaWNU;`9|gZytk`yu*i+g$hmRg+N~OF
z#*kI=VRxLdj9Zf@8g;w!@;z-&%gLM!m%m+$)c5|T9p~(^z6ocfX(-1JUx*k7&)V$O
zRK4<(NULq7O!D$FxNiB@k)X=&3{TfFHPHO}91bmIG>rQ5coOY^!;KohKG$9SkEP^*
z4^}M4SEkN?49EXmy1xVU=aUK*EcDMys$SQGf^HZ)oJ8iGm~&+h5)g1NsdFaRdJ@X`
zj3cDJG<1JO=GVGNbxB;0#v`ww#{9Y-n$K~3p69S@VywGo4>f)jjh--!ZHi&%V$33I
z`b1}T_myZlaL)nM#lm~&FvPxB_%GYfKB%{(h%8oI-Jdwm=2!YItqR8vU*n>as|Ki<
z7tXP0PZ_kpFBR}1U_yz45`Fh?SfFNR%^<$#WsdVq(mB^WNjsJ-7VwrUX*R8;uI&vS
zc>6=II^V4MCFx2@+3;0=k#)L`;~uY$Hpj4nP<eyVAuluJ%&UZqU^Hf@I-#O4LMbq9
zM)mrBq4c>%Y<gnH<tIMVm+cDORDBKi)A$XeneyaEsTxo0Fuy@SC3zYUa4>Pob&F>_
zb9w7yOu%CI@1@Mde~dlkqL?@T+S$kl>G8k40RMXM=fzCX<CZUO`+<_t0)I{rSz~}O
zUiQ!qvxgDSdPJ1{K+-&*v52MFlXs6XJ|pG_CR(cKp^iq!GlBWm*nu0o#V}q6&0-1J
z^|<GmV;R#sx`ZNj0`W<iU3gp~AGf_RObnu&Q59o-%QE|Ex$n`iy;`pWO<In_eNgZ_
z3E90+ki(p;y;L|59Yf^$&28>zsrsM123*t^&4>NxqpUUB*&}-u@22u~d@vyOCUgkL
zFTHirHAm(fJG#ku^|;c%&?dBAXgS(ZZhQ8eAL_4&HKpVID6E?H&Sqxb`SB}{CI{nn
z@3*La<qS`WVQJ$qz7M+|9Ytu-czNm?^7P+P89}RmM*4w3Pn8|<KZ5gjO8r+1>km+q
ziMBFb&Y|`WA_urf!KGI}ygRF_ovzXG;qmRmAX0@*uTDeAHU~2gE4z1^C{@BND4nFP
z{rt)^m<TzG9sE!_Hr(~;z+HU!J&}p5^XEDcX-X&>sk~+y)8Rr9i63$?jG&y$ju6oA
zx?Dk<eo>H)9+HmWlYLWu{zRj6=5jx*Xm03jvVJ%6QDdmO-AgDlHhQrxQbwn`Wh+cR
zO(Vy{9*BD!*WvjOhTe5rKOjIZmKON!mt4i*a4qgAk>#lk+8%^lX0VWx9p#4S+&LQc
z3c0mV_;BVxQhr1jihk8%mc7_C50)y_h<R!Gr|=iltWIU^K9e0rdB@sc)=wzFu;Z>F
zSpQzfAu?cH-YUkY{yR|rwRpc@fK=s#1KhRFH*BxgQ3Bi*VA88?JCS5WnIPxS^`bPs
znm@y-atLe{J~LJpdE%{jkvd+BA_WR2A9Z%VY%5f@cmcl#QpbW)k#B8#cfHYDLc%5&
znf2-$&jS>NXJ?KHl+81@;e1&`iPy3;hk8BX*GdApqwKZ4eC)4B`C>LHTCL=T1<*>A
zcaCAS)5%vBmYgP+Ti(g_7Qw7NK}++J=}%8qwej>PfoaNJH&G~SmPbKgxA}fII3<yE
zMF-Ub(O{`#GMJ#N>v(0!nG;Jcsy~5Qp|d;#`$9H%?vtLNQFlRFiOAZUlWu6yO-Z{O
zMH@r54W!F`qDcJHth{}Z<=g#6SU4PFXhlLlEzwP(1ebTbmKZv{lOw4*sl)W%9bNI!
zB|Eygfy4@g+8;1544dysA&H(&Bl|VUEDdH8e9yCLz88R4Do^noePi>blvc&b&c4kX
z%KExHNQ>xxm<kJ0q|bq!rV^79=3luvSqfP7p&Z7>!2h+Z{VV+aN@INC&+GaJmTII3
z;m^|%_`y1Imcp14n4`~%UO4kkFbPtPcElt^JU#kKO`rku?fRB4xO5qHuZaHrz-)ca
z+@8*bbna_gI98N^KJruPxBPR`r9L~yk)CeGV}+%Jd|)4>0G5>Sw>^9I38xORGtp0t
zz;_wG#~}sjJ@O?sw+rthYbguCA|4tb)y<9WH!^%fd*(Ih23;cpc_Q=YfPF5@8`YDr
zl*GqkyGblATR|*x5y#QC+Qvj0VSo~~N${xZVd|qHLQIZB;I+8pESXpvw(IaUO(D%l
zYk4X4ByX5lF?3h6e<cSz4#)&zgyTpSspDbim^YoXufDdcW~kkiU#0i()rxs|MFieM
z1xUo!ayL|J&BgV;5B9c%l>Qk|zG(vuEJ)G!r3c?2;0+9pybC2^VqtnPk-_MvLf>IR
z;WwR*w)aY&44UeWJl@Y7xR$Qou}yaQ))-cAxsT$s`n|s8vX^^7z{qZA{Z-?7U1Q+g
z>r4;43OQGp;@*faPC;~Vy%;{m3#)d>_bS@i(utGJWjDhg;O8M{Yz&x%b$Gc;jm-0N
z6O8r7y8Etg5ON&4HN+RJ6SqD5z^Mj!S}y9m)2%5lr}&c0d9m2Vm8my{{I8;NhcCdp
z%w!1nQu>V72k|n@f1<>S2dFCb8`?5SA^%13W)%f0y={PYFjEc+^yl<Shxt!_0rUd-
z+=Vfs17Uo|31}(k#v(3Zt?by<U*kjl5}afcFL@CC;ygN6;KtPahxfST>^Ujk=JTN0
zv@_{0d#?`1s)grt-$W#8RXgN>uRY!b_d?VltA_b?_NSk|YxO$gb4){8i7U{&<BTJm
zCdIqboOM7vpCT?huG4CO%B)7&QFBghIK3KkUCz^a5QK^hO*UHLp>+3>m}5#6&#Z<R
z+-j#~ME;TPs|C$6KM9|Z@?~a*=Rx21^8L&4q{gnvBeVgR<n#54yBFfxph5ho@HIP2
zPxK{X40{&+G*VX%QnrQ>tUx+8t$u>+084)oWNnp;RGyHSDSZA(FMVn3`1IiC@vP@J
zsCKA1gyQ^15KH4Z*wpLrA)BB0s-;;QRY(Z6zi2AJO5WKJ<@t?@g}-dwEuMZZ{UZkE
zhQ#53pt;GZ!+b^1@8ZDD2;UVaKu>jAm4zTuH+GPb!TAtYA>7SmLuMD*0r*>nfvXZF
ztkvh3JY$4~!c!qm3q{m!<;2)TT4x%|!-Akjcsz}Swmt1$uq|Uwllql!>)?79PTe}6
z@W?Es7pbzqtzE$od6T#f@4*~*5?D0fuijsVeianWTKnYk_Rvf@8^~WS9|nEKHi}D%
zZPa#a$2TJRg>FAh)GTsXPg)$_eCt3>8!#F&c{K-eY<u^NZP9!x;XM6qi;T5qv-N#U
zpK7lCi{MQQ?Pu{R$5>?A#8w%t-Z`x)$;U+Z1*QAVE>|HkaDQrra5jASmd{t~2`cKS
z|049rU_Lyirw=g~pY#TV3oc);@Wj@MV>WLzxu{DNDli@+LXRUP4uiTN9duw7(Lgd^
z&bZJYx{H);b?OQzr6ESql?icLqC&LEq(H>fBRs=7>NI-XNlD%~*aL%l&s=KttYa9J
z2HziiI<sV=ML(6&-$U(q4#9_l?oAI@4gq=%m5$`suGv+ez&lSWr(B%A0&7*<TDPt!
z>O@<oNuuF>9gcmTV>vY=Xi952sQ08%V)>cIq2Vh1^(rL!m4|){?KVIB1hM9i?h{dP
zUye}_-_j!l&vEKC3$^@?F1O@-_W)U;C*G*y2e#r8FQsD6UhsVOamQyg8(*|Sb~vLm
zpd2cZUvkV8<mov&d9S?M_uS4GleE+#>F5x~v#DJc&CjPhyf}HbSl*G9sI2sjy$mXH
zA;C3Y$BIIH>jfd4Ze-yBG@puMqrWeuMWb>dI^3D|ck!j-+I6CBqZ@HsrnM0_v1hlO
z5fvU4g(WIb3o!|IUp@}LX-#OEvlqn3fYVP2;GAXvPwL<9sHl&A?!w64R3xMd2k_gs
zobS>yliuH}A|=0<N=1&L8#TAt4}{I;WL+Dka`F6z(E~Z|y$o)%mo`L3&G$mjq0%Eh
zTSDF|!E4tpSo-ka%(3P=Tpn5|?f9Z|`+|a?&%WeL$1v@2Xf8q1Zg@7wD}Eb19s@a`
z%J+-l+5-Q9XbJ)Dw3TT`-F4vN8<iVR_=4+wX<_$|9y`<!0)tn47(%@u(&F6?J>*7T
zMYx*bQvhKpq2G0o$-UuX-Enc%5eM}myW`_k#f1pz9ayy&f2h>j_{en8>~g_WNdJb0
zwBNXJh!54Er%ilkS!*jaCZW}~xMhaj{CDfEMF)woWBnOdHSfP^B9TdfPot3DGF4Xj
zDcV}E@XVu5F3_emyYiKw(Y7?aT?lJ9_=qRbILFsN$T+<IaT4{Mb}Ah<%Kx31S+4gS
zvVmGg*vs4-D`lTwEqBtHEf~r4{dRM7W1^t$PXB$76mx81+mc7I9?W+K&SwG3>V~jb
z-jd`G6&g^nkLdKQ42>`3C|d4zVFfjzwLk+^hXxt0=d95rLN8mK_As1qM3#7?Z0}CW
zlCC~YP0wRHWiT<raS!AQ#mwjyX}?Cu8MJ>R1a<V_h!SK>T-oBV>EF~4?FxkM21XXg
zq|F;#lIej?XRi%@ERVw=dhhv@f%ykjTP9Ay{Tp1&BPX4*=pE4laSQP?(I}U7_C+EB
zJKzb><SPyyPJR@^`U_FK8jiY75m9T#G;DJ_<NI_2T=yD}<Yz<gHXCb6Ov+;esYN+P
z-#dJpkGWRF4cUl~)W)PPaKHYx^|Fw}%H^0`xF7&;e8@dCTr*nK=~dJ;#&x8s>HP<^
z=|)|fXJ0{w{pk+IorHLP2;<is=UtBO%9gFAQx`0{yl+CjCwp6=H_Iy$s&x^0x~$Za
z7DHk3Qne+;`U+OkKo{OWC^d4}{%F4e^K_vup1-gHhk^aCR~z_;+z+dD^g_hr)t{f3
z_4D=MN7cNgDY>WPXnZo9Z@0fF1465CUnrNJCa%g`vSeFW+t>b%MRpTXhW1ul{lK32
zTlb{BU>ZyMtHJ51<NcP(y_ia@VhN%8OtDkh!d#hZwtniG%i2#aY#jn$#L3Zfg5R6W
ztKIWr1{`RexRhh2VMe>NRKqoE4tmNr7T?)dGtw*lNwgqIS^%d<dk5wS{SMqe&;BPW
z)Stir`tyQjW6CB2=+7kYLuQq;af0)~SIzH2FRP!;W%$)=gfA|?E1bg?e*3ZexPNw!
z9*Ts7qte}=6=`_SO~!en;)b-AcSBrPvg44Rsyy0|iw{hS+(AlV$=>vyS%Rr*f&{<*
z9f^M%G4@w0Vy%N@xB3^+G?~`-0&AuH4S{5W)l1sYJuF^E5qGslw5oi7$LhtpN!6ku
zDkzujK<ILx+YXHONGhoZ#XcF~U{6cC^nvQ3Jv)!#1JN0n$mD8<YmVlZUr)gsrt0k2
z<4m%@*iNjwmZ8y#KXUcGggP$s<qQFpvlR>u`lyYde>|U%$8!0xT(E22d_MyD!&2KC
z>U|8j06bB^Z#AvfVOmS(Q?Zr=a#-S|psmYn?$%9WN4E}=Um0SU;tTT5pTw0-{es6`
zvd4oqt>sV?r2zhchVeb}>oQ(P?uI;AMvO<n!{&?M%fNzUXxQ$g>acO1ZfW%72@`Am
zkX%1-flG37WVn{A&{tkM@hf@5U1FFeBT%&)wr_%nV#mD6;8rDQpvd^n%?&)r7bLGg
z95;Ns9EHV84JX`9Onczy$l>i`UGItkN<rX=y}TU3*tyh4R=e~rD_)y)Kf#O80e!j)
z6^ebv%RrSbEjx}7hnS>RASkRT->eY${U%>c+peaGWywCrVX+w__e}Kd<{SUAi<Iyi
zf(?<n-C{YEIm$e?oRqjT+nx$d|1+>3cu<wCA7QBIo1VlsYT%ku>9pCy@SJqVR=kEe
z0iXl*YvQj>9U0tRitSWRWju6&9=Y~pYRvBW2~<5D2J=vA<BD0=n9<vKv=7C04rz(%
zH`Uc$(b(i*T@ihqAU^qzb>?`_Zx>1zpDV{*O%2~>z^?HU`+icj>_1Hyo@iv<`DW$Z
zc6i&^wC@AO%{vB4)Q38ta~rmc=r@hYXKKGXdU{$9A;|rzMUQ-xMeK;vWF*8>f}>E)
za!~UAKn1F|w!Ah~oSYOQECVwC9{ntiq#Nj`YEOT5fuGvO)|ZjsxpUWfgvK}HApUF^
z{jNvaD!5(zTjqZOR#@jAadM4LW%Ocsh+_G-*R^^LCE2O2G!}Np{nT*&9hVs6w|Ji7
z#9=EL@cgFBT>=G=NHhH{byq}7E6M28%7ygjz9Laofo}$o(r9<(@|u|mwU<+E6DOp=
zcZcYd-@Xs79(^^@srELFVPv35V<EC+yQ}e5OPgWVMGO(FZQ&rKhi6L?>p2(d@YO`e
z_h3Cn*%-bU*;-BZ(q)`m$bl^SR+h_HEj?rAc*bW?7#MRN`ey0%2nA7V$@q1I5+x_h
zxt_#^yR+~1yA2!|it-k(jXF7EVypUhxWc6217aX9>GV_1aZO4Fd8Vb*<g%`_iJI=M
zbNek^Mfxu$dt<@iM23%j*a`-#89O_Lr!e>SeU|NARL(MyeZPo2B?sdpmww$_H?cn9
zM#K-wD$F_lB(mzt2adz4v@Xe~D;GvuQ$mpX7SQe`+d9;1I8jDN*bvU;*+IEoSFVKv
z8m}c#?-Vz_KKCB7M)s5>O?@URkWH(s>cro?rA}Q{@Q~;w!yv$UhXp05>Qhy3>`J#q
zKf<ahtS4qyTM3)HMS<?a{>Ao;f%*l4)>V-l@vD7p0~Yk{B{@bQ<<wgaet_qcbqx+O
z6W9F7_bArpEOcpkbWT4%_a<^3HfMO-pL$SeAhUD*c}L+*en%H6y-VFPr|1ovg%}v7
z`8!uzqahBT8b5Rw7|roo8EU~|=iKF4`wzS{*KxBlFG6(n!eUt%{#OXu{@$sgGG)Dd
zCmXz{cqq0GNVggw7stax=$Jy=oHF17q%V1*yFcL{BWEBfB0u<D_9n9b%?rXp)kzwx
z%CfN!+b_@vyLcM%_>)C;1c>R|lKs$V`^t%ewR%<^WLWyzVvMx~Dk-m<xj4$XW5o_g
z)|ZP&G}Z7=?-Ou|Nv;XU4y)X4!>>Ha>ZAdZ6a16p6yP^0`4i@>KY$jI>&LV(@Bhv2
zlEKum0!;aQ2lE!If&?x6t|iO*#Y(8N-n>Ny9+d-na1`Vko)!8grz~FYBi1Of2eAs~
zt%tz{4Awt4fz&p}^)yM;yp4CqQJ-t<kPtZwE5MW@8UW&sN!V1pe!v4xXWy<2HCpTz
zC#HUtmo<{Bjn3%_`w?NWbUrOcj<x2u1&yFX)RqgQU90+o?{&KmXufxjA@H7ZeU=aJ
znQhjty$!yW%dS}l7b|z3zsV-@w)i^DWxS*9^H3B>3dLP8MFcSg^<xO+&{jxPz;1I~
zk9-h3Ao01@bUQSu-_CM^I2OD#{i&*_E2NaL?W#fP;|;O?M8{+UAZJvK6Mm=pWOql(
zo*BSK(+dVNUzya(Cn6(om>Vlgc=Fmi`13d;>7v(OFOvCmNL^=rI-m6RbVHz8e&wWC
zc<RWTpS-!_1NUat*Gsl{YxO<AhOab<A#2k)B`vvoCsa4l)`ZI~PP6Lz1DDQ?02DR3
zn@dX6%*yNM2R6Rd=bNEKBU{K#sctAk?IPN)f{?fqr>Kjxvh_!{7N^0oZST-I?$|E4
zZ19K@3Hq4~s<;z0sL38N6;@se4?jn_2WAlc#yWw+nNYQIfUc*W0}BG2AW7a8WlE?h
zRwQZNJ3cTA3_0mIs3<ePMXWlYA!RBp?sQ)jNg{gA_5F3b3|87L!KQHr%P`F*A%rX*
z#f+s0H{aHvcRR+RvHy9=y>5ra4g<x+C6{!Axz1r?bsPjkEQ@zYk`b3KnZA6VLz%_X
z-uuhAAc(z%f>h{JR;}Jo&2N_$<9>J@Dlq94kC!@|o9H-Y;FfJG+I*1{cy;4azJIgB
z<vwbwwP^dghiPeWB^DEk=(UJ@#}bwHANre>icC~MOc9gj59jd@?d&%_`3DER0iu_K
zn%a&AMmE45ro>oyU$~tpU<1A~^_k`ua$Ryn7br7@C&W*V(rGv?BMgr6fgBd`Y7N)1
zK1WZ+i(^=7m`))=BoyCQ;S)c&-9jqb4k^Q6^pdidU69a`XAI1RU!c=slIx6wGd#+x
zin2$&c0+2k0*8>QYaxQ+52V7kEYeNr6V7+?5M^yi-*I=&IquQC%3bMYTic2Yxu5Uf
z?88uDwq9IjYR`su*LR&Iy;f6YLbv%E?TnN`^$~NMb8Dr!S4}meqRHgFJ$5+OP6@zC
z%|F2=K6y^4Eg^{@2qyqPp>tAoNPN<Rj5v4Qk)$7r+nE&Fhq7a$-QMMEjgI)C)^;|f
zFm<MGhjQ&k&vd1O4qmpQW?|dYoR0h)ffPRXYr1bnya4zqpB_Ic;ezw7AbbN>ac`P(
z)4@m18)I4g`YYlm!(+WPVPvE2aV>lMeXq8&Z?%&O6eea^l?=`}LZay1_WOzR>arES
z8oNxBeUmvOgj>l4InYc_t!qh~Bh(cY>iHiXJlQBw4?ttRa(Wx;q$yhkDemu1Soi$5
z-b!&O)bTP`p7tbZYO5!|2~D;z>sxl;Z7YD(791(}Z_v;)Ux?#JW{ZY<7BYIMd~-Sa
zZsuH(44@j9m5Z<5i9SyqYg0lmB>HQX_OkVUxMBHTqP=lYU@eIyyo<~c@73S2$@KR?
z!yv`~Ftsh-LKcawG1)4O#FuUZqfqPjDvWcJg>%rzI_A(O4OBeUc-D;bP$iGM(sm-P
ztb37433u_W-aPsx*e-Z}z_Lf@^~e*L?C;If&Ue$<WlYc`p9H|f7$fZGwgfxNFJef4
zNmwyxaTtHFIY^TH5x3h@=b-(8A^#&@|2)wTd;B!h=IhAdkVN?NSK`?a!B;D<O>eWC
zyuY3PXvJ7<ho2F{<%0oR+G~H~eTv~#wUBfdE%i{hyNp$@Se+2<RylwmYhReS4aRoS
zZvW&X8Q{Z4JYLe!1{;iJnjX{T*5Q0-3w0IRmT!Qw^Q)zgXkvaMk9770G;pseL`%lu
za@;(F_+Z7*@Fome<ShQ=B7VL?Uv)Bl*VKR<?&H*PK{S&1xZ8uY;UHU*Q5Iegsc^+}
zR4>XYyJ6$9Hz)T><YJBbDDHF#{Z(KhfunVK2a*Nv`KhJFPF7DoEa!S*Yver$eTlc>
z<g4r)=jxdznHBk(C^81HBb?vx#aK!<<^kC<n}*1d?WYFvYM9b4?5&eiUCk*_2>5(Q
zUJix%4oIgv94HQZ{Grwi-J2*UEz&f$QQ{?aXa!YQOP4XnhYaFvGkjIbOBdeG6KfSu
z&4Z?zeJfC4+9zi3efx-9v_@Npn&PKeAa63IJek-+Uc6wX8)dn1ZNLaSLFpF>A(2Tx
zHHe-0V)*T6i`jJT+7=0q3s6THYVu#qzK=f}<;NOhRxzK}B=+Xo9~s8JpvYRDBwHAj
ze{+YjBmP;Ozw?Y_3Ww-BWRs}?n^((3Jm)OcXL1hf6up^60DG^+9y|F>dvEq6hHLRB
ze|cE^_ym)Rd?ix4;eMH5|5Iva;D-P}RyAC><%H~=Zkm-ABPgMidW3_Zkr?=<9XedD
z8Yp!Mn597pf@q<A9C?b-p@^@XWW5Zm518Wl-5aR<`Q*Eq9H#v2%tY;*l`s$Tm7&4R
z2Rd4fnyq@aSCPf(upcoGY_h+PDY7-%liT#Xl7ya}D-%-mtb?i$R32!3uG!{cJ^dh&
zdb**1sYe4I(SJWD{CUdA8t8*B&V$J;nVzlvkt+Op(9OOgSr}<XKl>B@e^T<F5dS;)
zXV@R{^*}=;r3p^<%Yxo6t~kDVA*_)I`dB?rW)8jAEP7B-PfB6b5nS2rkn$aCE$rwk
z7gt|G_;50C`71Dc?ALEu6rVaT`QU|Lig)LYRyhEK#K5b@7w{T3r9nYO<A>iGh=f^u
zYm)GoVtTB<h=ax7RO$lX!k{Dm<{GtJkp+<V{esKbx5m{Aa$w*m-sdB`UGn8|hdtR9
zX%xvRVQYtAr7V(gN|Q;?4$yHT#-DCire!ZE_I>*<ZeEPT@&&s~50huANWTX4YNp|e
zNm|{H)lQ~OG`}l6R#bL$)|Z>mr%{^qpY@)#Qr$`mIE*1g5YiV^=sbu-RIdu<cl;;^
z2oig&*`27G-DP1=KII{E$40Jv{`X-!-XkEzEhmV)bd^p;?6F8&(e75!EXC4oam>>c
z*0~&e;nQbaOn{TTFTAi^fa@yXwv>1K(%m99pb)`<3HGz(AR{Efh%e?6KK#h-62@aI
z>{8w((ZiJ5n`ujkLiK|rE}3|PVc~GKXdx`SI~9Q_GTXfh%ruK_pn|YPi+8>lkomEM
z@{`N3@(76OezX-X*#`L=ouY+51|r7caKEGVX#Aoq`eF?V@{^=QcQlGoE?KTXv!^L<
z*UeLWMqG=V_KO-840g^D@|Y)wwkkiR#t!SW=nUK(d5Cj2emaKkdFIvWrQO(BZ2GNc
zZW&Gqp|5}H)=CPD)QdTuhLWf@L{C65p#@TPH#ydYie$G8C_{T=xryq`QiOt_>%E^l
zP@9sdUbvX$y*B8+6{R`4Xjm>=XgO+@&@-ex@LR8+6>-;!7ad3!Sc;CeDS++`&pXte
zXA;D|wBbHVns~6esJ~ez9OSx{a%`8Z_!}xyJwj!w#N6ONZ;}0fVMhh$BhxYk6d|E$
zjAs*CNGs$}h=V_>GmCXX!VJ;)4t;RKcOC7<z}x1mEJU)}?DXJyCUIKyBR1b^&WW#Z
z>1#=m<4kRGt7rPn_j(X597XAQe%!yk0Cn4j6N!Niuf|_zm2#yvl<6}%b%J31AFWK}
zwQ>V+Og$mUst4-4LD@qV3_($^6|P+u++V9q%hG^JM*`*@?fNwskLIE#dpQLSi*7;k
zD#{b0;%RR>G^>infg%{k5qu%peD6gQCKfN*Av(Ez%=<)(u2F+{{`>)E#kXKpoL{H$
zsd-8myd)U$G-RtvvzN<o!3=i?WmT39J<J~2Lwk)u9jm{s`)*JT0QGF5U$!Rdb{TuC
zx0Ns^-+47*3=3|j`cPoR!OyPaO`?X;V98?sB^Q1*g)lSd8yHrIsm7=TVjV31+HN)_
zH{_jCAqQ~SVFSk9S@YYwCFPR5;#pc((+$^97l|rU&%%if?V<r}H*ZL-3(Ib}wXS>U
z-lAnA^|Q2Z9eY~Zx;Mm8;9dm2+tZ&aUXIs1AQPeA>$BT*o;xv5bnm1N>^Ma(k}Z`K
zfe(#!Scdq4yE)Gt4`=cYyCa%N>YA_6$&^PvG^{oIfjzzTg?rB1zk)Ie875{kZcuNL
zuzui_>6)Jis++cw6Q(=gi6OYs-EbANrN|=B2HYUcmOY$uEp6P_MmAltRo0JR#2$Fq
ze-Yf}8XA98z~iI~533&%KnONQh_%PPC9e~zb3QE*tPrTm%+`mS2Pm?9;gX}K;x%wH
zy+GADZ5omd-sZ?H+e2yMuLP&ietToW<!~3~A;`$Jix97JE7-g>BiXwy2WCrt%+hMh
zWVEkzf`-~$CYopav+V5zOUiZ;ef>z|1Ze%B?OL%-TK>fI=%W+Ll1i@czoES0At3<U
z)O=<%%qw3-TPFA@|6^K1eiF^^8Do}rjV)i@$hY~ee{u@4yz)%Wv>oltyN=#Q>@OUg
z?RfJ$+#u2n$a35PJg&}a+k782VfeMc8{ShE;1Z?_^d4<c65US(HgK@~Quc8+p(adT
zJLt=16t7gBaMC!%QHomz3dszfuHBI2MCri+2-QF&2$T3FAjoxgEN_FOWef7VQjL7E
zW;;B-)X-uv1HK!%K%tacqvq}ui2H^z2`Hbe_1Xtz-2CBuhcF?@r%SnEDq&<=qxOk~
z1BP=<=`UA`DNorCtto1Vtvt2bt_;GiopnZr1ci12-FbY-;hp#_Cp$HGVl`CHjq<X-
zq0}Z?)#LR{EDi={Pu=Hbs4Y~w5HnulQF1ZXRVcW9J~b{qudA`+{Y>KffxDS{BhMi=
zVx$)rM)$T`%JNlrJ)jjo8|{=@!iS*i&DY~NYJla(Vt}kw(Xk0MQaf{%XsJ>XHUo4K
z!+F{)=Rnn{6!k|yUOU)5d`$w+(^-j4bJNgwm_AJZ<_;Br@5CRRmXXIRu1tA5@I~@Z
z1;&2>`%j`f7%W!h5;$aGqi*#e#5O%r)7`XINAp>?=iK(t#nlt3{uK`-Z6nz)o0Xl>
zuhv?0usLSVmtu?t_pXryx&fl<ULJn(3C74}UPpV|I}Hh*KB8A|to{<*#!s^Wml1#R
zW{!KU&ck^?I~!sCm02w{r$@I5$Nh-$FkxGE5g5wxy_BbqEG#)Gw)WToEuQ}`e~nnJ
zff&=JbIj|x8TUZ<yDY17uDlc(*znHx%;2cGya^i^n@bXvoO>>Yx<kSi%Xco0u|Lep
zBo%5$v0NkVt@rq>ru9fzw7EQl2(6b}&FkMG__aNCk}F~=6nJqTF;}*H{b4QF((&%O
zNau0NbZ{GW(4Ww+@B;+Mb@~5>1ql%L2TWJxUxe#V96QM;-q`4n15QREW{*hChNX{U
z;5|-t3@E4i@a}fz$NK(_W^QwtBfV&Tm~sWm|D}u9Hhv-?GtYE4Fbogh*0+-D@(UKx
zj`Mpp8sa8G=OLj1Aeyg>n+t)~skC@<=Qcra_HzIX0$N`9ZU)roM$dbJP1`3r`+5<K
zaXf3$cKqH$7G4~0!wzp#pLi5=p{-h+$*%Amew&`<H<UiOrxdN4V7{&l!W0Ce!c0ed
z`I>{oytcP|o9*46#vJb_RN0eDz+(vq&rMN;R#ntYg=K2}wPrjOuAX$hN|(uu6I0=Q
z`j9;90r5-m7j={lS6^BU+1RAfCp}b9&V~gKI?aj~L-Y~OKn(qC(!&qpH=mSJuTsjS
z?MEfYC9&swSj@zPIvSOz402RBiwX*%@`Inp-q(M^<5O$E5FeAp(-BD+{|MVHl{jbf
zJpmyQhH!it@Yk|QS#)>LoqpO2ttj$_mdh}KeKO4HB~+!;&UN6{^Go)O9X6F-5@Il3
zFy%g=t}5b?0UEd$A@z@5qo=gaUi?R?cXIaD;eXMRb8L@|n1<2i20)llalGUenOu7n
zYlj48+nS$C=R#9F9FYmBU_dTJ40EYVoP*fw&_5^S3h^V3MRW!>N&5bcG*RPn<^n=H
z8gH5ymTqiQ2MR2;KkrVf+XEO0jx82bBuUWp1?s*zvCJ*4+hMU~6{Y^W9;TKIkxTJ<
z(QeQHFJ-36Db9;v@7l6eXe{>c-%c)OS&SgNtwUMONWBitN(*IHiJah&wNpd~o{mQd
zPpHqIAHt2=ph98$+EA^kb$nFC=uibE!ywW^=V;5gY`fH^C9d-cPjcPqM(#`=>2O3#
zDojc@-v`2Y3KTIv+IeZ^6AAq5Z9BvDpk005TUM~0;X~=Xet9L?kLV#*xAZo!h9!7u
zXeIlF9WFCWI!CcBGtf`ccVQjZZmj~^gpWsv;H&R0<-^<9cZAD1r`@w%o-!!Yly8I2
z(?KcMl12{FFaBV7eY!{FPDdw$^5?C;Vf}yZEd62%pr<X}vFGxt5~F%VNZwEfB{ew*
zFBU6>w%38e#WzA~%p2)z3wV+(^NyA5`XOy(ZCxRufqP`>ziXPGH|W?cl3N&ftw-C1
zwVS+?P!^7Xoossf1K2|QsSix(Z6zsdG~h$P?~s1f!AabYU2tIpsg)FepO5bt-`8>a
zwCHdP@CQXOnl;}qLI|QS$LSVK9}cU9Ywl@is4X&&{lKk=*VXue1nFVstKyD7t_OHi
zu`24ct{8-IySs}#uvDEMsIr=Vb!o&2Tt`(?ufVjix?DYpO^7z(hqoTO0tbeR%Tp-=
zbV0#tC6rFgFBrW2=Y^enH8-P;hlfHZL+DdfxF`-~b{UM8E>5yK^~NVTue^%16yxpg
zqF9eInsjeQ)QY~}6h1nB8@;}cDUj{oaG@V<n~`~%tkjMWTj<Zd9P#HmdkxTM_2MJ@
zi6Qj$J^j4i9g0?Mh=Uo(O>;ix{QwsyYecj01mA6f;kkUpwMbyej&~kKtB>HC*ZDkK
z!3=+4MdnCIJ(sq@QRGK7{$7rac`((P#m*OzfGgWP{XE!>GOHX(ilR@$u-m0;TjpU$
zl_qJky|TB<8zlz_g_VUKYh7oyn-urjqqE+n<_BX$rp@<ZDZ%0|QV_A06%VW)1{H@{
z(oPkqL`$HnrS$Grov9hp(kmWJd+g6Ez)ifma@;n`ALb8M#e9!0*@mJ-B?L;Lk3rk2
ziFTfVFS&XtY^L5oD%U+CBr(kICqEUNY+ipP1$8${<t&E0+=I<q4M2`BW`j_5bE*QY
zeMPGAM$0dn*iv|tw+-RaxfTPc`z~9H#tyj)Qqd`IX;<-chn@GIu=)=H7Ij=mqGrfp
zvNZ;ML5)jhBQPQ<>{xx$Rxx3k>we3F32R&K_u@NZ(zNB);}jWnz!koeIC1e*v=w?w
z8-KEX*MvOK1xstkmhOT}r27*;?buscsfQ<2x_@X&GgQDGz!_9*j1BdFx4?CVk0g62
zi#JxD4Ou`zu0^2VGS*m{eiQA{+aCsWFw8)^ndXO<@>~{_8Uv>LuGWdlr^k&YyUZ!+
z>b=5K88*6?`XIT`@h<EQ920P<60N^xe}u2m>f^ja^12ilQIBi~YXG0fQ;T(srxGXV
zL&3W9un=BA&{^2{nLOUxVLpP->6)N+_FSUnII+%5xHnfz_TUm5Xnim8q*S3G1T~8_
z{0x+))I-a)`^MlmOrjAw6Q*{q>s4hM?HqlDhI@~8$U7Rr@_S8W)$$>xJyXB9wTQlB
z#aim~5KS166jA+OI4|dOG!Eb<X}XufJ>YU6xP(nV71V*`JIp}riK2XH!{bV5PL0|=
z5l>SXId=JsLl+(E?B}e93f()b>487A_Lp)X1Xo$Gu$?L};qYAhs8vHtDr2^Z$3*)f
zl4~hQFemKI!AB;uq{%hnLhm=>QTKrd{RT`IlAle$#>}rSO(A6(9n7ONljPDp^d4<s
z>8C08A5;^!S93b*!?hZcMNG|;b%4<i+5LbQPpqyY)F^NwVim3BpFiR$7OaFUM9?Yp
zNF)1p^I4qdaT@ul9x@oH?W3%R!AM~qEH*uteyF~m$@;zy)#}w0b+x^X9C)y@p)ZNU
zdY1Zm3Ab*GGHrJ$g6_ONQR)K7zt%M(qdWz+?4;*uf}In!li^|+ii(Y%X$4`XE3DT0
zD3N9?guLQq74W2bQ`-f$G2#;MZC%iRbxg{cEQ$2{I2mKM9UVV`W!|+5Pd?w`Ybp9M
z)pZv-U+K{|OHb<4<7WOJV{aY}_22%Fw`vn9p{$W4A;~h<3Mp%rC~NlYS;o#(Dk7xF
zPWGJ;gE6LTW1lb#X2xV4W|+x3wi&;ddf)eb&gb{JzvuIxGv=JktLyo^9@k^J#$75y
ztY6S}+H;iAZ2IAmyZ$vCLEcG`x~ES3+4+8KR)R=0NZ)Rc&LzQ2``Q6?aFB;Xrjp&P
zz{J2H&9L2)8q<o{cVCkBy8d`vO>Fz?A;&h&e}O!u;k3(o|1Ny4eFvhPcf}iD!T&lc
z|7VDHyR`3Dd}Gncb~A?Q<T~HsRo)|wuy1PXm#)tjF|V^rdn^wzH{UT&TIl^E7Mvkl
z@%F$zOWj))y{Ln0aD#5Le-FM__r1v<uuvsbw8J@-q~;P6CCj7VnePX78kI;P2qWYw
zQ<W-f)8c~%oC(}B8sqEO6ZO`dr9R5Q#GCzEWx%?ZO%g55CUG+$Y|PcVh-{kSnTD{h
zepG%f{;gxuuKg5Lr#axfG~^EU@PN`426!LZyDcu78!m`0RGyjS-cGLtttDSa;~u7k
zT?~-cFQ2uNeq00}Thqb0eN%cXD)3`!sh(w68h+%LGgIclRl>Wncgz#vFW?7wK0FJ0
zE&WQ580=-o{JNBIN=d{}pn#0@%jJbLU4TWI#JP%D`gdNmoG4!cY~Gw~zuSuf&uylD
z#8g0-WVq6^H)s1wVi?>2{?UIiFTykw6loF{>?uw&Q6TQUKQ7|xdO3tD<&&{g!wnAC
zE7Ueu75qZwogL;RROp#D&}Pr)Ct2%UyV}`H&TO;MB7ezCKYAy`D_{6g$ns#yUL?6;
zX^|3IZroE|MYRFfmtP6UGIRAJ9UqFhz|`Nh$(ZNoDll~Kk<7|YKLH(8Yp~d;uoFIT
zaZMn)I{(dByV?l>Kya{sYG9$OcU4Djlg?9y8~?DS)&m=dC?>4(lgt$R0-x_xiIYt`
zhHFPEv%W2oa;-0tCP>9jn>4FSQ)>H&a>U5>XBCiT88aHvx{j&nqDOxZwv$duR)ZR}
zc4TYKs5oBdZCv}fDE5w6^Fln>!2t9r?Rb{)cIuJz6X|Knf{Z}to!>TiM6SIzV_7GZ
z`-x6u7LA*m7_h{z+kX#nq0ijM(91G^z4c1$m;rsh&BY~MZyt0J5j@=Fn^scf&n4uK
zyCZwPYliN5WBOI{&i8=nf;QSm=Kr9ZR4u@jt=lfv^?!0@$M3tcJ4z>-Ko}8pZjRXt
zMDuW?5p=#GP<qqJQH>2F{gT*JnnKU}>wm4h!_VBbSea|<8(r&r_ghS$e-D`YHP$db
zno};4DT7xh@ELAvd+=7Um#czlvc7tfbI`^iRF`J^e7zeNrwE$t6Z^2`MTXsuz+X-B
zw~ViU>9=oYFcrvxj&SlI-=8#`gBxKT6S*g=y)Gx%$Z<*X-hEU=ZzXMjtLL2I`P9DG
zpaTV@sM289h<A$mVf#b|xGF7E4kjCV@?F}i{>8mgjl4qASVK#UfI{ClXdXnYATUC(
zf+gFERZ2pmWSbwb!b^6nr)CnVy4hz%hfcip8I%an`-@tyST7O+-4Gs=mk;6HMoVw1
zJEwlf)<>{id<F{>IUrY}d<bv2QXM4qB%GM!6|E_~i8wuE(%}tG+6NIb@8eFBA+g1T
z;~BrV7jD$feV9$+uTcoZHxn)i^T>F;z7AZmJ>T8}(qaRDnyL6j$>9F@0#Mp{C*n!Y
z@FEnbV^aX9{$4S8OG^06w-%}Fi>|f5?_8Q6I>H44kC2)jiMwwE)SGkHP<G`SzMIwk
zW!^ekA(zh_W=Q)Vf=;xZj`?tqTRpiLnLEDVUR%dc^1LWbL(RNOEB2w&)lMNCzw5ou
za>3ePGKiUEo)EYijeq#|oyrqRFXNd!R>utcM}yJA$rsEi;I6UF$cmGbZr+fphyy1|
zotr*l2?!k*zN=LE9L&VOEu|K(`zua!sPunHJL^?v2onZpeDh%l2{b9*$#`_6QPzH2
zcT@{?W}5$7Mfb3KNwIP5Ed41XJ^rb7DD26oChTC3K+M>_?Ht%|Wy~D(DmL%#&ddCL
zzUX2VVdPIP#Oa|I<Jm3+-}xohvsU-UVEfCY&}e~5g2RoGsuYtrZtI@6Y$qtLC;o$K
zQg81U|BGv)|EJ<l+b@0!nJ_=bePW#p^5RWliWk>G#Y4XOClf2z19(xgjafk}wX@Ou
zb;+J_J$JqgX<Ka^TSpJX^z~ZqO4PL!0)=d0Me^-`7gc2m3B<ANiGI^rT8@2h`8?m7
z`+9GX=<oCgjiBOHnE3pJBfi@>>zNqoK55g&kR~qmHxXw&c>U0yL2-J8{O0(;UASoA
zUeO1ytUG)AF_uaRi3|iHd~{GD`7#_b<YB}2QUWvV*%8PDDOs!~B;SH&&{R*|Ynm3G
zR0Rz1ny%*;fUm^z6I*X+6;^0*0V8Hj{fgRKb4fo5WY3g-_@z~U7q8(e9|q#(8FB)&
z>f*}@)0OIYh~{6zd+yUubD)bddg3$gylJEX@yi7J-NC1}e*V|Ls7S@}q@C-mXXG=h
zGcwtVU^5N`MKsR$ReIm{g|?S@nPGp7V%EbmkIxqvdR7kqxc*6!r%Oj$XNTFSQvU|q
z5sx$YsS~;r>N#FRe4jejVN7C9*Uwei4z5)#A32i$Zg@d~nEM0plk7dhuZo<XdXXuX
zzE@o9j1>>#|Ao5D1M&O?LSE*PE@=Url~rxN#HYG6OCwYl^ZM&SkIA!#LPaYi4*CPL
zT74eep+CT$NB^-8FIU$6o1br=(zfyo`zVzwDhmj=&)W2~52MDw#(knx>jyY!%DtI8
zkH%|#9sHc7iHUyEPHdZ`%^lD-9o4pMMlY`@e?L~^N2^oG-tjq5AYvcCKb`PO3tKA*
z*fTykhuBRgkOO1~2=uqx``UCAMb5>pvU~W0Q6>Mgl`u9%#A%j@N2kL_5j_K&c$io7
zm9$Ik3|%&fYhVS=d+nnYdm*@ciUn!6Wj6Q{-Tn>EZ^Hp3Vk&v(g5>`Jwj6(X5O}u;
zR9)x<*NOErRK`eW7^y4quS2S1ma9bLK0su*5ASHKw|w^m46459K)fL3Mxu-K6P)T+
zy0EAIqmuYn-It?34*Bd_79AD4g_&Okh-PRe`Pog`Pc%upd$>bWTE9T$UJYw3$ABVz
zPbTz#kI)S&2fX|=V74vIvE4cP0TOr4aU%{c`8%DKf?kXs`})hkvN8apI{o_HA;^9-
zKwW4%92jWM^}SlrO4ibqYS_yqVLc`#UZ^C0Q)Oy4*+}=i%qxdmp1Z5OUHqsx#Ca?y
zHZv%wZk18|bazxhv+SxmIjD2~WMle$5$HV8b<%|?!{Mj3_3;|8TZ8KyzF^thJpE-f
z$_}6a<<S<*kecmr!P3U)ewo2<dnz`)n=+CMZFaqA14{P<5S)2x(IL+G{DZe(`a+2J
zm@7hqGplRMC;7R91&=hO3OCFr*7kGQxozum%ddNA>@Cgm7WZ@60q_b_aHQI`A0)5o
z=lDzTm^m546@Xy}9oXX=(66SIG3f&1u2p-lb)>%LKIc;VSqJt;|LlCkgAVCwQ^lpP
ztBZO4oex}?P2H}TDrLN@j0^1_uAglC+Sj!X>HIEYN2)xj<OFl1L6PaOjtY;Iud)>^
zQq9IBZwcP(fE^F;@m6t%zE($yQ+p>2$QB<JJKT#B&<{rZ=;jC0AG>CO-;+-mRZ^cs
z15cdrc4E=(k#_02gi#@_eoLo|lW^hYp72KVyK`sa2B}suga<^4iHcgekn%Zl-ye&Q
z*EGPI?>nN9umAW^apBDW$vD1!1KgQ!kHXmhbLakBXg&BpS(iHd{Ymm;CFLio&t8gy
z4RsYruHIO~3zt44MmfFM9IrR{(dObRP7&!+IJnpZ;pReW#tQAK%=@y$C{&&w0fe+y
zZ=@6I#WueF6W=>?P)s|sCR`#;fgp?c3VW1~=xBf2t2sQ?;$R3F;_ACgZvOe*$96Bc
z*6NIG{X@^c;!?u#*7=_&E)FuWa2*)&K04GBozD3_>QHBWd+vo@Lc#Ls_rZ>fZKQ)J
z@%c#0JZ>Lq>5jtS<f{`&pA|D66I1FNM*rHyE)q+@E@e`lMiI_gBHSfe`>C0W!;u>o
zirNe=(sKTarMW@2ze=i!ZH-7~9&Ta*y8$s9h!uFUCy7hxD-f-3zm`Afs{ac#^frXz
zNV)DZNk;)EQY&{``fM9|;TM1@kW!nb4}z}z>i<Z5c63fbq^CLcz5g-Ge*RBr&wFJa
zMz0;w#-I~_T%@JK5C!XW-CnO(U0USFsZIO9vvvl#KMWKv6rLE({k(hxaKP0P%zH)$
zZd~by$G-Et>R%@UzEl=G-biR<90IFq7S4R;;WB*GFnVR|`{ck#m9KFh%?|II`sw>a
zE+atyl%S?>J@sjP=4OHw-#z=~oNYBr$}_;04>^#0X82TlJ>)|DOO=ZO{Xc34duC1z
zCGRvlnY(n^kn~)q)B)R4xRk5tH<s&`Qqm@C9El<sk{0?ph1}?aPvHQcpkm9x&F@hC
z;6?jnd{3hz-Q%lH?m95@X+WzCuY)@HU-4E)kSq|=puK=AI1aj8*Pm1>F49#L^zX8J
znJokmdD(X2iT{4-|5u6s{>{ts*ZWQX<0)nJp<8UEVbaB$>P7)II^*KKLO*Avz_k*t
zF{Cj+I;YGjo9T=^3+sjybqRrQkt`M?DWN);%9o{4QlIwGR4K=lE6c!C=;4T0wLqwb
z8!3_@78Vo9sot*Mzz4V^&L1G4s|Yi@`}o=Q>*gN?p5>V3^Ll5cgUBSnzI7I@4ybtj
z-`_mwxFxVr1w`k2<&XUsz4&*yNUZ0|P_w~}{@pczUiFHO05j|0uY&{Mt~8YgW*)c^
z#*)0@UO#`Uu7cv3a2pC(p}Go`b;10}h8h~6_a1f%9cxcqQ$4lc_sm3F+s<j*&dFYu
zPzfKFV=Fd$otp$!Kscjot1PV>z@nIn$m!EgW$&b&7C0Vh#!&pl%sxMK8%nBx4v#b1
zwVkdV)R?g$MiGuKr+7)h-}gpa>ixXxC^IXy2`>_@bUb4a8a?)pXs|+w^kKf4{9%cs
z`Q%RxrtI*y%t?>SN5VeE_SCnhT?iR`I-Tr%r$KCIJr6{Vk;8wzL$h)nL?%e$pg!_`
zz4)h1$jeskR}Dh*g%d5k#@_D%z4qc1^K19ep&|#P7n8||rEM3V*!xhw18=_jILg4F
zz7@f8IcUNy@XAYKK=^(~6*+KBT~&yD?i{uP!S(&U87P^-1dLnESBlKk0?TYqMI>@6
z#HX#zrw%L=<TJS!<x~9VJICd(6(DL~Ch;CS!^$Cj4H3K}Vu*xS*9{ut>2G=zx*N$Z
z0z5IhY3SL{tN(7(-+%9`?7yp$y8r#se|MIA=h&aj%;(?m7C5IIU%&coqYP}ik)8he
z$D#7K(FROu){a(QS0XIlrzsvxN>mc@&aN|ge9k-gxdyCq+EoBOq=0fSv-uY3cKme^
z{ZwDVPl5gucgPoQh$piS4Cvc(HETSTbI_xqzvg=`I}36$MbBFT>o!nsX0Ay-jYH*$
z=pkmE_XyLE@8V_W2E_yHPwfcIgr3MdA`2r^gz<rgJlaTJPKo;~5+r0vyAn=d{bv`_
zqRz!%-_M<ztrlADsUF{aTs}Q4pP_Qcc<EnW05x0XhBO(I9i@XNM>2;-=C`kSnC{!0
zW4x`=piff2-Nt+-Y|yk<6pmmAEzpZ4C-%yU^&1UhDI0QzQ;SVmhwXkIrO}mHx;&{*
zflP#TMX#3a<MPu%-RGW@9}M4H{<g|!l@d%w96eV`EkllB{6dA<O<yg~99t&7>fd-w
zwD}|B6t*)Yo(x+8<^lBEBdvb(6P^j-PfLasa-oLsP8{#mAcgLi^7abNxI{Q2^8xvF
zdT+8N#XHPOJEr)rqzC<6mG=J0`GB#d+_p2|Ilhg_DuniX6q98?EuyhYr(ZSc1YuzK
zbW_>z2Xc;jh0$Uu_!aIT-S~RTv5TTEBYn^@;-4JZSM=|U34l2vEM}6r_Jd>$#p_p)
zAC9lz4BX@Htrzf+@H!NTT~EYX&iHpduAppMm?BGSzYfApQ)e#!iv39BDU8<r>uM;H
zw@vS}r?VU%zM{6&7{Wy4&%efN-FUN<vXIL9=`_oMXNa0lx1LVDEySkV-3p_5*c+E(
zgAI@2-jkEKz8*u!{$c2Kz44b#{zHg`0WF!E;VQuU|1zrntC5>??Pt(zIDL}~?F7+>
zS0{i>s8K7fq^@acJL3Q@`THfdQ=@<I8yDg0UqQzBy+?d{ajO}248LPmjGp#_zb?jz
z0PghUVTJF1w+CRPB`r^XTK&uAc66L=KI^MMh>d;#@Lxxi*w3wUa9+i(5AUiz9V8Y=
zxkxzzhWu~K5HctCHUAcW7w__Ya;9;goY69qp5CtPh&;g0s^qj61CriqiSNy0`044G
zX^M|r{<#Z;>MGw>rJerE5;B5)`I;30-${AC-d`h%%LT=!y??e+yiij6-12?0=E+^S
z>fuolj~qY{xzgM;{~N;;{eT!%6Irk5p;XQ%H=)ksGIdt=t6qM-OIp1#^Vl1n>^FlR
z%hwNC^J1)SRE?X($(N55V?CBdU(nuPK#bLV9Z^V&-Oq2uh%jfCZa5`(C-raf3%2lr
z*NQVL|Hg}^Ty88!bZ0F24-aVK)s{jgb#8>pU1NJW0bTYlM(GZ_fOvaJ{vIwZQ|GVj
z27}$##bjXZ9)>si2Xx<vvW1F!+^6eBLw+jz=+5c2No!>+u_m|0)+N`|TpdH|akDrO
z+UMq|d`@NlJ;f*EpgqFvO{<SvXMe|hB!-B0CCBr)uIW~J9qLuhk(pNYT<MqM@u{=4
zUhQ9hn^u|xP8~Dfd&`Y-=CKyLqaT<CY)qfactlR-lR@lfBg?Oe7J}?dXCo~Vfy9u~
zpD=rzU9pcFZfysKfPxOFSgS$Aj@K8>I8=JP{$RRFulMSfJbJF?lPj2MVll&c;-e*-
zN!Cbz+LkpC4zj){#UDd}a}z(#gV3t?p$kJ#({I^!XdXs^>Qhk1y2SrI58cWH8u>+(
z@&9ShhB;`zkZ0v?hZlMZqD#R8&%@1!3u^FFw1W&Ci>JRpx^&NECfgs)8@{3QuY;Hw
z#=tmWaI78hP99np*LtsHW>zkWvR0_c>q+eAbUL>x?&Z~Ih0FMh)*7u|?r#IBs(5RJ
zKjv4Gb5`udDS(~`C$lU(TJLpb9l_5{z5hEh)vgu|;5Fo+S5062wZh8oGRLn=0<o;G
zmbAp^3okT}2gsM1>;-Qn^19eOS7XUgJ9}L2XhwhWgXl4zQ!Ll7am5JX^3D~1i_OpY
zZ1pj;{VL)3@tci=GPK8!mJ3pXUH0d?{t&37qxd3urv%2E9>4~;@<&2j#7x_|JUL)a
zh8Lt!KL<>xFB+W2{7GqfL%9cA{phL(<pmnAD)R=6!7mOQV|Rl)%!l1gHD<ME4uJ$7
z{GC=Rp%@o!m$>sueVd3PCBKU2Qjz^}^zvgSX3ql6_l=J08%E?!Ai^D*Ze*2ERE%0r
z-tu|MbNtuR;r4^wptw7GLo{y9fM;n-Vo=S7u#rJn|GLu<jb(ZlKtB0}1=LHwu7*po
zvVC<*Pm^S`v?TS04G3V0PCob}!IVY1C|o8-&Spp_ij95XyOQHOeCBWjU@e%uCo3oH
zzsGjz3GH5V^s({{z^dK<%$~nBDZ0y#Q?;1Q7cs_n@L=n9QhlGvTUUvFr*{2<)OuRu
zfO^(rc^R7%$EmRnw>Lw7F7RC3;-1+i!Nj^7nhIis6QTpPh4Y(D{bcJ^RGEJ7-9o(l
z1K_4?-7pZ9dt8kDwD`o$usi<Rjt(SsrYxop<MvO-Z*!3Nj{HWR2}n-il%#q$a$ow1
z9gSYv?v^nj|JHonqn*(YVp0-P+m`Dy$bsO&@6Kz+j!-9Jp5Cm0h#r3MSmmNlFW*Pw
zt110Erf1bo+&!lJ*3~E`h;r$I*gt3BsjJ9O^j>FeZENBdu)XO&I>vvw+Ra7w4?zY7
z{-j2hjdXCC;@L+MceXmmnd~Uh0+?mQ4m1}pVZSelPsATP@Z+>XhJ<B#7x8*vaEalg
zmXgkJn<euNN0saz2O{0*$HQpO>1xX}0ZW`@Zy;W}9~~%a4{yvp6VQL%@`HjWQj)3t
zo^1D}9I#u7h=&cVz)CpzG2R5EHC9ncVFOpr$084*4DP;ai_p7%=&AEB5X|MA<kKDo
z^6=1^4ZXsz4jRtL3wvq424Ctrt{imtuu(m=rW$X@%<yB1?VH(mJD82~)y~uUE*-%o
zyVN!tE}Twk-_X4!Ip2Hj7|OuQ9@i4c90gAP+fy+`C&oOM1z#XadgPj`UDBL5d0`$u
z>;GC&-hr<le7NAC1F7mUIF&^31{P!#UI+=Vh=XQgqC-CsaW^#q<H1lQ$V8s+70sGb
zi)@6+hQ}^#LykOuXiN!u?Grwm?njw=PuYsU+k@t#c5XWLzL+VXa&oIXCUYSTi7g+0
zdCujZ!AbF89`#B7!wxjnsf#2k)<8*=(q;n9z0Ci>sGkm(%GN$0Xp~w{hg$CO9iYZ_
z8icRboV|E{_{zz9vGq1l1jjeN@%KJ7%L0$nL*C*B*`|~m`&lw<)V5h!!6TsAA4oh=
znN#+1$XT}wlbRM;eZ3x_^!G`@0Z-j_m(Z;X8F!MTC7sLTxd?mZdFmDq(np%u%_R8r
zs#b1)22@3<-5IWqQZwY>T7lcG_}kar{*hLrNk%nKVwUA5I(9>2OA-tv=DY+uVkWi2
z-wi{tVn!r6h1gj*K)jsuR|T*l<ix1ZwYOy?iG8x2!~i@<k|{XDg-}KFZ+;e3l86u?
z1^kW~1^6J%1wzKIHoc`*H#hy6eXz~UKQ*p^b7Ygp<o&(@ux3#;V-<4m^(WoO2OeX4
z0?g6=`W7tMb=70P2)VYYmzd<zKfMOlzx`AHGH1_ZTgCjcPyV;H>i=~xasbDOJ|*Vf
z*&A#msp(cgdF;P8;(kx!+@%z=wuzgc{XAy`+Zjh+n3DVmraq3qS~N*THRviFm_jD4
zW+*G_^+o6fi=ye|F3tj}p}~6j>Z|5v$(Ch3GJl}Ba6ay!Ei3+z4t^7wZvEY4`;n?X
zTIAN?yK<BGM|iybWNyj+HUsjwa)}7VdOmAE;Ptz!4eojKMGE5nYhGPW;SsWZ@wrPR
zi$qe2Yq{Mzi_4$oZa4Y}h)qlJ{$Yt`6VzK^<4`fMUa(4Q`c-x}?MP;>g~Yvf8yKK2
z_h4kS-`8kc_6u63Q$n)r9(Cfk5vJFetVZ7TIL$B}U;FS#`PnKWSlDVAB}Wg&eme%1
zOlT_GHg~{V%r<#*ud8-^hqP54Pl+>|7-nFz=NyQ;nztw_kW)%T_tRwUcWI^^rB#TA
z6S(eep6_Op3l=InWO{leqFqNToh#6-9+APDb4ce1BaIHf(ToQ6bHxPt$-d9MBM4ZK
zFTD5Eh*)udiD4Lo8>?Nkg}t#p<a^h6U>`_q09$e9k92#oKrOAs8%?9By0Ia@KvsVi
zucX;g&lSWaHC9(aFjtNOYnk@N$TJJ<$!m1*pXkL|v6i5tF-(G=RNgc1epv(+%7quF
z_K1mDGY)>|m+P;zh=rmuM|w~5ybmsx{z@3wVxtH5IKH{2?&}VG-O2K)Bya-REt@iV
zg?r@8AEw*fze@M4{hUM?r+IdeiF(wO+*|Gj0T`n+{*_M*H{pBWAI3S(I#-rFihS1p
zm#+cGmS0QikGCT0AblNwCc^uRDlai>{Yh}M41Hu~=mqAip8rJr&P4QM<Y*)pVvU~;
zZU5nG#E7WQKZrj4eFlx6X<GSSFS*{+mt|Gm*Q$7Rug0J1_0bBPv56=wQ9S2ubEpCA
zfA_$T1A?Zl;tw$Pug^D}Al);5>}47A;z+@j7n?rq%Wn_axCp$8%qcx2_C>7z_{p{|
z<74cRhfYRb|9I+CMds^+hfeh714}ZU9AACPhLMUXC7GlOe|oOuwh}P}wSz_rR}zf~
z3#Ok-S_KXrIetdq_S36Jj;z0(;$pw^;=sYbFa3`n%0v#{fv@|fA|a9vuY%&IiDrr}
zkbvFNC5s<)NeK0=Vuzln2RoHxZhUxrvpkI*n!lk%yH)EjC*rMuE-!XHQ?z+>&963y
zs%&cIs|^JglKRGYq#9=G)58$8c8%nivG&T}rW_nmET^<py%_@|vATPO=DO_!5mFy(
zVI&fK(fsv+oa*O$)iw=!KDr*xDmM!Qf(v7hXj&6ox$H|+RyTOIke^E~X@sfuJ6q?C
zj<**EOq{p$4*T}r?wwSON$!YyU+j=Cxh&qNDbz!A38{^)c2Uz(vZl>$$$L^t#>??h
zq-BER44T1cSW04_tWF*s53%g_3M`Zf3atQ_`hSI}qP>di-!&Qee`pn6`yKT4ai6Aw
zb=cQ7^7DZUYlzx9%*3EG6|X43+pM7RZ5%|O!No@V>$<zEG-V8tVQ>@iEoyUw!0=O)
zyL+!ng99c`T?;a{?0#iMA0;CTUTl<$#OsZu?QyZiE!?2pR7eI_d{h4k4QRGf!4}ae
zhSC|!gRc6#X`Xwi1XeiBcSpxar^Y{QuO&3<s|5zd+Kxz)I7sgbTJ89VkD%aMLe#_5
zWIC*eU7_D+GlNs~Vq3ZHXW^>J`hxjcW-bI!d+A&qCafy0C;JTXF?(f}Hkb$_vx6AL
zwRYPE8V9yAllG?Za_@QT%X=>NP)OMFjp=2YVC2Ryc4J~tm}ezEVP(}@abs2s2OZR-
zMJGg2T4~+!(R=105(p>tbnjliIU!qF4R%YlJwG?KAlFLO<AdV%-ZL?j^75rxC)Pnv
z7wy%R^qkG*c*D4ybIWeI8$<a_tXp4-%V83|prvbDslgb0UV!Ph)hu`Ua{2^!o%;l)
z$0QmOH@%QX)X4WXkr`z0Z1t=j8@FDR*$wIc?8=AHOJw!8RLtKS8&|=@4~HbZV~J3t
zF<iPNCfkqI#?{v8s}^A61bl;8OY;I+`mnC1{#YW&TxIRIpV?J~XPH>{#driFXSj&&
zy}XySwK%p^+o=A0<?{%op*VmT2WF-vmBHg$SwfvQ-PF?wt$90!ZO44{_T=(6Tkd|N
z8DobJ8(7N2@p?w0Q#tpSHzNb?v-(Vce63w+I8l81$P{ftQEqhUSZeIe9g9*a`gf1L
zx1VoHu$`oii+W!m63_l~B3~SO6?C{wM&^G%j%NgJqylfq4WxvRgHyg(#ctvofTh6C
z93HN(D|n9eJ85XX4BsI_iZE#c(LpTu+mIW3(5ZVnk-OhlWC}!TFAhP>9e35?TYCx-
zLn}LGF}nH`Sxmg%A%9=KB<Qz`X;xVGiYcmQi@v9rl%M}hK+r(qm$p$DOT!xPaIud_
zW~xx8^)}O*+MSHG)Ul4W=-?-n5p>^lSxv@5mEZQ}UEwH-yI9anBr5ABWQP{UC=sDr
zcd36`pA!+BvdNfx?2KtnL_<ewom3!1Q9JGuYpRlGYBA5~UaK5}aUz(L+tk-JnUJ-2
zOH--VEmU-BxF%*?o5};BM5nTfdwq5#Id)iiG^0Qg8CZjz=<2Zc`d0iYtfJuHCoL=g
z&{C;TM1@?M)pjgtrs15%nK~V=#+gA&D>flMX-v_1-p5K>$90C&^igb7(ydz>yW4KL
zCb*x7lDM0bD{nSlzB9MldENhVgRdmBgc=kYYup3@hp^xI?U$#T0-Mf@L#9c{KYlHv
z$AjG)_0x)zY&73kNk?faJBifw?z(6NnkUWfeHkvYtV1VTSxIuFOizOf{WZI(iZ>uF
zPKc0UJ478?p`G=5NN8(E%=D>al;$(jg|(|KcrmuZ{G#;FMia?`dOf1egWFTMN63jh
zYtpJ~EFu$Y5F7-pdEAFcBE7%}-%zg2-+G_U0<OBg(^yY^8Vju5iZvtm!+|S=KQdCp
zcRaY3mlaDNA4ilGpae!zgC=_e8-nKP&Ru$ML#_N(hynK`@tv?YpIC-XeZO))e^~k0
zF=rygS9pr<plFr2w1Rh<eyJl@<uLa#VPz+16AE_Y)p)jbShPK>H_QuVjU)bWisj!l
zo9#_6bt-(8bq>@Yt(DFR;!9@r4-{*ZmOS4faR{>a*(VH(iye1u46{<ei8jrLOvogg
z_}+mLJEW~9v}yf{azXP1qx{e(dB(d>dq3g{aWA)GF<<gozM4Z<1(PSN9@pEW$g&WZ
z0T2r$E+jB^emsy-L{^;`mk<2`g|s<g`>VRJETQ}hYZqaaVB`aT$q58!shzFsYN99-
ziZH(}dqs2mO+&{TH`9u})sH+UPxwYvB90MpUS|NRZItKVD_f8?=H61f38($eWH~CN
zO%Fg1WUV&_UZ+!C=LibBv{I~j%y@B61{VCMUB38EI(3{>-rR8h1kgPHN2mPr%1f|>
zKoPW`H6RgcI(Ls4xfxJu5Il1&sjh6czk(WcU)XRgEh^d?f+!h`h59zJ1p*(!r#YyF
zRyWjQ=MyWE&#sDg^=j9p)hN1{Cc-_cCz@oVth21A41D8K-O&Zzy_j-sO?~_g37AL7
z%8X}7V7!N5^k+ky1+APJ#otTuSXx}u-gJV`sL{C7vLHIKz7N!Qp$<5KouqQS4{CA4
zbqC22m?f^LkIgEv7M&io=|$9Ctc~D+WR=N96DPbIkK#b|=7g248m!!Tx$%m5DFaW2
zLDVMI^2!*#B5d_#a7Bo1(hXvi*22OR&~ws+`yncZO-*MP;Qor6>!`4r&=%2C+#UR%
z#=UIv)^e&*sGP4nUhBAy1%COu);W9l78T$4An%PYbqu3RKo2p+4gK12!v;<<uq?_L
zZ6?JPvS7acte=Cw-YJ-zThXFQo!ZaI+jflX&V>u{Q#(OpX2MX(ra+`{zQ>`=5;;gY
zlDh(yxUw7>&~`R_1Nbn~jk>?6d!je0X2(VH(P5sNpB|k|<}p`;(gtlqqITQHItpks
zvJPrYVfCwDzsAGHBu%z^%5~bN#blqh%6=IaL>TcG(2FREKBSGtTUkUqmo7?P#NleW
zgcBhN*KXz<HzC9eb_h-7@pyU1S(d6iiK(Iz@bzCecfIDlMxs>${A#`L4v2p1F_PR{
zbh8^ugT<JaBWPz{?RG*lGq4>ULx_;&?J;Gdo~}|rb}GN=?(w^<FXx4@E*P^IZO7ON
zyOdZ1wgva{j<p(-o&sw!d-fjb#PpEpxat{E`cqHUl7XyiQT&$4uIOgiqT46)R17$j
z-?AMu;1f0(UI<0%Z^9~M@LSB<iO8j~Ft33hzJ02(zNH~f>TVZ5_wnG@8;o>6nLE|p
z>#r1j;=kOuYi>842!$*^;zUV+`?4i0j_L0<8<#6dXfg&mmy15G5<q2}9;=&UOKFY%
zJ7Qt%1rs6qu>o38nW)%}#7McUpB|98?{a#|X9~ks<EuVM_KGqzp(rY{7Y?;74?|V)
zQhR(c5@AIS#U4M5*5~4g^6W*FwPch*;HJg&l)gWf>aqtn$7~Ik6fVX^lnqniUYn~X
z1BtagPAT3m+dWo&gRNNYM5a7~SN#7p${XGLZ7^heL+^h(hWpKvS^f4yPk}NaMQ5N-
zN@jnxd6AAA-+fjo$5s%D^;Y*X+Rlvhg+^{H#SdPH&pqyj^B3OHVnZdnqbKk^JPsbR
zu6VD(6Wa8sy#d$6`HJ6NCb=8GtJeWo#|Y|Q7IQ%IUogeJ*r19IiZ6AF+>13AYEL1Y
z)Z)nCYA!``Z&0h(s_;L9%3Rwg8@04p5rI2v*?K`nro9&F*ewaUR+^#y7`InX?-R<E
zT{UUCI{*`mf=`1!7u$gBKCe>!L1dl=M@-i$RMo4B@&<Tjr@YmX&)xOBhH)Ea16wY-
zN(FV4Ajkd%l<DKe4E)k((t#oHwpDmfgvVLydTdg61hZn|hcIcuJ!AEB=x9-`Ux8(n
zsm*uePz>wLeF(;(*z;JbX?C8cq|}ov=X)~?v8YIP(|(UicwQDb@x<{BC-`mw@R4zL
zisFfZyy6PmeX_Fj7DKSkJ_a`ou@|PI=ws89PdLaWMpqt&&W|TZUgJKy;M5o0S38J0
z0Uv?dp8=a!=uF7^m_uJYlj+b?RS*~>Z<k7ialHzRUaTqnNwbGPInv5-)vSEPOgax{
zub2h|`5v}UcvO`|A(gVqjEd~quHe+j<5e>-=!cAtS1o|>E}+4b#!Z8_R;sy6vFzmu
z)1P^~%+%RcQ@w(slCSzRv`y-u9|zyad1f&zpySltotp@!&l!wSui&A(K{K|hjZ)zD
z{$q1~>x4#~-3H(jzG&~@IT6Bfu+ziIOB-eu+F5ny$E*hsALQLywA}sDp%wPRv3-_@
zrxHiku57H=hOS4}Zo{GW4?V-OLx17Id?A@7C?#=C205SQiUuPexLycPmVc!DQ<#s>
z0~6FEo4Vuw713$!m$L78IKdUDRP{v<85i2c5E=Rto9eJWU-kU4tTB^YJ<U8-&J_kb
z&njzlM$*<JM<<pWc?g(sgB_xWkN!xS#*3+A3h&cA-P0$YyJ=SCcVyA3Q-XPLS<&3A
z+J18+lgX2e%oQJVtQ?(j(lN5DTpGax+MDy9>x{PsVMf=gb1O4S<n}yc16(RsXFDY`
zUEQu>kLIG+9MqWEs>Fb!@X3kb`$JAkw+VL#>ldpVs;rl$y#7&Aay=xiNUl<gP4Y42
ztV5+X8$=rBHF=@ReKp6DS!-fZDT^ZSJpCQt?NQQJ?~%c-rBju2sRw@WTZDvh25BeU
z_yJ@bq+=dvT<Wz978{?5v{b4BV?G(_L53W#tH2F-0?E|Pq$~5p(`u+(%&;<}vKx)F
z^C#Ril8~J{Uw&^XiF2oQW@N5L2$=;=3sAGFN1j%cg@B8_jLiKllYPb?2|oMjUL3Yd
zSA3;^&OBBBf#`&h<4XF9(OjJ;3Aa_%t8UqmzCU1zzrB9``c;NX*jIY3%JpIQsIABi
z8b9^{RU*q&Yxj3c6)!hT9Eig3f3DC+QeYTNV)1rwB5372FWm%GWQxAIp5w~@L2OAM
z9DKTyZSMxA$g+N%s5-xXrE>Ep+$VUlPjqTI+^W>tN%e(9TMvGlxDsN~WKSFsc3k4P
zY~~>qZ2{eR8mkF$)-10@r#sxGW+<rBey0R`!czHbiQT#wninKmCvm7@9a4{cw`vWy
ztKt_0ZM<qFyy;N!Q)>LkSM1OwFq4xNs^zPu>W%*y0(k7Rx9AyG0JKO#Efvx4sb+H1
zns)iA%{Gg-*#@miUPh$`Fn`sNo5%iE=ZyfMSbuPvzx*$Bz!@$n;0<hwzYlu)uV}zr
z3_!l;Ha^B#dCKxW@5(O#Fa$(WFw@-S_fsZILd6_XlJ*wnV~q}(zbniFb~Yy&)U6Jw
zhhol)4kq#vyO+@oytN*8qCNGc{$SdvN+vZ^myuAAX_&nSvYjd>)7biD$89WdoM8;F
z#p-rO>v$QtPPfowPX}fAZ1wwK?S`lB^?2~I$7Zfbn#Y(C^{(}dKI=NrmmzJ<KM<x@
zn6@1(YLgST6fP#JTAQ)jIyC9QOjEpuYpBc^%nz>9hiHD!>GeS*DR6M70sH{v+d|eN
zhhE0G!fG^#7;AYnXo&?nIBpt3&!-mYr7KE(66AwUH3X$Ubua<xjyN&uwC4l5q73se
z0G71I3E#&W9n0f`B0M6J;Khu2N^a>J06v9%RsU`)Kva1`Sl?k=sD8O9qX7pt!`c@X
zQpYwH&NT;NCvv0*J7TGH;X_lJDk2<Nta=wZt+e!0_ezFG*MyKPS!JkQ^RY8`w7h12
zgq}k9k-23#GwztK$>Z9~x5Gte7uXXcW;$78BWoBwgSUoUbTAy|_SF-+x7C$wG>-Z<
z2Ux2+G@8$E-bdy&NH_%t1J6+T^T}G)=dAe!nPqaU%YiWw|MCKCYzCI1o5c7NtT8Tb
zK?Y-i-QeXu$Sr8-(B4%=Cqzf$V(RuXaBX&mm4n<?dA5}3NPoIqbI_v=wAap{zp)4^
zV^c>x9T9YX0-VpjoLN7e=OjUnxc(1fWw)7oNf*#rCdgh(e1q(3cJvu259~tP)5lK<
zj`Ncj>wV?AipBgg<FUhn#~;rrPByb|9R-F!{`G0i)BU4^={#|3-|jEZ-`~m>MQ-_T
z>3mrAto3u-&cbI3X*P>+;=ysPtD^9R;bg{`hs?wl4S@I1B6_+wMyVo$h2DwwSJF3Z
zb1!sPqL-oEf`7lPX2ExWcMP(DCOw~j$`QK0oAI~!xzb;JnfT?_r}N(s#`_}R4Uv4i
zFX}9(OlyU_ca2;%mtGxAypin8iqDhZDrDOdIac$1s)j?(NJG!u$=_lZDF*KXK>c+O
zJ|(ys9SW7z^IW_r`)5K+^<12JUIcrbJ7UCH-&GC^Q}GU(cG>1|on@~y8-@Pnrq>TJ
z`p1)o+RHxz--H>6)W`w?*q_!@_}_Qt-+kTQmm}g~5zu)WD@)~mJ=^wq4rUa^PqLxV
zB;P;Eg7lP1ZpZOeyi>E})GOM(t+fyl>&w2lh7&KaS}CA(1=Y;6Z$(;-P;C|^p%-gS
z6D~qF9Ve%lu)?&DLKT^=Xr>GGS0B0Phu1Cfr0tSMsrR~72}58pu5_WjH4D3;tg@$3
zt*ENg!;p88I~%h4j6@T=A<xY99M(+B^7MzV4LL#+POH-DHn#aKx7Q_*s9HZX<GGB+
z6{sxXEGTo>q0ihbW_*lZQ3~wA@hqZHI6jlX`1C=axjOwlXDGRUY&;}H5qS*ue&-=@
zW7T6VG!+&Ychq;EL94=dcBb$WNo<sfBq2*F=<<}D-_EY@qE&y`WIwgsL`=UgK9dcg
z*6J{xU?H3m)UI)!6p(~$`<$X}*sTb&MD!vE=!!Y*ZpOnl0Fdk!X)I53Xsg6(cGmSy
zpn|$0D5e?Kn;|JxrSfw=Af)-6UK*p^D%0A*O4Db?sb5AGZg-xxvXp-Xx)@}VxAs{W
zqqkf&vN4_;r|Di=J2!b?Evatqz6NGB_ijUY`LExsr!96rS~Z*MSxhKN$F$d$@>ghO
z=_YPAjA3**T_ty<8uGWI?k{WiWPWLV1Alv51S-3C&6~Y)OIvvk7)Lh^A?ZaZB~^{E
zP}bmv=8RPoWvps}8eD%RXVYyjvHMN2;a0ZaOkrW<jz27{wc`yeahnl12A)1w7&0v*
zik!vIEfNVv-Kr|E&>?JaijR00ReVG43Bpk!$1r`X$_7rXy;$4KpnVpa?2(k<9mH5R
zB@xrB{!GBX0!|($@#=@Vp8MbD--Cz9h5+og<Rg3G(ZIvR75PvMHOgjxx5RsRYiO3;
zMA#!f`Xr;+5MEOqZ%h*#C(GNk92yVJC1l+MZkS-0uG|v<R+-m3{Hu}I0{_)}p#abO
znr90IJ908OrMh)=tu7pd*wj{Z(Rf}`87K5|SlIz4m$1Mr`-re!&9eYi<V<y(L(V}K
zM@&7fZYq5njLG~%p{kBbDC~ky_4C5eS^4)Bg=$s327f53tZtQNHto$HB#x+tptgK}
z*(4SzfrtTy>g@Q}L4i253z`BSJ$i0^F4w3nVtbeBXC1ZnZLGr=`m5Vg>)L?$A|1dX
z@8E7Bk2~}{+*K64*?Dc5Hn)=rNgUTu8KL+!UHT{u?SRQZtIWoh8tSPU**1rQ^0N@q
z=vx3C?0HWM(S!ys83*DrmpJ$A-UXfpX?@XgnrLk`-RmF}pS`nwe|ZF`sG4J5MVhbF
zHAGUa016>DoqZsRC#j!ObiOhspAY!+_Vn0PC!C)*PUn5<@l#e9d_u<byhdxCf9&xy
zM4^f5FD?_}?dqdF^DH5ag3a~QjFOx2n>{xz(k6n~EQxladeiGN?1&Q_v?uoykwm{B
zZ>?oI-0ytvKWtk&i8L}$fQ0k#WlmWxY<$Xmw4uamStD+G#WPLx&`<5=2q-59Uo{b&
zrdnH;$`mjF%o_T0_dR)koy8Yq#)TB^tXCuq%IwS`{X@4(kQ^D`gTm5lz;nBP{d*IW
zE%1fdPyOTtx8RDLRp4P;q8`LEFt)kgZK}O8E}R>IUD+9w&K)Z@Gg7eJqN1GT?iuy`
zk9Fd5YbGI9-9T;Y3b&Bx&zprcnx54eh&7+9*fa=1K11#ndbiC)i72|fN*L^Mgz6)z
zlR$C4T6+-QW+95JjEg_{7STQw6D(x7Y*`$knfd^PkS^R1k)7GCM22ZfD&&}8MKbw8
znaQ~;Gm#d)-IRsZkV*m_HBn985fg>OFtipv+0dly;_By)(259TKb!+Wn`nr5SB`Ph
z+j{tz+K&`5=jN2)QrE=lQL{jjIDZf19QWs7j|!o`9FmDXaAelY%GLjCqlw&aG(2n9
zJbbEER!8(Qh~90&KEN)DN7suhwejS9j9wpqFM=;VYlwW>&GKZ5emp~8Jq+v2tq&nM
z)hTb@Rz0N(YbIYxyx-n8l)(r2>4KIZ*HMh&Td#P$9u<0JcwD9W;zR3}l)U!b6FD{W
zrrr*#_$S|&j$g@ov^r`{@vzvO_1LlKB8Wg2x8v_GNW%L5>M7-RzbE@aSIB)i<c<wH
z;&lbdI-lJ|L%HAnrP&w+|B2@FHt`OsQK>#{A?nxF+mv^HdpdW0u`;mFQe=Tc+?rMh
z2%$&=LpuPhlt*svad2n)jLSmpwlP|IGQO<^6{CUoV3vViQ2>#2XZT~<rhUJ2k1ak=
z%ZlJN8vp$#_VdZx`S+>!k8P;#&0$n+sM+sWz)v!+zUnNzF15S_j1TzWg~vm2p=A-E
zD|L%a`dY(+%(%&T!alZHSstYVkD!ES`D6Zw9!P>BygNeXV*z{Xi0ta-Z8lp~D&GuK
zDI5($!Lk@)@UVXQvv>C}k@B9!wPK>mzvh;PvdH;$K4e8I;>@sE6yyp0_$3Lr-#~^l
zBf`nLW~&BIGPgQ2B~imc^@Q>U#=(MBtymSqD#q)4+(>m*G-78OY^l7B5G}C1>cl;y
z@w<_TykMUCX<9Uj_}#WLZgq8slQMvk=uqaYZ#!yao>tipvcgqvHTD|$Dl_c$mIv6E
z2ek6yCYtC|dl)F~z2J?l;gmRuEro&pG_y)P+9Dq=(a;M(0VBgG!CS#@0`oDzMHPsI
zH4j_u-M>2jM0BbcWIJ^7unsh&G;Iy$(@j;F{l#m_zjOFP4;~v&J<v`>^IK4Ov@M`=
zg#qTV-B_k&p<|j>Nl~=Q^T+T)Qw4q{N<Ix>zAc+n*6WkM;{jYas#m9}kbM3QE-^%3
z!YZ6{pY51;oAUND->~aL>Q6q&P)#tYNgAOzJ{+OhCWK{+(_y1&u@&+eVX|pxp5?@Z
zDY5wJ4xTN3vBYl8DYE|#F{E3r;vG*M=~o?WxCrkGLXGWHw*8@*HM6P6TBF%w(G^Y>
ziYFrU0~Nl{h4JoW8u_NDyQ4Gfc>FQ{Ftw$U05Q8gXL9*}<!yoKXQTl*WD2UkroY0a
zt*d=(-I~pI4R`%<Utkp;BMpm<%Uv-1a=r0`I8?l>=FHUG*K@&_2^oSF5-ou>+*pWe
zq_w<iW9GY{)mhOTqvOw9OAW+^H2`9L_j%OSFn7p+rm8~0{%vQo)R|@!i_~oMjU64$
zr7nq1rTS@6)A_X_>-naZyX$<F)&sxaiT7DQ5RLM<qphqGxU#zxw=g?5Rvfgt076#I
zu<v~|+*Jd<EeBT<jB=XFP&GL+VX`E(-@&JTR<6;|?lv!u`T>jznQ!6x@BZ_+JQy@%
zP@g)cLeETG5-NNrF1@FiA227i+Dpw+om`vs`}yRFE*-x8oS5jTA~(Cc;;;*Y*CdC6
zrdY+6x~cum8&i52FSiYMvnfF$kkvw?HVt59K984+`oPOPculR{5P}!pi5{^;^@6~p
zwG}6z$_lp)M~~}OkM=cdO5v<P&wr^9Sad6@^#{NO)z6_ht_%KeKNyCT)xtOChp#Sl
z#xOLV?_3hbV0qaP{?D7dhqWe7MGJI|hU`GEff(c}{5$V|M%b<(4qiVGdMojI-quv;
z-U-lDw&vZJ@FQYTzkgu7+c1W06A6Rh20UY@`$sq|D5#(3Wb-3JqEKe_NBY|8xQFVC
z-^y5(&=%9?I$E3wdih$rIa2?A5@Aho$ok2HrGHcciC0(s>HZ(5q2`J%(M-dp)mEw%
z%4bIzu&`b*nrHu{=*O#A9}yl0S@N%X(Q%Jz&3Bsm{dN4WqRm0s^Cs)bxz?&egZaP>
zL|_S`SFiFf7;n!-*3Wc0bz{XE6hc>Hqg4$2V%HTaj#rotj?Sg8ObcnU*{4PV(>!H&
z(XwgcJVBWjTB=4W^bv+se;h&*ktI|ZkhG>F1uUb!j=nU;qxF_I6Ie4W$7ngZJ=?xo
z%gyMDX0S~EPNz@#@wY0rDYC1gE0w*JiB(c!7}boV>FKSP*0X!HJ^t6ErqC45fs#*6
z5*I7N6kY>v@TuUIEp7dz37m`09`FTB=Fx=UBp7DaO3nbW@k4Z+K>2LG>yID>%o3A!
zT4j66TmL1BtQ<YE{@IhL^;g0CUz*tW9Dw1<<O)9{@6;2)mpRlT<Ii-9#ecUO0rEF<
z4|daDx>E*X_Q~AG0_oE^ejZtw*PH>MUiQ)UI?*D=xGR1|PH-wZ9xG=u{aT;3u@vi{
z%+4$8k+wyQ3>dYwI%PM!lLW6??L7U;`mtboKyaynVlBbRyZ+P9O$|_cs2JNc$o9I%
zMO6H?b~6z1uxL)AVY4X5wXDC`tRfq;^$&QDh?Wm+vZT+gWr}45Xjb~L^lWx**K(?V
zbNbBoaEZn-=hLm9g>7Dd<F81(Vzy?GCaiTtt*jL6lc(&8c`K8Z-M1@uwzk0=r~Jb_
z^UXqS)GZ#qKISu%)C-IWxmdmtG-3Y&oJxPk|NcIw`iy2+sgIsl?p77Pp5>^k;9!!^
zjDB81WtV7QFWhmPusvc~gnIUA-Emo=aVBP>_+x>SwREU^RuXT@0zm&Zn)Vk5K(=B9
zo=2@&J8}>4D`j$N9#V$*xP=b;DY?;HUGWN-joLLq`NtUuY^zU{CXpxZfhcDNkG_^@
z=rzr6IrwIqu%^)U^2!N|*Qa*2<yW^9^@}r;`ObqfE*Q|)_PKc)_+_pL47mTK^|1a^
zqjFdg6TQcscGGsJ2)pCEKR%(NwzwVjEhCGm=JQ1IYIZ(!w!6(;+EbS;zj1=ajT#}s
zi1tN3K&6BZ9c?Fe#>P!a2Plf~8nD_VV1<xrCC+X;l#VxY{%MFs0xO&_u`!#a8z1eV
zK=>rcSbyF0dy(7EO`<WLW!BD?Tp?GO9_HBIKD)V8WCB5~r$&2NEL(`Bf0;L?_hH38
zfnpwc1g^ia<{F_5*8_iY4YhfXFW93gkgBEjQ0y`{qel|vv1?Vhr08Qa`Rk=tuv+ji
zLSN#_H+SObjQcbzDsWON@xBOgYR8jjix1bIm@s%(aRnpz$RjG<Q^{+(1%dnO?z6n*
zIm~~~+(|+gv}eVtimOr-v<4v%^tmv<%W3sZfsPlW#o3${G@2pmO%8Pi!Ol-Uv|5&x
z?tKu9a*ZQT2lyWWcruzV>Bgw|ycLt@{`^%_85RrLX@^l1n}1eNny1m+)4H5Ii$vU<
z;NO151hg-?06Ve&qB|ZuBy<IMgYB{AD6BG(QzalsW%Bjj*?5E!EEqNH(P*Ku%c6Z=
zP7ktPCwV@DHLBmOaOOTY;9$VoZIP5sJsO*nuAzd*dX?vB(5!^qm4wKqGoVl*O~%vp
zUoSJ1!4(S*12Xa(j3k7AE2dP$+W5h6<?Q3l3ie^iJVm`-ofK|LL?|pmvF`nzmwM2k
zR|(j2Mk&+{S>sX5Lx}>fgbUX4bAk90SdOZ$cCb53csEvAKB-s9UzYsl(HIEDRCuH+
zGw1+&gi>1Vh2a$-Bt~t(Ibs#B5XTsO!X(%4Ufi{}{6fz!Gt=sFJ@fQIVF7-c<@0ZS
zaQsH5iy=?Hl~$?@E`e1vn{6wq7W89-ZBEX^Zby6%C1A{KTlNQ9xsArrne+UO{@<{y
zQ3=tshSjKFI^9x`&0|6<F~w&wRK@y9)e$vLjFtOmzO13PkrYe;PZETfyf((ZB@{oM
z<?RB=InFoBL$H4B=B-}KW&Q4(fA)CTlgg~Z9aBWg@iSM2A09b{Fx#R;20S#r#v0|P
z0y140i=Q5e?ADDRm&^0BfULCeTF>XN1{T-u$j?6%*mezu2+dyQ{Y{=;X*XU4A@zF!
zl2#*&xa8cpFRf;T$vcLqH4?QH+c}Pw1EQF+&BcG%sP|<6TIuA2$NbkpJpAk#G<QN-
zARtxWP&o9-+>d@g&)Bz6-c`mgLr5jdyJ=<S4R+5h+4GC#=~J#{6}O?W+W%OCyuhVq
zg{AI2iN)Ad{YVQ?s7;yodZw6UqeBJ-UH8`F%jWXCjT78fobeEE87=X~!%(9zYnf}6
zFhJxIF}2fm-7UO5uu!7F$12W9+R|dQWE#21ZNTk>o&zmn?ZL0b?TQFv!yuoT0OPEJ
zMS0z*dT~K3*u(PXFfNre-I25gwI3LOg`$!Qqp5H@`WELu<n(o*jkJ27CH*(<;mAk~
z@KhQ>THKdzzxeLO(dq2c(>vr7<<7SJRfTi4fqj`0d8_83J46ebszOe!Ncf(DA_R7i
zz9pFo+PW2=DyjO7hjFK=TD#m0KYr%1(IHyULah<tGLn49E;+xZ@b$g!l2{|R=8XR%
zWGQuLs#GG=vV%tg_{R>sWaCI4ZzQvU2782%(%Zeywe_}?6Vp|`p4l1-_WS?X`|fb6
z-~ZtTQL-u=$*f3aB|9UtL`3#>jIy)0l%%XgAzK;QTgWJdGLEbhLb4qr9V7d>k6n4|
zd;Om0`d-)bJoWkKe2)8l-><#z_Zgq9$*?}g!i(%;v;9~?Fz*5fV*#2`{B&<;I+wzp
z1*{evxg!C7mGWI}tHTC=mf*_@j}SGf1y>r$gB9FqdoUG=GsG+iVi^@p+M#ToKEmsk
z>c!?u9uZn4!<kxYofp#R>IL0F+=$KdEh)}WD$0~2p1P@&q*nBH8S8-~F&)F69}x+c
zN{<jHuspA5KmJ4N#pIH0oznQtCIONFUK=dcibJdDT|2@(@FQY?*+;>RP3(dlPt1?l
zWaFz7Gdu%?*9Y2_x}2N)JIaUcE1vfE+CMy54x(CeP_!+=Q7IiSzT3pXwaUQ^Re)jg
zNmELwP(fP{3?z#8Cz=KU?2}n6K-o&(u6;)1+J}Ms=Ps@fC5d%;1?jr@Eo%mj;Lue{
z2mHWiY``B47bJtv3uFy^Wa!MCd#@TGWatj`2aotx*GY-E#QZ#@t;Go;ZEGxgX7flD
zazs~0-9!iRM6~rFKXb>xBcX@0Rhe9tg=}%t_{2k=l4sQuQ2m1!1V=DHRLamKyiBdN
zl7E3Fvvd@Cu}~J&vzz#q?F9cwz0>uFia9-=iSK+FE2l<he}2b=f&N;x#i`D=E=ZBn
z0ZaNXllv=4>CIiH`U^S@H2QCVm}>vPxmEPFVVLR5?jzur&;R80$~P%zK)1!lCY)1b
zb;Us;?JU<(Y~oRWn`ecVU(3i>hNf^bUNdePNa2>$UM-jHy%Jpm)aaJ>hCFqENUe~9
z9CF%|Xz;R-Y&rw;-PP^fiI0SQX>-2PYp-}eb=-)<DA<R_mub`DDBHNdWU8&UdWdAo
z{GI@6lQU_K*GShhtdxrYcn<p;lJrpcd8&cJ$18-ydLTJLvJ!xhx63X{iuy1qTz>fT
zL96&E!rwejkzr#K)}wm@r~13Z-VvTHX`jqfgcmmYoA~T|tZ`_~E=c0y4nGk(=K(=f
zBAR;d+h8OB$oA?3&*@n~YbMq!=XrU_fJ|iRVh}=Ws(10Z0lajBa#()%%fRT$loz*t
zX0E)SR(QuXr{;;o>St>!+;f5d&2PCXRlNBKX2N!pvt{*Qb_jownDNmnT-<&bG<=C~
zbv{1%!rs`}X|%O6Q}oIvbz$W0$0^gjqdO1zPrguu&zVc?dndlm=QBjW66}t=vi|>n
zp6b0Hci2I59RyOckV4)5Z(r)zBV5{Y6?+~t!*Mr{qNyizf8dDz#0XxwS*zfiX=TU7
z{JMbJIP~rs4Kq4{<h0E1Z>L^W<SlruuNcOt_EocsYqIT&E7fAR809rtPWiE;-KeQf
z`FQ&^`i-aEd+04zf@J-axtyxmYmBFXAFdezfnY^mExPy!$F(R2*M|#pC(}aBIgEOW
zYYfk>J*yoAKoOsv99?VEY^CukEHF&DOh{Q;a+UZgx7^nnN@{$j$0c0*K)<58<)`8k
znGNP>Ywww9%|hoaUUM*gF)3fk{P?(l+gs1c!7x|Dxik8BA)qy`<Y!;^t*6mY&36dr
zNkldGnh7gQ1Z?%4V=r5RVugDNSTPmR<gv=5GG3yPL@upJI@i=jMbe6-nbMV<j`wpK
z(>k+oi3W?xF!kafnWcU~v0fb8kcFO9Up#;2b`^6_DVy=)cFUZ#d>4h_FcSPU;~TOj
zQZyM?F>D%o(?+^M=-)nAEC8G17ghXZO~YwPK0iGppI__DBVjTk+r2UiMleJWB(_1D
zudndH!~w$Bs|%%i>0y3e9x}4S^b1J#XFl?D<n+D@ifV5w8Ja1o?HS(?!utS>3x|J|
z|1~*TksKFNq9zo3;?U{}cMcX-1#<y@iPy>1;*m(Brp{OWT*iAhcEj3tf>j}EcK(=y
zRp`*#gjVOx5zxTVl4dklGxG66>c8;Ccwr2(3kkQQBMtZdhWDnxMk9cry6oeiTwRgT
zk4?WeYTj10uyjw}+@-v~9l2m&x<P^4Ww)_Fek>#Q?(P+tsMQhpSD$}oYc0fNpT@&m
zCME{O#;#FN^k`}@Ccm`)T=etEfvrey;)6jux(k4@NXoIMP#I6L<29{E_gcm+wlX{Y
z^adgCkZNZ_+on(*RMiHo*A)$Kujb_$2(B6ZY~81k*822JpjO1}4N$I(>z5=R4+T}P
zmg)`Dh9#~cBvE|LZQo}>D1y}60W)0^_-svvl*{)VR<kSS>6>Ja$a`ef2Ga1kXB@Ak
z<2J1Y>=niomU#Vqqknet!?9TstD>2Oz9yc-{<Vxtr|9a&7s&ZbwaqHykt}eg;B}!E
zR|Ljh&%d)`6Q%V(t%<=!V6pHeTVAi}iRxkQo;YMAFPO*Z^Mavd=P`Leg=%}4<K_-2
zd1b9Eoi<$`h3DQLXo(pA(YjFga;A1%iyPg-Wm4jTZCrqH;+P#-Edjxg+x2on_Hz%a
z0toBh_OH2`mWO3vu7pL*PsuuoD{eBYb+j>E{fv_w!v-bYdl-RW7MufF$0LpxM@rd>
zil={EOw%!2w+)V%c+Ee-Nak1GuS*CfNF^a^;!9f~T|3iL>?=ru66zv8haKm%J^k`1
zd;rwm?a>)QsRV1A0(8om9yK<#7^i>aS$=za_zR|9))K5~Gl9f8UwU($I4BIgJE&59
ziT|{atKU>!5t1FZ|DYDUN+B4%I~aPT;uIE7Tb~68u$miS=BNch#Dq?g8q+2gEqBU9
zUHWGke;6OvO^KR{KgI$aVo_3*)idvPa-Rm|es0+?eR&hxwLh)tV#+gMrjxFzd959<
z>;8eMRzm!t+pyd>X`oD<BCqk|Hi3t!y02)QoYPXi{8hom1UkEXTSNy#=PVr0<r(7q
z8gT9bn7rp@PoByqy*A;-NSZbWFoB99tw&eCF3Z3RpM4y07>;gMOyIS8*9;mfhdo_b
zhS$0?7}2TOvocYeT$kT<IDq=qV_x}QhB{8pVz-l4ub!?W{GU!F(w|%<Kua`GBL`Nx
zil?XfB^JiipWDtJ{qpSbI{&}~V{jSh$H{9YgOGgUTOY~Ot|lXsx!FDq)aiBH+=Yc?
z5ojn1gGk2^D7V1fXMU?dDraM24l=rLC09&o1{UrVP=;xi8gM0I^0GB$6rpFmW^R@w
z`kD_ogqR%Ec;>_;j;lfYUBXEV-<XYMl_17)ko;AFH}nF0m^(|8h%}UKT;rdyqPA{z
zO?rG#K2xsyv5AlZ@zLkTws#L~iRwI7UGUzQb5D;=1Rd(DRf2b4T*=0CQ)oBNr3AdY
zz`qXc#@8Dke)lYAELKyJ*7ux`tQMxgKzbu`r2oj~)Tnk9cKk5AOm%=87hRoNY}2gQ
z#~xK*?{tbEsi*wcSero)PHlO|#`t*ci%oB`MtbEdK-!Ima+=m+4q&V;jnPa2Jt}q@
z3@*0LBI@>y<5F(=^{B9@jXlb4;Psvd&HliEhG{Qu)G2Sm+~b1gGN797m*?NHZg1xp
z03`*luy1O@@A^!DiS+2!c`<g{V!67js2kN|=O)TQI*sUb{gLnuRFm|!^cTxcA5aIE
zB$^5i2aERx4z%NRSydZHWf+nu<sVsNqxK1awD`9WQ)r~6fp9f9<v|7;t7P&k1$sUq
zxR+!d!aul&k^-hGf--Rwa>WXNqe0jnnE*LQDfF9X|AR?ubn{dPr&SSNa@Y`>-#(PJ
z9R_CG$uM4bQ!Rd5vckg@%!Kht$Fwzl{sS#6+_Q8vi32yG(=7}{?w>FQD;1rm26dV?
z3C3XUZauUe5St`}(E)bQ@*a%a{OUJJL5To>UBaJhK(1IhAsW<{G*uXTwn1p^!IRjI
z`-jEerFU32O1Dd1HHB?e{cRpQ7z0Fri0a|vZT_~hrF$+DNT2c$GE+O_IH&;fQzf>Z
zO(Xd`DjFCAAqI8MDx9pCGQ@M6PTMO`mNVsXnZrOt@D=7MV=jPX-eWlaraRbHu-s{Z
zucvq`a_tZlEi$Gt!TAz-+is{8s~dj=BQ{T-oLe*5zp)TQp8C6hKZMGj>-p~X$ZGpy
z?y5}ey)FQC9BVirL%9QU(prsq`+aVg(EyU}i!r(iDWAQVjBJT<7Zf#DSxk(P^O*8H
z1;~A-FmjG)liXPbxH|#ZmGf9BoB7*s2d7*CnqcFQ!nv8n{I+5hA~8&w!UlmhQ1QJW
z$)?zqw6(ah4Lew00C+?kSwBLOu_X_Pf17|{^SVvn{>W%D0Jdh1541fA0y1`Hzy4p;
zt;HQ2$IC6MwhgeWBIb#iIQdy8Dz-DFVL@P{XuVoUHSx&-3ocQY*6xTHt1ckWaoewX
zko%<gfqjO#)$VQh#%({8(|{hw!G!32kP_4ZTw!X_@myM{6{~V^mJ!(H572fQb{oL6
zOI8{(c3qVpS`fTbG~B%g=>RkGFe*Pi`!aG{d^V_G45F#2SpD@9$DhI){Q{``<x}O`
z3(orb3<97)EpKSs_CTt<Ge+g@=ubn+>dFaa(HNCKX{-rFM(r0Em7n`*x+7xl#K4hK
zWbVz^fZWFhqw<xlr#DEk?FZOCpcy*c*c`uG`;FD8t3aYVC|}}3he;UBv@h?z$S<=c
zmD<=~^+VY%4C1$qpnKUXj6C=f$j2ippDZYT{Kx)G4EQrLCOEV|I|&FaDNHWD&g1Q!
zj}6d_d;)?`q>uo0V^~qbevKGPG*&m>cM_=BDHSxNZ#fPeHCIEWIr=xs{VL!-A%H+$
zl&?l1_enkuAm*)8Trb;f&)G6opc(k(jERsJ5|r(LM5D$onXaXgP&V%h@Tv}$+n<7g
zeILfQ?7A)w3GfQO+Lr)3JSVlc)W?2fOo}NG)m#_Xq|MZ3JC+Ht0I-X1kVBe8qEo;|
zDINOA3Z*)$=ceq%xH=23yzLHImDMP~itiB*9zu!6sz9)w0vlcY@NBIZ@Fy8zH^G7c
z0oSk{5z~GL6i9re%oK7TWlFG5^R2uCI}4)cMWE2Is~7R1<gBa^6Q$cp7>PiwSQSZS
zFp!K0TrXt*Nha1^0G{2^&mhyBRZ@tY2J-O&=lNMEerTIw;;m7%Jpay!ITC}~r&zTz
z)IPU`fQ>4CEbI9fOGXJaBlWIG4mQ+A+CKp+JjbQ3Z^`?9)zTGhaR0UJ_UDSSOPC7x
z<V5?09T8g_KrkzUVn^9924cT#g}(^mL5z#CIAHXr2(Eq85&<N-kWY@{FHY<+*eLQC
z6ga!wcVleJuI3=f6)X6{<N!Np`C<By{4mC}u}q_)hM}!E=7`_APTQO+iHZC@&iVYw
zgRS47guro>$6~j%3w|S2IVc%$IUB~(L5aqy?Kmh2Hj>Tihm1GHF$K{V{$2e$BDQuV
z%%UA~A4LhUk4<>0@Nc&AS8MF9g9*1i<z*pj_ZeeAqFXuw+K6_PM69PEOh)(qMwBh|
zK8T^<uEhBDPUWv~tnXd+2(SZx=*_A1lI9<_L%AJDnDP9=b;wL)Rl_s^*p8phtyc>F
z*oQ_I*r@aqK{{Jt+y*9P9B>@!E0HOrGlU5MS?%UKOadvZD<^2nf{pGz=3rSzb*p$`
zR<JgJ1;<?$ICn&>%nL-@zMtLOnW6T{;0606_n46UMG!$k+sPV!=O~nAaJK;xoj>N+
z<O#K66@m;9CPy-?-DUif%tG3`06cqzI8H(Vy!wpibs!&?2*p*P_)*pd8dz%W0||~h
zBQ^=@KS=MVt9NI%6MlD$jXD`~aOdIzXc>azaPHs0g9PY1rgzd6n)sI}Xv<L0Hsk@t
zr~rT%#ZXVa!;XlFV+wZf2)@`t83y-e;N{dP4U@MK<1KpmA51{-0g1M;d%^h^Cx!`=
zFYrK1$G=n_sMx<$9!fym|E2Q(Qu!^0X^XA?m&yan@!v}2_3t`9k<>@ybA7R>E!>+y
zyQ3Gv^?3w9!==nig-qI$F>!-qi{f80g$dyG_&OQ1U7*g>)&Q|!YBfBU5o*P%#GD0L
z;=qIdX9|m#?-_5r7hqi_{uLSjpC~eZT{awFf}iE9p1)l>9x?IbIE>>me%(2%G48yq
zg&RWeC2!#Ew>>U;^oHuoo04;PUkJyExhIKA(#UxiKTzFs^KhK|(|wZ9#EtN9PoF$#
zcI}?DyJryhwM)X1#!m~zBTjzC!Zner6r7HndrN1+Za2;o=~U-}a1ucKpB#6JnsE?7
zgtkuAIlF8PAnRp87DO8ahYE66vy@XHc}u^2@+f2yR0WzFETs{He+h*(K*VfwG8r;=
zVKoAg=e6{%?>mCygNcOo@VFqisrLt!d%-%ho$H)J1RxLKF}X+xb(qjCu;6`alMvK`
zRpKaP8XMi)J2FhpV-2dD$lXbgciyE6<UcHp?{7dRzA#KfD`O_XX=iJg1Cjva=Y2c0
zH9%ijQ{+@7R9Lo}rpbbCjOOku=OMcqk3fl2)0OYjUuxThpaF@JRD^CuGyxhv*DUSc
zDwKbuNw83WdljjfIu!2F)b2ypj>Z~Du9PCvbD>}C!N2xGD_6*Q)irkg0Ntc&J1$<x
zTyP(lsp0zCck?eBF@aP_^$ySGh|9X=UPt8yCMy;FKn{h#>iQsBP=BtVl7@6<dSK!?
zxK%i3Z8jYu({m>P;PAdvc=mgZVI3M+I-sY$ow?<X;8b8rS|t%-AyC^4f_iZ}j-C<(
z7;6bX&0(O_Qlv5XP=`gI1q-sKZytqOuu2?t&~8GYAxT3th%r)oub<pel4u@*Hj-w%
z_b15yA*%_1MrT}RW@ml90eUG&@oLD4x7$J@U>{Fezn#lM(U?m3o~Jjkpm3)V1Pc~y
zV~;~ESS5}%=piAWq!dB{U6u$$LcG|ueCMw5IdE72)0bCJ+Xx8)bzS5x`b*Q82UL0c
zGO+m}1>-$%1SI)#dv_*|b{r@z-G^=?)PttGCQhOxWGK3V28}#GN{Z|`6BGrtKLe!7
zJEO*SmYjNVAYZ;uv!Q6S%iR;~gCBDY(wMB6T%p}y753RrxKR5Dxq=0=qoNW}3s%)}
z5LA-Wfy0quSb(?W1u^xV03qz4W=uJ-B{<TkFLkUDqE7H>_PSH<lQ##(}ES+x$H
zBcvOA2s%e+%;NU_B`c5wy<$S>Vq!F=^+q83WalcfP&_~?lv!L2ih^YsKpQ-Z1vdu%
zws(fn;A)#D1(P8ZZFcX$2m2fq=J`vfcn=`q0p(cY?N0#}K+FY!AMFqPpcbsszWxaS
zM=5I<Qrw?o3W-y5_&ZmStH5F6c@{m8+bja@o$47Oh0>Q*5$FYBO>jBs<hoD!gY!p#
z`%wBKiT{@>;uF9tD(L9|Axr_DGWzsi#zLe3BhG2~<Uvu;T?c3bzLQ(w4%z@KDSPwg
z4SsplTP#y-0zyJ_<N0UO;bcwi9<gh4%iB0Y_xr$<Ih@$;Tv+mLS<(D3t+S-U3>}Bu
z6sBN3R!$`iHvY2FX~t31{`Ga$xJOP|*n)Va#qfrVY<gKj)g?%g#Z0p0mF}hdRrTDD
zVR<ODnP1!$R~)+QvQu`CCheC@a*2(8Ujgnrn0Z*O0oQc1f3&7n!x&^tERLB%NV1{W
z=$O!0?WPUwAv=L<HH)J(sK3aNCsgftoZ%NtwTt!ye(0?z=~R%BH7_c%<%6;kvf#aK
znlauSD{dE*=w;Ee1{a^DyN-AmD!QFMP&@fU|KrI&ao<aPiE-*_6lyfq{j+i4HsoJD
zFM}AE1z*<Fa*uA;*D5__v_O;KT@1VL`WO#+WKs-q>d%ca-#ofo)*94M>%u{ohz!ng
z`USB~Ggsz?g=HzMe3mqyQ+>A8b6}r?6z;$k<HIH7y*<e{p>#z1>mIg>_I(0(rm16u
zd?rS^yLHm%$(o!zs@wXj7c9y-r#hs8;<GH>ZpJ@`OZ2&<Oz+&#6k%O^lrqi+v?7OK
z^KN_x`fz&>4xts&H@^SnLx#id7_Ztefmga3$Z^#4zGiCXuLI4VXsJockK}$nSN9VQ
zifEs|SgijlH4PQE&?Px<pYjDdiuyy1K@&CjwrrEZi+fR=J&7w51mRL;%djauSIr22
zAsO<t>}pmUX4;NVSW`(!Noa>A>z#X!P$IHoNEk>$r0BW&iCQScWYd$9!bEVnBCKy}
z?r6~<#bRYgmHpw;j7(^_+9FI$cKqwtub%I9Z=n?9G@yg^ZD{bTl?wxgwmoOCZ&*5^
zQ4LjC5`*CD*m>@8N2ELUzX-~*XD>SYY|w8u=1FprYRSEBX_t-xs%^vze#DEdle40L
zg)4jcK?uX0ow3J=N`{JiA0qJ`3_(RjWiXL_`uTxj4h?+;1)Ij~y59q^N<1I_M0w=Y
ze8rrJ6n)^?rEkVzO9cykU7Yw|fWYqFLn{SP4#Tox2>jsdI{y6~(dV=ktJywUHo7X8
zWklGd-uDDW4&J)uxb+&UQ2!p+F*-E%&GgJNSSilw#F>mr_^k9Rzz*Y{go=w>aP=dB
zISCm)IM@HE&eGLQ=g3>wa_pO6(e)MX>A*udKz1%yY0XLnYy#Z2W0GQ^$3aSJ^^!mv
z9_}FKKj$H|_@#tJBw_kiUJEhqkuL4}N;xIj`f7*17C;l04H77}_VlFTew@yQZ}Dt`
zFaw5+t3ta5Vuayt`Qa?axP8u-g-;oFC=r$ZA!xAs;A!q@lfO*7odp)38GTIszzE6W
z3*6=ut@N_m-wp7Fo%)&#R$n0b#WegXv>32KVWiXE)>LktlD~^(v;-g;Ri?(y!tBlj
zoT;h9?leee`VliUYn2}VmuL@DV2Q+o0_OIA(f0!}5TXFMbE?)s#zk01K|z|s!}2e)
zBG^DUNisx_|LUl>P<j?b41M{BDSy|!UwIdexrd&n*9<w~A&!{}9G;@sc^otV9Oi4%
zT>-hx9A-AEvUSTvv+W#V6-f12?lTQSRsc_d>^%L|lcRsB0G`1}=??Qj1X$1ZP02}?
zXf?W*{-t6zC|}0qI^Sfw_NIzIgq?=<D4^h{l(y(!3A>>(Yaehy{JH0!3~n`q>ptcG
zR~ZD=Ga-e)3@3&HW1wFMGlkURa!fWhy=1*}?BdP|Y@lgQIHZ2!o(HxRxRbwJKsW#r
z@H5&E#J6MPilF}h33#<R1MkizSx5<hqwygLaz@q%GoRLk;`+;AB4+Zr^{p@DHj|k0
znSZw0cD7~{g-4iMHw0?+kke_h;C<1Wx6W|<WjIX{@XCQ3`Yg;`r_Jr86I+b*Ha6oN
zrdVvv)&@yIPmDz#SsdEYBJbW~3KH<P8$6J-A-Zu25Qpd^8>CTM(a$3w0aq$J<ol=L
zTl=OLB;dhaSFQh&fTsa)3LhiCd}LC}zhG`HUo9C6IMu(Iw^KyEV#1`^Y$4f7%0
z;tIuSaDdG|mKTi!byzevSny5JXzkbZemCapvXg=tN3?)G3p3q?+NE$A>X_Ai^+0Pm
zct|$~G8{UH8AJU%-}smL<2E2)xt}aIQtLmU{?~^4*M{3hSnV&-jIF;Hfad?YmK;!2
z*YJ*=ivgg1g*9dQ`3@D7ad!U(7CcmY=?=AE)n;d4ZXA?Dmo@%1hyFE()_ucPMf=wr
z`okA(#uM8AnnUZE(4SQL*BsiaN59eMzt<d6Q0O=&{OYF0P|)LXWQxcS)HZ?p7S~aw
z4vGfsT92l;eD{l8aMuC30=|`b;nvdSpHfhE9F$F*P8exIPhu#86lZ`Kvfr`-K$$)8
zZWY<afsjQfegMvt*XYLEXSPpKOjJM|T8(D`8HdWogG4FRiGyVO=hs>32l6(w`uhL<
zNuz&Z^S|k}Kcxbf=->3(FGl27CHoJi*Zxn1%FnQ-bbaz`4YDb`DLVf$xA<QK>t6&9
zRcH7&-t}+1YukwB-*^|4%G~}p-t}iZy75ifzwxeLz3*1X{IA8kl^Po|v-;t`*W&%x
zUW4rY{A;iMYp-qR`2Mxmwo5zLtl7Wz+72%7-z~S_YLi>4TmM>Wo5lV=sPwO;wnb8Y
zr_;atR*)v=UrX%|RoaxE|9MO8<3aa{^;v*l5BRU$y>h3&SWNU5b7z85J^e4kQn-_t
zA*SPb$xwdVeF6-{_!=VVw@ki)>m)chI7(Wt?I%*fy^^-VG(ipI6cc)OA<8WOO#bh8
z7vY#Gv-S$;cjd0U3kom$%2pVNl9J~5Xg%DPwI3Fyxj73&mF^=1vTS*T+huMNZf9$&
z`Fez@;%8(~L&356ro&srd=&I*+8lUR#^ATD+1xk*Q1<#jyET9dSJvi@Skl%&nsdk3
z&m;s<9s0^Qp??6FmaiUrkQ#O!J>$hQon82}Bm9X?5+b;QfpT*Nho8=Dy_%Z`NSQC@
ztn@qn8$k4dCef23wZJX2h>wEHta%r^3)8gRq?GL9f=XyTGOCi{9pFV{2JT5ouID|z
zBNSl^E;_|*f|LdugWRw}Fzi*&mFTt=hpl#y2J@chv&h)3FXcFJBr$22gVrOv(l@WD
zh>zcHO#QTl$#G&DRw4Js1Q|hFJG&WqT8}eIxo*QvFD1HeRZ`9zw79V4SFK>h;MhUx
z-McnW`t5@d8D=gqF=^{HF!yv)a4v%|Zn~-t`gKr)+W@$S2L?)#k_rVpzN3OsacL;?
zD8#@xB1UWLRn&USsB%Pb<8Mc7ro-;y;4t5iV_OIP${t|HHx9`(+bn+UIq)8)B@kBI
zY5(oY5fFlw?E^>7wbOcNsml-d2Q_4lA1ZVw4cd}wM`P{+6Wn?N*;A}I1E%LXJHo=Z
z0O5)~Ju~BGOJ2n{)&lD(eO7BA_}N^-4BRd)o7iLAxqGsj#Zf|X$ccPQ?~wHZIH-G6
zsuBvf)tBvNoM75`sFyCIsI+vs7uF+B>iz!S2fZK0jB$v*hH!x&WPrQ_!nkhcVhQ6i
zj4n<~@`I0dxwGM3SD4Zg!9{4Xqs(6frBUC#Pri9WG6>88Dz%B%LiR=Vz#aGC_ealf
z%<4QUDC_L(v;r4O+C^|>?3I(p!4;t8@8F+&sDE_vK@4&2Xg$WYK7G6w)X>9zLmu-=
zHe3Y-7g`UCzM}`U@-Lj(M0X$NouT#bZ-#ABa*GOsbyZ+wk7ayuaQ$8H^_fCny1JGt
zU_DW(sJUA<8q1xZK7C5JaUS^kaeRDyxf#~urrG#Yx~Ql~&}!^U0fNx1Oqil%y3IJs
zVtH|frE6%2LKxSs&LCS&7}wj>Pfej2ai7RlTY*qX!>0LVyLstiiRtv98Bi#7m#Jo@
z2wt1vH2Om`U#t*4W3SP5f@YR2TVD(C12mSa4as;R8^;I0D;3V%2o+ynkVEupQsifK
zeLWu4LKdtC&oH~SNG<5}++ZlXVcP5Yu_6SQy;4kE`8}eyf~DUcp6Q&N@OOKCtt{ro
zYbJcV#VMrL#E^{JoC$8X1a-?>l$>OGr<|JBYg;$BWhS<Y;FMw73VTP+u*D@#Cu_`a
z3MM5vUd8QRls-Wp_yGy76fNllX4h>o5sxr$-w-IO-~TJ!H`PHFyh2FxV)gm;x6wx5
z6H-tZ-%opC{8jKfSQJlKt!uVefv*46gvL@?<P=UjfiiFcY!%CFp;7{qi4y$7)BY|a
z@M@wce<}cts^^4e(YK`F_@K@&A{3RH*A$vd_tU<%ZeH|$|BYNiLV~(g(QOQPnKC(G
zic4Okq4Ljvvaf+tq3ug6IRY#_HzY+!6X?Ces0i1lIBoSD;ED(>Sa9!b0}Qj^(UY>J
zrKO>cj*etNiE$^T)D*?%p*Z`2cDNJ0%~pzj%@{X%PFu?vCx`4kY`JIOS=Zwp-`#=J
z;i&joDknL>DJ%Z~4iG3Da(8n<9|*T%j5E&1&Iihv4d|X;C|L26GP`x8xP2Ib`Gq^>
z1J@OD5Atz2;xx>Umo6I%%zltr613a`>p}42Lgvcz4NOp!XC(sW!Tqdc80I0?Bksxy
z&BjzWrr+!iAe5mGBpV+Yakk}o#}8!lz1}&8k9)`Src!V^O;IeKL`yYs-Agnc-$M)2
zSSn!S57|n9Ot|Qtx{I>1-(!5okxWifi%j^Hn|+{ZUQbfek|l0;FsIs-ck|c*+Aq^(
zMPy5I8pzVea&zi9c0O6X&!ROGQFCDf=-F|Ty$MN4$a_Stdd(d+8l^XYM>eG>F2LOO
z)B*<HXm-By$4qH1H3$gU?%%)fmNgx{<#)<(^}xN=;AY<QkOn6Rym~|FBYo|P!3iV%
z43s^uuZZeBOfXydtU#9X+0I)_3$qCI{>Y9Y>$y4LF$-R5zRqrlKhex{Ra;t?<t(YU
z;W0`|7rhZj5S_T0WhyTiAqfaQ?94-@=*NB7O6D=PN~253a?$2uaUXKJ)nvND6qYhS
zo{r;+aCmRR=~T|?XodL=E9U(piVC?4)KcoLq9@|4?`gXoW;EQCv0e-=NC9g|T#nFX
zjNyo}_&RXLO@6i8cUsR8zM^^VJs9tFWw_9_x*Q{3@-`Z7b0JDDrnueao<!4}CXPCp
z$`{asgM)tfNCiJ3WwIu3wh{}@uwti?>j^SZOO3$1Rl0QAT}Cg#dfIJmZCz)-6j;5i
z_LfX3W=<p%ws(;nQfLn;PD)a`Po!BiJhmD;v2=wtV{Yr`(XmEcjZ#5QjY_k~Fphc<
zkxC=@BsRlTAu5b!ja($#BCT6Y2GA@WjAk)4abp%($hr6Nc`fMSc2h^>)bG<;eiPJS
z#i=aAxJT;<fE<tZHFw;k_bcI$s|m~Y<k4-9-{Zi21htS;MRJl+?Q(>73vD%I(}AAX
z4WNTf;E72|n#96CgOkVn*p?<uIOk(bC;KtKtZ^3OVV5uZOlWa6Ut*k{?TlC6av`$n
zQW<qGyQ-(+CVzebd6&B-2nav6#$CoN_}aUtamaes&msMSLtL$*7n^_$u$i3($w{2H
z?m#SHJ=MT4Q&f)SgfCAHHTnayQOxA=ovD2i=z*XIk*f}ndoB-)WlpCLvt!@^qPKkV
zl3m%p<QN#ud?r$Z3+AO6<AP0}bJ+pst<6|JjVK4#=O{KKwQ?}@PSM|K4H=>8F$4LU
zy_2)Tnd1E;qoe$G=R(ik<!*IpvRRCsbzaIFIoensjopGh1;VPUjKya4B4!!0sPxjM
zheczBC=#{`x6C(L{y9w=HX2>deNU8@+WlQ8pU((1(8ZOcU6dU!1)-)D1zHK1D%_Hu
zb9T$Zl0!IopThV+h>&pvqkK<bJz7SIw`k)ge|j2HZh2V-&2eCze4fTYzPUE~7+7Fw
zd-D(mW$sYmHPQ)aUr*~P6gt@WOcWVpw=#8HSF%ajgMY_S@Tpwk%@l#;{*wM@#&zl-
zqi{3)W;rGo{Ud_bLyVr^wAgpw#bUzvpvY`r@tNuA6038e>SL%`J~_#*BHzw0AkcJ=
z11FSH2-_Qp_+o4`MWoG^tAFl3fcxkEWAV@f;xUO0qKn`_h1Q?R;|FM8-^x^e-3?SS
zn<72!Ya)i`x;}#Z!28{3x>R|vG(UgF2G~4yCiSN}g_ZY+BJ00#nEJcg1M~A8L@4uZ
zB^mF~08?sdMt)O@Y$b)M5u&#^%vLi+@{%lH#LdUROESvjqLY$XJc!av5+}NY8uB?W
zptjy4A_tcP^>YrU{wmj7yhk6%(Vk>q3B)WY`+!A2K(N%LoB8w0ha{8UW(iskJ0?<Z
zJ4#?1S%Kt=C!H4ls<Q`m#}Sxu8z9bDUqrCe3i_WhXae%fORMYpAR=oGmaQD73sGrK
zO~B8fYpdOWanZ!>#+Ek9xqv>*80U6wI-LTb1#*_i&keT6D*<yN1wy4<QRm)whRMzO
z`KrfbevhgR513LLE$`q(OUa=kwQ#%j9~}hAw!0LDH!2>SkgA)7QQfwLn=7iTw?{#t
znBU_&Qbw`wrxMb}MGd!mn&|D1j$8q*P^l$WkMFbKlJfWu5+L&w@P)fTPd+e@DFv35
zV;{wuG7Civbcv0{>~V5(l6fXYUoA9R=#NYY(###MKOFV*UAn{Mb7~h64P>>Q%)A;?
zXYpXx40CDA2ynJt;I#nHS-T5qJ%6GA%CvDghlQ0V`Lz@jx?sXTe<nGh3YY4g1(wk+
zGkpRp1}m*(ZaDdB1aGdh#6Uc?xg=w8%8<^$7Bs8CH5-;FjX}s?;yrZ<WIxgRM^|3q
zbM>r6hYgZ{sbqf4$xQ<(qjk-Mye{+}kpKn!0=Nav#qnzL?MwND>9JQmek|v?QMm4%
zSU^sn#xw*q<Q8+Bf8-5Bf&k<V3@y1u%4etj{QT<D`Rs;}b~7_G!QCmPhu{kov>wHt
z>1sA6<XJyKYC#yDQAeZIc_Ptn;@kU0^Z<fuzq(n|*EZvHm$xo69d-jrv1}FoH3r#+
zCX;gS0cQ2GK!y`Qe2~yR(U%z}Cn?zS?e+65jzkYQ_}~w8IU5<pX1Qh60|f!Ms?;JN
zz2a6G$kr46NK&}7qy|!p8on^<O~$g+KZ_m^EOPEC!s*IjwzxGkf-%5WKkBYyZqNf`
zk)eDx($(5}vOU<$5BSW^_KuFlcvw#$$XZfSj^w@VgtQmt8bSqMqUP|}D)4}=U-Oyp
z2l>-6VQvs5xNwH0;=eMr{aCBi-rinhb}n?Gi{nXGxSZqA@z?yCR<#^5rS=ow&?2~9
zl*0DY$5XPM6Y2&K1L&Cx4M@T%8|f9jG68xuRLw3hbXjI@4mc5Z5aibUK)K;|e_*S)
zse$xYi+sS=34Gyz2rk)EgtZ2eOxLH{t+OCI8$DKDUQWm{p}Its4cz@gIQR7Nil1)4
zY=+Y3hJ$pXEFK$h(+u;ji@d~nz~Zp~9kq(K!^D^0Eb}X1ZhF8M>WxPfJSsbiDI)1V
z=;L!{T6PGvduC$fqGM<{&@M3XKxRR)k*wD~Ehz~lf?MiZ&^cVVR04D@iu2R*dM*>C
zfSk(K_36rTMiz700QG1A3%WVe$>&<ZzE{|(K!U#9T10k=CyCeXYpIqELh8^V=dwqv
zjW*^o=ZH8@pCT69-4|DN;MGr|YuAn_UZi?>ibmkgl)E{;fTHX*q9dYY)HHN==t@{m
zrHMZOAcah=6FTHfe?l+5$aT5I<o+`2RfUe>vCyH&*%qGgaCpaDhm&Bld`D42>Eapo
z>`|KB#1Ae}dOigo<HSu}{13RuyfW}*t4PDa!J^saev|Cfkpl?yg>ic1gV5O!8r+fr
zUaB6NUAu9GcI_d$y$iEuni%M(%NVxeo-N=0`LXQmArS17c1W3EuRLiWN^{aehwm|^
z26J$!;BoKr7etqc>Q%cM?|j!#?BVle!!ayQjHob9)uN?);GUv7Q$});XhG+6i^i!V
zkx#4pn5!+uO@w~j?(Xi+mCuZ{O&OYBk}%SsL`00`ie1ZzR%bSs2~%R6LU?G?+;O~l
zo11Vqu2ksl@%c(NG=AFQUdsXoJI7sJHt8^&Ex@|B(u2Z|NzwDqgaEfKfB5~QDJ{qe
z)xPXIyJrO(UL~6Jprcn*UE}1g9Y20tXTXg3%($LAcs);~X?xyiw?k)S&1;X)nG>-B
zbZsoghIyns*3Pmrj8m?3j%s%*N#Qkclf!RGL!}if1f1$<Vr97NuGRHf%1}~Kbq;cu
z9QN#c_+FQVorQ{mB0GKjzE*pUI!$b&f$m=LCfwkFfB@5*R%hZyTs|K+IJN~@x#9Ps
z))6SI=g*%zeQ{e}4AZEDANvKbdVxLQrXQlXOmatTi9*Eoyw{zmKlRY;N^65yMaWD|
ztSS7=ngl$>ot&O#4i|J=dip}Q0%rbjFCSAY-oqLy_mYy54p0mw5uHr&NYh}??QSJ%
zn&%{@q`)~vr*vQE=fk00%%tA)UR6kqLezt*I2~9d2PNv+Tb>`PD8dI`Q@(hG(%q9E
z;xz6$s-kjpksmq(72y3{*cY|$=5S*mmMXd}H~d7@Iu7;jXE^Hgy<4jwt#a#z;BHU=
z9gox&+xqQ_)c(TGnmu5h!5is!q$@<wuE2WAq?T9t`LnXg{nNy=VlYocNG7)KHy(zs
z^GoLpmUa^}nKsszcEZrJoEl+`SwV7nBJrbboI`JF60k0Q7faA(f;AzL`7}`ow?HpH
z$uk2EokB0KpN{i2u0#XF-5HmVFp$jNw7=nA?`e2V@`F>76e&Tf(xUS+oCFt(&+~`h
z--Q(~CuHpM&`W)cyRxGq<cS}g?ckcmts%#bL2krTo98dc5nzz3ba}_TiktzK30Lx|
zhf73UMPAOd2g~IR4QxVe()J%bc<^kxVzOPX4jF&D&5wLh>rA%(`w`AsPnk95LL}pU
z%)H<SO5@^r|332SfB;ITxI2g0s%Bw=2DkG<gt75^br5oB9uX6yjoqtyE&U0HNrs=7
z05#SdLk(d^TeG+(nFoW>A6nd_HP4I_7+u`8M+G~l#qcvhiXYPDGt4ToNBQ7gj6K=l
zOq<j$@Oq0PT4jQ*-)>p3_Ay%otZThRor$+%ltRdNcg+y$7*J4!PHOk(@4NUtAXJ6Y
zAi{W$D$4F*)qD0=<`h*$9k!44`j<%px+5d0vduaQ-RizRQ{}Y*IVbDG*eHf09lAaa
z2Z=9VVYqrq@zjxN-ToO?bTwuFLI7Jo?)_VOA`fi8)YsPws#nPLg$T!dH$kAyqCWB*
zGwQl1DtI`cixpj%mX>C*I5!nWQGk?usrF#-dkD{`n2+5C+Lt5_^ae9)v~ltB7BaB2
z81$cd7%Lq)z@GuH^z)_QGIP#2ZpTqgnwsnC?dAMgU+vnF6%#prE{_C$;GS^l=)o(Z
zgr%M%c!|@UGYkeLT)21URoQTiHY_rrpf>Y<P*n^b<D5RQLC-%aM|8U^2eZz!DFk=b
zCdb@vnY%}4>GGw;$lZ_3L+9tH%Bv~z)`qWNzozloj>rNqKQ}jXWC4m%vS&Sd85NY<
zGENi__#iRN%M3L<>{vZFh2lCXof?!C^GPJ$)nS?I?xN)d;#L9&!DPkc($m4!sH6Lt
zq>+wlZ2m2C*NF?x#<WaTgplRyrg<E_z{;9ocSH<991#58#=yH^NflmmrN;=#w=Ysg
zRD$@^($u^OUq#2W>OKLs^XK*Ab<KfI%4{~W(K28@XGKQ5NE=+;QN(^CG00JA^rYRV
z>ot_}R)Johk<&8#i1|L|qLCl-9L6m^c}A{rLHK#fJs<p3sGCruqZ+TOnTyVTG0pE-
z%5t0<&QvUjy{dMgH@8JX?3(oj_NYe<6S-2N!D81Y?^zbSj={GLRLe7IHG_kH6(Oa$
zZye+G_mc}Rt2sJ3*@X?4b_0p9HxLL7MbO3y@42&>xr-uxNe<?TEU@K{AiXZN)ls^0
zk^uK9ZGP}6_KutqGQ<PDn${V@WMCVB*9|54?y_ucuq5^6v=MCy!#$m!^Pw90)2*Yy
zu~r3N*hPDH(Wy#``k##DJM-jY62?Qcp5b&Ai;=bGWsj<@K|JflYZIaS!Gu7UkYl}J
zF{lpC%55?9s){?@#L0$6-=X;_9Ud=d<g)0>K|p1tUwoxgn(hY!G5PS=m}WacM2l+4
zr=U9}<LSEn>^+{jh2HV_=cgto53+)YV)6u?>W3PMcn=L-yKAUjj`l=Y`S0-oxRwS2
zC;Odwq7c+gA2T}3%5JAACYv+FE{B_3HKqiHBDW=3v=?9Ay7Yd?tXu4<EQApSrv|lW
zch!;FDGL?AG7D-7_DYij!>x~6th@$8RPDtN2e?qqAJ#18JaTi~9p8Q5v=(t@1T|8s
zk3ZyL+vsE`56OJBs=TpFj@Re3w`a06uZWTzg_|hlOYTJ)l)NY{Eqzqor%6l4$!emN
zJY6br@2v@Uhwa0Ybn;GDJw+$yGcz;866US0_1EgZ7BXHGATFry&LSj-n`Agi62vw2
z7~Sg<5jEsueWP*hNO7(@^YSZq<6HZ;+N*w_U3VO<`W4~NcQ(^f31D60OW%SOHPz1B
ze=1Jkl4z!uTxTY{@-46n0-qV@$HvFIn1ZW|!@TOAnbg)S6dC0@Di^3Pp{|c#HeQq>
z&eX{HR(Mrf{SJPXMxR5c60p_dp-EG~nVI+lR0htWr%ye+bDzp_&_9gJyt6H7_9=^2
z$&LBx(Y(O=*vm$pH+^`x&6vAp%;EhaYPnAxBM2W<F(mN5YWl_@qr%fQ<5pLLQz1=5
z$Ka{+GuH>NitNBWUFP6+Z?C5y<O&Zp&FFg|nj=x6nprN^`&<WBa?%VK6jq8ylA+^=
z;5FE(%JTdkIz>|_Z7yX+P>iAnGwFVGD%S9&Rn&@Fr?K^Sk|L#<5`yNKiTMS`cqp*m
zxV{u?vlr6Rit1`^o-!iNoJsqH%)lRZX|+woFR%A})INP9VW!Sj==6~T^Nx{*dmTAn
zwAS2`NX_oQ0&wTzlt0nkcE4!xffxRP(&z^~X?p$X?C8`uxuF>P;hgtzswn@5Rbh>!
zfz^GH7ZjrMox7F^4VK2~BdPg<-CRHu@QlGNoc^445A;^%T{<EpOUOdB83J#sxUAS9
zvXb6?T2dGjDVK#4%SyULcwO~s{Z0?B`8M<p1lD(JoR6{lH@D=*tEMj5mV4s6-BJ+&
z8v1y^LD2fA+++kE8S@3_+GiZknQgNDGP@~>XZNsEM@_Ze>Ph#H9sFilU?L*K#Fvsj
z;E+er`uvU<ouw1Zlk-u%^#FBTllOzc!dGGg>n__?lX2=K9dCg?1$`{=P47|2^sc6l
z&nVT;qz7fWsolIkmE}e~-31syP27Q~p@==@c-xB#QB#B*LmwX2c`aTj9ha)}ZI%xh
zIe+D1N1KtkCvxV;V`msh@*+seS)IjOKEO58U!t4Sv)u?eKTr25))ZxnGubb^Ri!z>
z^dWJ57GO**HDyS(dS;O#bapUg!q|sxI``8pi5S@sL=D+FG@t70pG8KLvn5T9jp+9;
zalh1f&eV}Qaj9Q5c5zNU)4@f-%-Sv7{+{_^Y`H2WX?}+zu<g2cJC?rX<=^uvAr<=+
zm%6mjJ$$o>_Z-@aA3<>>z?H5|S~QO}dy(y~Wv*&&4AKS!VFceg33FPahyxk}`*Q@p
zu#&jz&L5n(FpkdU0I$46-0T@K{K$GxMw;UOFU$||25hZ;U*yLxvYr8L&;C9P`+3;t
zSCb@_v(YO7UZu$di1n=!jOR<n-8@1O5!}@5%T7dCTqjdS81;*CI$(|ON41!xQ8mqj
zkuF_=w?jwC77p{7=qI|FG6(aS+uL(jS=7D|0RaRiD6`}uX7YV$^Ng*+*6rr7tYq<h
zFB7HmS3SD$nim5g(LO{xo9Q*u(L`6Fq5kr^Ejc-PYfF-2gt`Qyez|<<cp3jehe$iw
zm)8pgZ;q}SbNT5RN_fq!RMtdn$FY~#j?*@YxpDdVcKNkoTv|_vd~8plSwte`QL;D7
zKQy|YiYuW;GBxcByhEcbFzRh&`4+go5y$V0W8|&92la~1EWIRk<Y&U&LlI*3fsLY*
zPi@bqJ9Lta?V)v^DXNY+=TZ!dS_CHRMaUjh@i1xhdg#_HWU4dUO<;nnc+1F$+$VAp
zM4!~yt!e3rXP*u@1Y=*csQyWP)Z*Y`=?574gW_1N=RqI<)O5Trp4gbCswz)>VLd3=
zGF+JWOIml840zebahev81#S(Le&eB#+5zBYpXOtG-&ygp1VC#S*_pr4(*=9#*tb}l
z=(lF+JNfwfn)-xQ+bTRjiOY{4GTeK^`&`<THoPW+JpBRX*!<F!&{^ZuB(5~S(Gym>
zL|9$M@p<F(EbxBgLq-S5;ZHf(?-wgFPMOm2a>mrzRD`@@_Uk2TDT*cac<o&2M_C#H
zZ1a1%AN?;lDB#~CB6uH_8%MXccl|((THOHUoI4nA5TbY?uDZQEm>tblyTsJ2HQ;|d
zwlqHVrE7_a&xBH%dh3%zEk!D#0!pchZXjTa{7}Pnq8S*CGRuO6E0r8M)*<8bT~dQb
z+rDSN+xHMO9cOg<Norn`u`kB~V^mmFG@ma|0?X2neK2@ebh7X%-N_@#_X5)MVqO>&
zrPKnI96BcxRypI+etE*>X3sP3(hBtJCm6#jZhRxvpOl#5gh{ji?iWn@fnMR1p53{e
zXP$Dt3PA7OySEwyB$bDGtqX&4JEoZl?tEppjZj4yi}vbY2I}8HgX^jBu*gTH(E4SP
zVsd`YkNf-R>(0<22+F@RFr(hUwz98=WcPyLV*kg83IW>9#9^C2H$)L6LGe#00zuvZ
zjPcn+m*(#X5=X#&X5U$xs4&{dr8c%J9qNCUW7UKSbwONb$MN(1AO|qO&L5v~7>kE>
zJa2JyBhKvDpD;ocuzaBWd+dY;JWHy7eyOj<S+aqnA|y>5HJ*|jR2b}~c%mk-?d2!i
zIGuh%P+}RhrzUQF^eq(>zwA=g^35t=6(}D1lzzCsP1mQD!@JRh3OwcERfl@S-*&s|
zvJPyi%%(VZplZ&;FCb5<x^QT=WO<K4UPPd}9}5NiY8qh;a!&)Nx+5vO`Qd=D(Aj>B
zL!X||dFPlo&ce96m*FUUq^muLnvpV@ih$lg<$9)aIGI?0OUe6U0kuQQk+0}er_f@(
z-oR5ooys#ZVe+z9B*Ka^JCD^oG#-&^X_7^7CQd*|bbw2RVak%&0r-@08A@&)A$}>N
zqS97`B>Sxekh36ztNUzH!(X>Nnf$4$B-eiLl|m&6hG2^=scNPf-?8_LagoDG^Nkun
z*SW*8;7ht(ryG5knfy~jg!R&G@njQOoUIwHJQJwAT*rGZur+-8lb2R<Al>6}XwSU4
zA_+I#K}qL6tCgIgAKaI%%k<6lmnWUO1Oq}%@5fvwG!V$9GJv^tg4&#IhWBXQQ(m@D
zH!DJxZDC}5FVfS^XF#nw?B=kA)jhWUTwf2~AUU13qsh%lr8?!Fmh4o(P?57SPFm)(
zRqPv4y=5=?(nvVQlg}wv_tvPQ2&2szV#-c4)Y1G%CQ<8RFXv>SITm_)dSNJ4r4;2)
z#WtuG3E}WS4M3=eppxlsT>Ih{ITHi^(F9=Zr(EcGPj?r5Zt#=6i6VFFoLN+uE;wc-
z9e+DT?xQUtn=)hCM!9^pB7~5~=gKi%kxo#=p{9*pHpx@WMk33(%umFgGETkfI$l$V
zWIQ?!WaNVf(lR+AmsasfAHO=I%?V-`jw{j0&E9Fq_CsQyF7{c@x~W~$Vw_4-4V0&-
zGo6~?C!Ubwo$S!|ne|tcQYR!|e&j`9nPMmiFV2^HsjlDElhd+SbW)Ny6VdptjFo+v
z5%*4?9~%4~-xjX)dtl<*Z=CSJts&B%H5RFSOnl4aH@g<!;&NMmbGXB1P-{JsFMH}j
zJNk9|;r_xZ7&>xd03;%xZ;J$?=0UdYWngq8)wHZvzqYB$e)L`itbMU^)?zU7rv(!+
z56!{y7X&spt=hip>3)9EtaV;y|2~7Uktca?2n5W>YX#+abEIdcJw<!TrqaBUog5uq
zc}n2APv0LoaIg3RVqX3&2NRyDQEC(&on281VvOaW2*uZ`V{avgR3)w*8EqaWP-1q<
zf(OY_OAQMd_<SfBQCTg?@TOB%)>L9$)GjjNv~Cg^=7ee9NGw|}y^%;d*)AjsuTj5|
zk&UV|iej$_@7CpXt&LddY<=N!hS{)dhUE(CJl_Y=y!M3A&(n4Wexki9HO`5sJF%zQ
zQb7X2VY=|`y|>-a&`{?1lwq8&4YymcM!MHo-F}Ai69O40gR3deUAW-vUml(*0_sjq
zF)p4pRXXKgL9m^H9oQZ{@FW+(!D5#(_2$$=X6E4Pb_Tz6yK63Ls`l4J=gL`+-3lix
z-@-l`Runc-Q`l5KjKbrW>dV$O7}Kgvj|JB4x2f5`;y|-6(xir6J|c!kKYwoG`bfvj
zqJTl3`+cBsOUgu817jCd&XN-P#10OmDtn`X`x(NnFy;n%wHT^PgofrnsEU+!R$0P=
z*Vv??4Zb_RX6rXkvtjU~fWhjUjdFvm`PrA5aqlc1jevi%k$kB|{Au^CsS5}v5W>tk
zCHJG6co!EJ@2N8z7M;F2qf^woSkS_DBf=}?WGt<LriADWCt5Y^)&A6}Pjr^>xBY47
zD_$pIhani_T*G6W8svT1k|{!!81+SX$xd7#3p<d)krf|-yph_af>O1)rS1!wD&vK=
zqpD8h@a6MQ<i=$V%%4;@E*F+Ddcmg)QV}qqCw<r{zx%b!gDRWD<>y>nT<{7D3nMAT
zK9#bgld}c7#Cq`nJ0iUv7kUU74Mog#wtBc#GdHW2iirw@$9(l4juJDYkNx-)Wn$O?
zf}OMMZfvQVAlrZg!#RV<xuuFicvTJE>^W4YMI!d857X*t9d1sptnkl%(+?iFz9_Lz
zo$t-)+06I*4&SA*IB-VhM~Xl$DQ8j20~ZdLo5^<^RgICw%GqRC7cJD<zEKMMu&`A)
zA5SHy+ZwcLr{RL)kJ8xr=~bRWDR5a12#UB;zl5w7APoY5#SLb#etc79hEXCb=c-9w
zdwsc<G7G8q6_B)ds83s@L~u)qYEpT2bNDtjsJdMj?HsWfjmUk}#0bnFXX0d)&RxpL
zfvG;rf`QgF9u?kmZ0IFW+^^6vOR{@3cez2iJ%aUMc|7k7;0gmBS$xV6GrXozy=C;A
z_=?e`;(8iNc;L=gL*`rqG^YM`0o31~#qyic(ao#oP>!~ES7mqwT|nTekyl(%EozyQ
zB360wotK*(_l^u#(Q=MM1u`8c{i><;RiKZ0{WB-k-Q3ywAB&^7tRlwe%?<kfE1EA2
z&s{%tq~9sh4x2cl{@u~BVK97fLw6TrYUKDl13Nmnq2hhGTP|CFGV?{=c6`&$*A6Tv
zW4Rtym5}$(XR%e37*LU5U3pb<yo-<BUh0Ev>Xc6IReQmYvJ0lbrpMV_upS~Ba_C%;
zdpRlF_Jy%S7tTT!5SG+DjFJ!+55Ln2BBgnS<mN`Ns-*!(6XJm5Z&`C9=ED8=eTgtO
zB>(&nyq#Xw{`G*v^!@#$aW$rUs}ZafUr<@%_o**rAY#&AHNE{cRKf-kQ2$$v3aY<E
zO179tDdO1S)wK02CIG(e6?r&2VjKqd3rkA{4Yz3@KR<exTZ;$aHO7h{)Xa528&K5n
zzPM*x)B=O=B?-lpM!W{o3CLO>#o(94VY8Tg{YC<c^di!Hdh(!NU<ZEf2;OGjHKSEg
z^FT+G*ufU&`9P=Sg@XwzrZirX!;;!gbP?vc9GPc;M3;<r85GUG^99lCAc$UFkdE0l
z;VpBibh>xxQ@Kr)GEpWtMIfbh^7Qmv`t&fiwh5%#A3Ze=YvSC0TCLx{KfU|7F;4A^
zjAzw!Ihb<g4870dZ8mx)80wrtKO1oJv!hn~$(WOESaZ??B`Bu``t8nHpO8{*PxfI;
zLsZ{Kor<eF9xK%?-j1E4np*Noow>d4kqY6wx}az1ay3T=1Kj|qhp+`d;GHOL)Yp>d
zi3f99oDb>R71)@xbFIh`Y!wUjx1=3F1*^B+tSFGGBILEm=SF!+9y`TjT%UlC_kV$X
zfI%%}kr<1ZM@F>n))!Q~vp{J+3Jr!T?7A0b(aS=j*p+j(A){@EQDp7I5ng0Xys82v
z)8%`L*wIvU4aUnSV>=Jd1ga7@p<Uca*rP(Tf~9Er)y^;-tgB&fxy(`SQZBnSbe0Ok
z_+xS32frKoUo)bRE0}aQuE@7|MIMCCfTF$eVcu67x;_QbPtYfuhsVcRnt1C@+PJCc
z`|uQz{`jfKbj&WVywKA~W5#dh0>U=w3ckMRYD3O0alVQub!sB_>w7Nwv;wQ{xA*R+
zt7IEA<xZECmHhzKEO-OiVrBDOBwK$-P8&#(pM+{fvYG|5#Up4h5UIVLS!^`c3R#wF
zQ}r4YsR%(+R`B3olWeunF|jzTWwb0<p;}dA`<3Uz{wGzQcqR!ix`;nf8|!%Bcn#7L
z7ifu&;gwsw%dE@6mzQj1?Z0;Ga6jk*5)u-&Uk&%A;)!cAB^Q1>&Q_t`G^jV_G|fyW
znZ%Zg7@BapCRu!r@%qS07bmMDo{DF!Ct`_5Oj++!U1w~J$PvW8_#Nv$(d5s8X*JQ_
za`OJtB)ONL$Mo9Isk67rQoME~Ce|2kz~>CECILf%cXN37CW_fjW#nB1;L2!jxDWnA
z9%@F7DfO=92Hulb)e)o!wBMO0lKFfr=MA)1-Hs`POz@1iOmks;$tV4*aZQyE2Bn$d
zD0ruyPo{=!qie6W;fL4b^D;X9T0Av6MdE~q0=Rt^gK1S!30_|K#Pn1uU3P1ot<TH!
z{KvDb<*%wy|LPMsF+PD}jBa|zeCoDPJ7^prZS%44HWTHOE*EECv;~KxKTAt};WE@}
z`y?yoV5OOr2%TkB9czHx@ZQ`<*Hi~tuT+NLR|%5i6oTJ9pDA%f%;^b8BvdF-5k;<J
zhX#oP_P%9ZuxU6(pp)m1bMbqB3hE0_NvAZz7LgTUdWdi>AE6j<1R--2JKFtLk616Z
z4$QSZx6_0&At<Z+Ue4{H(W}!WfTm8bJdd~Ckz%xAAx`yDlX~vv&4;>~iXb6wfBCWR
z^EtPrg{53$WLp`${Qmen89SP<=Kw3Z@(vAme)s-ouVjrEg^3FU(nJNKkRl=?dB&~f
zl|TBb$xORS7UM~68Yl!u_g93V)bk&cvWCl6+KTW$4;a(Cg#r?(9pjRhU}IHO2eT4x
zD*<)^!)k{TkaAoE^AKHcx|n-R;5AA6THqqZN<;zp*#eecRb-Bij&;iUW*r2<&m<Y=
zd$5^12J%F&Sz8pS=e-$Lq)f>z0j~6sB4|-9HuR@0?|MVsXw;180tKA9qvYh5=O#Qa
zUtx?BxFVXsQ)iXI)_=7!v*&&*Q6J?^#JP4;7Ep#cZ_PV5H)jimH1bf!Bl9p~Q?`oF
zW_7;SYSyYa`&P2!wJ<Qc5d?gw96ryf70rhX4(o}5g|K;N`j&3Ru|(Hm?Zd&rj94v7
zyv>R<@=@W}sGfFNE~s&~dh49`yf*VDDpRNA)I$P_u8HbC?^!xaN0?{Sm1MysAi7-V
z(4Y1rxv`nPrODuF`xK4sgV(s7NY!ARY7Am|QrZTC^QCcEpNcafYS>4o;fMBRqqZa;
z1W|#$R>FLxuYNxn_Qgjnv-SE(epG>nYI2>zGRI6(Z!1WeN*#FU7@2SQ@_N?ph#F9A
z#1vUogjv(sE7YGl#O^0sDYswlV2CJE;V63g00}DF0}hdfckKeHnvI&fGIT^*BU5l3
zMn8Y93zkk9Ux`Q3S3{mc7Sq?G1V^#ng!<ZkkxbG8^9lV9oh*$lfG^iom0uq2;%l7d
ze=-2dd7+`Dy{whNzHAJ9KHe^<Ar$J2eIzmvF_1r&*KX#fQtUXT{M1qMr3~>W6-MM|
z*6;4QvIEb~=2BL+m323Z2EkzZ{fK=RAHfD2Ro$e3qX?NfTmhSZzB`n!gXF+HR|J~M
zwV#^`zj&Zo*bx-Y2pju}0_^M~?chXMc_Ny+-;1ZoM=#N3rB~-@$Kw-5MRIces{a4O
zzF~k+jO^!X^o8~qvj)({q{QPg`||UvwW*5n;sN5DNq4*SXp19Ic0V8gbn;=#6NyB}
z1kD-BDMbP?0R4<SK1q!JCA6}<NgyE#Seod5_n@cot%+z*!Sk<mJj4Hsvp0{2y8Zsg
zOV;ekmV_zU8Of4;tq@9K>?CC0$=;BCDLctpsO<Z`ltPSs-<23UH6i=&8tQ#(x!vE-
z?~gpDnb*8t*LBXh&UMc7JkK@SdVCq#Hb`e94|x(yItSV$zP6OxTIr#hCMbtb8*n)M
zeXl7m_lxN+@>dD#i4@W!HSSxf0I#&N&?Dx{x+|2N=5XGvtjL`R@CGW=c}K%;vKR^}
z)o@t0LTpW_d`%=0u}hwL1fX#Ql+vlbwG|$d_krZ20IJ!^ac_8`wo^-5f`VCWCm_td
z139$?$&x%Hgo$rUiGy-z&K=gBQHNIbMw4u%Y>`s3`e%GIWWFD_!cCn%xf-ixs)pt|
zWER7}MJSs_111ESpRaGY2O>GD<!Mp00cE=rjR9xk>-RtT6C%Zmneq|S*IjD*Douvk
z<pXedhTCdNr0&m+Bk%cu@&qoA@hnvEUa)#}NseqH-J0{-HbMX3%7n4>=Qh(~L9j)6
zCn9Cjls|yj+B;xI1>sgDUu`5*Ux9#gACB*5E`u;na4}TB`tZAdrtS9V^q$rxKJWSg
z+QI_|(e{3|+gu&gpXhPd;@cS6(#Eb!C*8MKqzPq6mGY*KExH}vw|SqmoI7O~8N!|G
zlu(<S+YWrrMH3LPqU6)Mq;hx-8U~*{A>5*kRA%B1PCtOndL95OUFZk5?=`i7aTZ9H
z{z-*~SfvUgMH=MRt_8yF{)hnS6>ft1+_c`fRy4j1zjb;rPkki+JrxQD2^8$qpK6ZM
z28utYpi#+vvA=1zs4v66X)1>k;dTSyuy(Xbmle%+6iH=Ienh~W(u#7dSBZRS+{bI5
z#p%w^=vR7Gj{z6+FdU8b$u2IiLUQryS5C%U0WQt;>a}a#i7g-`G_LG1yBq9YGQ6Js
z;#dzyLoD)QYp2O_g2IF1&s{}r@s8oDj}|oZVQD>;ri0_}=t!>P&z9b$<<!4vB+88I
zm88Iz1khl^>7sSLE$W4$t5PoV(d@H}OG{&s4z&Px0EDQ$1sCmn*a%Z#s#sa5)fa(b
zi;grl{h$n)D~DsaA~dG5=<b;(c=+8v;cdY?05T)0wt?~OBR7bbAc14LgzDuEDr=^S
z=(9;rn_qX|t43^xVh33QprO&{n|UjsE@=Z;IHF?Gt&z&^+5EP~JTl0C@%1UE5&dB2
zXtU}UAWt<OQe>Akh~40kAbCd0f7Ps7ReCK{NSC<xf}>Xa!cb9O+r9pQo4ROp(I+JP
z<JN8n!p&Z?T8+lQoHFf-D?o}V!>k&P`!^{7)YCNs1Ng8;*TnUh7ppV|6$OsvnNE-*
zhEN$!zWvIUUwN<t_48tnOn*0K{RQ%d*#5E}(^aap9y!RSgVreE7>fYlT8R-dOb%Vu
zn?^Yf8pbH)s25J(g3E*0n{}{GxyvSjd-_q2tUvgfE&<4<Sr?4d*%?&k%X-X=f|chm
zAe-Wngfz*cye6TFuFK{PnS*9#qu<P#ZQYxz_av!&*Bpc<{gj(`7i)R-84Ll5zZXB-
z>HSF9rS|z??g&SJ`<eSyjDe|l%e54@D{njnXd9@xUCG4aNg~I5la|wOk}bvov9hKe
ze{17(ZGUGU8+-ed`EczYt`4YWV_9V(1DJ<_wm;+aML<>4sT?KEI)G6HCFmKC*#-cp
z5;uJjWNrMU)j45!5rXBxxnT9Q$8FTbURmO67?=p?PKk6OBXt+nw@#nXZGHi$M=@Q_
zCL!F@cs33oE_+5YT+WizKA)13@{R}|(bHy@t=5mBw<Va;ixbc!^cD&@L1y6%tZ6k8
z(p-Tw-R97nW+L<u9PUC;7#S75bm^;jW}m3G1EBZ#UstAjb7khTwprVy$MWAnM7(h6
z(xn0pDed7e2*(B?)_Ptg-i8;B5ACv%*5f(+4OIihWAAMGLO>~b1u#2SL)jD3PWrs<
z1|=Suu5s;VGfQL|2(M$I1fG}Fvt3}9c%>K4yjuWC2;e3`KOj(5#*uPq1J%59B3~*Z
z>$dCbL%0LRuPhX;x2X>DDl@%`SSfSKpf*@QhEa<}DklM3G2~n-8>B*2&B9&+#kQV4
zH7%IY7v<2E({unm;q@n41<Rhiz+~CEz5`0=t%QOv9J5E%`}1jxy<4tVA$F<W!zD-v
zlw@pr4r6HI8ED4FtP{!nK4arw>rTlW0D&`W8TOLd55L6Y@qznm#*V!k?Oeib)N5}k
z!LlkQ{mm=OZiw@S1c}?gR7K>HOnfkRkoMdFH;ad369uFT&``?zZk)*|_Q>$pdIr6i
zj|#7kSxCAcT+ZL3;Xdxk!{d3mh1bzsW4$(O)eY_Mlw`}6&`5+)w&pU#o{W@M^y83W
zLu^Xcs*!C~-=;gnR{!mpoUFxJ6`t}2Bc!UFtTG|#EHOx+9L|lekGCQArUVoHH7sHH
zGc{d(3NUZGrh9h@(;^q`*Gs<a3Gvhl;Cjt#uVqCF#`-T=Q^$WBfux==8(rzU(0fZx
zTV9$=Hm+rsnh#KZssL=BPhR|VMCgh=QqEm>4k*j0^=bJSW5@e_l=Vaf52!RQjMunh
zzoYFxW1u70i_dZT<W3?o>ULsoE?@#U7RNsrsIz079m1CEc^s)(<F=8yI9xi6yr%3A
z4FkoBQLTy%lk9cMuxPA&+GWivpm^LoRj`wfefF3AukHc}W<MGp|5s%lWXI{^z@f5;
z%gFBg-PlzOxmR=~opyKmTqLuFh8?;CCOT{k3#Q!KsqD^@%5a*g$k9qkj|b2I0=9_N
zXoZ8&Su@5B5iLn+(KEguHMQo_LO{y!B#<!k-8os>v@XFp!$o=(Ks);?ysTt0A9`iH
z+0q;Q;#$Ikv8Q>_#8d|1lBN+luZunW{HvTkeNe?Aq`8c+k5g{OW>!=zn`vo?eSwOb
zjt_VPIFpb9r?(^5&KOuasK)~;2I)pv%RqubY&jQTjTLA`@bN9i$w)g<8ze@n8h_d-
zYXb$?46v)`Kh7<BD);KV|BN3RhqMYvok<4RORWhnD6YiCG2tH<v3=!VB}`Cn)@6Fd
z*W#I^r->b7Laq~X=!SIAsfIMi8}n#2BA$Xurvp2wyU?ppq_ba?T-q>{cOma57a%?%
zle}N(5j2DGC9*?e0DEO0)gxOUJz!6Y#ZGi~@R%D{Yp8W7o$4G^a6vPbS+jlnHhVNC
zwhA8E<OZhBlqMppz5NxhXT~_{NG}j$J$uTim#9jHJPdR}Y0xlth!eT%e!lvz`qgVd
zU-w1xA=dsjDvooKIgRgB=eY<D_lq?z6N?h5#uTFDrtkpAm$>Y!S@xcS^j#r-z!K8~
zG@LYf5kDdap%wx6<H8n-G3`Qtnzl|z!?$f%pkbm<)c-`j8n7@7YoB<E4^=P;wIgcr
zz4U6Y<|HhfIlv*`_mYLl*F%NbTHL(7y(>X;!tVK{r#|?UYgqs#xh&rrcQ!M!;COLB
z!tq9~S5u=C>8`Z^9cw~dYiyk#axXlHvPvrNDL<UM1&~8eKnrVAUA~fSrY!<16`rq>
zdzEymQzAl}Ql_)JW$J5gm#JyC?ghyBb($FKU6=vFNjt}Bc1A{;j<4ZajB5M(j#TUh
zlk6m@P4qL0$MUIl0Ss$VZ)55O_m8yg71@@~IlIx|8Ye!3zJ=nN<=TgF7VBdJQqp6j
zzWNJWvV~?%w$=4^TNN7o&FLrfWz7mC&{*C0i$}>8*Tx%3Z#mrM7+>Ftb(VU2R*lL)
zC*r*!a$v}Ap$Ytp2BNWUB?R@l>zU6ZH`z%ODEI;G18d6Gxl&|&d|Y=$c)1s0F7+tF
zqfarm-9xch47#Ec(_ueUVRP<qX|YfnO%e6*%NIL3P%@IPn+lqjwg+`55uO6^8$ZaC
z_j%wNIH$IJ=4u-_*kW2loF6>+o@k=wsK}!7L13Jrfr^IDMilH)ZtbWNMy-|fp_o<+
zr|5SEKY~$5B9nk78UWwwu2_BtG`KXxcJT|0L?j{Dsk-ig@{-!APByZf+IlJ!dkmIo
z_BacKk|}sJ{FH5h?%a5<A!iJC&2o}u<p^VNbu*P+xO{ekK^Mj`n^Erq1b2c#cSUPJ
zQz_V$KicQK>a|PrVfMe@9ZD+{Z8=iLT~c{FALf?jzYUXKYb_9MN3F3($*migHyD~9
z<iNG^o0=_~r9Gg*!y0szu324PDyyTilO5WORNj-7PEw+wwA=M=h%GfN=i@+dKuZ1e
zAHXye4vH&X0!CwExL)HgdQHE9J;NHr%uCD=9Kai$f8;DUS6z53S5+GXAIA~T&PXQA
zZ7o+M6+ILoy@G|%LdFYN{!Frrvh9$l+otavKY;G6E$Lg|{L1FzHYO!Vs;<nR(Oyo=
zHbatx*@aKfEG>B;Djeo_<1VN7h1ob{pP$@~2$98A02RY)0Zl<N_vnl~-o9(Uw$_NB
z%Dh>Obgl(ZSd1LwmN)4uZH2<8`of1JDx*vgJ{PXG-PQeN5BLM=K)eLk@iagFZ~$<e
z01&8OK-1$-(W11Yj~d#w*(qWGNTZA0j1S<5Cg^Th3QNMuvx;s3d;g7?w19nBXYARU
z%u1PvN%lAcWLaUT(EppCzQCnHDzKa`M^V+-lrM+shKfb|=P2UMivVJI>D8r=@5~~V
zxh|&8pEAATN8;3XQ%+k?Xy{#U7M6P=h3`l0Y}lSXK)Tc1XZMT@5U6wmj@!H-i$($f
zOL?w#n2N`wspy>EmWNhG`J;+_hA5-;*TrMEr7!NpkEw%h8rLx(NyoCbv|^8R|B<+r
zAyIyU`pegp<pz^Sx3rP8{7X6Dlml<oTpf|<DUI><S`GaAD&g6u;LW9?3Qq0b*bM|d
zu4saxdyWA<h|q-F?Pf<@vbm?zWF3SStGj#eziu#G8G4Ig0u=1(R~kX`+F-H^<q&=I
za=<RZ3-QRgt}fNKzLWc6r*-g*?3)IV$WMWg?=fC`d{0hTL={pqLRY|1WXD=ft^Zm>
zLxVZ;!+>|35}wyYkgP$tU|3(S-KN%U=F{hrTBeHp7hq-9A1f5++UCDjy#J;2@oVY%
zSA6?W#v4n9hPnfWgB1>-ZhS1$lmw^#=J$?ITZa@H9Ah=BU5W`Y3>O2}S909OJz05#
zd=lrh<;|~!`%Wv%x$~z81uuSEi!D;T=p(7G+m1RjD*&SiQv<6ZBDl2MQK3Nl;l;(z
z1%-V8&v(+UbX<IqP|)=@uEf`03xomcHfEWgAIBV1HO}xG85@@rDT*Vun?c9T(@y=a
zk2T9;L2;hXZkk1K8`(sc1D`?CnK_v=_!H@9RkllS_)(e>jl8s%&=BiwY#NdstA-BE
zrA&&YKlAES4Q*{TaYLcWfPvE=yHQs(&y+|R76VqaLf^brgOKC0mQk?m=z816{w7}=
zxx(*uhssy%x@J<}=5t7RIT;11p~iBX!~JX}(y*3@K1N8pqH9fsZ*f5lak?{U7zjVJ
z{Rc=sr$&A;gbeX>AVxr1Omy|Os5DEl=&XqOTMW_9&SQ-ksha66SBv}T<mWMsg%+_T
ztW)Jh`nlKC&&YIU#BJR7Gi&2($O@$C2QI#83dEtP=~e*nj_4=)`JE1WfA_Qu-%N#H
zJ+BnliDkh~JRDx}v3GvNB3M1{&TW=e2Z$hpJ{O><#tv)@=(kcGrduD!y$9HvT<58L
z4_89}W)qYEAkm71Vjt`yE1npu6Ml=9r)WwbsWkpGlEaJ7DT0|FBxd-(q=&d87RYYV
zkuI9-mEz^_U0AzDx}j@SlMT##m@`4gRNFvDdH#%D_N0hl7nNP+<bWqA5$+DhZPb3s
zd7OHG83b0y7n>%uug><Bm;!=V_4>Hyo{?WZUlSl}dtS82;=XuIy7S(o>JL&0P8S9Q
zvHgi2cVWl#4oTPQxha&h2q454_4WHVwS4NU;^~MfVTwV4%9eJc<GDZ)3Qnzo$)RFU
zmTC)1ZR!i!qM^Rjz;36mDHIGK$YGJB_KfO2K~p80961&mFOhd|c1D3{&jYc~0}fWB
zf0J?dF5f3pa4e2k!_*aIUb?IwbJ}iPQON;*Co9o=tycj*<)Xv%SzVA=lS2eqT56<U
ze54PWod15xAb8O??!ot(Jq06CQNvh5F;QL)_87hx5kNfs_;t^5v2VtCj7Kgle32^p
zL78aGyo}K{z{|zDZFKf2*U(9=7}8EAkjO3bP@AO_^=pX9NG~bUFhh(V>BnwTGh~PE
zxP7lXLHvvZLMKb3DP~m}P(wf4NNBz;(2m*-ziCBAx;XjODLqm!eQlUX*P39q^Ly=i
zG(6L~$0;|YHZ-0fWo;Y6ZKMl%Ex4)w3%`ZF1Kk{?@zDR&(VK+0UMkon9w_`kUq&$9
z#!DqColbR{SgKtXF%3dGH=={z2v!z!Em8ig5hx9i4Y$7g=mNOQ?Q6nZCL&DK203!-
z%VEWJ?>RV8j3K09rChK)Jrym=Di@q|JDJZ?M0RP|*?{#C)c1j?*qonfSOKYY>+}u{
zDeVV@U%GZ9f{VawLeueCOU$PT!DH#Iyt~`sjPfqP+b_oK7(8YUTthctXw3;`?KErP
z6snc?nEG)Si$@Wj9pY3D0Hqio(B8mcp(T`*>%?9t5jrb~PRSpzGG+SYn+r7g-eMBD
zI8Q>w-<MLDZnrKAFqIp0iTca?ahv{jErG(UziDHJD9(I46V3OH(0UE?>vJlE*hS_{
zJ@pH#)F(m!`CfD)k8kVQ;>&f#7!iflSQlg1mH}4C*%H+9x~cwz=5uBZP$AxnCQ)wT
z(yp+Np%rpuZ$_emKw%uxj-m)?f*6(TEwVzi6Or1tSfh5&l~VaBS`+IwBan-rDL4r*
zlUHGVA%pF{#dPWq;*id{lwsAhx#O1w1vlRn(Vd-{pVxfSj-<Qs$~ggi7-UCv_UwoQ
zfUM~Zrf;9^H}ccg3wHi&mp19H<Ywix;c8}BIziz2{+HS*!UUMa^tZ*v>BA|;p9Qm6
zR&ZUq9EHO{;Lwe=K=}eRXWSNPcU3(tbFJ)AcDi1>(-{LI#n<y&yFQ5Rnxb;MaxEJ`
z@&y68{n$tONi>5fv0!X6cF-}wtk)*We}HT6@q<L;%1^C@3D2ZIJ4D!X1jnP$wWJqV
zy)`SIAl0f=2|A~HO<Iw-e#K5`j~`Uo>kxYpm({9W$5%OEN;)i2REpMwJOFFUa86ur
ziFBYN!-QAs(%OI)umv<<G`?iOs3IY)H!<gDw_2m4<nVJDyCJ<cQ9Rm3MQDRoe`@(f
zj!6=KyOVXjLYkuyUvJpVUH|E@ecg#$lf?Eu;HBKlP}Z1N^S^%g$<P<~x1v10+DW+(
z+gC{Ee!!keBFrBx0OhD{z{0x+als}Xym>3VpMl4d57(<uOUTw?$Tg!|A<<U?cqI={
zv!L{X_V@&e%EnWjZ-duIbv(7Ss4mNYFFTjBrw7<G>wUSpbTm3#r%$5dHUf&<xs@0o
zG#bT8gav2gV>g`9{E{h1TF+IS2o8v?4rffI`x`GTso@d}O;D#<GnkG-xP{J-zt&db
zFn{1c<y$h0d@e{(X|JU)#|%>K8h<x_!*)%O35~D%9oO3axiS3E`gID>6Oj9V>j@CQ
zcCttox_rxVEeOQA>`E981T<B{RCYFOmH>*(C=Pvq^S-BmHf=ND<thFcKnm;#Bt-f*
zmBwy-SIz3pZ$SI#C+N-|*~xi^_r>ZpD@y`<=-$j=y7{j6*}I0DY65Lx0yoe-u+XS#
zzG8fx15BAN1M_5ab#=8o&8$XGpgW&3pq_gdI0<OE#&I|JN(pU(B;_*A1JgGb%^yxf
zAe**Y*#J!88me+e^-&<mH6_t|tlZoM(<F8IQ-ZExi`RJt1x-&}^t+JF8`HALBP?v5
zL>d5M(RRb>`0j>9;daDUrLRl4Y2W&n#~^k5S^y=;Rv+S-RFMAmrO1Y?&59wlGS>O_
zX&Dj3wm8STQX!sYMQOg|Qf7ndFZFc2#}R3Y8P!e|3nqdM?7B5>ookE37T20iq#M)}
zU71UYFsFxI49%{!8XvDSYjN}*p@#rgEVp!#wzMNkI9FGCO+oYah;yZ?k?89?tWjEK
z081jqyhnJVB05Q6X$Yj(iA4Z!>I`6tRL(6*!csoj=+SpaP9)TS;jzq%orGKfemntt
zbpKU;`24RhtP@z^BiYbXhm*~}>=Ob22}IuTf$x6=4Z?C8*HytMd^YH-X+Qfih9YaQ
z2o%lI_Iv-xw*WU`rM2>tX+YC00E*KufwCwXtj9CE!Vs9sk%%1a30zI@8F-7>LYK3o
z($0ZsvjbHnL|ZZZqOagY?1lkIoD9Ps9%oB4PrmpCe-TiVU)+4r4FQ;UaiV{8dN5!u
zHF(H+x>3DlJAG2=oA9!XY;J&!gRV;M$BH6qI^+OWQx3|~0MSgXYaM|P%fBnL4wz|s
z6Duy)pMJp3CjzjP0p9Tlvk-1B3)}KnF=qRq_I2v-EbXW^t|&R;5V}(PS!G^6zBM<a
z8Yo9H8$_1?fOK$q5iLQZP__b)#X6eaYCJCYikPGm3$J|{fF4EKv!@=r%q+KOyHKRJ
z#@N?%T1FG4^pv;&6#ivz{}53`xz<vNk)&C+78%bvXag9(#|s)zE~F|I+16m24c4vW
z2XlM2*j6jgjTV1-d=h0`KIUM)L%p_H)_yuP<=WPu#p8hXt^~82a)a%mYdrTp>7NJz
zE65)JG?Cp6e9noWr=nsW*m#K&g`#-@ALn7%R{R>^Mf8)F;sqUsc>hQh#eb`s*WCd;
z*VFNVCb!Klp(?%TA6PYu15~A*$4=||=L-f1l~)m0h+aA=7D^<_5|eI#0z*baf|^`4
z>#G(Do?0lbmsPLeI<5Nrh*QvH!5P}Mvjzy_LM;H;OGwvV$L~z*DN0Dwu>|aoii;wq
zqsF4L(gDX4v^dv4gRXejf-KL1d=~9fjk>lO{%%omt^xNwA%Y{lDPyZMc-gpsE;!EK
zZ~TBZ-Pg1bAryhm;%X9+yqgtgnx}02uHE_^0Lnd-PF3%yP$93zG61}&W!}NK$jnnn
zh#GJh?w{>cm%NXt>fv3<EEd&}w!ESFM1)S}Rto66x2B%Ilzx->n`<H^jR$F%F35oh
zVl=3C`LvpOFBwJnS6v{ePr7WIU)`VZZESGjT*g8HqkQx?5Mb&i^_kc<Q>^Ua6Zc|F
zw49p$z>;Pe|G{CXT?XhMQFcr-yz_u7wAskwSOnJ_X5^833i??p8+M(VHOyb`v2iEy
zYE{k+w(yg7#;vXgc+N)0_SV<vR8~}k=83i4oS!pK`T~0WJd(ww+t6KPEjEWsW_pU~
zme>qSem?wvMG4XQLY+POI_<95gX+tWN}Zzua4!<^*~>&myDp&QrLgtM-hFf}XhxSl
zRqgS3;~k&p=#ofN8n;nELP1eqZn3V0(u$~~@x#`Apv^T$KaXJDnvgU>55V^IypIWJ
zsi+2WHL{161OjN6FJ^K;_(55>KUywrD`Fb-0`Av5ObB?A&R|M}r**G^y9YM0X1pOJ
zx;t&)MW44`@SJG)N;9geR)BpI1*p>@_f^ko%u@zj@1q|*1NcRGxfW?XTLCi`HdNqT
zlW~W|p<=!~!8E_VO%{mldZePHeE|2x68X`0<+OXtT&{ym!Q99q7tkrBn@F-S2D%q|
z9RPz8kofvRmGWa53gne(=zQ#H)*v^ihc4N@=4=Obary7UP!*J@vYNCGD2<cLlJ$Bm
z^U*uj+xx9be|9wT2;w$!;33%uGWmo@zScXQ1#0o=vqzr-;3{)(x$I4`-Tr9-Whito
z;g5QJZD7S3v_oawVI8Eq-emh3R5(uZ4AZ$k@L2StBa*8u`-yM72&s2IjQJrh0*`If
zJ^D`PS4V#M8niXsrvV6Spbj8bjWS#RFb|^yaMr+phEtwZ`$U5|Y>El;m8Tu$kq-`d
z&_&Zf%}&hc>&orgay(oAn$Uj<MG8WWmv9Y}n9otlwOLLou01f&uS<ey{Ei%y=a@*{
zMVH(_7znxpmY0Qf=O6T3lP(DwPB96x6|Q9sOho~J*0J$R1W{qas%e6HP1HA^RXj>d
z(lGovU4N%JqURucN+N!jefyw_@O03xy5t92@Y)%?bAoN|-rUn-XB$gKq1mfLMO*BU
zcFWX4wc*7<&ZhMpuuqsxRuVB*QRGsD6*f<O)!o|Mr1*?-rjbs#+{6kQ2Q4l>-Nhzp
zk?x6Sd1X)*Pr>mHdeI&FqexCj!RVFAPzA*HXL{Tvv(<_y<&ub=<;gkK%-&Ic^MQ#|
zitc5P`ZnW=J!>M@_`6cw-R^>{b_bo+PE+(28VY`@F7(;5aAryCQ6|0O7uRm?{N=M3
zC6xiEO&mDlu8k~geb}KQ*vb}h#z-_*<>NP(nC_&K?;=B@2BE}&^{c-2x>!_8+ESrx
zS`H+a&(6AnJ~U}*CgZr$zT?Y+kj+s4ZG8~jnK6ml0jCCFS?oD|#Cg+%$4N20W)&Ep
zsn<h39^R&7mC=EEB<1`6i03oW2vM)lWA}aVpiU891icfK#@CB+UW_TVQs#3aAZIH9
ziqYuyGcQddv;0xTg_tBHs1AS@ntXi%KaBt>bTQK+&-zm`NX7s?-!rrlX6F9dUyH&?
z^rF^%@6*e7r=9pJEQh}^#k9Cr0CGuw=NYoxLU=p?%|Qy7jhFHwUbwxoDl>E5mC)Wa
zf2)R(^93`+iWuy#oMxM#1u)2;)3l=F5eA@VXsD$~g>ZYCIV{e;1~BaF$Yag44Y9_c
z{XX3;@qG~Ga#we^fi4a5JwoULBm<ns;ce626Jh$XCig-XN~Ix|EyIj1bZ=epe9K^X
z%M=myxpOs4^Y2$D6c!v1Rh}f6wI4M49jcV)yru1T?@j*bn<+5k5VEwk&NDVQM~DuO
zKRG3iP<BKP%u9dX1anv>CM4W>9)>~GU$o<zQGd?Jt*I+bo{oo)&oOmzsRe<f_|eNZ
zc0-J2gTZTad@P>Hk>1iVyTm)+-wni;qa95=3>Ld(?OG=N@1NBc04Z>QM%L0KSspqK
zv#oOEd2Y%?i(uJ>j!L(UuQJ1>R;5XF5pSDK<_je1YsosLAO(;7*REI6FA9TWQY6u5
z!u<68|Az|E+1j$eP!}GLjJ#v}Uk7<!T91VEJ)x+guVvsM1kx}-F4Hg8AY80XIIH@U
zpmBYtC2k{f^9qgV_Dw2ZT_vv`4bXEE(+E<0V;}tojrWlAx_PCvpUp{{rpn}{bM&X#
z(!0hSZ2^mi3p7!%NEb1YinO7sk^na%3LL>j^UnM`^8wvndv!N$>qfXrA6^W*dKV}#
zj&{VXfliu~wIMM^d8eGL`NYsKEzumq2Il7Gyrkq8;~&QnHxEVK7wj8tzOpb55+NZu
zX~#?-9jDTi(?&Ju9;Ul@!A=^ZY*y?!vaPMH9YjOuf4PDQbbpV1n&^1S9H_`1(^6}8
zR{?#J4q2}8v?o}U=~w3~wA4Tkkx>EX14PwjTrd3?!H5tR0O;SzRH5k-jEy6<n{5(E
zQB;?<yqLNSYFlFkHj@oFG}`i&I{i(q#+6Hqmf9t^=e@A-iC(%>2!g8ZPMmsw#pF^{
zO3NUHQ4LpMYVE1c@mOaL(ot%e&UWA*eT_us^z$>w!WMCzt+Utc&b%}cq8Ntm5CYem
z0-xv6Rt9Y-r^C4ThpM|oO9N(#=y5GuE`O;dGD_=V3Yf{l%P27)uc>ZNW3%7k@mq8}
zv!&Dd`3#S4XtIvcC9OjlJrp0^{({9%s(A=8{>#hc9dMmlt*aOLL8sw2{0k10%>`$S
zJj}A6kM!zOh2AUAx!u@T$-L^ESZV=y@XuRB0oq!==Ci}~K4evRnHn@v>I$xg(2e)^
zBSH7}j!64(W?w??xJQ7lbTHBxupa=;uMbo;Dic*5SnUUgB11bh_Tv4uu1n`;DcPeN
z8B`TP#+pf}S&}?`k@SifK|Pzs%}fqC=Cnj{X)>WZ(QzATg(5FvrmzL<-8VzIr_0!R
zHM#phcvqJ}UhF>;4jPNUG&MDq7|r+AHyWS1l|;w@0uP#uRGR*Kfafw;0MZo^P3q8j
za2`P`==)4WZa~6~r>l0<q=hRv+fi4|626;Xe;_UGi+sCfe118hwWe=qW>ZU7S64|i
zX*u5vAUd7i#G<g-xw$|JE*&jEd~}HNPo;)g)!by>jz>Cg_3kzxhx=b5oy!w)do`|&
zv+`IK=XRSGSLma6TAG>g;cq4A-h)+u`&B7Qn?LjnGkCL1(wSkJplgwrXIKKVBCd9n
z&UuKEkrB;=hg0+u8;*r_0Z4&ap;#<;_cc?dx;P#B>BB)jh}J<!NB;WdfAqsV1I@%%
zZMB%k4h&?ENw3nos&H*thD-v)Ga%yfwU@pZ0$MZVKewL_%LuxjBf6V=>$G+O=qkJ<
z9)ShYOh_tMW5}s}sx$9dCd&OBONv<JvgiH#Gj!Q}tQ0{gP{Fd(LJIh8n2RrqKy2T?
z<1{1cFWDhU>j{H=b%YEpIge$7)fod?N|%x>YFA9Ol9ii%t@qNlCd(N2gLV_GuJgB9
z(rUs}Xpr~hx9=2d?5vo(v~AVwX+Z?vU}MEOt3fuCxu+|6Y&@Y%nDi5zdO{nQP?sPY
zfWN9If>~)M@~z?xstPpCql*W09C=Ba!(a2>)B|p1lEH}%o}xOATc;Cla>iIViTy&P
zh_yrSl;xVCgh#{dtBFNte{un!wBS1mewU{Amla%*G;dTSc4K_Og)J&@Fi+omF}S~A
z>3qT`Xw>{2xxr9twT9L(^tlb%v}x(KNeLR2V}|qCkfHPS=W<SmWrA|+s0%A~PJM^^
zg}i$o!pD}DEQhH%)?VIWtt?6<(>`wwjw=EUY~vSGPh8USWa827?Co{nkc^}5fu088
zy*M@~N5QA~a22E*jmom(Qlv^+^iIpDfs@B-`EKL(A0|a7M*z~m{O3x);-hEVQG&rK
zA<~W_Li74d7cb^Pf9^hlSL@by$l^BU9$|@fQrTHGn%}43ykk~v44%k?O2Heh1V<a7
z>FP9nIog$m6qoemsX-YM8Q65~6lW!Z3RMwos43`%-gWKmdweC@<e00jO4k6OGnek%
zIU<dUW2g5ISAxU9PIVIEa-Ot!v-&O}sk>j?@`|w4C^!$e!g1-ce+=&70%^KDD4WlQ
zHwYh&mHu>k2o?}MP4_G|82t#JcsRj1SuL>}Ps@CMo*32kT{8dSHb~saz_}8y{e?sP
z&$S>t0kFaZvAtHVHFgYw8%);3k|rm8$Nx5Xy93}Yla4n*FAFu#G8aUNDOF_Ntis#h
z160bEhxJ2XC%{1A<*YZCA8xup1Y`Uu3?1~QLAyt`v^@^bUez7e(WGMRhQs>M<3z7+
z%B4$dxc}_>hg|&6mvuz|t>It%97}S5>OeEVg5_dJHJc;>=wBvi{!wHCXOF?rX9Lj6
zN|&_U9m)Sd)$K=gmDW}&{{~v6iwt!d^NiA$_4dfZ45cq$mv$w&m9{&6)rjahGwZNt
zzzhIa#)k|xU|5I6J){Z5y;dUzqdolYpLq%`E8zY4jR~;R91O4{1srV!4p89#6wL+k
z)~Cu@?pV|xB#Dgxw4_|JPCPu%{$;}Y6>YmyIPt!JLKep`ESSL!X~s^o;~m`arzC#6
z7T$(FL;<zT;P~T$nG453GY{lzYZ))ke&fE7i4u$rbGkr0AEpIbj@?<Xl`c1o2*o7i
zIsLTs!_(<d1S`I{d$dmD@VkFLJ(d^kO`d87g!~BB1SP>?JZ_{LmLqpbd*=Oz{beL4
zNP%TLOWVt0d_W(UM*r(sVn6SHpAMMn5K>2v@jpx&f=&}VTi>bv_$Dp=%^$RWvgir~
z6;ID`%{Qljxu-r;el0ESBS2sz%7*JD4?j%g{Kv}v52x&biKfN}WdEF~crrNL@;URF
zP|er=1it!ZH782E7$x=E@X?1XgwaIM3%3s!0%I3L<{$f0aA2(tT?a<RS3wOsd9+a#
zUIW(++HvKR9cW6N3HRITDn}C%Mck0h8<<)RqOBw$eJ<E}%;^e{2M<fzn|swe1#>sH
zIxsEJ{mpQS&@E<*j@R<e{Z0A*jMm5p&BOS_cbDuyDe%x!H|1pCdOZfVGjxmQ?Og^a
z4i+{xwtcY%N1FK#V?(SJLgJh`^<SS^4DMy&?wTiX^fLujz;qR>?O^-|VoA0`H?{$+
z;e%rx!Wb|XX4^(-Rf*u`49Srd{{CZ8dBYswsKpQHsHmL5OZ%|<|5-hTx1mqMv|A!q
zJ^Ii!aU68-lmtJ%ywAZ3L!3Ges1dPF-|r3UPq%}f-3InS){^6Jbrz_AzyJ?8CoayP
zg3)}%t|7MDnDH3jUyV#q3(Sb977qK5^Ai?Xz+!hDqs%&aAU|k4aBoSBsoi(i9{6!j
zs~0-&4bWf;YJ&|ah8xApvLygc=QGY&CjUE)weS|&ZXT%RGaW5(BlPg7Yq4NIoQhr>
z-u-M49jVyR<bo|ZI5^lm-VQnd#T1klXIZ@s5oFdcmulD+TU)|H6uhoM?P^p8HmF{|
zs!Nmis?f=sIqb%>TjwHz<z9yRYYh!kDcrQVcSFZL<U_E{jXL=X`6_*HeF#brRrBG(
z)z%9&PXi>d?iI<Mz@EC)DeKSNWxRa52qUhO6mvFbXg=G0+-$MBeQ>L%JZaha99?Cb
zdVjk{Mq>Kr_S|y9#YZ=97cfQ#9K*n+fB4HEL=>1(xu7Lq&@XTQde!q&U^8JgnU*+q
z9Fs)+7+hM9?Xz8*Y7?@7l{BqKUMQhW7vg&iBZ6r{pp7R|+36c>Fb$M8xt%kOWqyAk
z`p)oU5+X~-{}{`mhe9!rd$~x78(v304?J{+HP3Z@sqfRL)c&;2Mt~hqUBx7c6^5&3
zuHq;+_d93~PEQ-nkGU>uCsp+pLVkQ}|EoX#z{MXXpIbke!2T@${6pmhWWkVp3Giq4
zhw|;zlM;)q(KFy!ih<SD)oU(<@8)sop@Q(yN+&TJ8=Eh1OCDD6b2s*al);uNh(~{s
zV8<O`sJFR?&i^X)Lo)>*@M7Z5^@^wB{dg>aR$a*=7d}JV>oYL94f&jT57P-U9(oMi
zZyb8VYgYISIWv66mPc!xXZ`<tV(<;XyKj0Khx_OB{wIxTS#XVT0-vkQzBu%3G0!$S
z`zg<TeN8UQQv0!DiRTfCHruj7TPO?<qmaY8f!cWC!Ay#uprk)p#lpAU$xxvj2V{cN
zxi||#N54PE0JusZ<PsHAef;NV|C>1(E66`2iG2+j_C*7o#0Z@f)?{(8E45%v(;1ny
z-!*Mu890Y=n*kVACOE!gK7U|*U3hu?#(Q7bPYwO=Za^)K7$AWAzA`9ygk2FRftjzE
zQ&azt&TFhTVHEgLk_Qv;9F@{fuU+=Ea8vT6kHNp36bMD>Mk?2KS&+-U+RN;o-ERDn
z6FErwKOr92CHlWEcpz|QM4QIHD_NWljBZo>IsQ)_ODFf5?)XH0O&i5Vx$kWQ^P0!G
z#NbW;P<$Mt#A2_U<P-TQ_7i2DiHao$cO}Vdf3xT?T=2^ScsvnDJh!e23@*Nu%((Vo
zbT>jDP6}|T$nD6K0$q>KRV+nC6q>4f)w!W3ERog|U%5C|kd;w9ylB;yK~j13&jpiy
z4ao4@Gjg^A>-qBnw1;3pdkh)B_sRzXixq*(43`k&F+^9GFLnc}^S!l_wosB-&f|?X
z9@;Dp3Z*I_q1#_%>1M`R6n`3TtQc?zD5vQKsy{#bpH8mQ1DnEZx&i-TcTvEms$^Jt
zv+s@(CL$AX7%*^y)xhGidql$TsQ$hJ>74x~C;YEUoddGXd@fD%7rFm$0Ldx9&VBVR
zUptTi9DZ`3Wak0HJANM$raJ^xpPa|waK&alT>1{!L{Yb*&{|X2d173K(A=4AP-OIY
z^`*}Ke?$MroJgq3!EM7Lc))1?RkH|haE-Ho;o5-<sJn;H&cPuvT9B0+57vqYIGC<F
zK@qufZ~0*_1C$O}l&3iEm%f5gPImU~o)js+rKiMrj(^z>{(mi9V{m^lHQX=dTE#hB
zaxA)w6-!RYV6R08G#|aF6<AiO8Q#4rm&pM=c7tGHKe$WT_V&<(|0jgYK*w6RM~Pey
z7px=Shs{rZh^8Z^?o~$OW<-2^yzxnfn)!+)v2VP%2_)irGw4T~*x40MZS9Kq5!3B9
zrYJRG{!OAM$bqfiRK%A0t117`HUtc91oNn&?SEU(1NidRcJ;#BM_fvUwj0vBJmg0x
z_(1beBk))Q(PQa9OzeNAcW9{a!P8*;@yO<!zlrph_u!Y%vj8aI|0Mp$tKS9u?=G$G
z(MjN1I=uDO|0dKQ??Fd_zS6h0VIu!I*A8j0Rg!TGLH_-bV5`IeTjirv>i@hO^aOaY
z(FDmL#u1m^NCb-bAVjA6x2^lvVGzgzua|d^8ZavVkl9aVOlyE^wCKd*kI>fs*Wp*l
zfXa+<+8qD3pZvJL-eoX@tCI-!BgA_!#<~l@S;arPtw(sorBFMxK$fl#k{ltB1FO5q
zB7XC6%TFiz*LxC3fT@z)q&{_oI1c6!dlT3knG2L1|A<S|HNoxdljt)4OLE6CpMtrX
zrOIgjyBbKulhc7-pWeKD<ci)Oo)L%&#|X|sH2-}?9D{#AYs=aAZQ#FZ{^tj9eX!lq
zM6*vGVZ?P>_&{6plg6mOO7@V;hZ+NwO)u09avq_w;HzK;_n*aqqj7$Y;$L3HqGRd^
z>yuf>|K$R-42%@P?GUsqG?G89`d{zL3f7{{m4E#hd;n)aM<^JqQb&m6$5#DYIQZ-0
z`yrD$d$i=1PzJ02CAo*<WMHleMfZ=&YH)ji<5VLHH2TlG)e)c<7+mTe{t=f#oq@J4
zF5iHGvm_2F@ozJMvV+mphZ|}gp|V&raE+b&w)wduw6!n%X>`c{j8<UysDP{!t%(qx
z!v7c<7;OYPh&rVR=RM-mIs`hbEI)53^<PtcD4qtAJx-jXWb%8mcd!=qFtEJ#cH`>|
z3Gd0)EVnO=VPjlo@`gnh^MR8YONVb)u9@%}j~;qHq7WSuiMp?a9+7rPq1W<d4eUdK
ztsX1pRqC-bDzS2M&TC~~qLeWk8{BabAbBaqdJ!o_coH-T-*M$8JVOL|CVRc$xPZbx
zUfT%0`VeOsX?~=}f72CQ799-BX1vk(XRWy45eVMlo0%AZp16xy>oQ8DbKv>(+%R^0
zX}1Wu*T|GFL<-bwy+Qn(VciR~%!|nMMSi~XmuW)D&|dx%)ga~%fg}9uk|pnZ<0cWZ
zT3;<Qzskup{KM73%gJ8Sh?<|_%c8k-0Vd$27E1UdORd8ND?5ZF{?$L@{o~c|fzCy3
zy-J4SMc?icnj7bz@XhiEuO1|JX!}Q`isy>!C(!HXuF$8vQLnnPfVVFw_yz@#%s8@r
z=^u&y`K0S$O1_vac?Z)b5ZBXsQ@n`Thnq<_F+mN8?f-luT~V(R{^oTaJ;R$<l_%K`
zQU&-c67Uf6#&?jT9x?>tHihAUd>!=d;8o(a5+xpNxvMmK=c^j3e_L2oMdGE_dbx65
zm?v=D&jcT2z<^2x)RTj{TYQvy7{Of9KPSU30`=r6z;9s>;TBq;^1$z+{<=Ww9^pm$
zU{v}XZ}EC~2j^Lyl;M^A+uCt}#hLRssr3jzp)L>1g-|})hE!aS0&^3ulsCJlPw8dI
zoOz*j<-mzw9|WF@iJ=m8Jk#r;5$=;?^y>$}0el(^^l5zOiNoUkN6-haW<Z>f%S1{D
zHe6!3KF$bk>8cI0{a;qXv~;p1mN@HP>jxjb`2Bfc0q3T5@{sC&zqj~5foSlpWOjOE
z)d%N4^EI5fYYM&ue%$0WtL}}re3`GUZoD1Z)Rr3b^WUNk@<3Z!cXLDc{f_qWscY3r
zuf-l~Jloj6_f-bH-BP7NPpi*+HHuNSx1OD}+jns0=ycZc+su(W#U0>1Cr#-=bx-yf
z*Y_2h-THDjTt~sS$10GA;_h&L>{?&iku%r)1BpHRYBh_b(R*jC7~F%^omkGK=S}A=
zAAt?44vA5c28m;E=hN4BUHSLM(?57#`#SE3dT@S#2Pg%q17tF}YV3G~RAO&|Yj|Ap
z%+2O7ze-f;efi+WzCrT2hn5llD*c1K9Kt~TCvOTQ{F=U#d`wo1v-wA=r{Ma&Y<}d4
zXYH@sQg{$qBfcmhv`xvkBc*Ujw>`gg=<+Xn_HphtcjD|B&(fsLIghE}-pCYJiG93>
zozzEWh<3DfpV1NjNTI&rQ6~N#ZQ|coV4FQBFVG3!ky|9@<Ho0Z8$VR9_wfWNl{D0%
z5?&<nsN;Lbuz1c1fO!04`oQYV`HyWpGG5XoDzX2(wVvp{FsY>Dv9Ofkn_ua+y_B!y
z61a9_vx1d)j3q7jyUxgY%cRx7y~wTF`#L)t%(%?&>&LfLFXx>4)Hq%#M2IW;b#$V=
zb24n%-Sw@a{}laACHn6h@?TLA6G-~)v(FQ0viBv_jE-b@{t?!nul|W-^l(WK@kwm?
zpx{%h&nsz)vTGb*`DIN(@551JP6kx(3LnAN(gcKPbUBQj1<u1_F3zTk-P4_WQ?-lX
zI`~kXzJp*|CX=0bVaic2i^rYu%ma>v+;=e^pLZ?FmY<LBEyZ}OgtV`#FuIed(3EPX
zgy~}Q&NwyN2geYvfD_O41XWvE_gz{8+9`}M)omZ8mI$C0qpz^Ro>;6;>%F!sA6Z4|
z$!QxLG0Z1%bHC7hXxzSd*X38!1AxW_ePR6oLzef0&BT7XZ88gQnF=r-_?&X=oL9fV
zrIB=tn``?Li-}r+cXD{*8g5N=Cv@Ya`)mcm@bsAm35mjYNw;irizIagtV)li5_q}k
z7Ca(39>cg@T<la^l}j=%PHgAzZBe@U$?<LAFdoZS6#tI$g*PIvmk7VKzU@Gp-2W-^
z>a;(Jxy+c>v#Yln@$F->n>Z(4l|{zpnd2q1RY_TAe)VNw2V0{JX8pWTQ=G-_P;wKJ
z9o)|o8v{){ULLNet+m40IvItDVr84<#hvz%e|2)2dj7^5Ip1S%&s$_7v;88mF}Vb=
za{>h3zRKB{=dD!epKVNxeaNA<|H%aqt}@#sWO|zN?!tWmSusjPg6;$3S0((!{7*&D
z@$f1-l`TzW{Ka+-Ee7<mxT%?}coY0|*=}}v2{R#5mVSd?iKK(3(a$=ZL`VJDWARbp
zQR%h7e^wCs6|@~g7OE8Bvs3f6{)tYkQny^s6l=Q)tejrc<bBl(n-Sf;ON4J#uu-4Z
zd>(2EhH~fJ!sCip)HU997GZjcgJ1Y8iCQ4|>Wn%=2Q0(6{gb;71M~PPGyc{G@2j0X
zl2!}XKO6gPfCF)6Xd2r|?dF+^`rGxpaL_JK;`r_tCv4&<j(_(4!|Qv0oOR#P(ZwDC
z?2SEMFx#`3M_*q=e)KcJ{iM)et^dd7NuaDZ{-)QE@tp^wm!6K6WR3)sD^u9`!lrH1
zH#y%-+-VbzdJEquWb?T%sKLxH+vUjTolh}Q1sjDX@t7#vod{mFtlF~N`#Z8EHuFI%
zNIqSU<rZRsuT|7$YnY#sTDTrxDJzwXss_K_y@1~OUb2BbE$8&&pDy4Z>w#l~wRA?W
z(>@2wYrXj;D-tWEDp@YkP~GgfU`kbajmw`gJ1Hg#1DmnOgYjJf)a#zgo06ABIwa5J
zNmliA5(4d9^x9{Vh&0b3GEbH;;&+{JH6enEeEC%KN$=UsTW`cIbO&W#jj73cx+gbf
zn(ErT&3Du7bW!GdrE2N2%tbaZxuYphkGF&s(nvD)yjdsvb}Bjrfoh^x{JiiW;Sm;g
z!ym-Vcvo<vx$`lepPa$^Se2}CE~P4<`XoESKNH@61rNi26oq;*z!e$j?IKHZBUd_x
zogKF%>T_Fx+yj#*Sry@lC9>ynGjImx)VkzZ+X!>QbxI~Dlv+=?-rDP7qg5n6I~Ro$
z)ToQQU?N*Bu)?H5<pn9?|61DKqjZ0G4SvgaEnN59)u`%MwSwm84fG`=Pz429$dRfr
z0+W2|-09+h=}X%o@8t|<{#dn9uC{Rc_t6VtY!P{U1gByRZamcK;UWs3+T{vtnQgze
z_yYHYZGr2PY^deaZxMrKA5vT8JWR&gU%pgRapoi{pmyeaep>rfa!_BcSiwnn+Lj!3
zv$wyA&l`KOO}c_bs+~!r_jUT4lDqbw?7Akr`_;ftnp(Q-$(T7T9kU>5!pBUh&JHT}
ze8kFzxpwtsE1+jRtOGw~J{NL4f#s-(bPX(%)?MntdLxW_f|*;qxr6E*zqv3FOOOpX
zS+CZq)>zi+I4iazqi|4=S6}ON(qS)!)tU;UFqaAj)bEt%yttQi&blQ*V^L3MB;#fn
zry9dewuu=01w&!1my&ro1(JFNMun+~)Ta^Ix8v13MQK_IZu;hCjeOhHC6ZJVCb`c}
zg0tr3@f>%Nw9m{<av!(;zZC-fIHAGgHxvY{W*Qs*fMIV`i<5NRR$d^Eu(n~~6N#Y^
zNg{8<$1opdYnvLTWcw1O_6*<4gVDLn{*xd%QjYk%BJSMrcarbR!;|$dX;2`75<B~o
zrnV5t`dvdKF`Z(qbkR7dX11;O<oKChUPG)w<iu$N7_OA01M!<q%isB+Z#&MxtH03l
z@Sx-9zvP6!gCB5bbf(gt(LP27*T2(wKDlw{4}b)&157N3GYBhRcjK<QndVD&%38rk
zfp(i!AJlB1ihBh&rE;kBTCxlG0`2tma$i{MQc&Hn3+jDS<+62&b*2}r3yB8yE7Xc=
zDtO!mJtth1Uz0Pu8&X)Vy`SB${m}vQHRjdR1Dg5lX4#{bc53B;XiEx4BNJiX_h^pV
z`_;iBxi~KjYn{QuR+o8fJvP(+nEH3yi&cde1?qU5NQ*)S29(k_QHY+r!pk#ZgxPrp
zXCzRf<o?*xDCBDlKgQ#w%&{42L^t@TM6<~!S{8LP#b1ggW$|cdfAPzb)O*^|opU?y
zoJcHX?MU`lealC$?V6X_ENi|-5#Q9d(im@vC|H_6K2vTkj8b-4DRqHM+`Rl?!72~u
z^*nD~Dms*cen5|R!yZO+l;_F@p6iu}AVrM=TtXFSq_nj@S?KpK*j*q|>5SO&Io0x>
z9(I<t1#2xe8Rh`xdeMz>U%iOErMov<IOb}9u)Y~K>a9-`)Ve?OtCj;)DSDxl3K5hO
zOf5|?&AW+|1FdV*)B$x5o-&KWnwZWgNh4n{;!MWVsxh-ejoLLOvqq$0R-rXy1Xh{u
z+jk3~d20NhDYw~s97e0_hmW7%&(?w^L8ieR8Fy4pHI0VfMrg0lP)L6Jq93KqkKv-l
z^lMZG9NT>e$)j(Pr!&>*L>|UWm4W>(wM>KaaWvMz$#QO<JW8DR;aKE2Cv`vAK4g4T
zR&Y#n<T;F9DK|C)i=_CT?z*04v>zKWeLDHP%VMUyVO_JFOa8UOV}>3K8+A$Nf=^3k
zyG@<Q73}z|*hv9rh5L$~AY_~3oi$MJ8*{9zZou2$(!p<ifg-OkziP;@(Z??l3>hbg
zOy3gwPZ4U>**sH8>-p-4eKEO|R%G*cL=c*Uv6Z7CLjP!lZoz{j>RaHOs<EhV1ryJ7
zQaD-3DOFOtQMN`<u9p?$*}9(hqP`U&Ltf}OkZ||cUdW5#x<eUag?*OhPFKtNV@jGO
zrlHt;j`K~iRT>&!@&?3K7nEhWhLwuGOdu8cb!|v{o2OC3?`}G3^Sj05Myiz(>sP0o
zrS?V-={zY|=BJD<{w0b1xeWoI3$C$Z60O}LT&hqu-E)+5!TdJ8zME(FyP`_YlicYd
zi-vJb4;RqGR8u&J+@>(hb}(z+uBJPed~LRydQ+t}!Ij;kcKxYLz1p1@@)%aFY?v3D
zjby6lR?U#%!&d!u`jhS|cSPCoAkk_=Eo;w_$AcuA?u*GWPc2tZH3z*?ElJ=%=gteS
z_*j`;dhPj~%cIq@`)JQ#k51Ow><oI3@(dr*D{cEBY)l+;ZUl9P%!lsgtIp7ie||-r
zmBi=UXVdS}XBRGg)t*INO(~P~;pfOxQ+CdQTAECiU#?aAE_H^6ZI?mE!&xsEte#RU
z%nh&AYbRzYUnr8?!mnc-b;Tr&2mo8=o3b--k&yBUb(nui{!3+Dk6Pdadl>u2)Z!nl
ze%HBl6P~VLt@l;X9e1AaTPzhi5}^tJ<`xF!`AL%aBdi2gh=><YI2M6)-qvib(hjB`
zvPuvdv#69H7wLCcZzI9e{6nd5(vw;brc6n?H2cznPnf&BFE0z%$<bHI;~-2nfmdiT
zR$Zp+^JJ{HrY0vx1x9`Ia=r)C&%OCUF_R$nb04Bkr?0kFPjjU9$uV}k2A_HEBsIzA
z`#UPg9(Ub0HS(K{^DLL9Sr=qhQQ=K`m)$AEwJN8Y)lhb?NNaSF`mY7wRMs)<*>$3)
z*;>D-{}^~ad5rDJW%h)Nu78a6pLhlyK>CvGF61RYcjJ4>Gbek>$3=W!1nNlHJ^cd|
z62F+Y{JQsqWYbPHYnvf=_AXnSOra_cs>tqxN-7&>hb>{Z1MkU&DQ}|>dgNO}M(@in
zH^nNF;Y@VNId6#Fqsbgm%lVYDl&02@nbSduLmo1sEecatPvJtCL?<L^ZN;<c%EBI7
zv7exPNmgTas&A4qk#O6eiUh5&*Fa$@UoQ##?vfAL79vwTfu0P2w$-%V)yslTm8O1g
z{N!FlvMoBgT2zq#rDzx|Pmepx!nkIJqIq+fV?{;PIR^rEDzFcAdt1At)d&d0WM0T1
z|AbEo16!9WLY`0UB?(ono@!2u+<WTAaFy6@UFf<>sia;-p1o7&TFCC~r4YG?QCqBn
z*BVv%_0+RgGFJOz5<=K|nmN>T-1M}Z;+&1u9_EcLBytF$>q-BQY!8RGHvr+|VOk3e
zYi?wo6aPiv7)<H$hd(x5LbyAfmL_?P28B&>PC>Xv4UtZao(@z|70QKQqJq8l>7)hY
zXpx@I%Y>-d?v+e03w<AP4rVGRnwwBAgBh_{%Ee5<ST)BMubnl)3>~nFZtr>LKdXo$
zE_76W8I?=Y7aw9pL{<_sVf;$&)sEb&sn=<|ud+&28pD%v6$hvb&jO#73<?P6)_*}#
ze>%ILR!0^7&|-Kh2^N4n|C0a017F&w;<-ZFo5bOFI%$TSq0rxzi~@!?F?XENuJ@bm
z8$J;M<2$exH6xI&%|@@XwoS23`|N~-yQnK#<&h)9i-~GKX$jwEV216wRpdxM$-Qd$
zcI5aYiG~8ZAvxBmlT=0%-1flNXPU<*-u9qpSsSyRdT)kW^SE34oODJ-J69IgUXWG2
z=onuq`1H8}1$s(c04Q_M?>xYnf6n5!BHs*02V0Ny&D>H3j*Hil`*Gx&RiExX`S5w+
zQPdyqrLO4YLZvD14Qq1NF6uWq%yp`&wp=bXu+Fxu7co~CAg?bQQ{3^5NTqtx;-z<M
zw`GRT+sU;CX2Z#r9TO5gthPkry?XkD`JlhLWsZyPPy)0sX^OaVkxfi;TFX<{s3X6*
z+*EgP`xchibI+5fMXa!etjg!^imHiak5i@VjaGY<oY_9BB3FLu(TYnkV-`OaPTMJf
z@8)2A$AAAHC1<!k*a>LTISn2wC-?hFvGx_9nJKq&BFcZ)vbdfj);CiZ7d?(lwG1^&
z`OD4T4%HaCwj6DH*|_;B^_zFqapk`&FD~nR;94?Ljm5`VBBkoFHFNPPXTIOZ2!%Cu
zPiei~)|J)ciuu-k^28ioowbw-Tq-Qn`X&7Rmen}{gd+4Q|DJ=QDCLVDWJnDw{`8dm
zn$LXr-Kl6cu4!ZS!Yz(_3qqMFp1uo@kOFP5l}*ddJGX6%^641y(b}s3Q<?PoLpz5<
z5Gi!E<pRu?_LX=7@smh)|0iSDl>$EeKG#@xxVu{RxdAz?MPkR-IH=bEA|aGY7K}o^
zAdwKan%ZnhtW~|XTD3l6T}N5-M0u&7t=+J?`{7gSfgrX$8S)9+E!I1@HySQ^`<I`D
zZyp=!T79S~9Ck}TCrhei>7}?nCkv%BHdy&*r@fqgY>M5qN5N&gJc$+L7{KeqT=hJ#
z<KnjJw$WrljCApYq+*vuN+V&H?JW#3&nXu)hVuVZ7X5}Ui|+E=!x0f9QNH@lPr6Lt
zkdr+S7StM!pR1)J8`;{FFLV6T`D1-mpPp$Av))k?N;bA!8Cln&ykijL^~mUbMVZZM
zbGFZ;T}-|zHaUuU*)VD-$X{A#$>mmkgo4)MJ?%3ez%XCFegNZvQDZ|_XD5#tz??g9
z#itWlFxk?yw&qkQmubIG8H!`tVpP9%k$qn!;=0DZRa2|s6N9-~w_i<(MVCM$9L{B7
z|I6Y;Yuo5Lh=vJhr=MA-_4{3wgW=A$&T8uq|2{c_+h~mFgxpnzvXxtvpR~HPh`&ym
zef6o|&==>z9kST^;<c_*`Z^M(7_F`v!1?Nial>%9dlWXQ9UB|=fO2f)wQ+x)fLtnb
zp3Qk(3OO<gZ|8D57K%$*xRAbA4`9Vy{~u#t9T)Z5ysZd|pt7O}D5)aSAkrl$DvOA$
zbcb|_2uMo_3K+12#Da7!OZU<tExEu_g0RxUvUI%nddlaV=Q+RkAK4F5`<?rqx#pT{
zW(1>Y6C7FdqNEH?Et<-0zX|LOQysu=RK7gwcCvq~+<MD#&~7R0mI12#1tan87#0Co
zb$6lD=I?*$F9(<Z9{8Hs?|J4=vxTI&jhAEIkDnSpR`Pcx3ea|>$fC=h>0IdR<tk;f
z=25q3*FSjxr+G>FX1&!XU_DH$3%yr(0uI;8<M(3C$w;5+t58F<U!0HFD57>SLRJEp
zIovu&Lzp`FDp%0vI3jk5X03q4XC?wqQHOp}KN3L|e7a%AU)h>$tBoX3o1-bOz_?C~
zE1otKi<DzgBV7B+BSUMQv5Lq1z#-pG=WwRh|3Gc$aa-G0j_Joz@(UAXWDkh&j6>}D
zf0G#h`KJ5WK{l|{6+&~5#C>B?Ys1sKIQIlIc_x><#NUxK<omtJdM*uue>DA7Mb1nb
z3S?7Vnekl3Q+mIg6!T@Sj2ztV%-5K;S_D%^aC4uZWh+Ayi!qJzSeql;ejkfLiJ;VE
zB-TNVzgyEgGKvfJG@6X0F#Z+A7>T=(pDTy?aCOM;9W}Jqc<TDxy+jR7K43+DyMb=2
z%q+}1aZlhZe7+LBW-edyKS0%A&Zd@uc=M(k64sELJ1>Lsr8wu5wp|N7{{C*R=}p%A
z8JrT-cVm-Udi|7A4RDuST^z;F_D%Uj`8&`uhhrEd#iz(@&Mq3q7iLXmdAD>mSs(H>
z(H6J+I=1pY+i&OWN$o5+Fg<2Dtxk$vZ+p2~TV9Kp9pTZ;JbfOOeOdFPYs3ji_$A}>
zUljSEreFB=yv{G4m&^4eZ0esop7Mf)-^NE@Cirx{TlLBb8UlryxL@(AV*e{oxe%gA
z_a;bbFde~z|8ibm-m(QZwm8#5RF(uSO=rv<@qU~%o>B4dK-KU9*#@Joo;nQqN}*Wv
zN5t{mi)wN0)JApu7=xX?Pu_2L;@VL<kzYrWX)u{78jKy`Y_g4RbyXJn$=<v#qg<Os
zKA#$-Q5=y!eTfV|g|~)uw5JKFN83r{Mn~&<T=|r)_M<QS?dlrk>)Q2g)c$zQ#hv}=
zrcEyQdF7ueW*gVjYijtc1GG<;+y2KsqUwk)@`$$&-T$G=86}!>my18*$e8qbV;c}P
zR1y5azsLlBQbL9mplGmHf7IafzFV~-il0`kBiQ#TpiDX>Bi91;mH1S=4rUT`)auD6
z;m6MuQ@=3U%%bqIoDt=TPCIcn8P&$sol0{&T>jNUD*=Px8;_C;s7)y`urpBML(3V8
z+kAC3IWShJvM5@A(dpSF`}qe`qbuN7U#jiCIp?$Osns^&-aZhOqlBpp7<iani;!}U
z^qeqQDmaMP63UuaT3{`-12p_4U@1q(ew)9Opuf7>yTqbV|5Qh*o=?TV`W8<oHX=>t
z?=X(MpN7aUzNEJDmVO!4RMY{o*z#QNyr^%jxx4XSHTo1cXw?VKsT@gvRMVNGYfd^I
zCeUlB=kPM`HIrex{6)tI+|8Avz#VCbhBUuB=5m^odET|c%?Hhm!(7RWBAH)WhkCkx
z^q9F_zA-T2-Z@^e33n#9(MWu#sSI^>)|!#>(y!F>+#%jUKVsS~s(AMwyxXtu>ZgJ<
z$nxBGC-d<A7l@V`!H3wd1wV`WzkXSoD!7o@+0d<aqo&ecLmh#m8mCHTNDQH&IWhWj
zUR^T@udV8A3#W9qs|)-X)tkx_d*k2^O>#?2@<+>zSsUKG2~3SxU!(aER_A!M<Gm8q
z57XBzFAh9P3!rXhhh?5+DBh!OD7}4cuj9h06C!C&!$Mvkv>$1`%W~^;tIfG|lyp|^
zWsPJ<lR%-u-<`!TK!q;w7V676qjU#QtX2n+26DgGsR>_o?Ec@`6YzMFAwj-wR$>&Y
zb2Qkn?FH4i;1Nz%xNiAD0q)(p`6Zn0LAe@h|5^rT*T9*@)R&!HU6$tdR|#2MOwH+!
zKw;~ZVmFs`$yHqRm|xCI1L-4LIbWhq$IlSjI9F-7n6!UCc%f4W!4o8}Z<QwW9go1*
zm-<swCw!;Mns*2!y3AIh(t;jW%lXTD_#5WFWq6#(Yfpaa`Vr|ihIz5zzizg5CQs<^
zPvM%%q%Uxpy-q#7>vGl(Px%a}c_`KZ>{=t+Dli-*L?I;Bx|I_gw~O25TGf(ybw8cg
zJ1@GZQl4vLkg4bj_=wNZ3{TrWsHFsx*agtsU6kVNo~x_ciJ%HU)=9ypjxwl>E#J4I
zSq&kiC(}pJJ$_Y>cY~;&%y(WGK#fVfUZEVki3JkkoXCJxU%eAPRauy=nU{s0d_$na
z_d#Io=ffm1M(&({)jtkZ-$G5EJG7|wG`UL`a;wwI&42xPL+AB>c-b$vi1DGLDRaG6
z|HcDV%WeQO!oT?$no!Q4v=++Qi&Ts>brnla(xV#+_Iu?<%`=iF{eZPjlY9_VC@b9Q
zX>hRb+(YS%pU%GELfm^MS?)P^jBfV^*SP7vVnymMY9nO0l62$d?3T0w+h8^H5vr~-
zBF*pocfty#siRh8ap{=dW3p%OPk^5Y8W3&+?{T`$;_s4_zrr4P)lc)CKs857n1}Y-
z4UwRit@vd&orcnDxBl_o9$kA{SI1{jjw)Ky(R#^~ldb{CIF0K|uiBd*t0;4n$1J1F
z0LFCSrLW83*JQNeGV1lU)nY>@c@;w4oH&{8I;PMZgd8!dsJt1ZDUxSS%2)_<OSR9+
zcxJe)LX&vjYH2W=?^6=fL`?JLJo9dZ>BMH?$|WK7v#MA}y(kHthBrhu>laPTPe1<O
z{xMu8a|=iym+suB`AB{wYfh<AAn%`I^y_DVM|2vd&3g2j_jDecXU7`{*)NKbq#ORa
zl?T1JcWjN%x+<l^bxe6JG*kqXaY0<e1LS!GirIkaSF%FrSGw!ZUD?paAVcy9!MpZY
zC~-2G;c;9{R=W=axlRtw3{F&q=@%WWdCvJX=%BoK5=wEEeM6u_M@tRLBcgRgjSyqe
zLu`<st2*NCKv%_a<-a(CzpMbH=2E=%H_7`?&Y$IE{I0-D&MJID{IVY_|7{Yax^=0k
zmOC6KBf4BRy<Cm&l6ZH@(H)5S?{N?uBTX4qpq`E<CevI%KJWD++VO_)ukoRucF5&N
zv#EiWrF>=<i)Q|sZhpNs1Eo&o*8;~D>V9{9x_)cwC&AjAS&$5}Z`32A+i3*M2?`zc
z<D>8!zHe7+9cILmkLaD>Yo9qNdgFT9H>mYhFwIJDDE-1<LU)OQ$kS-7hWzA@L2vJq
z38kpR$kdsaWEoi^hL2tmd6>H_wFV4#Uv`}T>%IQt6(E%X#>S7P+hWkmbAz}#b;)iC
ze%sxc_XJ+?w|HYPu>L&^!R!B`0x|{qHvHs*PQYOZ_*y$y+DGi~%3gBbZ8=x4{86iU
zd90FI@w=7y+dhXl$_N;WR<4S_#+YM9M-PM6CeIi69N%~p?E)6WPC)rm_XcNYXWNhG
z?S%fJ1;Ai1H3FZv!}80*8y=ZH_|x0@Xk8%pbF0_xq&DrbD`ts4{$35Cs2#&+;5}S^
z<#2CZEXtsg8?M_qKcseI1M26xdd*4$AxP~Nu;Y4nri$O3R-NVMF}pN#J63IeJ{953
zhy?@nd1bzi^r(XN1V^tehDWCF1@z2Z4f(RJM^kd9O`CVKAHC1kIF8M&eqtLrVZLZv
zKsLXZB;<HPbs%<8(9{W#kXVu{hrzQpjs^dO<-d4ymUG`}UM4~5OHD^fgrvX8-bdBk
z)_%%>GvG)a=k*sYo`r03rFs?i&3Pm(IvNYLBx-R#ozF}^_y(q6n+HI;Px@*=Gv6t4
z92#JpYAAcZodjzsMecLRjGCH8fsvjEvdB?~I5aeq&zXJ&5gHnrK>y@(xP(|np9zt(
z`t1S(E?#tHGPFIiKs`8OJ>-^}`bFN$ZDh3_ZyZH*H+(;Md+o$USW{i+*f1v}Jo70z
zs~*?xYwMRS8dVdnMne@`*Ov7ru(<+uErM@|R_#Z-R2Jv4Yf4!pK}{rqO-EA0!0B1q
z?Yg>*Ar>#-8||+ukl~5rR}q)3<v{KAEZw%jO&nRI=fB=SQbO6WaNb%*#=VnZG}%H~
zyo8I_1M<}=>3fG$$?v3hl6IFXSMGUkq{l+)OO}F5<-)v;4Rcn8@~6{Rz%W>5-<v8V
z!6mlLOU0Ml3z#PCO1-~i>&0UjTed4;oEcBGw!uvGe$Z8u)Ahn><SF~Y^`2i+YM-QA
znkvd#*{xhN0Cdf%t2yxI&_$_R;5YTXhqtdF-(Mgjd^;$56?=gjuEJLNrdD5dj`tk-
zVG7w0pt|`zs~wueY^N;fOi8DE=YnZ)^#QcLjt+-%OnEs|M)@}lDws{Mt<6Biu)Whh
zuH57~$F^V4=J%0ZR;pT1$@h1Bs$4_6p&br?)(NU%=#kZ*4SKbc00x{MhoAjBBl^n;
zJL2=`CFDG<Q5ac9n#fk4*!H*Zs0*~TGFYx7swT7D%iw9ImQHxNcmIHe=Fb+qw#CXi
zm54;gYC!dRiET!_D%YO(rN=#Mn*;4Z6FK=+I5&igW2m!UY~1r+g-YkeAqQ`iLkfJw
zNlzr>joAAWF3(s(PWk>B==cLkROF?~t-jUvro=EA_8oIKV?)2^dtlsJe2G`WCFN)&
z4yl_hKY0ACXn!uy(UEPeR~Z052%pZ*%}6&W<kr-n!+geAj^EAW?_iB2J2Tx1^$WT?
z$uCj!^q47@Pm;NWYnHsvS&N^q6k=-t)r28EmgX^x(G8`{ItYHBi!$8<>Wi*SS|3hI
z+>Z1+LV{<l6`9x;HUDp&TYqNRCPWZ+{CQdq&zA*1MvwQ&!lpw%8qX<vjG8dvqb1-l
zW!u5Rs$$rD%)@m2B|kcArj|V0U73@t2g;5sur`tM++F6`o+q$P4{%#YE?v50haxC&
zY=Tb@>o2*X?XVlu0g%IN4GA!N8g%}GMe^CZ;w}aMshRmg7WT%?@(40IheF~!Meht(
zM-#xyS0<hgQ{LU6pU;Rv3^DVlP9&7@EwKr*Xnd!m(%9c~KIZNgU!mJ1ZF^}4-+Cd7
zlhh2A?)~P#FwQzZ38B);XAAfjd7oQ-Y?O-x3WhXVPW>df@r<>Gf!w2a_7BeyJ4K0%
zVV5y?`v}&q^q#g<QCEcpcyQz$6}6ua!o(KdFH-%Q2mIkj{`ed}IFt_T*R()+Uw2jV
z9Tkg^Xf0?l4urLNk0rND$puaOLk{)GSI>ifwy_bNNJ5cwrCxfkre8$Yy*~A9r&RbF
z4_hU=A78~ad)vs$*>bo9J!}m&ipV&5dhTP<BLl8uB}=>h5ccQR+3<=C&{nDyzdB#^
z_2HmO(yuxld7^}Hx}fNr)duu}VV_Q{Yys0z;Z$|$!wvkswZeB`6f0$ZP1TfcQLr(P
z5*X`_-q@E(UW=VoDV@?C&2-K~LSCMt9F(sUl{!kxQ<xQ|&pHw1e)-Hu#HlDX*r|mR
z%>k>=A~vc045i$@@Q+#O3%DRV+WfZ3vLdiv32)pyykk!J{KncP>>@PLyc{zz3l&$8
zOycy=9Ac$_4h0CG)g8fqRNQ|F!X0_wYv`p%IVQAGRL>tNp@xrQOZKf;8}C)F?+n$~
z*H=<8ia*z=4xJWN22HSxur>|#^?iS^@nY0QsOSu-4fbFR*}aI2z6i}MbrFx>OuDHB
zK+A$2tj%s<VHqq=0dYj?n5UDo8Fs@|E~w2tkAT=4>`A_t;IQ@MYv4kNM*3-&g16MR
zzu5UYMH~^fD7LB<-qMD}@U{mEWsB0T*L=0bo!)osw-iE$LEXOE_2Jw}@zgOlWz=B;
zye+hD5oANX>DKCTTqkGc7jnsg7_h&K<J2ULFm-MrBu8+>(m<UQ51o)%6{p;6Idx-p
z(~^{hCsV!JSqQj78Kt+k!n54=;#{6}g@W1wP->pq=ZnVQZhiP~?(e7`sLO~L!^mu~
zXS_`I4&D${lu&hb@m@P~Ei!ALuWV8fdr3FINb%gI8!4k%*DjF4DLxe>QmGLxGThCz
z%pPzngs=IT*>lX;m*u)Ec_;T;YZ7}krMGxipqc4W9jl4(H5-hR9~6GfHO_g&vV}?P
z*|)o_jaqES#%;6)QD-V**nm^K*TV9bSskyxSL+Yvnxk1WH)AVF$%{Sk2!phrs7_%U
zv@2Vz`lKG~Hr`hJ6)xLU_2a|w?_mw2`T?~!$<wJoskn=LwrOcW6|tapl7CEN*=sNp
zk`Zd1BeRqsZLeOXDDr40_qr;14h6%-Zu7nwzaZ8A*amy4bLbiU8h@Ue7wrWbCD^6n
z*W-|2A_O!eHok-~UdR5OgZc&C`?L}(`eNpF(C0pOHBKKV3Pu-80MeKUgXydFER=&$
z#Jc8puqB}JdzGRP3136zF6e8SpBi~W>>Yl{K{%228a5vanrRy}l@PVB*%4c1Chw$#
z$ssbz<g3b4G~-oyc|W>t2c9YS(_j5)j&SWbMkV><cxt~s$IMB&Aa{QV#tpIA>Y)@Y
z+u0t^0KYo@r>7<stRJZ;bCzpoNYb7dr=S@zPQ3o|nj*<Fvq@#7$H5ZrM9mkA$#*!t
zgO6%ovu832^lo^B0u`hY?`wuvQjUKrAd`*Wq@GL;XzDh15M>l?r=2AU6w6;T)ED+1
z5S!UdWxS<M&0n+cp&*@HPJbsvUH8=G`32Ot#;A;DqAFLZii-W@3f1vZ#I3O`w>rMN
z&-7d}e7-F$`Husm*Dey<8qk8KV0JKI8UFh}{H#P7!8aDoyRwPxV<|EEe!B%lMMZrD
zIVF(G<lE8jL|z~^6Xwsjv~i@zm6exoFNL*vcrStdvi*$dW|fI=z`gN2eBHw2zIb@h
z<x=hF-RbqXFK56??`rB^Ul>`<IKJx8+)B7!&iw962{FNYt#|*r(RaBaYW?T3n>;?!
zm)-EyHsj~ja%siGf_wrAU0UTEFR6`d)D`&5MS|Wqj;PSx8Wzsxnwz_)bt+FDr9q1I
z-EJcZQsQDt!-bJu@3U-JT~O<#JmzXt+f$afDjwV})Uu?#`gEpb!h7sdecTl<34yUw
zC;O3E_a525Iunls52&B0z=}yW{`B9U&CiF%Zwnq!+EU}J3A)7g1T!~9mU@H~`Mgls
z@e<n_eezXi#R`u>d=;%QvTeYsc>G$bhjQx+W#Hg;Pe+;w>lNAV;`||ea9Fh}2SGKt
z4%BfI*CqzQp=`%Sn;TbvT!s^D((WKr;P4)B8k1qb$oV}@hp%yin+$7XV{HughbTkj
zNPW7oDR8h5HWJTwl6B7k`N1{t3V)`{Gto*@)ju)yQPSh=5P%A;sOcz=X*6Vtisa?#
zYb)@ze94*8<vkjeosypEj0AP|CsD_7F{*&uwJ+xEBBSghdGEj`!6Rx;I$fbI8N$gr
zFIH=O_~r@tUesi2{w(M53nYO(JB5KjxlmSm^UCQ*iFRsIzhCL24?7?Ie13XB8R;PP
zmv#U7%7+!4$V=4R4R(r&IzPqH`*}Wg*#kS5IA*$I^-|#GWx2alr(aRTBHZRPH=3JY
zS#LasqEkR3QZe_Uj=j$Y>-7;b^~P{3W*_CX-g9rlVZw8BKf`P{Q42!qkqViL02{{O
zu!DiAsi~HgavT$?j?So$IP+s8+T^WkFl=7KA2M-P&ZE8{M;Z7d=1n@q<>5ly9gdlN
z7}RO~dyfIpxfr`Dr1x!4r!9=w?LK}eHnZN@+37;gsy}AU%`qe553zUD+jLZFK%kou
zlNi4hDa!N&ubr-YOs-EHa9#12C@Cr5jl1IF?;Wo)v&Q7=)iJtvfQ48WM1+MeycL^`
zZhG>Tm`S+2bbPoT&ydH}@QdV{|E7<zyp&*G#Cc~yu6cRc9E1<*$&BpN2bmF*?cFu_
z`s%Focp`NMeZ4RDdd6a0l&GZORjh5|`odKqFPjhOiPPL^3%u|=N&}Ad?j=FGlA<-;
zJ?2k6IY33e{5;sFVNY&tIua#=<j-))7EKrTX|Q0C(xjC>SJ!x{(=Je-m0#!8c2$WN
znBw9dDEPyfbY6YnN=7N8=}+ZXC)rkD$*kG>uVDWV4EE>etbm=fo9-l^rF>o<rP^ox
z^aAMUSKx3eOKDFB+b0zKJ{6?_G~ftp!-pP3fzqk}l9fzkWaPs_?geF#U}lYRSii_#
z*)>O@P%^M|``^RiT`b`qv*&vH#p4J5w%D30{x7_}xKsO?oyFq~klMiji#&#s)zIwu
zi!ac~NnZCS-*CPI-OhtdwE!5$a47cTz$mp}g=2;apx`llRU*lRS!aH+89U;|)bv}a
zjUlc8i2e8E!%kp8K8fewzj%7CGrB|t02nFBF(WTWu0!rn1(|ZtG2f+QZFJE2+7rL(
zh#qtegs-{iVfJlMuS`+c63;Zq)P$X9*T%41ke<`*spW!&ibFdI)>eh;@O{yrV*5W*
zU<V4gdcUIIlkx}x(EPIJ(Kgpl)gt(EiVrus@YZUH6vy1;UL8dj73nGjh$Y{NYmOnq
zz41Zw|A>O;DB?|2B$nDOjy0D-?^~(|wU2ys+NeKps0KYcQb=#FeZ4Le-5u_$7n40{
z!(OeHDEhfNfo{qDb}nWL(5$p~t#(w8Y+iLB9F|+2TS*HgIuW1$k5<!K2BJ8v;da6g
z9+zF7c3@oR7r>O%95dd$b54;sSCNn1d|;r35g=hQt5^8I*b0wIPLCPUu9`ExmfHtt
zU#nT&-{!@5#y^5Z>3KKet1yEqndG#`nMuJ4j0%HH!MnZ9B}*)Db@EYH1g)wR*Xj0R
zZDe~YT(-F4lVl3KX1*fV`I^%;t77fSR5*r$6`_^xDIW{yFdR4vi4~LmlE06|k>wfM
z+)vL%p}Z?wUzo=^G`|gGa{JzM%`>b7nrW-1AJ;E2{Em!RH2mWX)0N(W^q+&}q#Z4L
zAq&+0*E1q1OH93cFePC$IfBVn4u>|rd2I+tHPZ;bsi5H7aJ@#|&a>?*T{bDt<{#Po
zC(_l5`nbwdBNm~)wrm00y{`*v^_}vR7VOKu)8O8)Z5AaOHdQC36Sk*E4KCb#fK$>w
zp<muNK=O8*#h4XF=dLb;TERRQ<`lTt;P2CUHShY#`Ow|nBX|wU{!9?N-d!TrnM2e8
zezHJ+JpIR?{ns1#^VKY|1kb>@0k;__SvsbZ;5lN?8%1S(Q|1=HOi=PwxnvwzVHywL
z_4o)Pmp-^nX!shgvqUb44<wD1z!SGxJCNmJLrQ*&_gbde1L7J`*W2+W=#l#z2Dl14
zEjpEj5v$^Zv?t!Ok5))9PN)ST<pU>di65u4EJ0}L6I{H!Zw{`;e=6$-*{4R@n;+AS
zu*v{Yv@7j2($F53$VSxIHB9jNZ^?sAN!6_vn}<v08A|ihN&$^jeQkepaTt%UUO7|>
zmo6_!AV}I9BzUr~J>00I-TBpL+WBN8AvBR^X63ma_&sU$b&MG3m2)%&oQI}?LO_YG
zsn~SQqwtf{uG1QYceE>*Jjb6K^(?4-t+@I&N_nS=W~t*_bdSuTsXQCHw~!0OMWhVo
z6KAwWj;Ka4YK=hKooQuo*R(dv5gKJrdO#t256mR5^70NzG#m|n<2HS8w|4L6T`K%p
zS<h8tI}b{E;5sb<H;6?rqyH^a@PPu(UiYoSx$kSALTS43z7Xbj7eQNR1ssNXE*?KY
z{z`Ozz3-j(w-5IYZjfsdF2bPO>fk}28d(}g@8$;w2g}^pkpvs8l-DnsZ!V7^o0^(f
zI@HK3=sj1!n0Ek+zcC#xb(idRhdvTc#Eyb_GD_sDIbp&$V1@wwg{~TXH9+PQ1OX&j
zX=?^oia88co!KgR8tXNJuZp_)z0d;d42fKH-e8V^c@1ywd!Lt+ymrQs*X851I{16@
z%rS))dCi}Mi)|$`IJe_umBPI32N?TGmcT5k{a2U3fX|hV3dfI2*M0!Q(fo(uoPFfv
z(M9~)ek?j=!5L_M*W5C6URcb9#9(Z6oy-;T$PiT-qKv(FQs1do2_>m<szClyj+SdK
zguBD}{d6xH3Y{=Tu<eJxdA`k(+9TWEGy`-Hg>P7lXT&V=^Ek{xT(5hW30$@R((dNb
z3t#jSAF4@#IV!{{V}{QGu)AyC`QYEc-|u4_(hV!sT*9cVZc0>T9eD(kwX?Le1R%{(
zCxW7q3{|8+d*Q+cgZT5$rx8WA66CaGtjX)EpRc@QI0VyyCHqZNWktESkV9kjD)!iN
zN1Tw$m=ngYRTQ1|cDvg9U}wkeQGV{)SJ5s^1WZ}vl4c}<Z)9nMN9lR{2GhC*bl)bu
zue{hMCp9&-kf*((<nFEfR#-3aU%}$>T?qqAHa|1yP4yxJTDiS9zyZgiO0|&~D92%j
z6%)ryHV92G#+*V5#|Fi!Jr>S*8$Yss53BBT8gmh0v|m79%wg43>}%;n;G{||(^K|S
z##+5LWo3Q=mDglhax&PPtj@n|moI$ggAUt^zR*FrdG`3wo1R>jXpN2QK<s$nd17OI
zo05hkTwe9~+0X;a-I%(nA(V$ffUj4bci<)6o;@J*sF*CHPR#$X)lSq5Xz`<rK{wPC
zss2H27ANyXT+DbvuV=)}rD;R+n>;i?z7l)X`a9>Bg^9~0UJuJ>E0*=?eqN<le4qfn
z)!gwvvl{Rpj^O*4Tco<DC#+`I?LthzqNNiw_>HK}PCXmv%OF7ndx;bZF#wl&tpOJI
zzzATBN_c@v#^-{Z$0JM55_I4M%bb<djbk?8aC(;BC#G8r10%1bGQ`zaclH5t0p)|_
zMd0gXSo~jD4_m|9sDRIGwv$u)D$d#{Ly$zN$O~di>}lmB3!hXMN_JCb6-7ut6rW4c
zYhiHg5ae#xCck(+mE^EqRCZF4W5zbVI|*jRx@GXiJlpd}F^!}@BofBCb>MM_eAPwA
z2YINz`RcbJRfn9`D0qdbk<r&Z=gNIQUfh;A070tp)8Nf4Rs!a^r(%KLymHHP&6u_E
zWd580tZmYRWa3BM+B;<7a1KR`@M*=_7Cmikaa$uJ){Ije9upHq+zT0ysU84yLjwW=
z)Q8=*IGpsrxiggT3V+64T6p4D*+2hDBD-1~Gn1VKYS9Zkq{UMJy#Pk*hNFQNS?&7F
zQH{8T<p|2jxzour=huAPc5i-(S|5PoW}J1_Jc>Fijzaf!3-xCfR???F!L2}wc33=#
zvyy~FYl5j_>JhMrnwues<Bd#=l(&!g(4U`V_BQ|VXqftc<gmnJ@Zc37tXf4+sGlXK
z7^6y08Q96)(d~2&0Ot+%_V%_$nv$vYwJ~i{vQ}U9-#W?Ks7%QkIdykes#~$ekpbri
z%@nzGOmY}s<=%E&3g034&UMn~gfAv+wa5&>!Na{TpKcqVs6mm09-7$;Nq|fRgA0R-
zMc86NJ`rbC(yDPXC)*x)%4RNAFws)F+i8r?&Ls{!%mi@&CH0(IEJtz&tk6o(ZA5_=
zNK@vXi+f|NAAs}2=l{Y>s}ekVb}$7IeOQR0>)UbDx-T@Yh)Fc7*|q(NayBE^4-bJC
zm#>+QBxF}$8>++oMvs~N8V1k!T+N8;LU+D#wA8DRgK+24(W-m3x6F1N(bl!dHEJq1
zmKBu>5}u*KO&N~as7qeW5P9n&?I=t$RA}B^u(F#yw)0y>870IRyYbU4c7mKv2%iK7
z;Qnt4;Srbsq;Jc4@$Id|c26SrIzcf$c?vLq{*bAeyHVBv@PC*VwFg_*wQh{k09JZM
z++*uUL#uUqyz@xDg!AGfmvL9~N98=8LnY_}wY>$GPyx>DKC$02<F*o(RF7}~p+8|Z
zDpZ>&HwTb4w%E-kwZ0-Vbk>o`&|#}$^-`|}m#?MXyFb0@4gtOKp=w#QOAiobF-sup
zT5JfUa+!bImpA4M%tb55V9>kb2}ao=n_qzPjJGbi%0+4ryUi=d4je)q(h3wnN=nHw
zquAL5M%#fgw1h)}?@9oHvMX||@X~Ky0pIHU9v?dQ@XRyHkbu9am0C^G0>GU&@@n5J
zrLZk;b8JmYrm%G*<zq;LbOgIM%2wxS?4`@4R#!ye)`8H2yk8wKXjm_&8L|D^b%F>8
zaJn@Pke>XEJLNfYiRWeIE7xDNb#X}Wn{3zB7Nqp*>$>vCi@11c+wn(>ETbCI76wuG
zoO64VlEyX1_a2OBCTbAlTd2Yu@lLE0yD<5L#eX>?|MZw>z?&0<lbXH*OP}tl;C`A6
zcXLVab2XPA$WaC;OmUXtq0IjJG_%*%S1?tPX=V2S6V4I_(k~wWvda_t9E6;Q>5e+X
zyf)ZsK(BX%VU|}VSDv#r-bIbst2I@=$mSeRfET%gdGyRngA*l7%#i_o7cN|Q2yVqA
z$W-?JcbL+$JQz8u<C2&qbh_|zye$^YgKO*hUI!|N;cKj&VV?I}9J(l%RhqPp>)C;I
zVWSnt`-Zwef)KLp8&<5L4j=83E|n`aw+XO29A~%<-S!5KcmYd~U6gn$A@-Rdc;T1h
zJ57C`Ykr1vF2E=IYGzc#XUZrGkIaE)N&_liS5qI9F26F?+OJk$VVOJLfGRIS1JQ?5
zs(#~VF&8!9)6gr(xtI-<9O{pmMEiZ6DxR)KoQ-~f`I5Y5B%WJ>vN@qeCMOsattQPK
z_xOX5gtu+{fDd}=neVRLBYWKy?^)YxhFIqx?L!+Hgpr*YP3zo+Wm{oR<j<fM1M(_s
z&b#iv1m0iLlHXAMAJefeRop72viuPXkHtrk*}-A<qobpHp8@IduH!5?@6L?U9$U=#
zZU6&H6*>GKsjrdO>!u%H(nbzmN`;%zRDozz*~+jB2jqr>qa)s(rt&*|K|4Xm2B5zW
zu<9GGfIV#kH6{bMlWLx*1@`k8t72*PRLPnrT~S~MNUOf}YXhpRUPd14%KZG*GA5t$
zn8)X1!Hrc<?LX0>0akYxgZfB&flpLqB!VS%kA}k(LP05K5<$f=<EV?-hfYw&`Mw2L
zPm3IE6IPwrOem<X)>q%Q)2V#Mfcpr|Wch9-Ej{TvzG7qj6phzh{&ob@{Ts{ln<ENQ
znylK?E6r)Fx6+nVrd*_<STL<COYzc=g1V)~&Y!hW+)*XNhSAI&E#a0G4uZO=u;c+{
zirh@C4eQs)^)?@~ruyqc*G!%GQIMfN>hN+yQ;H0fj6_-()I-WQBB0F=2tA9Ab#tz&
z*%TU<{GkO<>kZbecjj`@#aqiBd2@o~9MA8%6^-l*3XklQ-o0yL1%nSj2#WpG@PzAM
zoF@P8jh(ZuLSw#e2;8dutJ8e<n%sAd+Y9X0JP+Rsj1p&D?l~%Zt%WO#x~*G$>WzcV
zn}f0)5~SP3ur`^tO%*6Q5pZMUx>;Z{=woQa@ge8s(d~0T;iXauu$i-cKqr&?)NA$%
zVzJe7sFMzL<K<QV7rr>x3Huwsk0t<squGly?#4^2o9BcpOK%@)YHEJ*{h%Pzjn9!2
z3lR4kwCGE{I0}XWr_xX>ODq+cykiif_U{4tA~`4hhR5~P@t(Qo<9&r+;Ub_IX)PCI
z{t=tUy<iCh7_olih-uMob?BJxjb$I?N+_3KjJ>YQvquZ&e+dDOk~VWd)cem_37u|c
z3l`ukr=hI;D4Vxtr~{+U{b6OvhMw|Pxxm#mN11c3qtAoOFR0n8h{br-pQ27O*;rp#
zTb@no%IB!yJFmjkPT-S@&v$cy9xrt!ZVuV#ncX%L_ifi{sVLGjp=4<bJvgK|`?N0y
zYl^%)+QEy;aM_F~6yr>d%ywHXoHRoe5mlo*XD6A*c2tt7esLTBFy*7$AnKjI8WVCS
z5v%vDfxh#tAu%a1)uM_;gL(ZCFlT>A>5Rc+s2*nSIqzqg(xReu!AMij<R`NU8<Hyz
zTO9OL;k|N}ur^9v47zI(>=u!}n(Fbm)r|r#jI~4$sxir~cyZ1Tm+VK$1<}S9+7DNE
zIgK6anybB?j!S}9q^B(TA#nS5;stgrO|Tm-{*d&@b*YwuoSIF~Lk8AH$^Zz;LiC^#
z08cMR@1}I<rMJR$ownFBSy-t)-K3S>LmN*|^rBQDatzFm4F+lr_-N^bv)<-p$?<L{
zCQLj-{|-HIG{o<Of{z2m9U4MTj+`p*=QLz{2ET^ZC!S^1jQbiiTbw7$QfZdfVRm0X
zf8{rgz)g{AvzDc+A+jH{@+*C($7;A4m3<qKo8%|^vU_Q_L%%}}A?SAJ(p$u$$bBvP
zj`sNQX59kR6S8eWN+;)r+9}NoTzm3(n2LQIaGG|!U$V2nhz57;(uOwv!`#p5hL1GB
z<JaRD`PbNX8pJjs(&X(gq*=>Jzlj)b$G$jzVjS(hd5)HrmihYd2!0x8*HM!O-{}MX
zE^E(GjX7FNSNHicAdkBp;xHVk2P_l!pxX>U>0;cq@LP_?r_~)R)m`Z}bnnjr%S=46
z<^YhVfNByA=Lu(>$A3TwVE`-vDMf`D?gkZ<XFFinwHBcmz)p$$F1r1qeB0eXP@7iV
zIj(lP9kC+R*$L>5L1Id1YRAi@wy6bmO<SmNId##<(sE9+y@2AV=5Hmz&cxYE|IJ3K
z8vR_Ly$54?d;|2wG`%jD`&yP(S4G;A^8+IYG<XY85_dhWZu;E^-PcR`mPRRfXCl_a
zzUet^0c|j4p6O#%b&`Ic8)<73@{h?MHIETd%Ao0mZ|Cw|^4F6bY$+H6&I815?ZvZ2
zCJqogg^%4Kuk0gx&bRO7FTbpX6SdfD$DVBe<HGy}J^0IsHK$FRSxZwXui!PPZl*Qj
zen~L~6s;FQG3mWn5Nw_dn#l|jVYTUE2_XCuJdCr&9^NHaA@8xZu~{VIN0|~3+`t^<
z@Rcj6k_}E{-uyTuMIguT^QU!Ukr>yWlBHx*r3e^1C_pz5UcJ@^cL_&BgR?q8MT|>!
zT7#HQ*Hrk#Gri6AYYUm!jXx&+%H(3J?^QanHrfLK)asg%WL_n5pil1bir7e9V^NAN
zJ@r|HvH7Q*EP;c5Q$(tJ_Y1H`3%D~Q|9NL9Q)md!BEHrOeXJ|<_*8!LYb3IoZvBmc
zV2e`4sp~_7*;BgO!#wdChqu8nkyLZlp(v&f#g?(VP30Ej#(6w$gC5nI`MjJ*H3jp4
z&?PXy*Iz7Z9vZ~u0<xUVQj%GFmfw{VIg~YJ$fH+@1MYmJiJ0!>J&`}}?q9d~&sT|2
zAYsy%tCFSwm3_a7Ep&}BFA$GX)s-wIT#(y~9|noa4UQQNkBP(F?GXWxvQD{F<KuV3
zakkil0?1TOn1C~YSz~}4XXTiQT|;U3k8|FB){u2ey`Ie$yU|Yf2k?%8037npo@`!U
z%NV}`oLyFzCP>EXK4YOH7v#d7nzQ?6Q^FsP6oai!>mdv40$3Z*msWtJT9Jr<2w}i}
zHxIE~nFerPN9}rowlSU(Fjc4%=o;{wiL5gZ{U!{Q(dqg<rpv<pv?YEi(z5W|`EX~X
zv*#4W>C5w@o-bi@M5B%`P;3``RgcR)Hx$g-#`bQut}4%*nFg0+C*|upSMVkRl03~d
z$7j0bf%24JI0L1NS%@00m<M$+41H(p2789cvOQ_X#Qj7ppWT%VOi)-eIO?|yz>(fD
zPCoZ<A>#`%QL?|Heae7X{rcJVC_#Q=HG>0~un)}MkG$zkM9JCYNY5vrY*gdCyHv^y
zvOoJsm{*=nMT%9lEw+@B74gyDanzxO(VaNQy;WxO8@tT5fAYO|;~D~k;ea^Ww6e3-
z>CCFefZ_LWki1mJo&w6B1aekh-Z{Jk0_U07rTH5H@2G$Ue$3r2ULdAq_@TTM$4sDz
zh=@#8b@hQvOqYc9k}Z}OnZfQa+Jdj*JLOjMlQp4H<-N&#Lb6+j{(n4i{*3gT0ppo+
z<4G=E>oE%3yKLoIFLeqIZw3t_!wiaEW+=wxw6B@W$d-)zG{~JSdl3u0k0jBp>eYp)
zbR$3N?}$^<P;mJM4Oh2u&9UwB&F*)+aT|zOUpLgZGoZHyW891|$KDL@clyGIkIv{T
zIeM|}hWRFrm(`r*_cj6UNI0J$NOj|CsQk|?sGoK+*2a@d`(KrQp~O6QDrxmSX$2W!
z>(O1ub@l9G9jGg6_%R2e8AJx77KiFwxSLz+XDSL-CXn%eF&%uznW~-wH!E8@4)zWU
z?mC@3{pQNOw{btvE1oMpvl2Gr$%>Cwa76v+$dksbH*Ru4?I3~oj?b^~Pl7ToglMIQ
z0^OTM0d{*fa61P!pA58a7$yMSzlu+H-@b$`sk!F=BC=d#VzV1;F01kSa{TS|o?>c$
zI<y&GkFQcZAv968{4H^?``4bqkjE_98Bv39%BXO%x~yXcI~B<XPMk%cGPqW&0kl~3
zw-n8Iqz&jqG)_L6&=Yt{H@i}Ah$uG@e0ibUUNyR}iUNdh50h5>cCFFPf@RpLM;Ex5
z=gFuR$Tq@iO$GSqRzyCgZHUd`!UQ5uoC&4yAtf|#E4?h=1>A_a8b`72;75b;L0XP@
zA^|=EZiUt39%57ZpJj=^zp}Unveb{X$iORKmTuD%M;EUAuC+Y=NdmtL*12u`L1%7%
z9*_db4q#3KI6c4-e#}9mI>99=EFz+_38<4D7L1CJy4o!zsH@LH1x85A&lc+qB*Fp1
zKotEJ7!e_E!o$O}A7`pn3NJ$U;Hw-LLVA>AfvGuDx%ZE#N{%C8ZHdN{=^#b1lX2{D
zV+J<h_7}~5lE4R+PLIg0)BpzOK6HC7%t7Xx>$v?Ht7?a_R#{}iFXQ?kbm_NLRg`np
z$?%*B8nR90VC0HSfuYW)Dc$AQKQ$umT9e9fGVD}ql56qNX+BUtn2{=^HvHsmH`?pj
zwA*18dZ<di*t9!Oza{AVLAy!J#K0jhU$c(S{TgHE^PZ|T_6Qws>1eG7XI{m(aWT!#
zcJsb*<{jf=x{KpW)h?fo!7$U|c3+mCwcz}+6jge*-#SZym{ul!1TNo9|0l-%yA}R4
zj}-{u*-G*atQ4wa4ai$PKs)As=)KWJ_Ao3vcN4eXVW+9TnZ)Yfxt{s)393%*RtVKi
zNoSQ?pDzd1Hf6Jrlbluk#!A&QM&>bjZBqXJl)Sz#^-(g9)gSdCPyKWuc6UyahA;MK
z4zpc-Id#p)#%z8xb9p{XJpG}QMon6!dG-Fbg>z;1^x+}mG5Sg2#D1v5!cs+%DzphP
zE1g*E6$devCim@DPnj?88(#|V9hG%-bewS6+gkN@o<Ko26<efqt@mq6ry#y8WrK8h
zJp>v7mJ!~Q5xN=MSp%d2a0!6sVvXpkE4sAxNSnKD+w^JswM^a%beiEiVQmIKIy_w%
zT2PugI=iK^gJTMgaF~MC_!TMKVINqBK9iwED*NL)Sf5qCaM5I(LB2=uUhzcA^0$n6
zbH-mc&^!pcrPFWC9Tkp^ds4#v8EICpuQsKtCKoGM{eBKp{}^h)<@(I-S#0>Js92i|
z%WYTDzR?<pFS+U5T{BmZPio)zOD)(chau4RICED6TmFsR;cjS>;f_LSb??lMqU7`W
z0{;fN6IzcF*^QJbsKLw~w%YMyXEk+oB_S=V1t7WOlZ3#)FccC<hSCuglrZP<|F$#e
zuN_fGd=AK;pCfF&`?{b1WPOqpLO5=&Ayjx-)=tTK?7?gb>|t1M@fXw0*jXH|#u`W>
zmE?lho#{8vAPJ8v`_~fWf+}655A~3Q_T43Y4yR9$DJNLlp>fDI4ul|Dgrw_@`I5J=
z`98TI=I1S-ddTqVE1Q!wNH)vtlY8!`WKx%U-9dvA2OhwN>TlT69*adSJ-vPBv6vlt
zTGBfEizfPWT>{NjU^DT=ygBmuqArBATcCLl6y%$g=-0ZFu2(#J>E!YF354{-v}pA^
z_zi`is=L<vVq7VPcP<eMLtyhLf5^L4evfD8<rPMz?i&Dup9Q47`-Y<C;bMkxY=1n1
z+J4TlUmuAA_5C~Oq0gooTUp^>%!17<Y1z(pJhWR_*M4ZxE~DV}B-r)9Uo2QhmNYv-
zQsY{})ujkB@sBicYqQ~3NcqJSDUC$M@M({FqoX<s@2$~auacTYGkhqJyR)dO4wcT~
zfSI)h(p)y>6(7Mxxd)DgQT5H=?~nbaam1ZU_T>aE6A9B}R<Nk*l<W}|d-4|#e&5%0
z>yM33zx=5`UxG=W^A0lCloGKMaaG&2oy-H;%paeJ7M9h(=J(}-S}KaWySooDm+Z>E
z#<uR_5bn#~<zOQWZv;%bfss9RuK?74H8eD2Xh`eio1<$oC-Vj^ZZ3nx<vYx3abS-N
z5LI*(w%{80_o}@bvjMdoD10%L#vB<7dM3QSZ>>%nMP01ou#gMdv_K~GFNA<{q&rTp
z(izQib|kBL<I23=(>k(ByXx)PzUtx>IPEK=tyX=Z(`)6hc_5NXC9V5b1%5UYYoi<U
zLYrCPwj0L`K2A)zU%RICofGtSpF|XSz=Xlx7dm{M1ZaD~*;lPL#|CP7N0&o0ubTF<
z=Ss_4?{^suZge7d-rj7k0vlUf-dWWsjJ2u^bc_GGaxMC*k8=`2x5Rh&hAcNK=c=2N
z^!$e}k*Bkvz2`FNG$`hQ*{rS>xsogtc<XiV&dadoxlie_t8`!3$l4_YIU(y1{^f&}
z2mZGxD_7#O!l)ki<bK0OgzfcKXEjz;)vezPaBTBzu0nP6JtzG<6>$3xtFn;$nwoSR
zouJmXmJDMZx<LcBVBHcS@u+(y!vZY*KTQi3zDEP=hMweZAH0?bhu*1bpg9L2ScMmz
z06TZC`EN10R0zyEsGMh0a2w`2`@DLi7`1dx&SNtzNSHmf>VVg!yuC)>u~X>d{XrXH
zZw2V~U30W`3S5Tv)z`xXbu$QQ6u4oNm->{$um}6QE)&n%c@A5mLP|f*ZZIGT%xVLl
z;BKLiDU4J?LQFz}z4JHuP5hH?9eXr^qdD4tMr&+m@m{~;prvr^;q_FHMDxz<qCu*&
za*fBByl09)zKsx*ky(itO@VixYWUG9v}ptOjJ+0^#dtg@9&|yKKeW^=9QKF-B2G2@
zJ)R7LPCAS;bIB56zA&e99*vwJxP^i(zxP4~G+diKHLds0p=USo+gF#L$PffB{JM#)
z4?4-NPpCWQaGTe~UqC_urR$_MRCo@*qa0_w((};HF?@Y(f5~5ptDZO3UM*JH!dw5N
zXw>pbh`rMNrb6v{{isGUi6m59BZl?8aM&_d-iE8ho<34upADbiXHlZ0p-*FM>a5J!
z2n9bI40oO~>dh6~DQ`z~8aN9f4XFFo6cVupQ{dv90qdqKyp3pY60h(rL$HB_aQDau
zw*SM-FW((WV%jwBC9Qbvwr3Plr<^`)BP|ErHtMEW>DQJqeIK#8gmr12>r3Bka`4y%
z>o{lMGRt^*#*+%C_LIp4d1s9XDy&mB{NNe0f<eb_LtQ@#%(hlLz}oJC1>1F5LNT4Q
z)1Yx8duy)&tlt`S#@Z&7Z_mcsD{Mkjnht;KsIqf<LExGvll?J5(Ze&moHw{LfPR9;
zcgq~wm5#eD9EiR}$+)h4!_4<*!l5Y*cfLJ0NC#&L_;lJz<T=th7(<Kw2bkMo3I54)
zqW83Gn69XdjJI{DbhWwQ$<>&P(|4s)+RLRBt@qPZ?8|@(w;}ZI^kamR%m<42E7)f+
zmuF8-=Rrt+c~sZ#vnUKW3buuRUfa$+oElZmD2Es)Yxr%rdFGC@nV@STICpt@9($N9
z=YeUV6N2sIqvX`MQKwxc$)d33UdN~M%>vh$wAojVSZ#-2ohP`7Zh1gYvdk;Z)jhgE
z%96JPUfb?oNt6AzD|m$Tnmq8tuy4bqx4}nA7uFA*Ku`DsSzTT26&22|JTCmb3xt(A
zSlc&yOPTWW7L*0@5RK7>W`atxDur)zw2bx5i}qE0@yqLc7=8Dp%qQqOBO}Gu7*Br)
zExic>!_$+ip=_Suxgq7UGVb{pB_6n4WuSrrXFv<Ootq^UD!>H*4__r=Q0GwSEP0;;
zUQtmI*W|GG)TBchWG3SuYpSbrA0i~_3G+nCNFUVOL_nB^SB?{MS9q^+UNa#QUvw?2
zg`Sqrnt&J90A5%SJ^#ZKF^+tPiQc`6B)D6JT-tu$X&@3#`UM6B8=a#-bgf!`g^P3S
zs^8n&yAmQ`e$6|rzzP9}v01xID=*_y^_IJph_$_CoK`}Ad`^3a!>|7KW$gyr)31ss
z-3xa$y;&c```0?d63!`U+(#4!MKo5dKNdO3?~V<v%Kq?8V<;~o&O-iVTD{Q@pO4zf
z!0nr*C=V~|Sh@O7=Y8*>mv&#+>~{IGfP$B2UatXp@pjZ5q`%RNr`ThOKdpC;<z<G*
z(eF0@#}?%-@MWcfU9K^eIXixI3Y~sw&y*Z%bhKh&^J4(oRJcPqyv0s6h@678m*WuG
zgBZ+Varh1MkZslT@-n^FNPtxgO-xLRF=gxcJN?-a#!(?CuFv4kTsG;*j}cmz%CTb%
z2*5?an9s>cmNlCJX8f=>G+1C3>@1c_Zq}CLAnXES*X2@d91!VD_(O0Mtjiks2Vtt!
zyMab8sva^fM^_{v1ZK^N(B{3=tN1<}VFQ*_?vu}8ZMXoVt>%~ocJqXwqT&@-(4J-F
zjVkuK!|cfJ%)XoP;nROUeBrA<5~Hf!hl93WGCq&`Xn}mIXsi=>TSI_PWM+*;T%*>S
zub+A$N5lQ_n`e8eV;Fvo)Aj?(q}TNFr%N=^4|C?T(WnmjM&D>mR_T?!4;iJW6N2Ig
zsFTLK#O*5pDoOJ@k~IE2?wH!Y#4oUTG4C|-mNndt&yR!<nq3T`nYZPFYBm<1T6uX@
zyEdE_@hD6f3-_<w-rfe|GpR1%2530_Shj3UI@))K3!8`DrC#p6i^`|feOi|rYhS$$
z`cV$_O)5RS0w7cF9?!;(02P>~rf7PPg@?oWdiAhcT6q;@jj1HLulpqv>I>$l>*_@a
zb=nJug5P7nPC_Y1FRz1HeLR4Rha)2+IQP&?EjVbV!RRTEJp)8>`14kooI>-Mhg<KQ
zn$Jj>btAx?Oi>-wvWB@7JhTn*`F)k-Z)mK9&xg|B*-QaMAxT~50T-pa>&n<UT-^4C
z98#V$06kc5E70wgs^FaZQgS!cdF-vG#!!B74!aS`WVCpb=hV|i>4o%%23O~8C(KRQ
zpO=)Cy<mAeZEApW@}6*Df0}fU=$DeOf|Cu6$@#AfIDc`LU^+jc>`eGI@YU7lhHAVI
zmv!vQXRM`9f=$h{*WVz9Tb3PxO9FDCB0(G{lHg~R8=m5{-#j~XMW|)-;zkKzV)vjj
z)UcSvO_%*PH&|PDi^Ja3^*7Q<a5q%oZP<ch(VzlM&j7_j&SV;*V0CeWz>Ort&ht`}
z*123VIas-VRhG4p!=&RJwY1YF>LSKQ_z>vU9_tNmgIJ={B%>p?h}iD{5#2mgU}e2y
zTL?gmey!}^Lfuz!lR(BkR7TXINT!6C4XWeUCFw^<&7x}k^xb9(1g2h&$?|?Ko+u@&
zurfjkjwsYSl2-M%<AYr1JT!*dMV@Clgs)TJJ&pB8o^U9j-sfQu>YAl2vto5_$PYno
zh3=b~y5x7H?2dv!<AY=e?tsuSeD+r`@gKK{Xl@@yIpFvM8P?C2YO&2LaA?%?=R}+l
zZdy{RDguC&fn#QdCZPX9J6N%PR&KAtJpM|ocOjw2>mu0OR|c8l3xqUMb1Z6sW$eS<
z)$#ee8%veDAe0|IAa_s+-Nw+F=+nsefW{MKe>E?6wTAV6z(woGSKQp(qsq{28Ict<
zK9G6=X%^zzKyMzmnzWExxxC>fIFbys(_IroLoW&;Q}tf6G%X-E#aP@#Tzd6Y<G5b3
zsQt<~x)a<g@2vZ~yV&(b<+sKt*+FvO$@CC`*8O6ASQ~AC(F@CXp>BzlszLy*-=YTT
zCB8*dSs<B3K3@#42>0u6WB8BIXt;6m+iN$uZzOu@d^-Ntlr}=)IBmt0dINKHv5{P<
zHc?}P5>M%xef7EwuEf$t@rC;!4Jtpd@yfJKcpBZUtZq;4u<jTo<*w>2(7R}GT|8Qh
zMC~v>smFpr7p#^4SpMod9WX^L{M;>KK_N-<KYsf^G9a@k;DHtqW6qfjD)x))V$7s<
z<|0i=yK0N;Y2Y+h<#9$l<ZybY`o%rd{s2_Vr&gz18oVlhcNbL;$yXaI976?|;V}JS
zi<<{lPEG`w`L1j_cHn=cO&(kpkdh_udC;uD@$Qm|9LJ0~HGMn#c)aJv@a8Cp-UbRB
z%tKo4Fa6~Vv|t<&S1(+fnm)L=A5Sfp)9Bza44g}#RS4>0FA!wLL8i1k$=yLTxZ}LX
zg(RrmUR;W=0Kr}mhr<~FX_5+Sqh5%Bfz{26D*!3`C68xM5PVr)*j>KsHal>^AiNv^
z3{W-HILUgUw>S*{Kz{;y=&DWbpbagz3(;EH>!S;=Dm4lz2gvW<YT-9rE2ss6CiyMU
zyM0MHPIQ&v<<1=fLNr6tleGV&1Pbp@)<@}=a~Iq}>)I5O%*FYahiOKUEsGh)<GsF4
z%<O0MW=KfXpiI}arn4KXzJWQ)`bKZDQH8f>uT&W8t2->5aFy8hpeC5D4nB7LaBE=Y
z?3H;3WujvOn^uy>Uz8Pa{)*{-@mPPnydU-n>OGs{7jQ?A_p>@AW9K&A>ztJ0S$h)Z
zI#$5ooKWPD88^+m1F5K|!`Fy}_t*(Hdjvv^Q6K<t!PdnU(Ms0)>qgKAcWwbZbf2R}
zkv#vpe_1-@@&&m#Ffd>U-V*k$2-VTu?Y*(HfU2>KM&FeZ`EVLHJG)HV;&5KRr{?Xm
z{ojD@gxBsSu$}?Oj4?w?KKzDQy8ZB3ckw^80AOb!Q_HFh3W?N9D7ZL0s1P#-&egFn
zd=w19lR9b|vaQTvdf9!y5;AoOpxbZYz@Zq2(*#T2!{(zvZPulInO()cS=%l`0(iOL
zoQ(fmDE&R5Hh|9+Q;>fLEoU4z_YK5C>|!H&(fq9iCiBMGgGCMZyJAM&N*j)V395Zw
z&AsFK_B)Qk-}Tedz9Sn-55Yxop+(VgeZHM{cC%-v#;qfonq<$^?(s7S{OD2gMLLoo
zH;LH{%k%&4?~ovflnyiwce8<yLM}<eHu~JDRW|;W@gNI490yc_?7OE*k-oM4=YTuu
zRdQV=L&zm06?@_OMQ-{0&3~r72k)u;nHISt3kT0?&q$1ZakP2yqmL=LYm=!9&Sy6`
z&+WZYaBg~1BgZ=9j|OMT{R5{!_v)r(tC!qOe^FwZ6`COu;1U})|0mTY5mWOZk4fe(
zk?Q&mynud9BOTShX3|GUt*CGHXP<myWO4<BNa3Q?fCw0|n`HR9v(3$mG6jI}$W+N`
zx(JD8B>{m_jkVEN2ef0h)bsBRog&C({{3yex+SK<Cp6uAM_h5Vu%r0SQ$+(NBa~1#
zeZpyxYcwQ`r!qOBDgELJdaK<JhuSqPYYaAyj@^;22^yNqa4`tjzB;eesh9S?j$HM%
zTdS3}wMST8-VI7%m|JW|t2N6R{`UL+xGZ43`p=h!U?>HlnKaVW8u-fWP^<IDOaV70
z)V%kVKN#c`fTT>IDm?DU${$P$gjZg`#d>_vsDwZBrh8=&*HrzV2M;)bFZnh0sVFCX
z2kx8%i3GOSCYLYdTbRBpJ{X>f@Ekpmeb*z`cyM-Rtk;FJxss-Mp*U%j^NsU{vbT0_
zR?^{{<AwX}0b(mdlus(*CE2s~A)=!<mSF<jm@>=PPB)Yhcp%Jdl>C2$eR(_-`uF~=
zRF=w?r4re*g;aJT5<=FouSv3n?1qT!WZx-kmNB-mGqUefma$XBU_?wz_TPKzF5P=S
zzx(~?Uawx1c`xUE&U2pUIn1!LgLV^?KI~-!9@**cR>?28g4WD4(kXPiR(l&H73fz}
zC>~oKB1#l&PW(8<Ysqh|nl`gOixzn6RDG>CWgl0b*^5C-d&pd2ey}ymeKDfr-N=ac
z8tEtz;oTqxsb&NPhce+<{IO8~<4Zr5V{(#ftvMuLPYYBX5`jSbTyQa6HuwO#=F<*B
zrbDKPqd5%aaVDs_k6Kq>pY+zPTT*-hBE6%1UjV0O6z;QCR6XFGedAZ~#*d5j6|u+N
zzRRjT*E=eSva1BNX~$ylLh8Lj&9r%>5jv!r_Wt@}`3&*s8JMl7p37p9_NYT*I;p9h
zWbu_4n#DaTj2J9;OLr!*@l{3okOND{4pCJcQrK}|^ZlyW^iq9Eh%*ejcP|aMHL{S_
zQ)h5a1b0+7B{X}`8}qfNd1x;>!{S^^EmKKpBE;Ot&BSC&zL8Zn1&zC538yjpEX{Va
z+skB7QzWf?g3C&dsPIr$Sgw)s*rN-CNA@I$42Gxw`?<X$zyZF?;&^X40<81M-J2_Z
zPC)SU#*x8<H^5zEWKN@p|B{8&4Pjxo@gITDg|hgxpWc;Hkk`60bb`lg0QMIDMIW2r
zFUvnn{L6Lqv*viZH0H!T7gCkP+2>dFKs!|52BA)i6PvCjv8wtSw+C^N7qIfFQ_<_Q
z@qMrAgMDo9MJX)W&PA@Y91M0z^3pHS#RQw`%o&P@3>5aNge&`xd;1&e`|73HaDIq~
z-h#;4(haKm@c46)1#!t>n4oOSxx(4*sL4GYw&k7UWMuyHeN)JBx~(aAUP4V!rEvf3
zST-D{G(5R~T(l5z_alR(lTN^IM6LqI<Qgw~f3H0E%dh|MFPq&7Coz$~2+#4r<un4l
z@76N&{qC(fP)qL&?Cf}0hS!4FV&KT&>uhhohAk>8Iua~b(Iq}Ebk7tpO2omr=B#6X
zaM6PCqWp(Wt5Qj&m%kJq%~}-qh$LCtKd}kI*<wUV?5<NbDLvA?wm2@8z~k=`(CAM&
zD&$eZ{?wy<Oyd}g;Vp)3jwY~C!=$~fVW8{`CqqKJgw-@}0g94#$V{8XoO1$x6wquB
zr71CYwe^WE&wo1RW>Te_H2b1FI4R7_@>A*Ri3Ir`*F^h4Qy;bB(}S!UVjn_xzJB3c
zK`;g1`kJkny*UGyA5IB%Slys`6Q?v#6ZidUt|3Y5*N@*_A6XKPm3Y)|N5||j>7tkK
z{Er*?*9w-(gSb{jd}C4zgzI@H@RLLLOhG1VMX(RC?npABk>!0>uyGN#h2NE>&nx49
z;{WT)R8lN;d1wTm&~AaKQ5JK*;j=4`lYne~Go<XJ#1_ldr5Rp(=&_jp&TcxeRl^S^
zTpVy)ZPT#8NI|!lbussn3u$S|9jG9@&d2LuyGE=#DB-u!g2JHEQOX(KvXb}eKHeiN
z=At{Ls<NgNqd9AtYI0MNS;iP24F1#A@|jBS7g70<K#Oz2k%7Oyh;=0*NVo{3TjHr8
z5gL&4t^0dT^RJK|P6+A5iMckEAfyL|$bg7OO98UgG1h<@c-$t?23rk}Uyq#MgTY`2
z%IRhQs73>ph(WvgMEk=kV?|HcW0=!JBQopdFIr!_CArmNjMcAT8Nz7_5*IRZFU7@v
zTM4sUKD3wB=eqFzS{v<I+`*S7((1x1Rq^h$F~1eHRoS}nTXDtsc##B;ggAfM890t+
zRp-;G+Obp&^?0+e6By8DF0GNbM`x9DDZz;<U2HPzjg`-*jB%7}MnDp-l;!*+z6xQ#
zks*q4Ao0jx>zvD*lo`f4N?+F~6X8bfoEY4y`ow=#JIi>m5PdQK;iU{wJ`lEJS(_7A
zK95xh{ddkdnFdz$mh{I9kKrrlKtBSQ0~YZJy+6(lR$J&Ez}sJRBq@+GCk!6dM^Vh#
zA5@x$dw8CgcfeYmv-p9`{(c5aHA&d~XY#PmbniL7UiBM44snsy=e`x+Ur37<3vs4B
zQWAs<^{AQ`@h|4$lv#8i4T@Tx-`zZ~OEs3Z*>l{$>$V<|^X>+F1<f`;-ey!95fY>g
z&82(Df`>08*O7#8%ZQDZou4xHIEt~Xdhr&b^d5W7)!r4VHFL?#q5dGH#bl*wS~boG
z6q(i_*%F3v-ZA>Gm+KgDp9Ba%$<i&!4xlDojCytIihwf_fOjo&<JOt-o^maOwF)#m
zOuCsw_Sc>I%`aW{W0rD4q5+BLTG5qqgfw@U>0v<Cz+=c$=k_|)=cy*IF5Y+p8@j94
zu)@M1=J({wM`2{}u;(NxMKN<1P0Z)Q0C=B6rOzuSB`!gJhe~&`b`o^RBZ<pX>CP}E
zslKSj-Lw@>RD5RPF0>zgP$8CXt&XD66~{L>l4oGr*_J)woJJ0p4$NEDX&ka!b7%lG
zUI-$j%GSmMhX3B_iu>SmCa5`h4G%CijWJ^G(NQF^A6F^>!|7vgG|Cu*Vpc6UYl-#6
z{YU41{hM5={O%-PV;T?N6`NYF<-Nbfe;0w~PJR5Pnr8-1`>uz?c|dmDHCY!T(JU*Q
z${ynp&}wTG<%lk<pF2=8<5zi%WiJ&yt2+v}k$ufhnm5ESWdA^rA5VRn4(E;0YOrd;
z*iKF(&11o#?%iG1oC=HHsv3gx$WZWJvQRR2%l;rzQ&(44p6;1TM+kqh1QbVcclGQq
zyYlZZo2v+!H!RAHBLy^%LM*cG5!+rmzK36*h4%7QPf6Xo(~%_H=D43i&8b>^_Ijr1
zuV-b9i|E{C1F0*EpUtT2%jF{;)*R32X{(JKT@3Xo_^QL_kKoiDB2|2r4#T0Q1El8J
zrgMb~0~+nk4dP3!jHnV5O>BFqusvrV9(c;P7VqCke&6_7l8PMn#jvdKK#^s0++88f
zJ&EQ)=vSL{lhCiJU-m|~_AD0VBsICF^t)5gC!J8WU!A#hJ~tkH{iwPVIXwVnS+T<m
z<qCt{P62%^l_Bc~WWVmwKLg-z$0tL<y{-})NVYwt?!XZhk;N=9L^$2Epw`yTC*fL1
zj1crK1Hk)EcPwYNm^1y~I=AWfn^iy9q&mOgOwPrn2h`oaFVsSbkuWzZ_jUB)IINtl
zhSqNGlM3-;d&RI*6t<1iGr|#hJRu%-*S2|RNGh7r-Zu6LDX)~Q6sd61;Z}>0Iz|h`
zNY};=nr+gpe18kCn|lYU<UWG@v$Luayu;c{MQs&NWvg!MjX)d&?eY9kXZpDIVx|}d
zmb+<LDT-nkCMAMuec4$NyG53Za*f;6gFZVo^%&iT=K~D`53Ipb%|0&{o=5(s9pag)
z)~XKMioW0wwMmFj_s@cRULZ<J`gg?fpG#(158jGNdrmR0gDWfjGs)_LM32&MQxK60
zbnKT<%tc%C=Lb+k3v$G1a9DfTbkOq!e3v^fKZ^214*yy&`N3FTDS8G5h}x{n${P5#
zK&;{oAKD-_`O3%lT<`9+HTd6qL}G9G#HVb4wrR62^aFI~P;WeAw<r)ENxR6gvcO^i
z`6SSaN#^$=exY;R%$$O@kvl3_=P}3}XakCAZ=rT*sD_;cQT)XK+&531L6)F4?gx8r
z?^F+T<6kGN*+e*|dnL1BzsRE}3)lA@SF_`k9yYXQASkywj-}mX1|Xa=2n3ysk`e!%
zKM2@^9Ce<fZ?}>7S?!DcPhrMizcU=Cpbu>ZN{RHQEto?;bgRgu(dBJm(wV>Az~Ih?
zwfSC-NOIk)q~^xUP@cyn_Ob?6@v@tSf}i>o7kgs9ty(8rqkUx!>K+up?Yo_Q%-|oh
z;{#eHl2A0*iM_<VLjKhSMXz@1f_oV}>_lQ$F2|Oz55&iuz(JK*%oz_xRj>9@p?lKE
zWP(FJVGKE0bJRpu9gjA#*DdfUJXA<I%h)x76yNT1MU3wC`-1Kh->1ChI3AH!u;81b
zA2wb%a4Aq42HO1qv##2KrCN<!L{|i$AZN&1zyFr_zbhU)0)&~$@cB%&-&^6<Acf^$
zBgfBDiMj2~XbCSF>$z|q$7_@|J>~|HP5QP){G|w;BA(~D<`2e|QzP1HKc_upBzpX!
zvhv-#lU3VUuI3p~Rh=6qVs)8qcY2BMN1+lMbmHB0PrVB9FL}Py=o`q^u{yS=C{Y>9
z0L8=M{D}oLTN>|5@`E8hhfYGML?UZ>NwK&3^HDv{*OX(%-TUeN{>Y##Sy%eUWHhdU
zkGVzon1{hvDgOIo>Jc8%BF<{)%H82fDJw<Je}2qT9Pu+Y$s!!zJ<u732s@StO>p`z
z#AA**lzfdJaFc6F7=vv0Z`LXKd@v1ho(AoYb5`ubG`ofFNLG%7Iu+q>p}~PKu93y1
zljGX%?1kKO>pA=+jU1@eZ%V^7^qN1eGpJ^3j0x7|+$b%C!y0{!kVbhr!#ooCFfU!!
z3;eBLsQIV2>PURLJS)EQfMyDB#>Jc9L|5Dc`$MZ1k@9PH@@skd^Qb0~fYUAHcz?1E
za8&G|4L45z?FfU1W-_k%jYaS2wls+}wq-=%O)jN5Et#b9O_Y+q0n2-jMY>pQ@ozUW
zzb?=_!!}BiMn%K5*pwUIBw#E!k(Sw0J3FUgObZv)T{hE%1dQI%jHexV1=}WHu^O@9
ztXzP`^P7e!bM6fA=uXU-dC{f%D$uNo@4TlXwrnbCt)0q~7Dag-4SH$r9gd%v>>O>H
z32bhM)%rrAF8AZ#LOyB=E5g@A$doo1&-)ne`v@++wlyvrq*Fen7I!=TZQ}V{L<dju
z(sHRNXd9&ye+{flSB*9PXUO||`H;j7Rz%Z<LA*SvQ2h6rBQijC=${+^n;#dbF<p93
z=UDTSU%Q*zAJ00e&lJ~K3s6sq!YlzD%|084R~I~K7i6d>#KPjh+2(YK!SEitW+IU4
zK=*{aD;8tx$LCNHmz?P2DtA|*xk6sfchj(A=upNdG6RpZo01dDKP+@EC+a<(T*%g3
z*mSDaSG#3KWZfiv-M@fpD-9@?#KTlP*4|xhYkWBVQa9R*AA_gEPQBr}sJXS*4nD@S
zdvVAAOz~td{s)8sw%t^pkU?R)SJUb!wp<7FhPeLuw%U>(4>GPV2L~ZOc6{)Vv?XrD
zF=Y(x0avocJ{%Ek{qnflyxgG4H2*nT+S#w4@jbFxXQ?PI2clqVgIX7N99ZU=k8d=g
z#Z6q&4)e&xJ@qFRkEcDR-{2kM%no=JTD`p+VIF8v8X`W<r|{v?d+R3hF&PJKyD{HL
zS<}dbmkus18JJ&`Yb4z$OOi~URwG|;SU-nh>v_L-#Q;@n(!j^D8j@Y^(H`wrbKRPS
zsL%&l^RpGe&g{sN-;xIkAXk9aRsT$p|9RAZ-QuK+gk#`*Uc8C|)aPE`Zv3l07Xlbp
z?$OhSLrG$`@iU1@iFaeRnC~tes^!@us!<JGoEJ(olAIRvRkYZb<K;`A`8>&3C;|mR
zkcysMB_2*YCiZ2gt%|{L=8=jZ|IL(r=s4rU$wXX;MmakJ?GD|!EOs63t&y40<aKJ?
z_2>32#u`xw8GVKNftbuCnmeZ(e67thEF)d&eN|2Dj1Q(k$DT0{<c=d_`-ZFhp##)b
z)jC|?X&5)dO&klo*6n0&ASvz=vy%5L(hwU%jS9QZp7Xz=Xe;?q2Sm79Au}s*0QLxP
zzmJuY!mlgw??ZE+1f&vNqQRF9Pi4)Vlk(R8XBq03%<^<;aDi2}>;51QnZ5m-Zhqs3
zP}`~+$nB)i2RcqhGD%K_T&YH|Kud!MJWZr5SzqWTSPZ;opZPEW^nkLs5zaqXVx*?M
zX^4i<QTB0jX1;<=9!}9xIvXG2csQ97<lDTb=R^^{Y3Z@rsIM&M3h#T=9Ph5p3e290
zJ-=0cx?DDmC&R%~)>!Zu+&;#5+7Z_IME=_Thj(TaLT*@)JK9~gaRr6G>oxFE(!I8y
z{cC&wwgk;d1o-8GKrbGo-J{f$C;lBns5yT`13BKr#=!GbixZjnObM&KXW5DwxG+<X
z(*=nTS{!M&vkp>q6M`T{m&|6~gS5<tmD`~BIYxJ4FlnEgRbjBtb=;pyukJ5UuimOB
z_F`NYG$>Qud{d#ms|k@?74onLCBiXAyke+l6Px6^{03QRmEXk9;Z=d9lo|-VNu*Y#
zOiZD0b9+kPdAY{!r&bWKsaAwd)x)S%|Lqg~f^q!((v}&l1YbbEj`8v7wb$sBBb~0s
zf2WG~kNMq!8|mFD6Mew-yxaA*5xlJy9Zb5B_jz0-YvOf?bMiK#J;Yh3+yr~3T462?
z3Ez*2<k=|IxP5<JF@0}TNrHE#Xw`7=xxC4o!D4zO=f+dED7;RnXZ)O@5hZ-ck$L$&
z)gE`u&gSXb=fxsbPa18JG~?dX1)i+6Pv?!~xLh9M$b^&~_*PxDY|_0u2>AAIL@(Kz
zeZ4PHT^aiC$L{%U8Ujw+kwb`cyF_D5|9bo1r#HV*PK+XECL5Gv7Az5Q38TX3x1e8z
zILqd>yf*MnY^yaKDyKdnt_<S&e(Yz3+TbO+rt5xv83~+d%XNIB$mrq-nrIRZQ<Ny?
zTh)UuoYqKpPjdZ~l8#t6F>6xm!#;4LCL<f07}_`29p|iQub85939Mvp3aQc<F1r8T
znZHC~szSe*=d;U&#A(dn9ZhDU)EDJDojiA5JzAlq09$#y%a+i3aR>29{E+mwzrSAp
zysUnXVD~T4>7ijD+l61_>i9R3U6pZLeb9z;g2u%1Q_+DnqALMvsGqO@u!y!vtX`BQ
zB{t29b0z+Q&ej6ml+%1aEk2aP@Y?kWsn-f)+{eq#@N@3;D={eVZF7X@-hCaHH4O3e
zVef~t)#S3|;2SiLx;GifTJ$+eY0lKFTHw>ftUfTBX2!y3tEBizf|mDXP3CKSYojos
zR@t?_tR98cvcs1=VvM=`hj0hxM&Sc~qYPHFa4l`4=zeJqZ3nhQd+#vN;Lzxa7dRt|
z+<0RJz(CW}gd;8!q*C;M9C1Ey#9<5l_LgAnrbr28bcON{c+~aF#E7AZ$*fDq1*`pD
zmr*0Py;sv+AZ^3};l(;|vo5Qua?PeNx9GZ={7a+|AOe^`Nt^{z$y}?>kJVK##3JzG
z3L8o<#4^41<fq;z-HKAZzAM3wvX!w5O2_rR5Mt)WVuQO1#u<FYG(xKr*u;y9WGlQF
zwJXm1jdQYJ>WX%i+6&7~EXg!R8Z^pEY`1e8iZ_){S%g?Eh>9APW3{41s#Vr!DWaAi
zeasd0+_I!&T1LFw=?RM@wkBOpYJ7BF8b77x!1C#Br?#YUI@J4k8P9+|^i|;~s!Yw4
zH*1FIbp<;}hUP&C{8Gc3Z$?;z|Np905z1$s=tr4^q-g5wj}?;#D`tiQT}pmp<_v&4
zo0I{D?HhqJN3O3TL1hhWFF64XiFPrQO4d;7cj&%V25tEZn$rA{NX*((+uWd2qW9(n
zVLceDQ|)ns`n>YeQC7#2A*I|>O!B&7R&RnM0GZ1Tty?SC{J&x-<9lS>0uPH~mPlRd
zq)ZgPW4TsGRMV9A3b}M1?Ikvd;v29W1LKA~0ByL>8Nx5!yec6kT=@O&8r?acqh0E|
zd-c9Aq?0p}=P9<hslTORT13eq5BhzKeFQ>prE?aR%Qo|@VW3T3!Ty8AU5lxTmY0@K
zGJG)y?ntNSM2uY_?CK7cX?kamQ;dy+>cMq;XGrp&8PeZZ(}Dof+^yj?3Ii2%Z%5LJ
zrgxx;`*CGuMF|#2PxK@Iu;45$Eq#i-o4s+3xy@(xQAZM9Uj5s58B#!kRg2YVLiA4M
zKY`5(c1`sC5{~J^pQq+QI2{M5Qk7;Bc<x8&_F*gasNQqOd@BSY>jLn>9zLv1mxG*N
z3Y`x#PyH6+VVyqHR?BUMVr6roxe9|;oP~v%vNP@mw2}()cM8-bC2rVSF-*8@#k(g7
z>nrr}WNJ3~a%~9ItEFQ<QZ|!rVf=H4)S}or-({#;p+)2*F@}X}zQ&5`_ihGSz$n&~
z=D;gE@a}x(-o#u68@Q=-4iaJD(G21tAl4LD3Fy1j-<RiK=gv=%5Ur?n6JS5o289I|
z-p17dW4!!15h*}w@TZuw1Ed$}n<pa+UL61wtLaPC<aUQ+gK4-mz-${k=%Kscmg6r0
zM)#tiU2i!>676wkKy-lQ#wBiFT_SLlPH~A5`_o;txV9{ap9X!VhN`OV2$^luUk>MO
z=yAoA48gflzLZ>C5mB{0i5)c9lu!EYg2|42sj<IdelK@4jF*9SR|!#e2vELGG&z7X
z4^LK?e_gOOz5KX#mh{tmN=n?y0gI(5G3@F?ngjXFF0>S^FLM(R5&hzKm-h~7){^7L
zMC8ljQq=$80^s3?nq!?^(V?Ex0TV43=-Nl>{0;Yxi${#(3%>s5!gwE(xs6NVNAoGd
zzw0TpNN~TXW>|h9aHskexTLf#k9a^N>;%g^S*D;i|F5<ETe2HYXdu*(RQT|Nd)-K^
zGKv9AwniM_0Uu@xF4mLA1JVhRKGrw0<VjG`qu&{zQPi^gaTIgMWP>XTbwG=Iwh^?r
zb3eI0yhW6k@LWr&5jc0mU8FI7gJ`Phs2QZl)nYk1bcv`HPLqP}`Vp7kyeCL#s{Twj
z&d4+?4e<nfpkq{948`B@sfb+C4KKf6gmz7SIx;>b*twaxnEa$&teykqQTZnFsut&)
zek^A$i^dU&UHhppzlXVx>u)87ey%k?_(Id$V$4tZVXarCp#s~dYc=`3y=ZW`#@vcZ
zcZvFjvc%A3Wjv~YC#pu-!zEk>C1SE^_9k=C#WjY!9=AE$%gl({9Ci%$TzK*2x!d8g
zbd(Y^f9tI1Yv%UGa>a@4rfnaM#*+$WIf+l7g#lcx`<%YbfZemVdL$qayd(O4efnLH
z8upLR^}kPeIDuH?zEke$Kv?e!Dnr5C(RqK6l=AdX!z{tL2}dw$qdGV4D0}UqMo({x
zfVchhM)-?nUdqR}QjyHL-Y(TC!kLMLj_6Zx%q{K^Kf6xskGW`(0lXNVlHbi70|fb2
zc>P-4q*Vwzf^g}6thJvWGw6kGQZl!HieXF$)pjAO(t+DIk30(3=0Da))g{)<9mu6{
zrMF|0W|Gi>2&pqS@G=~yE|*8Fur+!&IlP%MPgQKcg?Z!x6hl;)RjA!ZOY-+$ik>wu
zAJ1qaHMA=$-`5J~*?6xshBJKc$jKPFwZo(|T(RzT&{j|Su4Ge`H_{<@f+mYL?Aer8
zJbjvuR;~-GJUOyPZ(5;l=B4P>#F;8SAAXx+H=E;OtSc(Ni5Y<3UOW=4S){nr#a}y>
z|4LhLn-&qWc^d2O0u~T#4cxNFZJPi~Fx7rUqlYuV>^oh>h)RL}IAFcbDBmkw+PDnb
z#RKj~aqG;l<LZD2gg_dv6kJrsVzHSLfR1h_eo}{mNZKyk!$BSi=24h50CSRe2n50*
zBB>k5h48ccC+ZXkA|M#}9_91$DEf^i-d|voD*;P6%UhiH^`am0MFNe0!$YGzJD!cc
z%p9xB!KtqRc@gm{-Xo#Jgl}~`duZoGSn&3twxTD{1X0E3aEgM+Q6H*Oo>n>vnKKka
zvXGReKTAATJ%HeAMqp6`UPK|z*)_gf)a2D`hY<_PkFKhtBu69ZF%6kC#u6B<8l2S9
z;od9!Z0!S=&Rn4iFUjz$N#2%8Ac@AlD@!{ac24E>3ew=!Tj3k}w1o5V9xPj@{6F)?
zzeNjhrY#7I%TdZ}00M`xotCDc0I`2>xFB79`#mlurk9`g^0K;)h1PuZCv%;CHUucy
zT)t+{m%HOQtzol59}V+VnEe224iUF`yR3mt)E=Dl5E~nt%*>W@1^qz)q(bLJ4znCS
z1l-B~;NAs#{YbGvmEN#^V4R>!ym4bz3iN41vz9gmN1HeQylJKilp@cTHz_UJ%<7kK
zFvC8tk2$yp!qBpTJD^*Og(}n(oyYAf8s_d;Q}cZ)TBUrnsMlm8Gv*wFIYrrYO8IfG
za7nBLL@K}GgQ3MHS*&HW)9p{CnsQYtDQQWK>%>9h8`@!UFQ$=^CCl5B^M{uBoZ;a3
z&pO*m*w8qGuQb<Ry&a9Z0=d)0li@Z^BtaJ8%(O!4!OlEJ`J1xK0*3cR^(QXdcUue(
z$>bVk-U^-u_Yu4;`^R-NX;}ZSLd66m7`rX)9|b|if9b@q;I)D{&mf99%oN|8h|e&A
z3fNJzrJHDfE7gJV!Js&MKn+{uyoJTR^OtN;yv8a&8<tTnX3tu8cXy%HUga4Y+=_#H
z*5pA5kw~T*kj<Kht$k?FSW%;9Mpw)h09xnbS>dU#{d|Ric7GPP%~hQ56^E2B@^Pk&
zAJ<Z&>{EP(CHA@;JQHwB;gwB<OU&ouOwWMV=&I!J5Rr0O^9+|)+TG3`e%2QXo-ck6
zNC<v>lDMwHkdliv8lkuGVtZ0mQjdIkmwT@0o`@}3LVKG;eo0^Fx6w=z-|fm762F@P
zjZT|p=iGbOGM&x5W{`|faKazbtHZ}KIn@Q>)$WHk+axiyE}2KGm|VvJdF~MmX~qds
zpKj+Niqn=jJ01&BYTC<$ob|!?A7K1zPLT0u1k?W;`272UKSiK$(F;F{AjF&{UHf(=
zkT&q=#z`-%A-HZzvV~@;*~*{Io&mu{N=Iqu@FJGrbQ&bMSR?qz#ok^u$Sf%*r2XdN
z^@lHsfwU9Vk+kQ+Gfs~a0_lqA9lI^<Cl%xSGF%J!&{)94h$=2A@hOC#3}OJ@@uh%h
zUQEh7u*vHJ6iGeI)uv*9C6&)9=H8broVXjhaON&|X*{$(l&z6e5Oubvf+#(~Z6V1)
zuQWZi4_&~ka<3xbs=>0--E-Vk3-#Y)mqG!I?zj@-kxXw@9m`3MrWk9Wt72P?Z|}Lq
z?rUOchKi*hofuOww}EKVZtp6m@uWjW4xCp|l5maB7WF0fr=C4e6c~fX%Z(IY#vkrb
zGgg07&Sg?pYYRGfl&9cck@8fo@l1ZEmlUEE5o8LfgJ{oBzbK*~t)rI=kmb_J@!WgI
zC&_cG2uNMvv>c}hWhZ?SSls5Nq+f|T|BN$)%UuTuX!9@cr-1LyQG1O%Nd4KaTS(qI
zp^rsvS5JIaSf8%P58!Y(VW4)<(T)HUn>vypKqaFACNb?47MP|H4M^>+daVI-(>!A@
z5R5p_w9YQwwAYb_C_#Z0;#0K)ruX{N_Eb<HwIBmM(7kPtqkPj7o^hY=k9`{^`RDy?
zb_Gxtcs)N*1SvPAD}+DUGGbqJd0k_!DQw2A(u_OpxnQp4WrC!?c}>v45a1tQ6E~dN
zsxG$7JOG%c89XQnlZJh5p23c$VR|jq){tE1Yp}uoA?hvXeCv*LT&V5Lz4p4f3?6$l
z&8BBKHl<51X_B>8h)(%H_RX1@$f8mjw@Q>`;bFb}&knE>sd=Wv1wswl%-+oJGez>>
z1>C<VRx~HTasr9w?Pci|rCRATt^BI;(a`fimj|+INu#`m`!>)J3iHM&%L`(yL~$B;
z3R&-L2v{PBg8H}A#*Y3uj0}Z$wIGM$E8fPr-!%`MA4X5o8J1#!0PHY^V$RUb*P@Rs
z`>N*`wiR==KYu}hz#Bg6E*~B4nJrc|2Q&Q|8KXJoL|cr5^wvh>IP!XHLp&{`VUq7@
zd?v)>+9m4`GoUV%ly>lL6?5t@&E-_D>RQYY!B-7rvz@VZ>`b&T^%XrCS<AyeEG$Up
zmp@HPf9lFN(NKsgNDE5JwBguxQ9Pd3u~(ftdln(zNXIZP5mNOwu~9<S*vJv(fX?G_
zTT`2Elf*0M7-(dy*RF@VA@e<pse4~IE!zc~7<ig_IlO#3no7w^h=*qgS}l6dzW+--
zYXq_pM@gWbO?_Lv(>WOV1QvC_?(Q~<1n`c1furMl%j?3z)$TxQ{uHeXCS?KJMqBx5
zaCZ4K5E*aX+yN5-4x+(IRgB<I&Cd7fJcqz5#2Gf>9xw9CkwA=_@~~a0zzWRk>C$YS
zwSh4Sc2A65`ZKZa6Z5;%nLtj=l@Kch`U}FXE5w?X;Qt;@YDM~e6(Us}=NH|iR+al=
z`DBVAn`!K5J@Yqxf#ML6`@Ka3iH;ERx30AkUPeUQt;}Qy1x~4fG@`1#;oJT$4Q?=?
zmE~wE*7;=5s;Rzo*RIn5zqAyxbX1U{8g($);t=Q~Cz%}TNpsw8yyi==D@}{Rg0mij
z$#+L-68G(s)qZHBd<CbRSLX7C)l5?@L*_*n!8`ZlwmC5fd49}b>CPUD`X@2=&)4Yp
z-0$@z0*Kf7JBYVCX~MOo$J~amuK<y?5eSbRRvi!_QTA7JZ*doXZ!=o=9A!c-rr4;e
zH7JpeIf3;}Bw_Z+e_{c8|LZ`ba5@cSeN}w?_GGuTln&x4<^<Pt051w;jtP8M-#CQ<
zfwD9Bn^PQ`!AojXg-yO6BkJ`>%ed5;m_e+{yG`V{n)X~8yZH`Lsm7J5$5fb2sA8JU
zbl)&Vu~`1V7X;{EK*T&|Q_mzt7R0(gmSm2+o$O^gqUgCz{fP6;)~bz0NYI5b(oavR
zOfDS+d$LqfZ`F}HM4DPL)eIFJq+vcp8Bu+ipsO`cPf_%q!F|2wkEXllsLc9#PW&qk
zhoeLNYRFq*%hR%eztLR9+RX1gxpvR`|LEXaGlMsekF(4f3$zii3J@4&h|6YSbsrv2
zSrCK^j|x(MSKRtmz#nlf1e($NI3?`9L4Tket9F>Zkgt|`=mm!6NfsF+xI!hJ!OtjX
zPda+)=o=a~XfkxpZfZXa7v&7iij6JN;o7nWTC6%3dJD!2@*fm9jF|TRSi|4^^w^&+
z#VlB|w)wdpSeiX~)mH0tC5`WM-;+7RnZ($TL^B-QGEEIyIjutIH-r2+l~ct^5`DAl
zQ;NtWNWA-Fd0g&@xUBYQ+Z}O|GCrc+fa1coRx!m$yBJGKL8$a6OS2`+Az~CEIlSrZ
zF!zbAi6~sueT#+;{%vByd*P(QEj3W<cX|wruzvK6-7Y?R+A!Z{Zu*L%cS+Zz{Q1?S
z!VZQw<@MEWIGu?U(`ArQd?<U!PKat!1W=6Z&-nJglZ=0ITxm|c_~|KOFX+fH>tSct
z&x5}&|0#n7U}tz>x7-e4>NXtmd}09kGp71PDjt?*#DObsPa(UHK<_qYAlH1cq?d52
zz*Uyq?#9I?>9p?`U|Zx79(($|Jh{f)0pE>KA7@APkvV7<FwyMKjN&R~%~Fh|=KaUL
z?G0jpl~G$o5xb{b)uqGv2F81LgtBR34-#Vf0#L88YHsD<wej&`4~+uhXEN2EJqfej
z=D`^1k8WZ$%u2aE1~$&{$yB(k`cRe$T`MYNDJZ^y9JAWd*CX*GK)U3MPHSm}YJEk%
zM;yBd4TE8?!{~m6-1N7Q<Qxd3rXq81Woyc@Nv<!HsKNEk%-;IUXbYX1*bHX^ZQ_cz
zeM~OJ7Kf)_Zz|*<!|~_S7pf&{iIFE9@l^m5ftfVf=KsC+j{-mn5za7t>l(GXL)AG5
z7Y&fROcf%~&a%Po33bUQnXMTle%K2D)vb>gdccN(SGVFmNS+};L$zRry9je(=~CGr
zvAxB_lDmIejX85j-$Y>JIM*MwHR!EgA%xo~`s5yQ#e;%o3$n*ANW2oBYAdba$TTe}
ziyy`zt|p$WI`0GWvZ8z(+4>&!@ktyOV8l3as|@=j_6E!KQM&D2r3_~|p8DCW**MNV
zQA$U&9pYr~Gav3%MKF&oRxW|u#!Dj*kK2xYC?54xS1i9}CwJ6N*UW(h?{Iwf7{uf-
zP2YEB#(QC>w?$>3a?QHzyMNcCM!P|0*K0nv@^DwQNL;r?F^m}sXyweVIce_pMx~9?
zELDT&M$uvGA|ig-7QLk4G#mbac<0$4V0}fxtBG{}+1UGMx;7aLR(M-tRBwe;-GN)R
zh?3~t4$$vf0UP3je8ETBHV5lXJ#iX<W6QQ|j#jpj9QGYGBJfXhy}>{Ypm3HWYN)1|
z6Az#$0;2O64Ihp(t#W-AlF!uqlrDZIe_IRZsYKJMwReY?66X#F0<kL@Gb2y#z7s=d
zQ1;cj!D;i^Ca6_0AcCZW6b=n`lEeJbD1_y(AB6%V2#}OOz2+GK-`i<vk##zhX3M?b
zR#b5*q9eH@$%~k|i7!`zvU5X`xvdlYWl!SpS@zKEi*5s=7;g-KxU*xn2vmLdxwa6v
zn}PTzmfR_u@LH&y-IrZ;d`a9cHFKM$#7zl;VTAgYWS<x?)0)w9n{rgxG}7|4hFzKz
z-FQlG|4Y&^4N@Rm51?In4O-@X{X{oLk%}s**2;s*a#B2wC2XXw-r0Sn$_D-Ix2$(U
zT;~ZGxh{^(Ye!Kl#LkYgK-RW`s7kfAN@RrgtQXtH&ThHf9i9|8LY+fkkuIybND~2o
zZ=NB^DA@z_(W~nAp2PJ_Bhw?r$btPnX<noKf*tb${*{B3a)G(q^<PP&OV|w2modiS
zEzS#+BFcLPSw``dqgGU}U1X>CUWKhQtl+Ei7O&gJ%GT{?Y^cLrV$O8`-g3Lr%<p-p
z+2a4wVlCbMO+9K)&>9@BSW!Qq^rYx?+&o=DCAiiZNa5zpEy^D79k{Q|h6^$Q)+L-e
z>aFascrK!8?@Br^A0NB_N1gx*=~0`exa$R&1>l`CO~EpdrctO}B)IrSqX+M?O)%O)
z3Q9{$Gj(nlZu$aUJB*2wABgRa6BLQDfX5qhUt|PmOcTNdg=^%&6pz`3&)u=r?69l3
zK03r3*Ma<3))#)Z^=sf-!?VV0+2GF(sVFL1fEfdM^puPlTg*owuFM1=TNB0Ho=r&w
zn9ig^|1y!&t#UD~&@94h2AAPB1N3|8uIn?Bpbalo4d(Ui415HNLGOhWn-WpU&y3^G
za)&E~+1q&1wivS!hp7WzAzi}K(nmMI0g4dB<0SGkZ*+L?hYJnVe5(V3Oz7AIcn_qN
zxR{poFsCcgnU`G;)cb*re?G=bxv#%J9lHPB5#A<T*r-Rq6@CVITZQ9vM&(ntbXNlk
z5|T!IX4mS{L`HWcO!AOzqE<fJnhN1q=KCX*xZ;-*(D4H_4Nk16jx0|iRK~fft+ud<
zwEN{6$$4y_D<Au6uDKzHv37dwfzJv_0yP;fq9+OT78+2>fQ{4=pjYo3yKldAXPD=E
zjZ5stR8HHq7K3H7R@SL_x!ah-_>3cF`FEk~)F`@1nc8my@roI&UKo(-)tn{#t{nvC
zU4ysY<v)M(|Am_RT_!MfPct3HRYtz8Mx2gj8@+ikVithVN}1xbrr{m{P2$V&5VnS`
zkDxs9bzsD=6KLNVvIMxRO5+C0AcP?F40#>+1>KYlHeoVF8fzM7Ar23WUvTUIvlvJq
zh3`i+@C6@BUwfACyNj-F+B)oiRu{DMBz*q6I)JB`l%#F0=2DeikN(!cZ_#%r>*}r?
z@_;*+N)M-DVs6ookYdi=-w`+yHOc^Hb1CYZu$(*u{`VPD8@;W((R3y^0(d9%ud4*s
z4^&`GfSFCH@6q9O^^{XD7!kx@Fpla3E=WwiKNRbW?(3!8$LtB}aOC!c4Yi*ew=tNG
zOU%r;MoZnfabgsWK3vy2uSogFUf(vo`y=&iCLU6{R876LDeMIA;NPy?FD(qX#dB;R
zvvlFR$#_QsN|V`tJj8?RoBzj0P-9KG)mKknPTn325XC0dhuFgWN@_4_k9o*~0-FZ4
zc_|4>jSLf7-d+}S{&(f#YW1pk%6q%z9<_?k)79)`DlrGnlI2;ZEA_tPw{|=pc52pG
zHLlH`mz?7vJVBrN2bXuQbNY^GaW0?bUK^if9N(1;lsLl6%*f&QV({v2X<o=Fj2g?z
zR)dUzFxXwJZUr^yMhbz3{FgrGe4GD%&#Rt;*M|1?qp%W!ugQ-Vu+4!Cs{o#Zpa^2J
z=*#rdsP$A^lQIWxQ^oc7D#oXHa_>{^u5je-B&fE%Q;<`C22F<&tWWSm`D*0s0xsj6
zis$#w0k5EReGb99T=6;m@j3Un7MOwPJb*&c++YMNsgGq_P`#DfKdQ`?Ihq|b@LeCq
zAv;l^nSTvdW}m)gf4yU0_q_bkgw0d#ma}oUt$r5<AQY-mU=jz7Y{RtBVji%sVoFL)
z^>ULhH(DrHT=@)&iB?sDkXGi2fuZ3(+>zm^-k}8TS$MemQey?y<OS+I;4$SZb?-=W
zu?rL-OhC&!YZ-2BKQjuq^EnN@9-3tnK(Rr;ZK3$*s+C&)U~HS`h)zwEe(wL?c6nKG
zqJnc#Dq+tj=IdhL`knA6VaAL%;UIhCS=3w9=@F%=QkpS5^to0_vyGZij>U@Q>u{~^
zVEIXG(+k5Z>=|7WFGGA=G8%oC)6#Y;<my+UR!v(zJ3R?|Pf+k!@3AIY);eEW)I~fz
z3+^hJ5M5`9vL21d`Mou*9lq0pBM+AA*>d6=STkIeX_v&y6!Lr?xtw_U@tdW&443xC
zm``;qiHC8sL8~s&RZGr*gmS$97D)0U*#)$b<MkG5ejR_JF@Ghpzu<Rc2_UjmEtCyt
zX$E^5I2$Sz$aQ$4L|kJve1P=^4qJSg=v^c5{2`A{(V1GzV>_t|6!FGh{MW8Y2}I8O
zuHLkNk0P}Boi;yrPo7GbUh_~Y0E60MfZW_N+~dw=8GGRHfCLW*(p2-_F6<&=JGN8=
z9=CgmxsBDP*ubr;06?mhppm#^PyMoi+P4THrB!`pRwQ?rpN!dfeTP>ORu3%7*PdgU
z(E6N;-cTeE+OHwY%AzGQ(M6e74eKhW(|{XOj11KJ1^=l2(c1G~9aS*0?wX3pvrccI
zwA}v;TE@QNo@(e9f3ARm{KpTSQl4_epx#z((f=@2ab%aZRUt!-DSuc!kiuBTM~#rU
zvcfii$k%48(EfHav`zF<Y_HHk<(X>><3<8K(y*l>*=Fw3saq6070!zC?Q^1ATmamb
zSRLqH6JnY)D(;J_W;1s=_u5^Y^q8{;a(Zo6M=}+GFW^~8{~phpn(_J5N?X0m$@-SY
zZ8nnxs{LZGi94k@@AVq|>n8UlpHFeSM$yXavcpBtjSg{&ze0W%quXBz^#2O|K<XU=
zu6j~)xvd>Y91!YfCB8O*>Gar0;Mk=~>{#tbw|retaf7t=>)}nbYECgZr;|})+4?ID
zABo6$I0^F_c&)b|mCyNIo0rZ;_?W`wif**8%SBu~eZeUny({zL{#2nTu=_IORcycX
zEk38yMR~dcot~O1z6T7*OuiuWfR9vs+$K$9$JSo0;9?-KkE)uK$Q|}!mQ(AlFpof4
z!}yn~OXl2jhm`{<Sm}CzaW+5+^gy~XyH+g)TAxfD)`@FaYJF_bxB3~8PT|yklUR-2
zf$yZSt@i9G11=YGX7GHY0VE^@{9&S;)X;yGX$FcPvn-f+9y=8bE)bho`oT2KuHBDd
zb-YH`XZR+s5h<O_rzX*ho851LB+y%h_}A*+Dru`-X`09cRO=E3VoY!Vj0h>nHee?|
zXNDcaQ9GEt`j!#A!NzF6;5=amFXH!Ag-P2u7^?Fd4ATlbpKv3ZGv2r|bx;nER&_FI
z_i1;e4dyOGM6opwuy?QAIK40!&oksyC0=_Du;sjWzXANw_N1Tw(YzTCsTuj7@#lY6
zz@-F7IlF4=^hO^bNArCoLt*$Re3`w(`!4YYn4@O}P(-in4>fSXMLL@vXb3O<PNNtw
zr3W+8#cAfzU~(l|-(<mO#`abw$h>Sub~So3fisnw&Faw`N{(vw0;}2cvfiSR^NB!3
ze=uvjr?%;wJM0xmA#IfXG6!BKYZdd@U8g;Td^6qB(UyWJ3IuGUsFWD5otJ=gu{<&|
z!p|xDsA60Km}8t_mUGH1G)93-h%!E~@9|upiQCE;vq{6?-YneD9sZC)61cWa5-Zo@
z^)#`Lpis9$LT{$_b1HMqE+pgaVOzfIZjFwS5G8h!Xhn^jE%0yY0jD4Fo<^H{>7A8m
zq*RZj-p>r0fNR;Z`zch*HTuX^Z9f)2U+I1D2Nyu3m!@eD#Ye&s^|4~ni>k)SVSqJe
zfzdd!;EKN<iM^^*Iy=L}2?HA!72;JEf0ys^P^8nr<?HQ2C9xEbLtbSxO`MH8q$?jv
z<SSp0%9HjeN%LOnTepP>GnYC$4{q#~984i+SS(#k3W&!k@jZ0o9l9Szb)+(HZtK_X
zZeoU&{De4tYxiG!@~g(zBwlvkeNi_r*1AWrDS@)WKa`q1@t~OhD1}G*Dqw;%UQD#-
zc?q`oq~$gU%dXb}MDqI~`QMlO$DcOqf=54shCgf+SSUeG3{u<yvhp?HH*#3pOJxJp
zc7O!~UrS<>@1e)}UktI5%+0kA(dtN218JS)4`+?Sl{5YyF90{JlJEO@U_58h4<i%C
zw_sXy&;^Tu`<-9F6pD5sPRjS^>`C48BLrt>#>Q$6O57T;U+Ue~54UfQ8ajIh&nJy(
z<qq!!Qbc{xc^<dv0!H{QD%;mB=P4+1Nx!!%7F?_aT^w-yO!#xXGsInmEH^e%xx&`5
z<sE4n+?4ABu#2%>?q>8H{*RWUQaGG{sJ}U)`Jv?>sS)Uqv@sk4ba@<rEEmasVqRqC
z@r{dawo1>d9c*ojQT_clBLs6@K=l)zwe&%fZv8qyACzM;MT-~Lqcsj^03)O6eJuvS
zJeV67BlvZK<9%U>!|oItt5)#u<OwdSWO%Tu6=17xq41j8yR)|&-HJFHZ8EXf7jJ^Z
zg*lq;Pv{3&qXb1lzXelnJ}oiP2H=FCLqLmGP#Vu`c4IY;E}D(SZfDwW8dRpc1vnpt
zo3`wwI{~lH-{Vq8DLpQVmat0p^2p^ARI7@`DQ$T{>$gSr(t0}PuEvmm@hpGQz3p(+
zOuM}~k2}N~Ji0w$ZtU9~^kNEWo@JcuytiK$njVjL2xWA!jx>gjw=1k@X;E#3`j-^*
zl+#4$;2ZL9PvJ~H4$#T$Wx}uq<LjY@o)Er`vUAQIXH-F&_|&xzNTshl0RA7wmzDmZ
zQvD}B{M+#pgr@0l7d7uHfuGG0Q4ZtEl>$|Pd;7J_SPe$-Xz)xIK2VYb;Rj;W;eFrc
zz+xH=I2Qbg8i!R5;ev}zQtIz7Pr?$x=oq1&W+#uYUAs2J6*=$Vu{f23Xo5)FsO;(H
z4$lHR>=g|w7V`s*&CU^8IyzM3e9fjRy@d~JXqL|PER7XC*<k6-;yqi~)=}XEKe2zO
zL&lzo7W7)~ewn}-X>?c-Fl?3$4ypxadOwXUab8ex)Bq5u=HAh$U~`aTjPsI!gamxA
zWVD%j81A8SUf$TToLgH}X=g#;3~C*`HN!Yy1cK}HYDDis_KY0T=0S&tH7AbA6)(_H
zET?{BP+DtT+5J{W14}5&I(|~*LMhPs{2r?5Px>h!7qb=Q?l4q&rzdpIk$RmJ97ugO
zQqsXwW%*JYF@|1d$0K_bpEb5s#XQw^&*ppR(N<H5DrA!sPwv+*opw<C)+44Uz0O7+
zsbkygwh<Pu*OJLjI$>^^@g7p6m=LV{-9VE;ke@sY7v{l!Oi=rba<?C2*<fLB0g_L|
zb4_nT_RF4?QBA{9;hu1)m7SrDA#}(2$$_Ys^Bg3Sn3Aaq`!zT#BcOMMsE$+}-c{X;
zm(TdD$K3a*U5Dg(Yp{R8Tg*03d#GWZQ_hj9Xi3HF8WzX|@zi<JfDbiD^yGj%Sdp&#
z@Xx~Of2SU%v>+Vy&lT9gQ-DIuEn`|^HO5FSQwoZX08K&XbW#wV7$+Xz(DAmg$q8d$
zwmveB-ruU%$0m-xH3RdD@@0wP3?q+kY!iCB^&Mj9SE1|8n)=v!d_DR`6nGE1oui81
zZV+sQR_j59r^cT7M-#JJC&CC=bBhtkC*2q&E{8%ren)G~!~NB~{hfm*$(Vp9fittZ
zLHQahWWZBmZR~D#StZbd>RUzFTL6^6L!$>PVKGyJSYsb|nQnS1QA_)?V)V{3wfXYX
zVXk#DAMJUKgk&{HhJpxb9qy59HLz|2GZ{qb&ofRgh%Y<qW6AE;9sw!_@Tuu6BIG5^
zXIhmup2Z<bfwkcu$K|G9SR^0+C9`v%Tf!TGZ>6DFv3@wvvIHOp%9jHSzbVDQcw70?
zA?K$^Z|0gq6pOJ5Igqp&N-U@{Qgku9vXP~_Yn(Q`MfG#1zf||$H6`U%>#I<0UeKnx
z>O5eKEWE<9O@&*9rn{u$cwDDvs`bYzBw$o(shO$yzy!=;nEc~cEbPY#QMBY!SIf)F
z*!qn&>}rNZVAms)ult8+%Kbtr-U=!)Sz{(pURa-^!P|b)&hTPiYx(NtyhKB>_401l
z58R=S>LZ=_HA3g<pQz@qM}?jmyk>%amg5tIPF41b*T<HY7INW>5o+4^B?=KzgQzS(
z6&THlgKzci*ub{bbYQu_%`(|{)7$3>FgRX~{H85va!TeaHE5wiLX3b_-Kf5FR;IC-
zMZCt*GBq&*J-LF^gs`L78G!OQDr?E^q|e#h;g?bQJRe*XZ+f|$mq#ueWGje`YwKfA
zS073^L_kJ%2xB`{0Zn=O%gux(F*&aId(T78JZ{YS30j$<K{rRAc;f|)O<GE7>fXEg
z&M1>f8}i&?0D4%}&W8(PKq*{4x<3b;>Hwk=7a(ELR|Mb<m=F3Qehl!QPj@)(M`b|m
z-|J@z_Z-ycI8;@I%zsA$pMaf!0riA;`hTvGDK8P@!5$Ohu-Z*j5nf?;t$aLi%wIXa
z3vIG#SUaLYWuTM5Q&YPIOug*h^{Z?;FHVYZ%gYr>AZ11`YEVrT2Hc7Im_Xu`(N>PW
z-t@Uv>wS6b@gd(6Jj2!}#}PdJhrY~L%INJ9w=BXuxI$u{K1OC*z)~r6#eyrPgqa6w
zkpq=ZCWu4rXS}Gn+hIySTwY4Hyu=qj;|4p2qBlY>C7#vIFP4$3+dqsu)JB%njMhRI
z!p3Xweeq2oYCG00$wl6i%hTwqH*y=(mg7fgLIy-A5<cG(>M@<8V>zy)X9%y)e;pHp
z+u)@kyHX(!0p-ek{S`1Xj+ptdt>R8h8tr3PY+U^W{2B?=_YJ~Fc0^W&qG604x3`A*
zN%eJ~Z2;vxs9k_LXL8qr+_uU&-~dPHww1fG2KuFjm&`tVRw6DOIWG<pT&T^foB58a
z72}2p8jN3u6LO*sUQx7A;kbS5Y6_C_5{nVPoBpBdkogFo<S2Q-JYnau)Ha`qatQTg
zf^`chJ~6~kudB1OfKz$Uk;Ha@f}Twm$d3yRpdf1k^Ms^nUw-ObdHO&JYDgU26@e}S
z1!&jG%9HrFSR;L|Eiji7C&{$Uw~a1<CJ7qJHKtag@fRQ2%^KzouahzxulRs8-H84r
zKmC?Z@n5{SD*qokLi6au^_`f^VWU2t5aV=zAyiN0i_J8An7I}<`<kMy)G13dxUV3*
z(V;*fp8@$*JDx(<eu(n0{Nru6X;&xRZpaFxuV`*ca}1Xj%euNng==s_7;KuEwY)Z8
z$^iwcMB>4!<JJPIV9RQ?KLm4`QiGv{uiO$}Ap4BRM@lOu<H+N!rxflpN{R|!y!#~7
z>n>wF#z;^3`6{M?ev%1X_Ct5Uiy%PUMuMj!D%gk4QsTe&p5pm)3eX4iR|K?`OS2>Z
zRaPKYkI4-e^re;mfZDFdfk8p2w9HK8cslS<zJ8`Pvp^q=%4G)Vfn+99%jO_uU=LXM
ziO+vq9twH-O^aEM1x&=cU|~!t|G@#zpey*AdxSS~e%%n5$#=ZQs!ld=C->6h-d_!+
z`q1mCy+2`^4!O=u?9X(w!*F4_TH~Rn(m^f7oMML8H$sL0@4Xf+3i=(RlJoEn`o3z@
zE_D&u!P)5k;r8s-RrUB&R{@a;E#`6RNn_ntZ*TgS>qUrrVpl;H8X+jPyb7|&m>e`}
zBAapY_bwQ!&PE@*_!h5xXy^inl2n}=$G8^mgbRW$-S;r?P^AQ{M3_k@{B>n4IKM89
zdVdR!o_lKW7RQ-vspvjWHK|0R>pB^GKg|ZB@v&SUSYsShc*vk-+dKJ|mclC}Ue7-h
z^C)4UKq&834rEU6%<a^{J>l)3r%7jb0V7RL((D{&k|t(jl?)^GJpEqxw}a3dO?BPw
zh)&B#MR(p-AdhN!w?;i)@*u6a<z$#4pKE=6o+izkt$hVy{H0YFDdB$Cfi~j2&wtb?
zgj)|d=}R0dc8Afy1<LO)P@hUjzXA@)#yf~oqY3ebF6vwm0u5hv!+@Y!F)wl$YGn*5
zd&`XgUV!s}SxN?QJz&`=DCJS45CI?7=k#s$;M#j*@9;ag8wnnM2&Y=ceJWH{dy&%c
zkp4QAY@EijH}cj!uFMNBKc$A<22VWsJuc~a*_D{<9Pt{U+MVh@Q3Zy#{@&BT(pyLY
z!^f!bxsRYGND|D8`wDE>w*W4EwnAY9j@0OBG^5|m5L$OtDi|eW_8D*eM5t)cY7swz
ze?yxPh8$F;=pU&7*kk}|ms(|BszhTVynNArwE!pmnzN&z-@5YNKwi0dw`9bgfiSOX
zhnZSmH|eZje$GSkxPbgK$0|+CIB#@ctjMVT1f}2g4+4{;hcY(qDUU}F+rCu0%^BY6
zkL5*cn^tVAU<<hx<4-E7naCd-T1Ijv`k=UYW<P1ipBa<ytB5i?kLO2<?rg!5-9_5T
zJ*RIpqG3`ShR8SLe$((Son6MUm4^~|zY?Cc5eCkMP(#__)Nfk~;6Eg#(g0JQ2Kn6Y
zS>XT9_<sH=2k1{Zm@kcs5QN@MMGx5DT(CG)I4{4x;)VhAhg-l;5n(kmS&3GI>%*0u
z#tKNyiXhey;FatTIKgW8i(^=THq-}Fs6FchmFYAF;76m|DbhTdZj@UMN0Lpl!(PVp
zFCBQdi{t8U6K|+7o-;1PPC28r0pa*fRI1=&G_YHrdk7l^^JNLWQ<KdzhhDHP)+c;}
z7Vkl>p$6(c^14<quO1j<Uph5y0n|5Y050hl_7y6C!=+h204H`ZuJVZ+-yd)I6D=k7
zd8neF!=S!&Q6{28vg!wIF}bS?t*o@O)6e#9FY2E=E8i(q#gO{_c2VN};pJxuq^`M0
z+ICu_o)EVI@4b9u%7M=7dZV<sLG{1@;0E+u&jfwS-o%o!Pk*!k$y*a22;XixM5^WS
zuF<&bKU>A)u211FQx4L+9dzzLkdJzxJyk6`_5`8H%rcQ>@hMG%@`LJ%vq$aR@Utmo
zfp61kpam#}r`uv?=8pw7#3bpJ*)L~|jLqV_A?#+0cZznqW)m@u`EXz1%w2Jz7bjU(
z`aix=zYcW#nnd3N?eikT%>NrB1{N>CtM!@wWQ7j6yy>9qasXc09<J`;XtAAQ_$XND
zCV&Pxtg%`2r2^(!o<kXQZ#&%)5bYiC2=%RxH^D5tO7!|=4taPDpw0RM4$+5cz%xGp
zJxL_xh~$PP=m2p~wNA-$=ah=(YH?WtI4Miu3_o8<*&o*Aj6QGxT%BCo1LZs(Sa=_;
zo-hv|r<4D%<zmJM+*!f63hx=AL%?$E95KqDthqvb?Rv-ejChj|`bVE$djYR6(OFSp
zVf;<h{*TI{qC|tc7TGOXWjvkHTeuakp?KXS<~^pV1nm3U%(@QATR`h_!NR-XCDV}p
z(O3=M3i>BhS9+OPVs}h|dYu>QLWkS@EZVu$D^WDl?QDlCdPr2Yy`CZgp&Hw~TIY>;
zQr5rSyL#Z9IY8q!p-cLFP&jSpLKE;+v`YRIi|K(bRF@aRV@ff>>bXOX)Q9CnvdG)o
zo7BoALFw28-p?2P3uc!*kgPEYyDLD(kC&8|I@(deI!n2y80V`&Uac~>S!NlXbsTW>
zSsse}f3&>^RFv5gEsTiTAWG9jNdh)G=OBn6shcE7l$=2ck~4^sL6jzECFiK*BpJa*
z$w4yM0s=~u{7%meGjs3Uf7bujdTaHJA1;{TJEu<7u3h_7{rs*#T4GRs8f2y9BReF)
z0Q$(bpV}n>O<ps{u=f>r8AEXGaUpIMbl6RT14oP)vUdFUOZ_E8s}Ta<23i{ke5T2o
z{S}R1+tX&~9R5NJP84Ox!p(!moqy@it!Hn8bG}#Vu|H`&=K_QwB72QGPECBwhtJi|
z1tPL!li2YsI!b2jgn0n7^r_=pJILY12BwAub^a00XHs_0t+!O>IXTuEPm5f9E;A@*
z@uUhIjI4%8<KY=I)peGJ(k?kHa?ppXXU}i9AljJZL1&|&3m=caW<vD7WT`*rU}Vfv
zE#`%VpQ$g&4)6}=qti{F+Q52rO!Br0YX>DCj~0s8qGyiW=eACLXFc+^N;eKPM^~eF
z1&q<-1<EhQ*9wepe|!DhztD8sYVLrEFM@C1Ri@WI<zej;^jucUR>NjBYwnz2g_j`a
zM9Qun%%+0_^_<}PN!->kW;*)8D3}jPp@Ebtx82c3A8IwD)nhn4r7$N2;!EBc2aF7z
zn2|d&VX4pQJyp{y4aWU$<Z7nY^ocn_W!lA;u2plHe>V?$mH2q=Ocshqaoxos>KZOR
z&OblJ!EG)zzA8&Sh3!UqcEy>ZiK)W%6jmHHOp*i$Z@z%)B%Kq{7X$;SAEbPIGyQ9w
zVpGKXMX?L07^GP$eVoRNup}4m7y6#JGDI?r^>ru~3gPldvkd++20k^uwM9$W@X&^a
zx8b2HHq9kvRg;NPnH$PfD~ZjSnNs+g&lGCa@^<n++R>b^8*->@xuqu7JbsXoq}Sw;
z(n`IlVBDT6`<6nzqC&O9t1pjt_R4b;T`fmN;h`n^H7-mr=o*l=%O?VP+6IS$P=VhV
zMJ0`cclZDElMsv}+idwh8UeZ#tzFv^^~^Z9!ywBx8U@|vMjz%i2n-Gkcy74+NDprf
zr|l%Jb+5Ciztr%yZ=7kV8GUqGu=&t_TA+Kjepf%hC1u~5iz&?SqkEc@qcu_o%=Zc<
zY6fH4&m5gjnd?y+NAhzA=T#F}S3m7UGt<g@ZiGIZ*5oc*zmuuq{XCii;~N1Ts<9N4
zlsp~bC2Y;M4Y-9lIjmXIe77U9#UvBP`d;17-UxYXB%Qk07EG1|YCyx`x|#w^!=?aR
z?%Yt+yL1<S)aWymJV<!^z!0(6GBwJO9x&0G)~%b`N`G)?lUzo^{gaQtfRMQz6z9S}
zKa?!EgUv@{llMjS`*HOfkXjS5bA!!%elSD!{WwX=`er|r5r3;T9*1?=^mwjufEa=&
zV@`geKlWn(ZKI;`I8kdDxDn%(;)jwDdpB#5`G8gxcDTLQA083hfW);l&HUhR1WVST
z6d|LxyeT;SNu!>CV))LU-&yx+75n4D#JmIX;R|XfN^)uaUx&iqbOq;RuE!Eb-sa>o
zhHVgL&A*QQ`8M1)#pk|*h{DTLWw`j_J7ut4TqK^#S8?fi`M|TvfOhu%-{0upulA>h
zKy?ye7mpT`)R+LI@U(u$N6wUDSeU$!owlPfWK<hpSJvpj_QJ6?Z$>3;2a|Fn(7kyp
zh-T9MWO*kw2zv8z?X<||WHm-7!Oz>_Yr|&dedJvJjrqg09cmvs&jE4%?v}d{ufT>9
z_eQ^8n=_38+P-sf`|@ml@3xL_a@JTzr6VBqS`W&!FIz^*^+nzsUC?WE-zeXjs9d6`
z`37b+D?=EVPMG|*meZsr-P4NP!2oh_xkgVVwOAD6xZ!qva3_N`=vPZ#fW7)lK6gfj
z<(`zveJ3<gado%(OUeD-i&w789=zLy$u#joa7!)ptfQsh8rC|@Fg$qh0A=!!@_-S2
z*mH&NrMNsOrsRIMN#hJM0pkjwGIo#OJv`@EMLh2)pf-Po19|$}r$iNZuzqdq<Fg@A
z+v1%FmBOh$m%N}FJ6){-4l2`2HR1SHu*nbpDV&f=_}kC2H2v#bmv`9mk{C;pgF2e@
z`UE2j<@;PLU-FWSIg29#gklEizgse<R0TX5GLc-UFxq}&kl5s3&n<?dj<ldj+^N+l
z^O_NqWDsUke}&Aw#NcT1EKTty3lpD!+E=BUw<CBHC0~Jtj*A!J^<#NI33hU?)M}P9
zUf1APdwKZ1JKkAPdr6#Ot+d-vP6ar$C0S6Q6y3NyhWopF`?sI_(?dlO<nD;R)^t5%
zBxse?WN#jo4!VHnw!ag_2n4<A60ZiO!M*6Ew}O5S#>h;c5L?c1w^|LPHOJQmP%u0Q
zdNp{|65yCFv^Sl=w@)S0Yh1hpVK5@cdoQ(r`>bx)7*$^e{_)p&fH|Uv?32!AW@Pvq
zP==fr+4=ykFO#!n&Q5sHjRs?-lAbDx+0vLhkl5Ojfw0B%*bmzX(qY9)jrK_@YHCu9
zg!ks?pN=KWu%pY^F8nxON;_9eeYZ9>FfbtTX2xMI_pqeKKtw>mMYC~cwm22sskjG*
zfj@T}vpuSRs&S+`cfeCFFjg_uP<EUeH+1{jlGJk-|KryfFb>!)Zm4xA>Ug8Dun@EH
zrGBUNvLs9Q#eca7oIq2UCtAyRu4#G2{)22;Rrpx)or7+>Sj_iM3B4yAnkkfCGqEG6
zJ0U^!=msIgLoa><@qAA9_)hU)rOC(#>soSe+Qm77ntg?W_ed*AV=+0;iE9V>k`55m
zoC#s-UWQ1?WSzqDVot^n>w~rL4Rj2iR^8+mHncDsQn}^I*28w^%HA5)*O_&%?^GzG
zE32DkDMlY!BYU+;?*Wwz*>S?v>{IwA8A}Bj!Kjbny)UuX&8%mjd){|Sp07gG9eHW^
z2(VV21a`V(@=spj->>+`hd>5m;40u1lw&D4vXYNA-%-;!brOft`Tlr+Oh_A$=yH}U
z{+z#iRuOV|RV6b$ZO2^2yuR#M8oA0LlxYmE+fD12j<t^J_0Ug)wI)EWhK}+9H4_x?
zy~1U&ap&ljhE1vaNS^!SpKJv4JT|7t@Es@n<MzQINOdTlI=^w{hT~O+mYjtHYfy`Q
z)3W6?8yN)69IkvFcc<sTFMU_%(Wcid5GiRi7*`TweCb=Q!4!2bnJ}KmYsV%~yz)&q
zH4cp*5KNQ0z#IDrKKHw|2_V}9r~LL>Ni5&Kp-iuBN{STdo*%g$B~((A=R^Hbfz42a
za3PV;orqzXvSZTk`PQ*`xT7wFm;0aY04(1wn8(g#JdMOljz>+pGV6TpVg0JVo%>>f
z{P+%&OWA{aY|=cXoW#Y>ADL6{7d3NISzUxLQM7WCvW{v|R{dy+x@Xj?E~p$*q?4$!
zXdqFcvS>QRze<lExe_F)B`64|OIa&o(6+*~(cga*JBReo<g}a2lsYun^$V1jylaTy
z@tm=e>0(~xw79}}^;1cV=XmH3ZHrt;>{L+mcVAMB;D9kqLTE8>_P&amYqTPaWXb7U
z$l$f*0Da0mCq1qm=@EUd+u`B>6LiPpwDi4LN1GV_$rbzy2mSFcIWho7AZrcgXu%2d
z?wFD3p0E#kmH*hPKGx;TVtSHeXjlB6X1F?V*7qRlNF5Rx13J9-x9!L8+l>E^*fL}2
z+#-?LCt$b(q__Frl%aKHK6i3=Ngf$9bgqM`19y4Mqs?!2>%wW}n{pCWm(8N&Fihei
zLrM@QRVbb@SIcTc8Jp84SK1EO8s$eTm*-2O%zQhJqL}iHZXU5n^~JecD)#w#dH&{5
zeCx9H3>CI!xei8g7YcX%;?*7Y>tKH7BhX)1YCY;B<#BLysKX#&ml`&O;)axjHnMWs
zdq(0QjVj~An5fFfiJNC#DQNYF1=GW?$<l@GCZ2o(L&9$I3%!Z+|8Q`+b48Np&n`i9
ztEe0wjwZV#ru74{+OvhZbHCX_IJL9hGmi`<s&j83vVVxBDE{Iu46W``gm5ll2(c1%
z16OI)?UHKkOw0%8+%~$X*Q1!-jkD;!<oLW~)OV*HIwM<5Q-K-VN*uOWzO%wuKg}|s
zb>hjPUUkY$F7KeTzoGy<`@F6h92@XIfrA?Aa^dfMJPz(XMiBHBP54TLE`)iH6?i|g
zfqh23_s?`WJD9l)u4!Jm9lfJs_(6Nh19Z3!1681tMhDLBiGF;Bn-8S)k)FqeTgNQK
zGUSb=Z+?im8S31au=|%5;DCoU*YW<*h$<xNC>1sfzS#FWU!z*5TF(P>c<(}-!mVOJ
zCx}{#Ig#96VwsFSu|&K(<8H=x+2S?!QI&7A=ooU|QwnSxwA>?--2JgJUAtBO^v2#x
zi}%CDPah!FcKxJKhezHZ0t|+j)@jkoLO$y>e5vfA^H~nOcx7{#OEuAXqBJbm<>Yy8
zPWuDFbzeg$Ud)KqsacfdOX#+o;T7cSu;1LAmE^<|DtY{8QX*j~+EvywmZkc&n?HcZ
zzrubr^Fd6iakI{b==YWEN4&nZrpNrY%P7*wVm;+k0=X?T=R==MRRuH48rh3_&w14V
zLm7QGN>vpzrN+K3k4}EXz%a|e%7ws3jOmL~<m+m9Z=E%a{b4ylcOPu0xjxY}s%bZ^
zsVJG1IIQsH?{7C9l27$5n9L_2R^M;<fFIBy5zy~-#U9hLJ{9@+l&%@v(+&o$9-0Qz
z?LR@H>LZF;iMmMGZ%RE_OD}~M*yi7!oAk-s(+YdO{X&(<;RxsR9jWcr_7}T#Ja<Q`
zB}*<kTv2yyyURy|^<b=kEq;2z`;RC4e?9L5L^)Y4)4zbtBWTK23-U9(*HW&~dA^o|
zh-rwB<nHp%9n4ujyk!1eJ?K>{Q&3aRqg+$UJGrBe{B*|ikj`)7TBZ(8i5_Y^0jLgC
zv3pYXk3}(|deN~$E`|qsaYKqSn#Lg3=aRNHEWK3n>DG;NIAC^@1bk#Gb}a6dt^-4C
z;<g)`^6s1&bK-;4#|~)r^nv1SeI7NA*;Zt{Fnn;@%Ks!gs*(f3;P`I-NCSPC&o!kb
z{8M79HV9h#m};h7OsaPLXRJ>l8LWfXR|br#H%+g+VIISIi2IHxclWeqYklZvR7bgS
z>shN1)wNPrCAL!+kfY?Fr!LR9ub!mU$HBad9Sj&cFG)kcp&g9-3t$}CfyZPb#}8@5
zpZ^OcKD|qXm*KNlTD|d#`J7(PYSUHzgI@#aqvH^ljKU0Au60RF5pTkIREQh5wTk*&
zezKtE$#r0A$xM}&9IwK<cGyp^GR7_oCBvqcA$1nBy7Zif;UB1fHoUu>S#|TnQMJSN
z*^1@kizEWv04AJUq-@`@yh*y6fRlQ_HWO`OcqondSZOI9AZhSg-JRHM$#C4zj4L91
zrhK}|bxk-vxQR|qc!kdP`dj5&x97-qat{<!?Q|Y|iIp{s6)cb-$|3n8$4Y7uO?OcL
zG5d@2Ks-niLWh3~1RdBsq0X02{Pzg>4`lqG|8B#>W)|;LrI>T?jZmM-^LC%Tf^LiC
zC4;PiG3dmcGGuNVaFv5uBLagvx|AW+aYN_kie;z}pp^y?@Fw5K7>IU2(V_3n++DVr
zp+SkkRQ!$~_u_{1hH27vrq1m_+1Q*d_qL$9QI}XPaeGK>;KigszI}A=oPXMbP-hy9
zEnuuXNaw7hnnR%5<0-aEt3G6_k6qGSHh8MF!k(tH1w-?}P1?b(!|_sm4@FQ3n9e2b
zzH0Xb2qcJ}(HGU1>i|Jr#F8j1)1uAM1)VuMI3HDM{B1J<x|l`}$r`!4PGSCi5T79+
zXY~m-EgieabWH`{FX)wD^b$=yAkgm5@Ld_r;M+Hq>E&=;;^OA60zMS)c?t(DCdoYd
za1YS$=OvULT9Q1MKN2kk{>yrzZU(x{`K)s~^^VP=2Aui0*UL#JhN`sqxL`{*oL~FO
zf2}*;%}7h42)Add<lao)8)y2yawpBmuK&n-U6^S#xl342Q28=D9KF)_#PWGLClxtG
z7D3!4KK>4Ka*lp{|I^6$+~xNXolWig%Vx$-t;?4`h<yszuhgCFKcsG*)~(D+K^bAl
z-*P7eT2g$VN0V(8mWt-vvt*9{Y6+JpP*YjvOiy(6BY&0|GyvZpEv%LKL1LCccM$x7
zW9>^xR+r3(zFca!1{PDG?rxkX@h51RsQ)j|^84TOh5#q?Vj)j?0tNoG=*WgwQR6eT
z+Vf4H=Iu5YwZsiQ_L~bxFZ>9wy*gkZ9J?qfXuZF^8F!p(F+|=-1v$hc%8+uQaIsa-
zxFNN}mH_{*U+PqI)9t)bloHFL=<Hx95zB_!@b0uKfCLcYQp}rgYCTR_`$WN}6Glk?
zV-Mu&M$6XR<M*zUsh$}gh0sMRH-@+&5Gd48f{SXB*nHIK-IOq4$I`>n1h|lQhDQc;
zu|Ea3d^v5sbD*HGgEDeig!bUu&No7%Ua|9spyb)wdB}-e1uvpjbYYtredwaHF<x#w
zmzI(ut`BkY++MGp$^(swgFdCN+q~$8f5^RaP&>V`e(-&AhhvxuI7@v{vH6(M1ki?J
z1}Kj1>p+|opm=K4bgZ^ER6K{j4#Y44#vTLGZ(pJT5>l5b*c`2u(AzsPiIU5jy>-o=
zdoeA*7b;{%_&XT@6f_tExAm_~!B!7g`t0wwYc~8qB`GM<h~|@63$|yNt*tbpTG<eG
z=}sJ-Z!}~~7{bxwi1G)iYf?9YI-Vu5Y*<#Ied{M>maXJ2s2t{06CegSNHZ2bU-fxN
z{P}+3CSd{zQLOeu;>sW;@^wPDE=Ot!0j*v35q)yv9en=MD%Uf4r(y35^9%@|4hy3E
zp2HxMK7uta;7e~Dr3i0~3fmBTN^9-`+l2?>SIELWsHAo8NF`*w7F4x8-EqAL+9Rkr
zXVP+<H3t8@8M@|zbiD#HmB4T;oXM)8?)rDS{+~|i-zN+O@T>Yzw}Ch=7)|QJ4O8C?
zZC3P5;AjE>cDbk~s*)?{m5A26@J>eLYAdl!G(%9DS?pJeTwir$=KS#)_wWQ+pH0Hb
zYO0Egiq~zA&1Ju9{Lj#NaT3T?7vH(o`Z~PW0w4?gN1_^Of5Y4wCgY%`^g6C(?%-5O
zB^NN5`Z8hDp}I$>X*=13OKN=G{tjL>M|z8;X*<;_7kRr|WFTS+JBRvW7@g7N3AdRY
zw%p85yOVulQZ;B@b^UR-ado5z^^rX$EmIkSzaBm9cpPDYw)yskASkL*k+MbAWh}sR
zzAZ3c3z2*}_Tuah$IJ#drhtN=ijDl+&}U0xZdK3WNE2@!?oFOZ^_3_w>IDv?7(I*r
zI|*&@9S+7!b{xw-x7@1OpqlS~^=A(4i3r*62@;8eX5_^0j^j#(;f&oo-F+_nc~y`^
zg<-OX?on-2jFyNBWfV~{io{1WU?!-;0ps||!xpmi5eshLk$jPT^7Cu%#%xYRbnw-T
zjHGR^-ZI;tcA609p-XS=^S#iTSDlB<@0Q)GM*A+sbFJ!Y%4W?%uh(z9g6`QN-q2!W
z&#pYs)j59oboFoOOyGnXV4(jc88H=MGT3#7i`;}DcZ`BAJ`Tql@9(_H6m${<-8NFe
zlx1aLyj~K^d=bLa`;nZsGhIYr7G-aAq%x?|OgfW02+nR}2=C_I&=P1pIs&yG5es(l
zC^x6RRg(f5vcOiVy{^!80Ca5Cv^~k=ocjST=<~Y%4Bt9^4Ku9(z(LdY9F{<3QY~t%
zKBi=Eg3+3+hpOI8Y%k_+u$pw3T)FA{!_;HW@8Hf%PgG?qb5N7~DeyL40N<2Ux2&$N
zPVy4wT6E@|-$5Umx^*L`0k@_I@yaLz6Tm>Y`lZOTv_EBSWz`KhxvP0+S|vW|4>rK~
zK{W?KDjukeY^!J^3rdr=r`{jrK5wKy20H!D1DlZIZOb;B$>@u_NK-1Iel&!8rDqiV
zNF>xDpQs{Qc9`6aP)xX9awPl?g(1+a-qz$(E0f|;FjULpp@L0)V9a8_n|GAW{<Ukz
z+AKXL-#^n?*_%&oJ4p~>Ax~TxhgXnnJmoJY##O|Us_OCz85q`eO851Ux=XJ3_7yT0
z55IZTw}8fiJy@f0ZE{$}k5-Q;O$#NSx7*eW89HzyBqP^5<+j)inh#z&=$G*Y1%9xV
zgIKzQOP+-9RF~(e0KTA6io(O9baC8Ym;Y~4pUXqM$T94l7T(<5rj(2RaV}?%x+l!>
z8KlN`n*&|W+8xY+uLs>GGB3wj)u;iLH)tsFZn~)hwig85y8djQcIK<`<{&_lPsqY`
zK&JY$gdWmj=nII2+d;3?9#3V^;Jt4pos;^SWInODm{nfrcolQPWw;iN+9+cVI#D+S
z^@+oK*Tt3(LAvwFNhWOT#?j+Ts^e~hd2zEE9%!Q}oj&v5^659R5|8^c!||xY&DV;%
z#5{cTd;oPh?nJ|8U&)IGs=IvsQtdcpQ4|0tSA$+rd}EI69HNKJilTx|V84d|-Aj1r
z=Iyc4ps-0ZGb3*0_^3M7uH+~2W0~-pcoMXx6Z57CGHu<^P6@Vz%u8ej-F)bevrVF}
z0kM7)XL?&6qC`pd)+NWXb~4!xQzp7cFFhHVzq*pVQzItt&oXs4LAl&N5f`bu>~ywh
zKbqNE*;><#^nTw}9fT^SWl*_g@`z-=q*Ls$DWN+iNL1;4nX_LNy&coU<QRv23nL%-
zHSLQOpnKyF%7j&1z)>*&NFv9(YhnhOr~cAuCq!`RkJQv(JyT+H+HB_NLrvh~6EMo4
zj4>VbiUVAEaMgQ%2e~@hvZcW09jUJ0J>h}RkWY3bXP>cneAYUpDFDbq-^Q7<c@3)3
zVf(4|DY)&U@&mtfqf|biY5;=TleRqjbYYO(yaDx&3f0UNo7(~cJ3;P^w7EBsJUvdT
zU|LO>2CN#u$1n4u=+;7eFnIXoM78x`E63J@?g=9yrWlQ*%OI`zkf+@}@fe?BeDbZz
z)R(Hdx?hXfEc?++gIlPwipr_6p)qn*$#?EN?P&7fxot6@ak&b|iq^PWrW#wbG;Ny3
z(e0M4fSS#m49w}>=D~j!x@Ibe_M<AxNn}(-c3Wdy&K7s;KdsXUpBjue2WcE#c%a|o
zL|0u=0ZqTx^t&U#!Q+Ia)vk<{GQz7X6V1qXa^6HCT%W6@MPdxS!d>`}oKZ#wWAh{!
z*JDj`)?w|UUK{8%(#&XRPGcYzwJUh_E%DCxwIYu*&a$3@)BzHPubT~<FhdQ$m4~wI
zt7KbBH6KAYB*|}Ps#(n5!*LeYo>HkD+Px390<@enf?0K!vr20p8M@w*+`Rhkz<T70
ztq|J!Xp200$<9Dmw=JWjTSY+y9P?Y)EF`Cn@5bL|A#f~diD;C}cn3h~tY3IeJuhJ<
zN6-X)&=1z!E5tIm46=9GntcGj;SvMYGGO{GePwmq;0t;cjP=MaL=r#Rcphsn@^%}k
zLcCP2nU>2z4rSvAd&F9szxu+FJD@twFVh<yH`D=Pcu5~bgM^6b-dK+bE<a$)lvIRF
z0%s8ny$MwR9d6^5${V-dY-R_&iX1vjq-?@s0k?J2C_@CK5B8HacWc-0-1*+L0Lo8{
z|IinVPXtcH72k1cTXJw3%yP7{b<EU!kMWo3wM+v5^X;&E*}DHL4balj7<3@}0%Z4_
zV=qAEi-R>+UGn4dj%#N2=?kOJbX)mx+yo5d!&1S}&p!4HofnX+fs59k#}TcHHZ!9R
zwdapO*;uN$ApbpOV*+$LG+6lNLYl(6k?6p%&23O;20ZwrMbp*;r18e_+4<ahGN`>F
z$ijowAIim9P%trC9e-Xo$n(z=?0tJWMAsw<d3HePZ4`CociN;lxvgGjHYJsd>82`A
ziG|c__5ila8mqA858+<;lbs{%mtN~dm?Vd3ANA>xE$=d#iNtV~E1hS1SsD%z?E->i
zAeXbQ(9AF;+nPf+L9WJ=(Zz6Rk|#WPQf+bdS*%-rpu-Vdcs0#J5_bRXw?MtLRvFa8
z+8vSZgF0*J@@hY~*j?n}<A(D!u_~3JYI}vaoL2S)RS)jSjo6+9dsK$~?ZY+>1?H5w
zz=B$33jC~D{=lBMz|KhS0b1?<zq@(DJXWfY0o^@WU@~Ls$d`F>Wnd=mGYu+)>bW%J
zuzpF@M|p@0GDV`>>hC82n}9IPEu|^gy(m}g&56qZ{I`FT3kH3@#vcg=U*FskRZVp{
zGe0Kl<1!qV{OFWseU%{^G*6QWYmQ#}LL0K677R7EfA_FZG11Kcl(rTCWilN8njGIF
z#>R^bb5hz|W*&E%m!JQVIY?RR*Zgy-22g^1yZY2^tb_7)h{n<SDa5<$5SIL7F+%7I
zlFcu2>sY;Nxh<&KJX$P-R`kg{Q6jvrR%f7jXCd_F9ZTq!HC}oBKSYvMJapsQSg(MR
zQ57ZGRU1LIsdecAB`yB&2jJ=je{GIi!fBOfptFpA8X0B4t25$JtzQ&;f^L=<`?z9j
z?Sq?TX*u9Sh`wN^R#mb?{yk1()yh{xHMX+F?MFNOTwP7qd8tmbX$UUGTgN9^-;wJ3
zaY^ky8L<$iJOoT`m=F|`TJlyeOln-em}s}@<`te(8xG~?IH%^h2b<ibQ1J4PdiZ5{
zCY4b=_Nfi@QngR+sV`tQx-t%m$Nl7`Wa!rxNU?cB+Y@}Ch2%Dly8gS}{tlY|$b$2}
z$Cg)re0_;n1$V<NZdDLMbr<kpl|P6<QE|g(vz`Sx*YOw!JF@3}^pbBZ)cIlJ0wZEO
z7Hmv9C~r2Dj~7Y~`j9XzkIIDgJdFDJ%fIt}<j>D|ze8DVra+hSlSiaq#`sDJx{VBq
z)lHPt;bsQpah4nvOs78N6GJOZxXJ8P<ROWjqF$3;W`_GMj1f0x2j|KZ@z&W8YAbpV
zpH!*k!JC@`Y64Q=e6e7B+xA<dRQ9_GI4r|`S=Os3anYe)TE-5CM=cdE)AUD9=f57W
z&q#+$Rthz@I`f+m*W~u*!I={}W=2ni!X0WhMqDfy&^P4X%NVbVdqW8>rDylt8_|!x
zeM#L>tiW6qWMBwv28ME{e)sno7?^bdf46&E2k<c`k?(KK#Pp>SX3Z{dh_PS30M*sQ
zE}nOfQ;QlzY{+ayGXDm+%nbVT90qpLeCCO31yWC*iMMjpzgr>Bg@5=MS;xX~7bGl&
zt*=DDKn!0cF-J$$N@*;MZ2v>It_A6%lM*|Bu^aa@b6B(-Nr_2Z)b(`^@m5JN+k2So
z!gfsw?RsvnGmS;l3;JwtcrKD|WqubbJdV4JUd1aEMW}scic~qw7boUiEhj1F=M$tg
z+`L)dIJhH{f$-ju4XbY~YqV&Y!`x?HCA(5YIwB9RvgOgGbt+GU-(8_$JE(zHCePAb
z3TSBI&N7kMlmLC*Uz~2<JQ`;K9p~-{>|(2MfuNP%+UT3$n!h%_KN{lw=OCe>s`%u}
z3V!6;C63EjR1_Ep1NVZcX?p4dq0VNBfsXTOtrdxVH_QV-WiHjhk73pAR?y(@GLY9P
zcjF2Yk0q)2IEeGugL%xejO;ApgUV%d*nH%QAbG?!;wz-ES$L*e;-@@_PXti2S}WVc
zvcEKCzRGi{4oor{;<pR#aNYXAnEJMOvZpAC<zR4E>fW}2S0?H#xB-SUbn}qYo4B&h
zI3&7HxB$arKO-ml9rdj)YVfIUFZHXmM5pWAv&sb%q{hz~1)kd^I5s_Fv@a*&id0d_
zGNh<1^_3=gJ_g+$qT=Z3-^xu{PEvaa-Hz(>y_84mP~Y>#ROOcXmJ$GY3mO0@UjCK@
z13)ngR5eozHsmY+b``VXz-Hs#@+C%tvS!*<tVQYGz{6I0CP1Y}N*3aU=!%d+i?tRc
zMo0|a2K|EfB3f<(_Sh7#$Lc&w*+hQ;A$OJOVSD`Kmll*3%}L)|sQ1X@#l92Kx~pV8
zde!(+Hm_~LK&D?|gq+h3YoSUR2k{yI=i*<9O_n|V>yW^dvv5{jFkU;*zj9v36M^ui
zaw>rHTeWK#y_^;Ku#O}U`Jo|K{5E+=YC2*2Xys?TTYQLG_eWpl>ZJVJ0m(nC4b)Ie
zHc;YFSImy91bfS5@-2A!!R45={4%{9RL1_ZhhZ+l&r+PxBb@`^AE)=`1w6^2&W_6_
zzVt38`N$`KHh)MwG?o|Ch7~Carz@a)4aDsj?D~DW39$6(&wGx4JZ^ve%nNeu(2mM4
zz#zE}uABH{zpFr@&246wH8+u1#;9ob3q{C6A;^{I)?OslT1EK<L!DJCrE+&eWx_mM
z&{Ap;z3*;zyT6y&FF?ca({^SRM}}Ff5VNVmPTaW#1TN3+7a@i?n9xk8WN(<s*K!J)
z-kM?tt3q>0+CJnNFn7l%loI$PJG0WpdWXBL>{(}-1h<i^u?{+_N+|0m!Z2TNpO`su
zQuLu7ZPtCw8uF<lT2i@-M={8qexjmueB-o`Qy4fw#@tFn_p`_)o3jksd$BkL6vjJU
z$#}Q-aHndNwnEP8)wxK?Z_+UPd#j7w6j2#z#1Hqr)xteIJX~!`=`gCYE{g6B4!$HY
zEZ@YEbRAgb9zdhDNBg8?|K@uCCqsn~mfXCGn52m?%$ua?kd+qvxEzV@Kcy?faYM-k
zuY?|t`P%nH(-Ix%YXYhonf=)&#Pr)$2KSnWrI`n}nU8)iNZ>%0@x;Tulk@W_9O1@2
z^xxf-(qfYm!7TG(1~|HJC(oEI+zAzvbWlODv%H5%F_eDF$tjB*8s_BfF-a9wz_1a|
z5h6s1@E!MNHluSC<fJt;2IHmcik0IcMOQBw`YfolsD1r8;vRgAI(*A)%S-(IE#j$=
z+<T+74bFGGL}WX=q97k@vDC_<!_YYN6^P*NfJG=l;(8{PeXy7NdwO{3E>^tqz1^bx
zLe{LsmQ)dz^ZWCMk}J?<)<?<V4QY^MDMc)VqrOfjB4nL&*62xWaQ4x&REbuV?KD{(
zvWqY9&78_L47J+dz`ju&zc}iwMo(+t7$+-LJv$*nVuEc3r3a`t{BEq+g}<&1hwxKh
zf`3p)h2S;)%3<$UZWRWd>0R`Hst81%1;!Aj)rgdG*jpj-MoRkK?-aW?z$Guvu@-Ds
zioCa^T#=&>0=R<ji88>$yfJt`KH4+Z(#rjDXa3R#qMl(X%KP0kTHq)x`2NNBw0(o5
z9nY3Wa4vAAYa^2STT58J%9*7s_mPs6!qQpKzNCR3SE#BCyX26AMkc(3e$AlPf$7gz
zWtiqvALukWR41`K562Xe?wiFpwlGY&)Ec^cKKZ5!v3htNm$A-M;Eh?VU@afRAbnf%
zrh>bXikZP`NT;KsS+Sl)>@*qA6QZow06NFNveqltp7^*PTp)5peL>xEcYyB}KvT=u
z#LLSkI^y?8^FL&V@>o?;*j;m~TfpQ+Y5u}t^NQo9%Qg>W7OIYb;c|NL4l<+=Gddo(
z++v|8EhiE3WR>ROh5T<9*xQZZR7iwJ-`&0-OGhNxnI>Oh9kk15%{ONo*>lf(_M1cJ
zHhKi{`IZ^U8;dc{rasK4mUQ1UWS<YRJ%=y3+~sAE`fII0r`d`gizd5C+j9cG$~Xz#
zI-Ra#tx44teVXS`?~}v@{-|Fm|IhZ+RknF|l-O<Ch8>*GV+V255Y%k!?qW<ozE9sV
zlauPLvBFaU1{OM<>$B_!B1v998#H4Oe0z&-ELO&B3rfRM#G%`8$z&-q*9-!Fk7*&y
zEonzr9Wy<vHHf#<vst^Ar*yDy;_1hvlQ|e^Vug4cos$wXe_KDv2p^_}8r9|$H!3`%
z(q2XUH!s1Kt|{TFJC1bn-Nb50fo~THZ;Vd)`z@8ZkM}d`Us?dLy^6f%C;~vlJdyY#
z*YMA06IVQRJtJ(7GZsS{!=rD>V5<7ii?L?qNKe{=LvR33e6*G=%+AS0&q9FZLSCtx
zSq3L#H=VKVqNlKZyDnX&4lm8l3Y(i{|5%CCO(U|)tG*YX&3@y4&;OCM-^=2G^SRZ|
zl09_r2pOcbrS@|$=ZCXkj%!4&54Z7?;4}p1Te5?pg`xfCh=@loYL;v|rB#}Df2CqK
z8U6omlVN*tvW3oVam@A6RIsmtUPd|WUS;=-UQ|##cHawXt_myIRU0#b?TfD`uAzeO
z>B(|lWF2Q3%v>Vwbo$mkT_y|O(4p;{HQ3Z^4dFqIh{57mhz@8Li~G>MEq6L0z%<p)
znx)EeiG0g1!H#V%r_TP(^_4tCp?kc{8>=q^LA0F4z#q2?fcnQEjR3&w%5>oP#nxg9
zD1T2w{$jJhAvTc(F6jkCFpuxIVE<P_lk8hh&}7DJc#UkhdJR#;Ea=s)nlyyEU92Kg
zcz|9&$B}@&LLmoiuR!%KC;WKD%QwAOQii~>b7x2v&Pm`C{!M55Vq{i)(sa#A`^W5Q
zuK_`JTw1=DqSZkmgUms2XQQ7dNw8fM84^=6QIjW{i0`yrA8%2Ij$?Kz(^=rUA<*hc
za0J?pc4JTo5Hoa&YOfk2A(CetPj1M&d7-?(?@uY32S7%WBJQuR!ZFj)S$}WT(!h{e
zS<R!_v4#xBg8bJy{btCE<b3|&F1Dwa-k0tqurGFQECw~VbL{vtI+cA=BHK}1oScjZ
zW8zaYvbMcp2x}X%r46N*b97bkw-E@Rzg8K9391envL=DahdOGjlnhY3Ry3WFt@&P;
zo0T3spWjH?;Itf>#mgSyGb2}XLYLIpLIwOVEOSJ)K5@1B7KtTSomLdkc-ZLlit>WC
zp8`Emz@x@u^+EqkuKpuv{>PV;#J~)0;l8kq!p5B^|0C}Bb?G=i&W+<uOt#y8CXP$R
zq>h`ui)`V((sKiaK|V9!{;E!OWvq)wY=HxCh^&q1#B|vv=U0aNs++5E+LVJ?`-6FE
zWAS9EyFB3mk6_6_^GZj>kd_}eFyp>*xYIG=<g{h!<*lExEH4+gCVZp(`$^EY7;3+g
zKyHd>Bjb9~hD-Z+97IR7aZPNKJz)<_5=$G7<IVrV5wC#pv_5mtd&M=|6BzD_XqHo*
za*L1UbRp9@m-yf@(U~1tmoE|UZ3#>hSi1r-=R{J^<e-(<(hvLsmI}t7A7b~_L<m@R
zEtLnE(BA{KOIV;*d5xqQlJ-7))}QnbYN3CIGF~%W3>a+|t(Zp~X7RqXq7BYhMnjVy
z;f1()uifB|qqw{DM64i#URV|Jj`*#VHGcSNZpiDREz*Ja7P`%#%pdfSh=bTa%a&H=
ztHZRjKRlE0J7|bkplF$pl_OFirrmwF=2;5hunf8}wuEvucFB324JnNJ%mt&0ZT;K^
z-akJKMwW&9tWp*gq+YXx-D-q*T;>zKImTTG4foCtAOLPo(q#Yw@QD}Lbd}kRY_tD_
zR{vooIDy2(PvJt0pOU&2ajc=C7ys)l;NZzCypAo%XI{_>v)3>)Y#4Y}!BX0U>8!V#
z|HYG>m>nVS+*map%d*h5Xn<&r3@wk((Ya0;NpiNk1xBMJ^a|J7_Xl5$*`>!IuE52h
zmraXZ%Z`Z~Vd{g-H_BO?`Y2>uzFm$Ha*WHUrds^c2HScxk~(VoUgu^Y+m<;kca1N{
zflBkBGwrTZZJQ<KOl<*L8TdLvnqjb|%J2)mE~&lt3ZMT1ZTvCA#=*^|$3Z+IjpI`w
zyO1WGM)Vi!LP;CQs?J04#eyOCR*gCp_C!1i_d*1`c=cfa>j=Dgpd<pr1M%^01=Exl
z_mSs01U{xbtK|_<WM)QA>A|e6zG6WG_SAP#852IMV>xN~s^|w>2xb{1hUvDb-cCxm
z-DM{&IuEiiyDpcR&|d9DLxnG~HE(}?DSYqo$t=ftj$!S^=J5k@lK;EYuBh>k)BfbL
zlj#5OY5s#BQ|QHAw$qxbs(Q%-98{Ay{OzEV$j`1`uM^}c;c>Fz?#Vhi#8Jv%sNr(9
zX6)O1g6D`)WR;GD<ItRptSe2`&l--w%q?q=uI5C0W=*>_>!{J;arl>9vNg2Qg0B_*
zF5@HLC^EegNV!EyN_Sz*Qx?NlMQ&Q7>gf=cnZZD&O(wGvmG;-JFB^W)+AL;xa}$OK
z#G5gv=f7;UTw1HY$B_*S^X4$mtUv+=Bwhu0yAQKcC3JsbmVbIjlPLVVD5|B8$**!3
z2vvLKvz7mAg99t+pX`$v6T8J?hJu<6TnaNKSdJx}w+$H<)vPoXM`G_NqIURAc~0*W
z7j8ejoBleRcpp?mgrA&HAe;oR`l#QY%FkihY_=07-hK5t(f%wc%r0bSOsC3#x!{Ga
z1$9!L&aDfM^x;0iR|~&Pe~^`1Wz<Q_|AH4%P)j&h57Ow1uFCk}XgXn!=9h$QlmGe+
z|C|Kj1gc|?Ci_rQk1RNv)_;98=hHYGW~m5Z{l-j{D$ptGMVj!cO~OUx#c{TFea1Go
zhCbX(&hB`Ldb?9E`h+2=_f#)9F1h8_#UAY#RXt4-Dpn*Mxf-_crG|^bVbfBF;Tu`u
zUdrplErXNVlqHLeYxs6uJLbwyId<QX_dSCD)MB0lG9ilh=(h0)B`2wdAEjSJUVLH=
zf1Bc>c|g-S80zKSUaB~<zG6D2a5J_*0g}9wcCc%=VJ`NK087UfV+9*p^`)!?f4ieJ
z5U``6hDqOr<_T5z&$7M$YrO(L!@^`GW5T@aDMXGs>qjt=#fe>6x+U3g<Us!&?l44s
zQesu?_#E+>4I?4>Px4NQaPMoTYU>J2>&hX?0Fs<!$ihH;N{uCL{!AB9naL%2B>~8E
zIh7eF2w;;@j3$S(y^E#NU~lJj$o*X6wL8YOZQ^+HeEDCrXPag=hWohVu1NUNfCNv9
z5kQd?(XUu2A`5T@v(vb`z#j;ue<@b}6o=asvG0g)rK}+hcDU!ScNjQdiik<oZ9_V}
zalNN1+XhWT-hD2s5?)U;(|i+;`6Hgh9^OGRW}uo_z^FvMZUl(wU$PAxrFlOT#|6Ka
zIZZ!95ltwdW@nXZxpO_b5TPXJr~_%q{M_>Dixx3Yb;Pf%QcE(=-{?`26{h}N7B`{G
zkntIQf#-JUT0OUKspP@xDHq+7s}AwcjQOo|e~sNTruMgV5ixvqVlCptbDN+LI<?BH
zp!Vyb+Tup1@zNcfyCdrt=^C)q@lz%cR1;o*BK*7E#=%>IJ%wu}Qce-2;GpMUhnq?K
z*LGik@{))iONuBA_2m`F>6s`Uxgj!?6cS-u1P|o5Jd`pbPpLsJto0fqe3*zt87Z_1
zj9cCUT85%5-J>!+`fMyHu{?!Kf-SE`M9{&ThbmPL#x=gIL-fOYtvC{Fi5{kRHZOWr
zlFe>Y$Eis%Gp2HhOuK9O3J@kB!sUP7UnnFyi10D@$pEK${9q{JyxB9_<+_t|z3}B7
z@VX^fK&d_Io<;c=zx$s8Z=e$+h(>Ad=KSN&1VodMql^9<wI+nNB`3~FC>4ecO;;s0
zS6HWtQV<!E?~_)d<DSH;WJma>oO$2k%`hhOiR&!)6})E{5>!!j1>Ose^2nJVMs_H6
z-^@$3D{0`7vc+)P#p0ZI+=~Y4y>aN6H^$?#H7U<BUGH3D)XRnH_(=`)Jzw?bV6-@s
zRpfcH9UUH8YLa;4ZCAeP;nu_V-2N!Vt7}u-_6`p-H@w!AgTTE)PYH0Z=${<zB4p(3
zz1t$>&ELAT3(#*dIPgNc_iv`#AoGOR37IbA{)g8CUj~)|NyEw4$r{GB+;lq_CI>Dt
z>cPhJ`I_!PKLz_}vd?Yr3;D|dZ0iH39L*CC_Nx~?#FvK9=hL3k@t8r6U}GD4;HF(r
zIXd~%?2h_O8Md-Di(a21eEh=fo<k3!7ENoOcu8Gh+I*pD!9llX)YZ&Kd}|M5drNZR
z@w?4^_ST$RY(&51x<F?Uhz~e@$95u|5C6?m{b^4xKq=`iC~9<=N5*V`cbjsO;rQJQ
zDwsZj3ShI<M=Bt}cls!Og?2Za_O`o(-898UcLq*+M#dd~P`l=p>D|;WewPRxqC3_y
z4+IX=T<&#J!8q}b4QCpnT$Y#JM?S(+2MJeF=#vw+od4A?3ukAUWjkHrx!U)zk2g0M
zv!yhvEv#BqCOBUVC(rBWfM@raEUqYh&G<(S<|cdtWfYUpi+=u<{?VW%c|{&`DcQlK
zu+v~sIyW)5WN)xkDPDcb-XnCTpGJmr;0KDC0gt#WGaUyv6v5-|4R%kBFmE?Hhl`IM
zAI(M5_Dt4QRNy!{-%aBq1k8d3Rx3Pt{BrHT&G}E^8cfYq#6gWf7tCxpa7CYMCw+qa
z?jhK*<4T@`t|1)w4CTZ!X&QFQY<^{1$08RNJ}}QWY>FdSYw;O+NZBV9AoVri+PcYj
znc>-oGH5Z_Axfse>e<s3=Mlr;io|2PWK)GO`6p~XE>$KZkzT>klRa4y2GXSLJ`!u&
zTEo0=k~s@@Z5K`*@B>lharPR*izgd8>8+LMS?QtS&l*G94H5dmryJ}Kb3cd54Gzc6
zTX`DK91I<KD+jG}IPpBx6n9jK;VR9Kt{Y#?9?+rbL3(}{aw-?~SlBWgM9^{{xyY#f
z5|;=OC#t*f4zMuvuP85LUz8olUudGQ{9Sy&fe?m+<H8WwRWXKFgr+}>w|E?71MvMn
zpe?9h{zJ&(i-J05U4IT;+yU)<Jf9JlMhpRs{iWR94zjKjI?RcAah5JANbh-FajlK~
zV=3i<?m7Gk3ln-}W!VVDD10MWQm$7iv`BF%cA>A&^U!AHV6WZi3mQ%!&M@N%s3-mW
z+u=SusZ5%7v`3d;9^=VNt4LC~b(X;qT+@||c(D~)3=_83R6ZWrVXDapa+`}6-}Js{
z@Xz&`Aeu9av!HcIadRj8jxwuq8IpTVv*?w@;(w8@gI_}JkFe;+HHXC-noEDrs77EJ
z)tGJxW^Dd0e77y5Eu1BzZeOSMXtaTsFjWRJ?v&;x18D&ruMdGV{f>zg?BPEOnI$mX
zYvXR`6uG;8hHLfdE@x3%8atf{3tqFiGGUJ!aO&<5b|`^sg@Y{Qqy)g4Dj1)PH9F2B
z!;)UWHopfqJ1-mQ4JhTS;fWwfhgen-SG7B{PWlJF$E4~&e4Gst=Tr#q^l-v;wST~i
zkJ@dq@@8APMNXRe$Lx<Db3`R~iXNt|m_*2A<kqp+zujmc+_A<~s>N_o&uZVVw$CqX
z?D4)m-iIoOON=cr8n$^Z%={v+w<LN>@TO8fi=JU;H>i9ZIba}{=&Gj#L=Gjt_{6}u
z3&P&!V4!}%|F`E(UJXF%Z^`EJKtm~+1_2>8DLGlJXvU!xsQMXYdP(pZ6jC1Ej&=b}
zDcaTANPat(K(2MadqM%9fkIEymlksZi*&c)@5ZAvA7l#pVpG|f4#uGdu*3bYYS`f7
zk|@s)^tXN?-gCyQ*JZg9&=+o1nWTW&wZ`N|xT(AJVK^cl#8_uTJmmR!V1W^4s{H9w
zQf#kp(IiN=D}V2^=7RX3L<*QM<XH@~FJNQY(WzDC!4NB=yv-MvX#Os0kYF+1Ny6*=
zLrhpRf%&!<x{J@U<-1ZW+=`=j?-u*RF#8Xu54hcY13=+oW^E(3JRx(p4RI3^yyr%|
zZAWq~a-$$ApT#a1;PRL3Vcz{Xv=6Yf8BHJ<Dx^w7OG@~!^Y!Nh@{bv&A6W14D-y1>
z5P)wkz7Lbuz7YtNovZGRr*z2GOEtObNGTvMUbmml{!QfiwCy*~m8p)LGt}xDI*7?6
zU~rgd4LBctRinM{&r+NZB!NbdpBgf~0*#ux8=#?o#yj(#=ujQG6;@opW6a~Bn@+y!
zA-l$$vobj6SJondNK<Jo%!7<kjZQb0KoxPG^#jaH;;&RaQi^^9Sh<1WNaZ5t90G;<
zOn7Y^3|GjF>_cgM9yWmi6@lS45$CRJx=aPvq~Cr}848L+ye)?IFUA_tmmW4Qq<uK$
z?5f|Tn~1O+x}h8BD<tE9F1^B6<=nJOnUtYV(E4iNb8L$`hmcE`VIQ9%uX)KX<;#S%
z->PK6hkS;@374>I$bmhmWbX`hIseYlwIQ*B1E!6!H3JUW=F$4{f+B0&5x1FIHxeGg
z?0_hjGat#psAzMgr=Xz(m|8&7GszS5s}E?JYJSt%*-=CV13G{({VB$ahgin9oA<LS
zay4~oWB((lP`-b|bD4ybA}|EwrKSth%rd46$Z#N*2_pixMv7J*(ticzc1lOAYh2=F
zK#ysfa2&gbz$Kq2>0EDk5ZpRo5%C_hn3k1h0&^Ut*vt&$a`J67fBtg1U0R_tk>bp6
za%4KxDseYoI$f|iJYm<4WKx`(3yJ^YP<xMlViZ<Gm}TTn!`a2X?#&gu7I%FA&Sy7S
z;j1mV-duHxFU4b7CoF01$>eLhzE%BIT~NFGl&v%%Izy01-SI|Cda|q}TC53WIYER?
z9xKH0=K~wW56HCsp3k~~<+CdANm}qg;n5%GzQ6qZ$OAP)m-EvDV+A%p%a^)FAH>dG
zU;2HvC&Y(8n{8yvt?pWp!`ItzKlT7ue2IFVOzOgbb9sfY4DM%h6LtiQs*p(Dr0MY*
z!%EbsxA}QbenD-(g`jBYHo*)k2wHyi+5FT`J#KAqFP&Ev58ZH*CqtLcKE}i$1PW~o
zMY@ShGi=u$g#l4=NKlo=Gb3S@0JDs^92R>G?(bdx_jli6L5j>{FNN>TnyG|Bs@A5S
zj|-8WN+J!SKHk?KhuNWCSwq_iF@<q))}!3S()Mz9KYD$bTYEG6{h7@#vaQGCd)Mw-
z2*Aw});MN|%huVqwI0)KfN<jw+`-!vdT*r{$P4f#^1y~CvEu$>7EzymVfObv6M-d+
zZOyr9!IFwn&_hX{7#=*AM0y`X7{-H4J%v!Q+Jm;+5{}u>MQSp#4FdxMi5q?1#nyxB
z_>c)0xW9q$=K1{A5sdU00qAbpo%5}}5MCao`8nvRP?(qH2joVZ==9s?k4H5lWdJ<5
zEDv0$BWz_^Om9MoMSy8A{*s8$&uDnKN0AB1%e1{gW{o3itV9S}og*uG&yb8U>pRKH
zX|17O;NF&ynnOy)hsHX!;MQs8Pw{B!a+wVIU6h07%kU@N;@)KaSwZh5Zd4heGD7xv
z@nr3*3~&0{qqa%;>RLOSH+@&bcg8&YOo*%qM;LI_iLPA0Mt~4CoVn$YK}vx?rLTXL
z(<-xqH6m&FX|5>@T31dHa#~J8>dKCBk*gG(<Q3`AZ9axhFl$^U`|~^y?iM26$QcXj
zN#jWj9tT1f<BFKBGN6f`%|GBM;N&F3yZc%V7u@@mXFDx(pzLwVr~$*u6*3>ZJ|@7z
zX(D$TYaoru$qNE4Ej&s;4OM`~AAt-o(VcA`M<#gM2BkEj#^~z~X>1k2<4=)Z^`@~3
zyUAS=g*mFBGUvM;GGoS-eq!o8n>U+zyzYEB`sl`Vs<cNgTTP?38{o~LGPtI!eKoG$
z<Wh`Ex9y_#Ez=4C6LHmY8S|>5?{s4~dgv|P6D%q7J-6&(2g(OR#f=sI`WIfY<3EoZ
z7$*5D@{ra;kV+IA9~2@kU~TL*sE^WJzw~#$@Hfy0?l=4f`jlHeiLU|eF_0ewd#%6!
z1jW;)4|H1}qvaP80%^jw;_x<Gt`wN_0I-ahjH1t_gm9*D`X{@e6>e6#Z%iFGy1j`r
zS~xz-bbB0%)e317UFe?B!Dq0xWpBXB;YS~^v9AULuUQV-nBO1wI6pg80PQ^v0*$)m
zZi3YuRl)}Iw_2cbuEWamdgX$Fw{!u`CoS@6Gb#DZ3Z_fuWSum&1}`dxiMQcS+>@E`
z&m}Z=%!52k7a!qeRNZ8Ckxt22z0;faon)S~lgkz<cb8gE!=NGTt*X8gx+Xp4b^d@g
z>F|jAwqFHo>8)?9wM<NT{12<jOyQ-8v1z?b#);LnRd3a0jLyId^yk>GBIUDNI7@~%
zKqIZ$n<ZE+h8u{*wc`84T7PSsQ4Cm@SjR?E0|z!*v%p#_3*re@*RS*X@B#WzDCm{k
zd@3PsRHXv3jQ?!uLx#>q&}vAb*8LaFZ~r53=N*qhHm>BMH#o{c+QoYvqv~u$-{RW}
z^D`R024122g%27GxDhgR<^mwzI$rsi2>Wwbi=?&=f32Ur=B<*p9_EA+%@1=*&$*p)
z)hB{Cb}*G!Te)8KSq6Fl%p@P$WX&_;{K|B-gyHMTFu4xvtGw|&_Nc&KtJvOzbdsx;
z%6CkH%L)AYLkeVy+PXxIyh$P{hTgV_o8<)eCY1FhdFGQ$Mg&ui*`~iv<a!;@BFy;u
zMI8(Mnp^YcJv$I(!=CU7Vr>wPtqZaK?V<ty4<uOBFkQmODzA|vOKD53WDjFyJK~XG
z?JmjmUL{;_jdt<xLzZ4!-%Q6b|9(fNH%99g<R9m_dGJ{V772u7O^ib3B-iSrh!Z-q
z5%uW(!%zkAU%>Vhd2Ce|@8ZAz!>YR%`@J!D;cNKPHCBxJZ^9Ix3?Hc?hO{>4dW#G<
zISLWpKj$awRt=n-wGzH19t#xxirpQLkg8`(I?#~I^F6a*=p*>BuBK0ZAV_}jT@Fk2
zq~epMYA@Co)LQ8W7Dwxn(}oI$m)(ph`zkug$B8b?FrN;Rn09}J%0PIoC)82Qm3^su
zH*duuh`f>|(_2lpoD=uaXnkWrqK~5W$$2DEi?8Hvj^koLY{>yHrSAGa?(^J+)gQH=
zYfJDBQ2v(4ic1A9{RV;l2pa$5nHS8Y-wVMZyEg{#PdPcapphRQO}Fm!{jpl?^$`Iu
z>aZnXFaAYD1F=ceDI3Ew!Gqh#RW%|S(I1^y;8QfM`%lv}iSmm94f5YnAy<pRJ%dkc
z1u(5$rPq^oWgay7xzyHxTp0i1mu=ksjR?2+W!sIdBUIRFip;jb8)a%!wY5y7XtF{!
z#lBfK>*+bkn<Dz}xk}&QYpX_m$5lxzU<6o01WA{`fGhG}T7c*A>ucOJpo#?|jzDB{
zA{BYd6#D*4##Coj0v3h~7BCzivPRVKm=J<1y(NUzhN)nb_<$4kqhZXHt!jmW>!&3b
zp42LYL&JRB&60$f;<H&x;EqbB$Ph_^>n6ww=c);^X16PQ2EeMt%C#L`neJ<U_ULaz
z_1)J2U`3Ct!-$V=ffbeV|JGcEU-GY30Ipj1k(ryvXEpJL4BB%Xf!C;oY2Jb?Q6HaS
za3sp<7!~w$4LI7&UGY0d0AQ7H;sOcSOGw8L>6xNnC}?RhQC-E}{wNdy3>VvHWQ|59
z47A+;wZ@C-p1_(ec3~TwIEXET4ZS$QyyF_>lK!TYO*4I`G*hNkImXQ8pK-G91<Tb~
zxYdVNGmrgp+plKKpLmqDm}GyvUVbLnoYH3xd%KIX$NI~bgzPHGa!hnzYM!fvQcS6H
zQ@h+MdqHw%$H~{eq$9FR$McVwx>sRmf+e{axk{N88rPLXhkV5M^=zY!L%0jDQu~_v
zEgJz(up*ZSNpJl|$o?*l{15W#Kfa`d0bh9!H`25esHv;vgF}j6JzZ~jnICtis6(aU
zsJ##}&bf-klRIq}i3~V{CKTBuQP3)&kp?%Y1saUg%fuv7Kq_BCj|)0>O-3&bJWlXs
z!JPDQ6p(Vd;ep)I5%d<&5f~*$AL`3j;3)%oqT(4O&B=>#?f)?L=7CVRZTonMq7)HH
z3L$%z$WBNY+4pViAv+=a8nUG9vWBsbecwf6-x)iV7>p&$m>A3N(tSU_@ArM*=egf+
zf0&eFKG$`f=W!nAc^yYV`Zam{I<-dz$IrL8;)M|zd0z~ArEa?IaxI_Mm#TjfOs;KA
zx~*Jr`$pe`EZ$Z&O4zjrruMT5*WjJ^K&1!|{<*t!Rrm&f!l&Kp7vX(_lD9{CwWfLx
zCi$SepH-{S(i_xO-N-u*PYps;OJB3G)y1!`O3qbMFkJCWulm6XMsbycBn9~2dB%-;
zEMnooYm%ize{OZRy*i&U<=<j^^>0|q|H@-h0iO*F!u0M#&zt97k~rp;H(|HQ{qQgR
znKmdU1lO3{=sTA`A+((a8Cb7zk#?WMByho4fM2d<NJxSrAHc4snR4Iy2LuMMLTVs^
zsn4IS|9;%?vp&a6eoj5O$hl1G!7)WpJ>M5KAs#MWA6}DbwCY!h<}z(1!WRYIo9gAt
zVUA1pCJl08R}`*50HSudQ?i<^{Y{NEp$BJ1Iec}K@zG_I0j+QAuO|sjr)g?j^HGSZ
zNBp{jQZ(a8o5n|&RK0#nCQ7}liD_PGVRmV-DQ8ur;Vo-f=k)b=$zXrA8|olX6o3s5
zjxGR4^I`afmTARtw``sNLwx?XZT0u30jlRCNRXK%1u)NWfAA|KIrqu#P5Ph0gNIBE
zFCQ;AY&i(M-@~KtbPE1}3I6c1B7~CO+<jl+4Adq?prSLn{BhN-^QJ6~XE%LlDpRl^
zA8|*@Id3Ct@HjW(iRa1D8PbYxFv7TMb~D9&d*P&#jT>ykk+@M;V0ZwE8?oRIj@AA&
z*vEIunS(`>)u`V+kp-j&P|F3X7ojY$#t!Z|X~IJ-Qv6zM0~h-*(cz_NiPK>q%Xt#A
zBzOg`Te;E(s?PvW@Mi!~{lf|F!E+avB(<X#poC|IjkxTBn@3g-l!WwWFIbyhSl8%d
zaxS|GP?`BD6V}VA56PtNMU!MJMb<kL(Rgi?%iagqGsF!EQe7-b7dx~c-75x&<0;ed
zO$<#1Ki1B)V=ze+l=nZ%Z}BKSd*=>03k+j8^tud`2k~O9O+V(-h~|>-#e`h`uJZB&
zVNh~j99)IXVRquS&dH<7I&8&2aJXU-HV4+kDN}Rn*~1y#Fbgu#0q?c?a*E-qB_@ZN
ztNyTH1e>H?zswVcS}DaEgGhP*X#@(}{X|G*&w#ZjaWp8WuAQ2G=s*;<$&!}G>`Pu?
z*Y}&$EK+6-21BusVG^9wx%ol0SjcFM>DVkRrjhwyOVsny5<m-HW8%elz!a2(4THg2
zP*2{7=p~a9O;J1YP<%HTYVz9>)JN(jR8O+bz*THvzi6mb*lOa22p4269Ykxn8*6K>
zKi)By0218quq*h31br}#wNUqF+^7JqrFEdfryv=C4>GA^!<|ci^gg^t`BwV-BPZ=^
z!3>fuUGTFkoiV4>)uFD391E+sQ=@~1T%I))&ABR8F<9uEK%^`U_zK&CPhPSa&bEX5
zWZMUuvsjW>@W}GX=cf6*3wwenBgkSzCmblwXp>6$Dib<|kGj#)o2QWe(7$r;^)SwD
zV`}GDCi2O;69V(XeyZ$y<_Qkl8b`V-8~lM;*V$SXQo8GNHq+f&wIQwb#Y6CDH9J}d
zTr#N-{;-%8m!7`hCml6%cpM6|M05$EM)nFzU#BD|Coj!z9?TNab8REy)Q+U3x@ST1
zcW&K#?gAvc?m^bn`S(uGMONmKK|{>Z0G-_q{vcZz4=e<-fixE^{`bRZvh#}?azv@(
z*suM4eFZ=9QeHrSIA(Q^_<Jz$B(S*A=*VkN`7HPBz#ut?*~dk5a%|_2FUzFqOTNnV
zZ8aHbm|x!Qw6j65+1`m1DUHWID}KN}8_cWk#3m-0YhHhq=h3w8iO_i@&J~!j(o3wc
z$8-(YKS)sxLkE?{w^=3QPicl#F2^`hV>~LB^>6Wj3n|Qg>||em>76sY|Ekvo!S5g1
z_o;IKZv|9a{yAW!u<kblpo+xOEQXL9QSu<T$%Bx;^{q<_Y^x4rb>NNXT@|rvN|T^=
zglgF~(h}hRb6ZncQE_ve5Z~Q-FvB2~`=;6H71%x#wR~|^4OrR5sd^Wb--JcE!R8$^
zRbD+t;}c5ypM+@|pG%_Dj!uKUj7C*s>Z+t4qb!9`t+8^c+u+-NjTsWi&CP{2#r)25
zjY21zN6JV|;a<sAt!w#!J&tSH-d?W>UGlvj)3_S~8*h-KhMz#T24K}9WK!?{rKnm$
zA8arueKrfaA__+o*)K*X&UfL|y!+xkqIYis{Lc#+O{s2OHlT%;G4rbWl&J3S{1NFs
zU#CgPQ*^xvf}b$wKEga`foC9G_PihMJ$}>GIhF<MZ~L}6NA|(-&+MKr&=z8hkA<&A
z+MUEKN4cK!u>ieyB7nm7If)eaWv+x&jmg=`lq;}DPTa!U$`rNCvR<iW%-0MHtdVz$
z-;6ng32TK@O)t~Ph0au+`RoqV3~?*%>}?oO{|+%_5Id|n+-Z!6t$Y9`6e}n5Z-MH_
z1B3v5=xGv|i~o}v|Kmqv3ea4t4Hs~K8+TdN;jQhhglU042(AVoI1?gsph6V=5SU&*
zisNy6NdHr7_JCqj93kXqF;JhOKv=K(_@#y{vmN&jS0JV^02D_zO9vGoK}BOu<96gX
zOY-o6wE!)?<gbsu$bKrEm{hkczu8<XxXGIWZhLu32PQhru*lc~7`;E==zdn5Q61I5
zWoBDEX#V=ic`OizSTp8ga1BO;PR~j4&(!#b>(WC(aUdIPLPeDAuzvd)RmcD!e;Nk`
zOVB0RanFa?XZfV(e~AE3j4P6k%4zjuHDFtwI|lo$0&DvnFU{tsY5wAo<s|4%pc&i;
zU}hK5Q-3m$XJ3&D`fP;^JR4SYO4Na}Ua{`Miv-%&57h~Tr5|W-brmdKm9qY^M`0uP
zkOQX!Ia4ydA2r$+A#M!Smd|Dn^G@@y_{fDPNt8V6rs%HffXP`Sn!hOLCe5HXebqxt
z-XVeLjq9_9FZF}b8V%cT4{c?H*X&`UKV5EJD;T}_2paQ+cRkGF@Pvav=B~`w*KCT-
z@do3dZ$9@HDA~HVv<MrJeXQNy(=eXl94Sl6N!*a|Ikx+K*nq%BoA7}q7;z!hpSD6T
z{;%sKd4S#i=XD3{)A?Zehc;6k8@z`=qE>htclsP10JsoD>eDzVxQ5~$Qr!<Uc#LMj
z)$CGR1okf`?unh<43IGe>_?;B%yYY#xUy*LnMHuDdjE(FpEV@jDra2YR<@n2NZmXV
zB?pBmLwzv{T(^$`%2&h0q%aq;r-VYeUG3+8Q~v5}6Jq0q5)Y!~8KID+{{<}I03rbg
zAU&3}^yWSiNWs|hQP+}6Dbe6ZJkNG|on|gFw0EB&xUMMC^5fRwb3!4&(JVDo|1D@K
zi~azO8_fXnn%?s}!@j)dL5hztt0l_ibfG8HpAD?DEm3OyA!7PNIJ_c{@tRHjwwmlz
zc1~(l-dYBc^H++ly{;by-(iIEF~79Qy!*dEUp+`z?`_I@_B(=&^gBYNq(0x~3v&G?
z=D|WUp_R31RzvjHotPoEuNBl;@pds#)$VnUcDnBAs}9!hMi0jstD<1tTKS7L3&N&9
zGs!PkE{(q}g@0p~Q2B{cU|Cu1snvy9&WIb0l!$}U0F(UrPy2FT3dnbGjSpY>-<Oet
zP~)BFUpK&4k0RtK$1;fssle&)_?~nNPo7O3&X`ASYmZ8m=*abn>3{A1_6;Z}cGu*7
zl1pvC;?h?I2V;TP)WcxWQ9fB9^0W<-<59o>HI-Q@eekmf=|>Ifgie=W<9bKxS4gG2
z#ew|zY_VDWEH~o8VSY_jY%va(x~GmS0NPSoQr+C<QUAZXlLY#%#iEmNYzZ2w#Oi~~
zMBkP&pvG^<DK5RA^)QYU3nz7+&17uOx2NyynU5j3=Ka8gHwIt1bSffR71Bx(^1YQ@
z>b@kQ$L6R6AGiL<snXz~IuX4|+6i3F`}hg!Sl1OHHQPfETGzDd5LfrpbCeq^jTa9J
zvMDk_pS|m8Ry^uthQj9_R-LIj{pcp*K(+fic?zSC!#uuz55LA5CR9$Ezn1Y3pE_p$
zr}<Oswj5$faNlWJtF@~sOGZrpyB(vcHrE$@XYLBjb?ySChG!H%SQKLjE3c_+=+xaF
zMNK>?RAu2Xl#6|nuYgeCvid$$BK4j0P>+f8xvS1@{Im8?2KWK_mN-STuuX6U@#dot
z017Sy;9V@Db(AFj-`DnO=HHv5%lI79aVSSVSpYSj(H&!mR-EU7QSgV15R>0E;J7G1
znea--UZf8`xV@kKIu`)wD)6T+uEpnIri1%9i}Xi%gGVqaXl3ibjJ3KiJz+9XW}W{S
z79{&hSVtk^N$mB;ftX_VlbB*`qgel#ds<|9J(J05npnRNGd*L>uXNp!KVraMR9;|Y
zy)SuRbX%0ZGqLe$*zYsglHek+fd6gH=u=<f#R1#b1aJ|)ijz&<J2N}-?jMw9?2#$R
z(6r=WWAeQ3ov3!y41T~qSa3I}4wkOcKX5vHAtk3?;0`KTofK&VRfG=I#A+5~`A8Sv
z#-n<4LuqE5?-XCx=pK5xW&12>fO_;v3A!ax?MPlR11*F)%O2FR5)s0GA^%xnu}cEz
zX1@JDJHc!(N}u3nWg(S0<=&5vk9oM#Sk~w~Bt8Yhk6oJcb1tuaEM#n!`Uu=fXb6*-
z^5aDxGg-q8Naz>qSEPy-%_zON<{^XyF$Z?FW_M<$ubO>w8+}#O$)YgACZxMY6l<ap
zk;zofDI&-JmXlvqMQHKx(~G%=us}qx#H>#@Wt<uH+wR<lH*ZKs(p1%HyQYVe;Db^7
zwI@Z5EyTB9XXWy}^HgY9p+V8S3nh1@Rsl`DHg2MCDGS_drvI1X_J1h&-aQ1Bb5^Y%
zM|fzy0OhQ|OyqliaBJNVC};65x5tf=Sm(B8kXAU!x6N_|zsaAsm+)Tw94g)fNbJxW
zz=E|-E-NW3D~FoY^$nk$0u}0YKvOeZl);6M{swJ6l;S(p_e-h3>+n~sJsRj5Y)Xao
zHCD5G`?iw);5_HjEbqVh79fY9k1Q7;mns>JG~xU)p5&LW2y}jc6{nK{pFCBFU6<1h
z4kB$5pxgEJdG<|t*$ma&bis8(gn_92O75Y(1?J%2<b6EiK#0_N;1j>ZLb+L2hib;l
z%K8PTR6l-3&V_Cvzn0~xz6IMCgXTdzl<(ZTlKh|Xrf=gu;%oH=zdHhD|1<h$jQD6q
z<l=H_;-H%Fq+r?=#1t)=)G4Bq9y0iv1_eZt6^AOrzDANhfDmdLMX#iu$xeJ$7(Ir3
zaUPciqG8!1f)3}pr;29azhk5CTue>}4dW7vHL(hzW6($j=3BL&YUG|*UKIRn#}-4S
zJ-+85RZC(K)K)<%XtsA-BnvA4p^|i3c#%Qmo!eVlwTcX_AFWhhn{sAYdH@%w#uDbY
zqwmbCyR}sC&cYgPZeSty_4|^CrX<_8Lt1_<nT*&hqgb|%)>Q28P#8;+sypdu+F)&Y
z#?Qx^IYnuNiD?%+43~pGA2vB^^*%3f_ER)9c`K+Rikh~v#>GEibNeomj<QY{KXohe
z^S08xKT6JL{nf7&FanaO3osBBi8DYs`28`0_}@^b|FtPk4gt_0*#4L*{<^9|<daJZ
zcZALx@c?!G&V2a;Cxnu{?kTB&wHq5vn(^}y1&+*ma4p8)8ZBfuS8*ZF^?~^K+7m<!
zJ~_<uWXOIqYlwj!)fRB66kxpVOJ9NRN*))}%lX_p*+8Jj4gn>yGnh1#Wu1rXqVt4M
zT$_29fx4}Cb5<YKN5;cXbcFW9)sLB;$gD-SY|}0d9X+3Rry0xvEon+AZ={9*Y@c`$
zn;lo*3m^tM_`}%5^SIHx^AdI3!(gz3r;BC}9T5=`AbBAwoe0=9ev6=_Q34W+5i^5M
zsa+f#92@X;;Gi%I^hHLhb!W0;gXmbIih%aIPduq}f_>z_q}h@1kbM`hrEW@{8jPKC
zCFFFRS%}=2QKY>1mkLpqmLc;+_lmI3dLY6-7FZO#OIFU$U+lct%TeY^z9+2Fw_>zg
zbFg?XtYwn@3)F{)E1hm$-6LI1Ds%KjEUDmGnhn&RX*;YraXo?>y=0N{?d7aeU@LE@
z*kK!EqlLc1K(HiJJ7+7cYp0mZEdw4?^Kyjv?;8yZug}gayTy>I8lE>$$?>~Eifv>7
z{5?FsA^C3V<t@<Q{;#Y1-#;y*0X+-OE3W;zfQKVNnT!%qZMB`tXkB_q4g*>*jCL}n
z-#n$dvj$QX3j7xz1dFPVAt5nlktYoFIYs6VgMSCn5sGK&lbdZklj@G<&zK(+@(WSC
z$Nm}U26w4J+^vvDs)HT^JtY;-5L1bS^sR;?`5?cm>%$Y2Zf%?w+&duH1lki1dN5oi
zG9XIr`u+mQcgYfZ#JmSXMb|;dkH4;f(0R?3g`;n#=}Oks(HY*-9CtJrl#XYVB|J16
zJUpxU4w{$BZZxRSSF<ql28<->vmvQ5c2BM6@4TYN`QC7$j_oM-fRfy4`+lkm4eA>p
zIYK!QYkWZd^d7ieZ9SvkGg7mcIluk@{6`s^f`d)Dh1)KJ%7%s$S=3-nC2vmEtzG_n
z8GAriBj2~SpBOXtbK<!vv_M+2vqyrT&?^MLc5t?8m8e(V9M%x$N-5fUD6&u+{!9$r
z*{I6AB$G!c<U6xxD$IPjol{H$`OckxeKQiyIA;-Br}V>cr$IB!^GQ!o4dx}zp&gDC
zH@~KGUzRnR+S_iOUtIgvo#8^mCiA1;v)|c?*Hh#(mcTqPIbFl~To;%J_Fk&QfBW*k
zuk8O^ca=o2D;he7eTTuCU;o5{(6<JFqIH;vK6NkPw08GuIj58x6Y5ymZW_q9om+zt
zZ_+l${kkRp2$6_x&<E2DnsyB^KcFmp_M9pN1|Y>Y7*F7x)k8F=Sue7{Xn*;fVNQxO
z{-W|jcn<ifqsBnGadod1qB6s-;j3JuFPt4_z<JjXnFDU`t5Yg1)z$Yu$K)cbAyzzM
z>R<{27xrg}Gf?;u%4F?uHwcF6V_g@{emy`(2sQ09dHHgM2r!n9TvOc|H@6m=8#UYM
z$&yZW_YPY?xsrV@4OtW(BoF*9v<Y|YV_yRg1<(j{OY0b*YK$ROgRoQlCAf`_d|MMM
zZS3iJQ~;d$WYu>7!YuzAH;mIeB38KUufLV^RT2OrX=V=}XsmSP(6R+;Z1t_aEsBvy
zI9+AHc_4yo0(BCL>3q9bHem2^{ML#x0l%(7$atk9Z{HK%(}?PCdEv=5c%OKUL<@NC
z@Zd7*jA~4iz5SQjP1!5tQI&~6r|E}HA6!dh-uI!G9#&NT%u=NkRj^N|dT`I$)3rsL
zxRLW&vk5uk_xb9Ql2fhvd0)`@->Beon3;M$ix#qc4iW0zFYIv%fF?CyXwAl5!E}XO
z(S)a?3R?dYDhuXu3Kal`R?1e@da$3YV{;U&4$la8Hh_q0r`S^8UK6mwist7B>WrOO
zpIE{qSF=b%8A1;ir&Msmp2l@Z3_B;mV&5nB9t;AJNj;OdSufgV<V5H^CA40D1r&Qg
zsVHE(F2KYg{C=+roH`L(gwEEJKA(qRn(`iOJTA$7B_<#1s_<NuI`z?=N*f4DF!j91
zU*jfrUwdU&I~%$UtT8PUXtHiKFkhu_DXrN~;8^uo01g}a<G$}$ew)r853HP-Z4X}J
zs9$e5%TS+BP9hDYGcR*NYhYhYk>N!@<9<pg#zKe7+U;hbW1|aaG40F@c+qy9Kvlyr
zBLA0J1S#nU&D{2%7bCwh^jR76j=8S$PTz|2|B$>T+*;D;syVOjsyy%GvFI1GCX7a>
zdNC)jDf`*D13%~dI9{S5H_gW{uxk`jHBqH;o^aA1G6|O2R&bJrPs8@*_>e$Pd5Zbw
z2w1!Q&8Kae#wXt<(4*bbQ3vn~8tz{k53JyH1_z0T$wwup@PnR7^|yO&iBlBEeOGKh
zDJnkdPNdE}Y!a!TJoI!~_(5m-(hy<mz0N-*I@~ur_ok=eL$0+zd}&suT1I>GU_Dk{
zcaY511@65<ox@H{c~HN|Ci!*nZvM%0h*8+MP~GK4-lgRQM?=Z4$EgV?FSjf|%1Y9e
zjJbDX3(BLz+h1rWR-CvA60Iwabx}$-TMJ(Y7WR0198Xmo(v=gN>HRO@5z_tJ4{gr~
z2-g+kGJ4*U;$!MAM?6x|_{s(>4htvIM)wk;w{d<fuw$XwH+pCb`pz1(ffshGxdSx#
zkFM)`ne~qWe=hxL9T|$&7;!|BQFzuW)&S`|D1=J&z-`vHsz46I(mwRJGEeMRA!QAZ
z1|cbQg!fG&>yZ@;K_Z&BOR3ChnNYuS?{!($zlX<p0&Ots!vnS6ybzN&)o8(3TY)tZ
zUDMw%ZsP6bSF!jA(V@z>(>HA3L+Cvpp&A^RDcpm_IdqqfQSBA5gx}v-|4_B(bLIt$
zn_R|qI(bKXHv8sSalpo}fP+vY3Jf08f`jgs0`D%=I61QtWBiR+EQBn0>yQ^=01P4>
z_HC1FfN_IOBo9~3zE?c*XtZ!?#FqyjNT2FJ4xRe!(Rwz1Jh!Hp@CY?1eg_0;mpTXp
zdKTpCF-ON<L%M3G+Z?=N{Mo?pMkb)yEz4eja$KJ9NkXfLo|7?GihJ*%(XFZ3fy3#*
z7ma8EDM8Kgl;lh(PVEJ6cY;uvFMCe)lTo2>_uDJ_eZ4f=`kI`=k(SPrP3E*|DRV6&
z1w;my{5N^7;^iZ+1|@rlT*)}m>DFx7VEZ9<t-U*7OFdS40jpPWRnxD7%ET+slcCeX
z3ZL{T9aEcwufY}Yy9*cp!w2JMc=(v-_<d!A;5D_tN-z%(Qb2Nz{(kb}0uq@Z?0fu1
zZO4{{>o04)UXq8f;#>Hyuw0(NVKhCj`gE^CCUL)6$;Rh1>4W>dxhUfgV2w>8*9voi
z`6wtOT1C1k6~z6tzi>B=xZ1+T(ZQr#{j~y!Cc5yf$;S7Od)&xdV%1nNi&9}Cdc?p^
zn>f&TfNG3F%ay1>vBO6Nasp<dP@L{-@v66)n|kjROuSk|c|f&gx`3au7p;if#!*&z
zcwy>Qdf&FaCNAKXdr2ycdFo4n7!n(<g=su?cf!sksIAw&C2cje{*BW^8*K<(xK63o
zhCMm=l{cS0R)u3u-##-LBc%JrSmt}!NFvXGWY<XT!{2k;^Ez1EZMft$-`=z$6%pjI
z*0srERI#wtwy>4pzNd0v49)Xv?EBF(Bm1ve0DFJy=j`hgnxbf1$HZx-)$z^IBgmuu
z6}>y9E-jjPZUtT=LNf>A?D!~ygIO0^R-?(@Hzk|A+v9;MpKaQPgu*6$@7_-9s@on5
z^R~+f40+|L)G8ENuG#|vq^fr$s!fBCj{6L()yi4qH2r@%*Wkdvkn_)62>zB_!_X#R
z<ameD@F^?i27PDr&wHM4<qI2t6FYsdT%p9W(KfI_Zdrc=3`iUz>n&R{;&u~(5Rhxn
zn?6WEYz<1@qKTWBMANWOw6=8Ol>~_3S2zZ_h&z5oAI*D#1URtQbXJa8EEv~b!|{QJ
z3qR{}DXNPXp>abS)0lA)r?{`QHCwA}XJ_}eMvThr{yiC0vK3cPU|>9N<K=~0<cyNY
zs`k|wp&N6i4HqYg<*q~?Wh)WFpUX_J0B!1_m6hCq{E^C>h97nWCcV(Uh9~crZcm~y
ze(MuutMzJZM65GxR;TsIBaf5!3ay8f0LjR!^)lvUcx-BjoXjazkKu-}ThsUO2#Fs!
ze%p0t23C+z1E&%(&&P6OhTqul?mLy{Oj6fUa1=u2xoS(Ydd<>~mBq^@HynsI2CP)L
z&?EP}U&GB&GbQHYlm2=zx5Go;S{~NTRh^v@SrzVkRGfwxW*^{{8wW=;f#JyluN!^7
z(#@&_eoPoP9>H;l8RfRE<ZQ<-kA2Ksplh{=%;ECW5+&2RECbGD13ufyMQ-2!5S{-H
z+go6@=ipXC)E8S0P5DY9g+;&-l^~ZA6=*eOUB2{%`#8~L^BF5fhQ6~tVFwBh<=eu&
zTLfp3Jc%-@`*&rQ(m>3!Oc!)|P;eiKDWoPXYFN>=1*R~-etNsI_ykpr^qbmF9~8%J
zZ0qUi9g!%U0h>)rtvlMR@pFqm?^{9<N$eohE5(;T9B~_Uuh1%*AKGj3tKOWaa#6Y^
zBT&~c{;GOUYzUVMJMjX>eUGRu^|pf4)~V?`rT5TtfQU|ePb0MJsvVk>rnU~?sf6Cq
zoVHDx_kFkK^K7B6oGQ4p$+j0)1;0iQEv`PDeI4>Sqa4^Me(lY1wm5a&RPyJUX66Eq
zE0C_Plgg~9>4*o8!AggKCv=Obn6}B&5V5P0mj%tONV3JB<}Ee&T7uxR3&?%$2njFw
z24z>7NGs~S4F7adSm6xmsn;Q}t2?0mL~z2uosB0>-%|D4Fk$)5+N9{-)6%<U_D$x7
za*qilUgtpk`=(+NbX#wrGTSQ-$_kteE*T_?Sl0U~E!ADd?qX{fjgu`}rv;R-xctiK
zKsL`Po6QE2Nw<37$$KJJE%`OPg~`F#W6`FEb^pl!?w-qO@tgi{FG4TiU1M)^@W1A@
zZ%gx^0yEeh&pAN`AO8}$sNnC=vdYxC=W!eC?g_PCqh26fc{t2Xe70S7P#OpMB97x&
zY6cF*vzn;2+N_23_4WBB!Ilx;MNU0*v*skv_9hUSuZY`y)hEt1q1Rftc#^2$ROEd_
zl2r67)n3W5UL+4~Sk`L!Nr{ue$wMDzpFcToC-7v!f>{p7uN?t`x^d)T<nrE6JD?HU
zY?W$VPXZ~BBUJ4xt;`^GfNOcxp*-#$mj4iJF=~dMH4XgOa&N`WA8bs!L=@G+(|8^x
z?~!D&K=5kxgCZxk(74lR2<zgjy9zeVjKN`{JS1lQiT-Nqic(>gI^rpdWWYstVA6j2
zIyklZnW~yYVHoo{-yn}hAwjRQLUxC7iej71Lb{}?yM(seEt=m;ldn`=IiNOD#`F6c
zf8lXJ^UH6J2)1?19rM1!yx^HDBgdB7q?L=|j)!}$RF-OzH7|ZuEq;T>U$7AVfch45
zc!5RFDE`|5X<%SSi?YncZ{A^*Wrpo}j1&7IC$n8Dh^3+M&@bI|;G8~Hz8L5!_Ri|x
z!l{4ZtbOLEDy0$vp4XMZ!cnC&F%0B_H@7SzQF(T?Z=JSYopmj{V<vtZml3B1wYswn
z4CNKr2P=!?gms5edDTe6iHR9IVC2FD6E$7)(y(c6{UN;40#vgl%p>L5oR^KRxPMWo
z@mew`!*BPA<O$`BV!wj%Ui3?f+a*sZKZi&zdc*j46_1pTTF#!aE_>PH|ApY#hrrf&
z>qO%>I(piyhi0aXdd!KHWOc0svaH9#)kx?>;cH1!8g7I)hlCbQ7q)Z~T~ioj`XSs?
ztyjR-!J+jt3;wESNR3;y-lin*yqP%whS*-?XxCZGbvpG?_Z#@(co-4=yQ|n}p-hu&
zAfwWK(3l{f{h<<L<w8`$@A9HP+!gN8z_wbAa)@u4jl!=!Lb30=68H*V;yY#1m*mps
z8KUO1J#f%kRk(6cfmS|w)xf?E3yQ3Pgov@xZC%wMj&U9QN-y_P@%h0Wme(aLmPH#@
zM41?3f6dpd28$PP15K7i_ypGuW<9T%OI)3?!%daC`jjl`81x>X*U`U(68S!VN@o~P
zN})K>Ek}7=c_JXt=u6d?l-TteEN|N!A?bKZO13$}Kg48iI}nj`H=j{Dh~Z->xBI7^
zv;dm8t-xwz=H!uuJZkRP1>*}`IdKveceWMv(3YCXc{xW~y!{<j$jMt#DBP(JEUxC2
zR{SMXO{8d%9hj>v<XK5ez+lPCHi*gb+p)k%+U8E}t`G}s5cK9pc#(E`=*M~UzN7AH
ztnp1}r!y;6mP6?x_k^5(qDw+R>Nbr$G*rN$tB@pYe+S=;U{aF)QWqN=^o_1)+B4eo
z;Jb~Bj#2YvS(Jqw`h(yBvntLiGnT|*@0u-CH8-Cu6rzhN62T}&hU@On9>&fo3@<`d
zDExuVaAhw|RT@`7e0=;^Vq#*eZdsA#EBK<Qk;qE{07tF{U=oey*;FU*&+jTIaQBNP
zO;S(tBux?_mt7$6wkdmGX4+%x!7;--WmZPhr_aOIZmhfBXuTKdT%3xW_Ep7Hg-aHt
z2h@&xD0?}Ux|&sN7xCavZ@}(XN|Zjp`mPmvuhN(+q&ZCcT0Ws#`&A&vFLouThmSsA
z;EwxCnF|45$?5WI-&hX5zxfY1_XSup1pWZuTku*(6Y(fB&J);NDDT}sB+*7?t!{Sg
z5(R!sjN)lEyR*%phnCe$?!ul#z~X4RR$XqxF_Ym4bqEs$vc=e|MV<crHc5AToQ6J5
z8jcy#7OH(c3p}WC4h{}y5(;O%-y6VZ&9k$!0@Fy~%D#s2JkdWXaiKYntYEDJcvhlq
zy$tpA8nA#fFL8I$F*H1RFZf<sUq+bzne_#i^W@1^aQ{#`;bd!(=oYQ3lT|JF;VckI
z+COpoSJ#0Fh#aII$~)<*)(earnjRB(+$}1${!k-x*31a=_5gk%H0HKPgBkwYxJ7nY
zQ;D(VZEJXGTB~6t13oELP1{jli8y1h&i0KuSAywJ>h1(#Ge(aJE+o98TMm*gI8C^q
z?*545E(iMR*0S(xElpBx^ew{Fnap&v;X)rCVY(X>zjzZ_Z>ylGOZVQ+QM@3y0$-|b
zS*6BD7Z&Si=?>;7FwIQHFUU^&vR@&+=biU3f9lOr4M`Rz1nrbQAmCLw%ctd7yelcE
zQW}obnI#*)%PtU0@85=Z?bnr^Yu>+Y@BLF!WCuyHswHwk-#-)Tz@XP(3*O~Wf?5kt
ztAbg-@-x+a?T(q|ZJct9TFWdil&rrYg4DFnwImI@xh!HwqBk>Hjr5%s1Oe<m2w*+P
z=Qci&c-OiT_k>w+1&Ye7e^Gy{3ltk-d5g{Q8tvqIXeRn+1|}O8uyJ{PoZ$Oy)zZmm
z4Jw-rMdm-N8P2m}SQDkdN_#it`xOHfFq$B5)*mlp*B4j@5N(_q5G!vQt_aNo3*Zda
zk+pE}cGu1c8gqmSM}Q~+VO>_%OK)X(ztp#mIZ7-lHq)_-I53N>7Y@(LyJ2%5kxUYh
zbyg$i#tbvKW7+*${Mj@CUIc5I@2_>&n-S6j7Z&U*tOULxHD^jLP8h0|PD>nmKFo-_
z2HhMqy_VTCAO0pe*pS_laZIS);6oXYLx~@oaa_;cX*PT%JC#x&F80fC>bvPrhYKMp
zjx#~FVwYY??{Csnn!!4uKtgQ0vBW25QRKa(3M$jWhT$!`@3LKEP^(P)3moc0UtUeq
zcJ)tkgG|KSOIKG{vG+@iEDM-j(e1XCl@$aKV*D$jUj23V``mrbyVC!$2n8a$*Ff0a
zC*qcn6x(Xcy7s%}>^1AM{w_G2x50`YnH<JxUXN812gaD7c-CD}meMt1%?BDt$RleD
z8>pe7Vbs#^BzP3c!+l|H?zq%s)8js3$LCB{t(Qc1t4lwN<*3{jt5Y;FxFb45Ex3-N
ziBmJWEj2e%F(@mmy8qpGQs}*JY)h<#C4KPk$8d}?);WLTl_hA-I@CP2!Wu%Mr-RwD
z(WkX`#eaa00;neAjk4o*u>gI;pf@&A07<n`?Ii~upf<gg?meO|$|uX<SvfCS=UlHB
zxG8|YU!>Yw0y_4QJUw1mJIyZe<4veRQY6(lcN*D4Y>JeOkkHGLj&vZQZWnJqq3<+-
zV=NyN#DoZ?_;4UUxTr#!O;9aji(e)lt4JIznO?>xO=1ejDNN^iIj<#D#8Ng%__OCF
zHy7^JX@kgQ>~%UOisObr*Mao3)9Fs5C*Q_W1>*O632?lvRRnvxC$PES`!?wwKpr{y
zE|>1@(YjL!&C>7T8MiNIJ1@X47HZ741Q{|>Y^tHE8*eIPmRWw|G%6^klhYD@|CK2l
zy^cir8V1=Wcdl;vZ1199P1^JLJD@gAcQ%QQZrv(g)u+0(F;=_iPByWB6o1!40&o({
zFYy2gk;(8>CA#k);Zx*!=^AEy2dV3ynf{pKI!Yrm|5HNTE)%eL-y*F|!`~>mEw#}X
z%?k7Op$g%0OfJg*qF=OK`U>7C^<>QnyE*Hg2ZwXoP&wGxDBP3q6?N9_%Bo^LdRQ%4
zWVN~2dR83Eth`|BVgRZgF}DpJov&|D_s`HunsvBk9F>{xlkhA?Zr<ur*1A{)NI_P3
zLT#E8;*PtR_wQft$l7xJ$U?lI-2rHNj+;1^+i23}PqCvb71mnAu_Fw}1Q|RU0Y)f|
zsFcOy71tVE(th;P?{o{zojA22QmLk%U7<t3X$=+WVAKIT6J}i=T1Mb*!ZC{t8+r53
zSuyyai#dHH$4W~b^8F0XyTCnjS@M^lSwFiQ663%B29ZRPD(bP@<<PK}s~^gvJWW;H
zUKo4qdo~gHtwjB|`#1{m8zf~BJanRAvwe<G9H(PKsZOE>nxix)aoXuoU6sfuc%U}>
zFH3kCAc6%fk@u!tN^&M?SZn*WzB#V5t;Z8&BS@<t8AcE3CM}0w-3+p2!q&Tx6yakz
zu<lrK%f;Jb>=PD&DP}Fo$vzs*$`j(+1-aS-HmPu;^(gg}deW!HBY;=rJi2at_D?X1
z7eHx-Yzt#q@cLebFGc?%Fs@Bd^PQ_`@2f`2Uof@eTEu2>!o0KX+Wro_af&q!4M+C?
zg4H10QPDr2Dr%~)KP<R^Edf$cmpR~5jWnW}2|*cW{)=jiC$Wb6l`V2Y9oq@=<~-4G
z<S(F2aE@#<ig;_(*5?6Tw2ae$2Wwdj($}Q@PSb!Gs_(Zzq@ZB8Iko%7R-fgPJ}7tw
zC<i_`MziKQi0Y6nuZoQ?n7o6Zhb?C!1o0^9ft)zp7w#%8%!~BW;^H}1507bTZV<45
zH7~oDWNkWd0B)M0+KWgysI`U<y{5?<+1RKYIxPVt)>#<_nAijRt=rmn9ENKwS~D2U
zNT?sW>C)Piv2S{(AGZyndSq>tqj=7k!?S7|eve)T>Cx(%=y63636Tj>gkp0Bv%VXT
z(z{1Zfw6^Y>R-&X<u3jBtIwF30z+Nut;7uS!dPabGa#$H#s?kiU6+xqDG|Mjq7)RE
z7h@gOcxn8ED|$kYNVGNN;n(WAjtnlDm23*%euRC=S-_!9^+92JzmR)XZ=?);|2Dx8
zVsqBhWnj){{pjiHne?Yuo+q<WpP40_dFUP!077b*j4r=R^FJf8^TL)9<nU)@#ArZK
z4<(0(xUc~-*OwK>Nrz>FS1bHR=sqmq!$diFFI}d8#<?gDNye{CHy|^XXWMU5B;1j%
zAu0;yjZ>4KXe!|+q`#kAt%Rr0q=yG`I_{h#rae=&@!6qMffSU^vOT)<hjUW<Ja7p6
zo8`Z;J)tKt;*m+jpSqt0>?venac{rhG3&pS&2og|8|v-tT~km{@OuP1mUv_fK-Y8C
z{oSk@b!S>p*m$B{yhN<)U9-V=SuBUdXitC{<`GGZVaKbyB#FIg@Lx*`oJ(^LVaF|5
zy}_FYlBC67osJXLdNEnmR&aJd2@Ta{2z{qrIx4Li>|49UMsBN=pIe+<Tz+FFse4t(
z(0enQ_Ft&Od*al3aNnkDC?8>4cTI*bUc_I4W*TZb2nbFZdJKjSQ}o+ZI}@)k1&G)@
z;-r|eSt;4QShXUWcDni1^%MV@1vc?8(^$j-d32<Kj~QjZ@B9J2q)tgyL~5>ZWgMaF
z#MK<#|CWPjOLx}7URL`<6}#j*^_)uy&6I;<X%3#Dbz5aLj?a;_>WdFs@wcfnSuPOA
zYwyY`2YN24yu5K9$5~YlNpk)FA*q91>IjfIYUNqc2+=J93=kSGtFN!;X-K|vC!qWq
z<3zkK|NEt`O^w#08j!eo5OWK9UoUYM330%VKkz&(@I2W3nURiS+HAeowR4~p4?p{$
z1_dZ9O4ALP+MIL$EUU4>DyNQsM%)CQiIdtP@0G{Q@oMe~YT}IVmoj41#9>zFN9+$z
z?3q{%V1+#E^4r525tH^mU4}lY%7qnh5KF)UNc2*<Ojc}1d{ARE>mSlP@aj|Q-r|?J
zA>v~{y|CchwX@W9mY1(cia&YRY|C0GG~;T=FRmi0E0Vtyz#c(?#34>0GZIu3$^WiK
z-H|!gUyrQ2%7>DXvB%>q32>TVcW$I%PZ?lwT(_h|GXp1a)yU|K16*d@5dkm{o<-J^
zLypyGx`seCLf*3nu2;-kgLih(iAF*P9DtPVsXl2){V3YS1#qd`2L}g=OlD$xk&g%8
zbbap#eXPl<t*y-!x%4|PLP~9OCnwGUn7FJpBR}~;_r-0haZVt?XxyvJIB40HcmbIf
z47$&i$QIU&C=<85?pE`2uYIq{^Wu8j#qE|G5lSF`+{Pnm-7pM&kzEL75OSDTSCa?G
zB4J~_q%0AhG|9SQ0e_0z$lx4FB)M8+SxxXhhpF3g&EXm^|B52p>ANW9)&t!up7(O<
zcVb-W2R=hLQ!_}!E+o*%7V#SG(Q%^U<+*Fpv3D=nxDOF53s>q5_VHgx(De|{pKjqI
zK-Uap?66RCxepcN9QR$Xb88i5PMNjtIcKCUJ|(zbkSmb?ts6%^epiTx{JgPs4<rx+
zfQSBb(D@&2jwL*%_>s#q)4RLQj{B>so5FH`aoLLWow_AHG&7*3krr%|3WXF15YeZb
zSRZ@CF&V<wd1s?y)`N{)i72ku4MbbPX*Kym)YgLtCb}M-^J=|w;OQDd4k;Bfvh~m>
znut%&X07XD0r=~ra@i!r`b8i;x2qOI_26!|hxSe@Lp$uG!t^89wlS+-iD^Scr5I-|
z$jN)sABlZ4Op+R`7L9n{PG}a#<mrx1nI;sj<?Z|U$v_U{122bGBT?7re?YzN?$_JM
z4N~pxvxLV$ilzO^FN;{q^LOWTOnpg}mT<iM40s4$D>ibQXt)Vw2u2bi<M9u8J&Fu2
zI0z`kUEin{m@fE`SmZzHk$!e$KUzAH)A(h<o}gK2tB1^=ZEi9r(9<v;F9I(j!B}?%
zaf@J+O~c-%=Sk@1<8I`ZzGF$6lRHUP^|jCodegol7x_b-41xlYZjAzU9Y3ifY1|x#
z+>~Ibgqngk83sOZ$vNMxF=N_Ljj_A^4m8aGEl4K|fmI90y&x%OkM|GV3_9)y=U^ew
zLdO!k4r=hFI$eZ~x2J8tYVyXPtuwGW31?eEzN)?g<{#GBd>}ImZa+HToe0doRww)9
z<Vm3`$7*yx<U4!X<>c?r;f4AcG+mTRBAZo@OIuHm7{|vkY7nN+-Kb?+Dl_OEnUTJt
z;?kcfYxZM$8M+q5@>OXGA_Z}tW~HxeEkLKU2pFB@@vIEgxaFlf_b%MEucI#*6qvxa
zV^7#&aS}q|jhWTrGKA$5HlQ2dGE2307DDq%Dc1)?fltuA+hW>~lQhyFL^UWxQU+jM
zOONi&Rxg6AV+gnvFW;3eC+{=Q=2M|br=lve>+^axHa6`C2X17YwJp8A68Wae`uL<e
z1_2RVUY2`jmd+QOx57O*v$g#58i%1JS%%vFd3JMGr<V6L4clA2%UC8QaD7U{{rrwB
z!88ss&7U>I6_WS!<#dGTCMk0IF6px^TP!@0ku0OXyw<y-ut+=UZ(!CG$emK;5LzIn
zr0QRRG2BC5jX!jqVl$v5U=3YG(`dXXE$#JfT-fh8{0959;7B7YdR4Od=B3A?U<p5*
z=YQA#d=&pp{(t<4yD&4IXJry}8FV%8!^TMn>BZHmjY>{9LG-J5MJmmT8IuZ=DDOXM
z)I%o{*X0&$gR86AIdQbqT~d94cL2KTr!u?dp>@NrMoO`Z)2q-1e&V#@`avjnRq5db
zr5~6n$tuM0AZ$~8kJglr4{BR|0Of3X_2_m;XYQJKF&w@*zp!v>N&gI9C|!D>XM@_7
zz8d%qEH>k~cNcU?ShRJ3YOh1}me15@JCj&F_(2e04>B{`?VM;zk!VbgDN4?$WxG`Y
zau^pQFP>BF`@(%X?K@`#zd2{T+BT(>XwO62$ir-waHycD&vU5@0WuVCep_q{o4;sU
zC~h?&!LKLS%dx^g<p{{wlJaQ;4EA0LnP=3JT#Nk>$)?u0&69qN!@ZYef>kXDsH(%|
zAT2og6s9tG#`E&A@RgC%#5t$B%NKBP!pSC?1F@>DfnoNo>kQp1>oL348gZ{n<8(#@
z$Rqg*tRaD3hFELCj5=@#-!J9M0v7F+2|^aEiW@=dnLjb#KZVf?0j~gkR_HO}`5U~3
zIP`{%v8ohJY@s|BT5ou{25E5U>FYO6g5;zR=*Lwv!)-uwd42~p*Ox*cKz)Vrtju^z
znDujk#*~+4S4#B(P)m|P|Llnsh&Gwu9?_Gix4I87KMTxT`LsT6HxzUtEhjJlWR)V4
za4_5*GjURx5woKA{96ZOZp;dSK{9-_Gcz-u>*v|o(!yeSe|dS?VTKyQ>c>Ld;Xa+N
zIO}8OBkof*RJO29WoDnKwj$kMYX!$@)f>TG4eH5leXZhs>#>q*YrhCgs1~NSsn&cU
z1%9MbqFKhp*jMn3bv1s+57ouR#lQRd`s9;#Ir;>*1rVZK;ScoRxPP#I?4#T9D0CYq
z!2pWUn<XfZ#pe2YkFnujcL*=yQ)(HXR1_i27)Y2dz~cAyKIY9Er!h@xtH_s<*P(yD
zsR&4)_s(dJfw~)WO}6@Y60|pY8!KO}1g$7>+|yEN0Rd>mqa5og4bOTla~J!>(rztS
zjP8<oW#4wcKJ0<ML_qCeJKq_s*Ei*!-X-D+qfk2Cqtm*}3%prAM{dnVR{`a%B?vvG
z3Gv9#Yk6cJzwqovR6#DvpYNU`nEU7wD!U3o2{DMM5l({}WdFdrJ){GO9dGqT@=b8K
z-|8?1<Ol=dAuts<7_XMj8vGk{mhVV~jXB>~TUV@IPWMqDbdn;xNo`gVnY9`fM3oK<
zq`X>oq8yGL0Jeg#a&7P+IYGfJfH-!#B_sbddvP*GjZGK(SpCvCXk3*+OZ{Mrl9mM-
ze^Gn96jHA;T%22a*oBtJ)p|M2`n7Nwh@_u3vxuZvwcgk@%brsGlbT@CuA17~v)}AS
zhrNCYb&RNAm+3oQ)a_0oHYc$s8to7~MSXpJzXvb_hbXKr>$1yv>-3IbN<kyY(4gtj
z)FAueiH9;Y3)B;}vv!t?*iv9(j7sw0<m8;kO_iHX-cIcrpAQ8(Y75`NeQ%$7(k}(0
zFV!?1t*zH!DJdx#ujZ9{-gN(K7GM*FI`Gu7@qyA}N7Nx=(u6xrR|A?GNFN9#wvOq!
z6U5?lr1KO=@kv+;J|*EZ$E;h+Ya#~IsLWfkTG-jy-2v^S<g4rzM*0KrV18K9?Ew=i
zO3mZDN4z>gq>ut;Mr<*mI|}=*571jtpO+rf;Sq8aw{`e!I*{C<pE+yR<|&&j%t;(Y
z-&m=p-O?Jh@rBjJKv%e(FSgDbX%*#6-gt18UO@b0#xm+rzBW~4pYX<3xuN6$^kZ4$
zd~l$aCZ6xXCEI&mUKtbzv-Mn?Ua>8X@0VV9eSn9THjkTMv$(pYzS8u3Zv?{MJW}`G
zbi5)H3a*c5>)bVa08rW9L$GHSl;TeQ{p``7W?9?#dB5dTw$9XB(sBK%Y|ABe7BGbK
ziRWc2si`8Qo>|wFrt8c)@{o(Z^CKAOF;D6PY+rLD&x+R%!S`7#!l^F=`giX1DbfjD
zq3>L;ZNb2Y7QOcdkPusY27WO3cp%TZ&Bt)DxgVLkbKVVm3K)Y0PlqE!QVOu47Jt9Q
z1!9Up>E!J9!9i%uuTk(z6FzKChNt+YwRBX^Xa>!Zw<8Yv+hYj{31G0L{wd)OpbRSY
z=_hh{mpKthW}sB0eW6Xvo-gbUM;>KCFIx=7ve|<_U1SK>`KY#F2=M?MF&fMy+`{qG
zcWQ$PM<`hsjroz%Vj!KV=SH*~s~;0_$551TnLyli9yyBu0R0O9LnZFc-0h<)kIJrH
z^u?Wo>`X&F0$u~KR8J}Q#j~6ztqkf?x!I<?qHP#I@~K^vHO4Quu^K0^E1N88RA{(_
zPH2M`N@+jLvn-gKm}vD;gMU+R8BXVWxir=)5gzV&aB^F_3eCaiUdur-Q&9##(CvEu
zwPCPGmY}IW(nl+py?9@X4;p>qRw6k8@eI<Kv(Y?a-y4|ETj8gTDOLaWXd9X5;7Qw2
zr)-GQeBHwEdo}0N0-=hmK#tn_x51PZ=UHMfS%ezwCZsD%0@;FcZ*H3ZA0S?&Bw+oh
zJ|t&p_-Ed~jsGVY1&k1~z_vg=_#TAy%|Ae^rIxg`a!4H42-f4Yqhh)U3>>6f31(~p
zlPO)m3w@(JgE9XQ397zCFdf7}^knywQve;IuP)&Zjak3gy&7jDIOg`fnUVAFkPila
zrZv&!A46l5HhWzfoiYR4pxWG}LKX~y+GI~>24a_kUcygn!e6f)c@bRvJpVLPta}`a
zTm>W<KH?KE{PEgjEB+q36ec>|`%h`;x{>wT@8frQ<p--=aGWZXR#jEo2E%L@+b$lU
z_2f&-9vHV`3j!Thj*IFvj;Ne%UdRm%ScmJ~98byWFlGf+7ZyCImD8|kaFqY!sd)hb
z4=U$paxEASh;IBBliG0$%mz$@HtkARoLUP9afcD@ia{(*kycUKrpJOMxLPSx+)<E~
z)gj22ocR>4)tG_uI9sp4H|tl=V)3k1(4RT~h9`suufYT1r3{^kpLTZLDvsZ$e%IvP
z)1Z}^kM1id6>#ycza#8Xm?LGxb~)ji>E_BrgVxlPTjiJZ<%y$)o2E+*pEbt&E>krI
z+N>GJD-0Jj|HK=5@wNlfULR<z>Dltd*rL!O`ohZA(b08(SIY6D+p>!#K7tzpE@I9X
zAt;E`CPt_q^+TVkv#(w-Bgl&2Wdw3k)_-Jq0kijA{c|+vIGB^Ny-}1_5A+;!yjl#1
z5lR{VMaJ99In&Jl@X#U+!9$zsN#+Kv`DpflzdQ9Ew^SfLDug>$eM!p0daVvgE{dMT
zpg03VvD-jxb=HjY9FusAqAgC_@d15HA$6=<s!F(qA^RZ&MiUgi{V-}?&%F6#7xAY4
z>99TY(cNQtJomKY_e>wg<Jo%&Vm<{lkN1)F+mz(ZKNRrWHio`@W8II)3nd%x<0$Ma
z608#lq)Soo@2z&$FU>SYQ5j%I7`nBntojdch;%0`gktcth$~?17iYL$?-ju$8(E2-
zA2cb8WRSfLK$!SY>mqn|{YB3{Mco{18)hJ-%NM}Y05?Q0;$OulTzBIChk_WG9T1U@
z-@VDvA$$#RGgZiAvCowcuK(RZrMn5PTwFvX4NdXY)YOavaFy*5D~w8Y85DK;(P;#R
z?efhX9FTevE(-@ORyfd{ya_TC%QSXG@O=h|rV_1IPd08}#DB=_xKrA4lF75XJKwER
zQ!#CSuhIK!<;thW?afH-(Z=Zk*6B;0^X<*v+@<+yIyR@?!^I+wf;uwI`)^<Y54*x0
z4Fi}%ui7P}Kf2pqS*1a&;j@p~MIdaH^|<X9;}^maDJn8`UouVY&omEGh_Y{(v?gFR
zoxaM1da1n0=*9Cmbdo(zw)jc0du(&#{Ts><+JFn7P|4(F1Y1RguFa_J|M?-2j=SV$
z{cGSZju2D~1fauv#uzNfzSvJyc~fR2z~eF?69d$G6@p2X`!@G#1mAPzgF#y0V9eFK
zU`qj*pNz+d;|Ao>EBa1kmQI;6=-}KBaMR(^{iwiQTE6=w+<nsrbEhBudPsYh91ih4
zFxFLNlCbXdr@$N$R|shb_gxteA>CNlrROwU8=#-EC`hgI@|wLUGa2sH<EtM~wjuNC
z`kLN!NwjfJk-`c*`5Hf^RsuT_ALaYWq94%pC@m%yL0hAuSwe|c!nG(;_kC$VqBQe7
zK0o?)00zzx{1f{B?<7jt-=~NG%%h21qX&Mlsrth|t@Y4FlouWU#^Rc+%X$@y>PkY(
z*|P6x>V0k=w6J&;gCWk^sw$0$B|$y}4IDfp!?@<k^Tz=Vi15X1UUBbTGE8T^jO@NS
zeMx8D17_P+2#`yq;X{-q5#FZwkoMGLsa|tKp+02rd>{Ya$&+^FePUe7pGFqVtM_`J
zD|Ad`o&dcz{$B~xdhwZ=Fg2)TrgQBB{dj*vcAb}o88&StBt}L06p7<eMYrDcZP$`4
zzBYEr%=~r%w}}>yst<&EMFsEYVg-o0;%|<-1ZUj=JBAw{JR|9+=7j(Bl-dC7iT2{L
z245O0esSfzvH}f}8UiE|xmDfV>;ohH@tHuR-eM?SEWB*SHsf*8pD#)B4alj0rUqkT
zv>(Y^7E}DtdsqL$ageN-+?n~3)ir1S*7N}rS1gy@3O{6oKQMSr`%#!h@O0f4w&*fO
z?LtX=$pLjCKDOeT@(EjY-j^M{foqcjPs{pPRgy~alj`{zZ|7SQG_YNi2+cG;qDeha
z{hVd|&`MhaSPA>j-EC=B?lR87h*RJbC|vUQ`JK=Z^~)vl!vdjGmjIt+^N#Lga`67a
z8x#3gzWKNL>A!dG{t72aULXPAH;eT6#Iw?!F6MQ590F1~%xdg!kb4SXQADqCPF(c$
zj$fBtqPs5t{qOJsA|CnR+S3Hbm(wm7YMa2dWSX{O-V98*J4+sB>fp*blWJ3lX9irS
z**#qe_Nd9X9ul~Fxl*cq&VqX9i>1QMjSgN5Ula<>)XvRgvYl{rA$G)E-4fDvR3NAu
zy)vn5Bm8ZmkXr{?GVj15`E_5$`IqbUpVzC?IP;T$)v{34(LOO>jlb7n?ohjH>szc#
zB&}TVJ|awPrhsrg3Yj<pe$Id@z+G)!`iIo>Ke(&&01dW834zvMjBV_iB81h*A=0B6
zd9=9gR@<27cC?na2xbDj<XO)WNq>Os?am?1gE+RhB_qK<>1cc7FLcC8Ubi}0TXOCD
zJxIbb8E66o2+L=$*n8q1%v^Wd^Q}MSjNLMZq9i>(A%myepf-jS?CaIqKdH=ZN=<bn
zi}nRO2=z3HE(i}K;*Zk=dWnf0!S|64j$E`fX`{wJS+><zTypatWQx&=j7GCac`_{D
z!+rx(G@OIp0Mm7!W4gorX_0?;>yZG<Ey#J?nFH7^{SuktUkOR)0%-J|NkN4ARpz@u
zeo1#c{=IXidqKSBuZYq%PGs<UO`n9C;gK>M(iO~sfjU_<iHS6RYjMG%gjeTDS*^lH
zd`wQ<7pVM@R@kGvZM;&p$9uHLodM9g6@&uv?Ua2mc?#F0aiU%8+H^Rc>S4Go)21zY
zjgkJ|B}>Dt5_GTE{@t>^lpEjBG%U87Y#bPN!R^6AZ24lv;V8;@UiK?4j>wc!!{$Y)
zB{u41E2?>crNdWA&S5HxoHjahp)=Z_08ESFfgGxz+3~XI-=2oQ5F9rYM1m#E%bNd>
zvFiY*x_$o<Sq&9Y%BV=lNZDHyO2`Zc36-5q#?gGOdduD=dmNGN7|F;w$~ZQWk<GE^
z|Ngdrz3<!gzkb*6dQWxE_xn8ezVGM0KaUk+U?JiDQX0De;e_Y0d(OkWXZ-jWPAEq@
zuQv$xtv49?RnvcaSW~GKf944u>wu<<s}sW$bLB4_QwK0+1cib!gA=8un2A=}_Bsu}
zuE|ykZA-n8LoLr{pZECh0`A(45dSqqRQ5Dh!^lJoU#4p&T~ysJ^vH3GXsIjP8)d%P
zDDgo1Adg*+{?f-oe)dOrM!Fm}Hv;%y^rLW!z8jt>p~Y_Mh9|ubjNQT;9)ZU^D9j%G
z*EV817Rczh1OC4BpRp;><P0TbbX8Y$($4N}(M=<m=6&4`4F!8VBaAaoQ{Us39Q5P&
zSeyA+apD2*0oiX4E2Ac1oO_zbI4{OBAe5XLQmhtKb*Oagc#b1e-ElJ0d)RGPIcGvB
zhE1*tn7v!Fo{p#Gt=j=QZ|XRXQk|9?m#LB6uJHk~mf)*bF#V{`*0UEvBpx?Tb&Whr
z$qzY**&c>Ha(_HkO()}x$3sRb`!wEmTCT&q7qo<vLYFZ6^3(dq@5DTqVfiRPLL@Gj
z7-lJ_Vr}|3;QleMvYp75%4Ln;{fW&Cpbw^lk;5)Q5W=t6cQAa_i&p4ac8=s52Cyyn
zR#$<mrRNo0<mex4eK>DW_<iT}Bu~qt0|!1<Jz5yH@7NXs(Li&<zQf}wq4?e_kd7?G
za)yTNqNZdy%sGa=p?@JPB?%uUl-kKz6O5YX^>)jU&egNY;<RaZnWDd5S3vg|uW)tP
z${<6n@AzGd`y);6o>3e-6Q<qG@Y^Ty%>BF_be8xoRAO7ijF%^FXf%lm1tf<W--h+G
znc?a|wVe^whQ3C=KdH6?sCH;<Sb_?qYa!de7$hQ++r)Kn*ztj$0LavFkaGl)>vP^)
zhO%FSDou|U5_KW|qduTa7B#nZRz|Sub?U_A@M33qhO!^~5>HV<`W@En;muTy4@$VM
zvA*326a}+5&zb#XuF2sH&wN~89Bg55%uMBc654AQr08&IVsUr2+wJrk0jW%-aCH=N
zbV(qY9JgPvw9deLq(kE6uFturaK}jG^s_jOdAwYR`@F01&do?l*Sidj0B*EkI+m}^
z1xw0D#9UG;`0=-(=HFP|jP#zl;!IUF<yEXzmBZbUG6$S^$CUMA-KK47PbN-;s7(>X
ztQ8bjVxZJyAnr2vX_p`rBm~zOf>7Tc$$N=d|G794@%75ja6Hqy<=$~N6F(M{l8HZ@
zp86DJr^k{)x*$T79c*Ppip}*9xiLcj`WA+ZaJkF*vmt8AWh6xW3(qrV6^HF8y@qhc
zW&J2)rht-)ommdv_esm&zK$@r;UQF=q7>64_Y3Yxj4rQv?&)K3Cgd3A$%sMox)X|u
z6}Iv`Afn+YAm$@OV9VP=8NV^{-_nL*EnxS0^kmwEUu<t$B+d9&C-%jYeRXGI#B`{+
zgqJe<bGNp#S2&L`_O72V=M<2EOENyiglTmzYYf`X;TE&?sFOK&+!%PgMHu%;QC$Hu
zmX>6;dy>0eEhzof3?)&@bCRmnq<lPuAMM0*dnQ`Wsrc<r)huW<Ka59etr>`%2rb%`
zzVfU(42EtwQ+Y!rC+F{cW(e)Ge(cdhmpOVgZ`y6#<(-c9)XG$dYm{eW{9xNL?e_6q
zA6L6;9#0onGoZ!1T{3PEdIDm5@5JyF$!<e`h9e6>{gLup|F-jQe`go~DOJOJqsCE9
z`0r@>D~o>wfj7wQy<IIIzmbY&tual1*>*5#g-1VV(uX5Chidl$(jr&VJRfhz!6b6C
z-8DIRs&f8IG{jU~Dl{$6)U05*(-3{LT?E95*B*&7Pnbyk;|0GQs>GgzU@U_7f1}Kw
z+<?Jnl0(ws`<XhA#IaLru^NW>1ygl9kS=!eA0XX!gxE36pDrCE!&+A5JZZ}@p+)KK
zdRI?o^(ZeHsQz$2bq5^A)%$@;gb8yT@cDyEWFtksHf4*=HYFA7&BnkZzfT)xULbDp
zaRtw$r>`KDPSkw<Nkbck-(QwFL$$#ANq_V!^+ofT+X`hoc6${$DEGDKzRcU{CRfG*
z+w(=zbid0Wc*|*F6n_xT0~g^t-ba>ctU_XYX&5P$WPYK<*Wa0;dwg7jnXektJY?37
zcbtsh1x5J1-F2As;}^6FOIXyqpFQ<)Z4?Z6Kwh5+`(6>#a=gEdxEaW7ErPqR@%y%Z
z34_ZHBQ?ysvXQf=;J>rmudn=WJGk<tj8rg3nv^&?9K3BQ+UW1P`DfcEy&O5zT^H$~
zz$%X4$i<&mu`chEvtS}(fz+J}qz*lcWau$x`Tz88ZzKZA+#d60a$ift8(AZT-`~yL
z&t+(}Uf^K#iREvSxzm_MjPBOO&G+P)3*?RQ+dbN5uNv6z-lj=G8LpmzJFC);x*oH3
zY4M|rdW&6&?U{3i5?~w}MhdbxAv;qB!EAQnPNVz-FNURn7l#iI_*tW4d;OT0>%JyH
zKN(_JPAVdeRW8fWV#Zvv!R)U|RXfNd9B9ARQmRF<Y|`jRUXHDD4z@imjV1MDc;!x#
zlvI5Lk^Cxl;DulNMPSQ6Hu;?$zKB_S)#!G}3B>Wx<@;y<K|8NhufII3^tt{5cBlXC
zs`0^Y*QfaHXgQ<e>$M!{V)^*e#Q^>7E*X;+gSi|_Ih;GIhP{fb`>gWAn<45RzU<}W
zN^nY;fYY4>#3ar~@8bMJ4-EPs9hjCuG9j|M6G?((4{HwmZep|z(WY|P<4GZymm?}f
zJcGe-7Jn91WNW2|_v*Xg?Ur7qZ+vrO$C4lQhr>_I!7ndczHQqZ`-0I4At`+v$SU78
ze%k*g2{~6Ve5X=m`eyej7!DSrZM*jm0%j2V`?y96w^+nbrV48{nd4ffbZOq!<<)I;
z3O@?jLo?hKmC=sRa1JT2Nzc9=+mzL(t)hxsc=LWYT9_8v*YzI{<}c_J7D0Cn;0urq
z+jm(|!RkVh*zvDS{*!-Noi}_(PH5<J?=flYX9*@vn2!35y+gzuv8WG@B+|L_`y#iy
zD8v`qJ2vGE=qnUeiTH4k=9ZvX^YI;88R3R*heBcv-TaJ5k2OnIW^IVXRA_tutyM#i
zuzj+`L7a_DL<e7foC6_U8t6?#&VFO;Z}0l`kG%XyNH?4`?D!mv;^K`C`rlVZ#nQBM
zEIvg_m@MCi*!q+J-uA%}F7ueHMl<P(%(ydl?(yg%)LPWJ`{ma!1(%r^C!36D+Lnen
zU5FvVvGGb{?=Qa&o4y2^J54Zg0?V%tFH9I0(}uaH{|vF~{|15yi_{<Nfiw4m;zGgx
z{~*{d#Mrs-NEbmDd)#%qLUVomU=72yC+Y56dfWZ69N<tNhq=$|7&STQe|Ep#dqrbv
zzqGhSfT;lp^h6AcDA=A3IOr1;HLFzoRuTNEzdt`2Rv|929o_KpHpN)O$LCqBB&Jfa
znyKP?iYpgS4(0L<8z)A)j*%h5e+vh-SJin`Pdb>L9J(?f<WWpL;0p(UFf;xj8Fj|@
z7Zoy;LMzK*m8sKkhFV4{X;>*GX3T8T^^y<$dheZC31_S7Br*GFF6KDw%5u>7#9&kE
zuCc6y`&`)PQ*xJt)IVP)+OwgY2y?n;Z)g%Cesla#s^toWkEEhDxxW7()=x?gLbsZr
zgt=)JG@}oiLMoF_3PV*-1Lt}QT)wmv=HpLaCZwmOMS4J}x9-a~9G&dm!*-ASb(=v%
zA~uWe7l-|UI4m|=>B*@hHMe%EQxogHWsYdYr1#uT+1EWjcgc;;KO_P}sMEV&)W}M&
zYQtl<$tqnwyip(%eO~49<%Es{u#&Ek>ao30IYNls=JM+5FHis7h=j6$W#+fnwkuTk
zcC~ZOw4g$tL?&U8>4M=9XK01VwnEy46P-U_!Jdg%SCgB@uE@DYI~m&9*%_zM;Rb$r
zBE3}EIh+BV8e@uPN8`TYu@rSYN}ZcoGybTasR8RC&5lpEX=l=4;zOIBYa;uNVV`v6
zb#c-XZrUBG%Y`KTN=tk%Deuawsakt2@#-haf=t=Nje8(dl~AD0qaUf~{*z1*g9@;D
zH3lAp9qB=#jUJir@Z^cVbJ3$0nt-*SxGXufP+xZkx-DcBSF+YUV9x6BgO8#kCNUG=
z(-6da7E2vV8hIysl;XMxij^qmqM^lG>D<Ne)8h=xHKs>+RtB~O4MDR~MMl2hM1S<c
zy1xGH2iU1ABHP%Lld4ZNPaqQs8~oSFKrKZPJ1~-_{U<wCk4V@-(M(}IZctA#_1t@9
zfmH1A5?uG3uX}`5g%Y5-^i3d-=dBPrlJaqXKFMbWv4L}dVS*CRp)xc2DTU-a*_+am
zPB?OoX(ZRQg?6273b8h)@G@CcqoGcvX0K1fDE4#Zv^{vJVA<?YnlafKF=bbp5wtCe
zbNwKT-uC+@C6jI+Z^tAh3HBF-PBkM3_lK0xXF;4_K%Al*_{=osmzVu+W_TTe?#3s~
z-b6xKD}w~$FI^ZMr(k(o!A0oRGT*4Q#h9UnuP9;_$Y=f5JwTH$Vo_WD#S_ome|gjs
zx>3rp9ESWIMTaII(qC7?#pq1z+|i-OxhGh&%5HvAB##j-KjPzLOfJ8#W$^CJ-H42r
zvt&W}ITT?$znwZ4%^!jc(@P*#X;JO#>3Qr)(0qb!XlU?X>EaWG#u^1=dgM!+rvumb
zbNkD#?%5)x(`(bv63SZYVO(FSO0zMTHs$MZ?0e(@X`E8*!5039he(?WiH$!$(H!s7
z=XlBb%+cF$H=55JVt3jlO3wTs^<4A**D*^vwwPVr=B462t22`aXUwDSree$?klS{x
zO3*T@2x<VX*Z{x0xbM9Hms}*aLz(=p=yZM^K`H2Kz`rB6Y#&lWM7k;j`OjC%{7Es|
z5E^y<)zIe(#<njk>ljG_GSh!q<`LO2{$Ta2ILr_IebZp;bp~!}qclWKqAIHd8l62&
zui_gDe^qbw*!?ZzI|C{G&X$-`jGryBxLPiuTgKCeW62*m1f@YcZQ75NoMej!yT?1v
zE2!ACm)97PEu9WYy}^|1*5qKIwiMeeE~$K5@XN8v(<l3RNK7?>AN7n!sLpRgAEbfJ
z+<jQ}^iP`+KsHk)+(6)f%@96k36dmSkvbW4uZq!7r^h3G;r(a5g?UWv(uS<4<dBDW
zsoKe*0_ex@nxFQf{hD-t%e+C!PYlQwVe5!*kv_-1v@_~Y7r!mKt=&2^Rk1()RczZ^
zlky7tB1ZyD=W`I5*7$b2`aR1a(Wdb^9E349DUK$RYW_6Uk<lNpuVs9?K?#$*kA1a>
zl4I;rE8i=65Xb!>j&`@7So~sI{vntNmt(n-5_5*9ZzfFF3|q;`Nn@j|FBQs|H(Vx;
zDcWx)OMVdrtU?C#SAsw&(;4ApI&z)T%#O5V+EY!@g(V}~#Ut&|j|azkB0=GDla+iu
zOYHEzniRF!3_g_G9#(X*5zM!j=U-RGZ;Le-?ULXi=@3Dm$J>n#X|m7@2lk^dhA)RQ
zU@u#{F5;5+vai@(EAP{|*ueLS`pafEiJg^qKehZtSAQzN*H2~hNN7~Tuk)Ig)_wa9
z$&)Vct%%p;4DS$sgNX2uhh#BIS~Xq<M<{W#vm+#w%O?u{60tOMX6mY9a6UHoVc>{^
zz{+S+o~N0aUOs&2iJc{0f~sz{c6i)giLB36oxc9CTTIFQ@;4IhU2gUi^1E5{M{JfT
z#)O&QU++Z95^EH%Bw*nufC~is@Cni%B>EYKeE*byoQta;sg~h>5Zf!mC>kMe_ySap
z;U4wcoG-{^`zsm^4nV+cgif8Uv+bkv^51_6#dudzFt`PpHg=XF6E0rNx+h{apC8q`
zTOdC2=&swl8E$YL@3oS7LkqZ%W$<d#zn<VKUg61Do4oa7x#37TA3rE0q5fJP#E0<t
zl>jVU4i=v6d@JqX9~N#Z0}H=+xLJ?qX-uFPzPfIhI#dyU7bhJ2LJ_95V^A1;>Dh3d
zgS0;%f__I8Ix375Jzt#345IWECM$9!S^W?+a2lPLtH5o8PbJoi(@ouH@Yy+img=ru
z`d^gW&E(1pIEbp+u8LM$Q=D1s#1Mo(ZNJZ2l3~R?nWAUu?W*zmhUon-R*Fd~gzl6-
zl1v|hWEvfY#3Rks)dPRh>2QQ!gwI@6oweSs(IZFNySux!cTzY);dpSLilUs%qpKbg
z9Amp2iRCiBOjU~gg9nfpUR)?TPSB(gt8?!W<`uYLV|@=H{d_0J+y*oF-rB}`Tc5mF
zM)vdWcup$j#gAS5H*`KW?r%x;l54(+SRVhxH{rm~Xe4d@Vpx7LeZSOC5Y~T>4-2Th
zV;S-AH&v%%0}FM?Z8bf5?Nu-Ub5TJkCQ~|DilfiQ(-ZX7YWe;71eH$8o*deHa%fFZ
zP;kqgWZYoUeTyY}MEo7S-5!>=9m$49`sA~;Cx@!@^716%EWGr|roe~2X_WxDhJq7g
zGiWAr@1f65&e;sNj6RB2*l1Y`3ZnJAdGjVfsqj2BavmhNfE!1CM0@B5Qv69L!f6*y
zV{A#PuF}qgbH2j$@iS|^B1Q!<>xZ2`kz{($W><~LqV|BOALU(rtKB2!J=Su49=%#5
z3#+PeI2ww;4RN`wdWDUSH#B=+(ZHh(lOUvdQ2WmC521ipSq9RUUGb6>_$l*<^K~29
z39LLEXrfp4u~P=RMCTo-wpF`1JtXt!eBs1Z#T8+QHm$9!JZj_Jbt3F=;!Ze0==n3C
zktuFJaGYz@E&b~VU!U}d^%7ic1|3|=CGVGw)M}ZiWxDwU7ois(N5N8J;pcLhMsAu`
z#)2-tHz7x&IXCyqbhV}B#$SPRGAD;ZAnI_Tk+WucBLNdR=71=q=)XXczOeRySK#|;
z0utsou{KeR<;18{WC@eDJb)}nr#$`0IVIA%LvFWsZ42P6X+y<S+^JX$8YGuroMk%(
z*H|9XPf&=hj@i4vB_g5X+9`<r=zvhe1N0pb3jL?iyohAsH^qV~uU_d8G`et7@C~n6
zruNZ^0|<r(%y$&B-U+_wrcV9}gIh6x-eCRG)=A{-?@ZNt4kQs9^$^}#(mx29xVLlA
zWA%jlt(?xA&^3??$VZ$3wk`Ao=pHM5JzpeejuqTm0AjpU>~mPLgDbd@Nn82K75;`G
zHOZs*oFAI_Au~i|2O~;;S#3K~3N+q5DWOFAF4Fs&sANh_4zb?NqU)sT%sMznh2|IA
z`h_@%a)+&BgGh+#qD9H}=$^QuJ=+9cXQo@te6q;<nn$Uuv%%gL2I8Z#7uCRA9bX7z
zfYuvT_;@s0#O)VT`qQX>{e*r6VJ8C>|Fb~&@1X0Kv`P^?2=`D5G<A(Iio4F2rtW+h
zsF>2`{!%f<JJ<Td)vcG**HXu{Q&Ury1geSwW~Ma7-R(67n!4{W#J9xDJnGAu-+0ot
zN=Bw#@l<V0DYq=1Z`cgF$1AoOH~(TKiBhGqUqlPPju7OejMhl<!_&T2dVG^2*hJGb
z*$=93oD7K@X}tX*XXxdgG~~RAiGzl=s11=-#aHj!jkzSEpU>4-$XPtC{gkZLS~j@N
zn%CQPd;fcf55%ytaofX*ZyI|zKU{rwGHhO&PZBDoLED&8>>MI3kO{rtjLnZfCLjkV
zXQHaEW!#VLy%sH6ByaeX_}I=d)~?m_=?j$^YhWn6Aa%}|bjx}6wUcYQm3u?)<U)na
zqtujnTT9OPbv612OSO4gidYzfjL5b1;~P1rYJYe9@MlLjet~I_5-2l{E!<UP`mqjl
zG;tB_0vAz%{Pa^F+bj(tOq#wMsRqXz6HWbnhq=tPSf7lj3pepy2;p*4qupW2_~E!l
zDdx#i$EeOm*PYpcyA3S$L;gPZ<42kZw+aMduFFcUE_uI-BI|TSUEKjV5E%>bCxt0~
zKO}Mbw<_o_>cfi#tx8z3IACI8B8`3PW&K7wFrrLiFMW&@1{c+$xLfI@BwTU`t&`R)
zZ<k=iuiZRet+pCqv(YF-b4vLG2FKxeiT1U4ILYW!W*D_*g9s6<a1A}-Fw5H`zEuW*
zWMohsaAXaSBAmc2P4{`Wy?b`zaiIUmX=PQ0lS3nbNaneTV5;QquW_@JE`^a(%%~aa
z8(!SP*(x!1pBLk+2SX%&QKdi%d$NV6Z3VTr3Hz>RPsQCD%6Gh9jFQfk;2;y&opM7D
zKiioYMKbSqMT*B$G8`>#=i+j|9MBKR0Z9DW^C|?<r+c6tsd1h0WFzMJ!JdB;81!C5
zbBH)(HTR9uh63U#XuBycMjLfg?GTi(WT@s>yWxUYfMB6(9bdqWJpTdV1ds)4dmC48
z-EV%XVxYI<$k&Y1*!-FLi!PXDvw@WD*5(Q{&SM9JHbtQ=($d|Zk3p5@h>h^4tg|my
zPkhYvgkd{CDhPu~Md(b(FwUPwMw{o%RTU@q%g|^vAjatEy>`bA*`fvjmB9`QZRUAP
z<aJM4Ii<P}^T!D5m|X1qB~<S}3Z=5jhmM>{K^jd;x39}mJ~iR!jjy)f)mV^g&PhYs
zqqx{Ldi8a#PkBLr?qe?lBUY(84}P-9-CDJrpIn&CHLC3jNEZc$33NwjPE$<yFT?rl
z+p?XKibM}36F)vI@Sod8kC-IHI(>=$em6uvNsP8d2>bA~T_8nZPq}6(xVG({#x#jY
z0Q{O%v4m(s;N+0+8{Ru{@g%Z4nwB<VU_!U#LMuHX4hH(MqbsoCTZgNwDWm(cprfZ;
zt7I<7RbgwoySKOGs$zka`JhX$I{?dUVJBM5b2>M~aGa^dS}--h=JSH*Q?;sMPX|2#
zu)9pet=e*~j{90z2C8i_-*Y}Ey>)(Se(gq(Zc%oEB(==?9qAY)5X+R)jF!JdUN*F&
zBnzGB*T<(@lf>}5J^V%U1s^c$Za;D4HGi^LGz3b=*+XZ|W6r5ISxNOZ&ZOJE6lR{z
z=6lU_=xlH%qqI0ak8?6XC00wtdITpS@P^lZ6@BhXzIZTF7`_+}zARbnz=1!N*$GHh
zfl=8ucd3apsao(_{o+cND@lTC>?lJS^yN;uPqe=p*T+wM8`HeuiFCog*A_Luz`>y{
zTLVKq3DDx7i5CI53`Us4q3YZSXj|2)t%7z;ZSvU~Z-JN~H3_(YoWad+7MX_+dc6YE
zQ)`yEZ+>pFLpi4`51=88u{%3)-zNUkw>WN<=Hxi!4OTzidN}U<`j@I=tdE!Q;)KJ;
zcd{L|w(rPIXuG*Ih6{;3CN*eCHYqs?Ijb7UTzK-Tp^>`D=QhQ9rGJsp9_jX36#euB
zWyTtpk#=4MQ>z=Cd*yg$VPPR-^(<67XGbco*c);|PBJVHLf3tml|lLsLU$7>FIt|+
zavuLo+oE(1S}vnL4(RW4^ed|&QS?w-(}yymUDUES6}Qk(o36Du$GxPE)rpSm5m6|h
z-Yy#I08a;(a&gsV>jFF*6-%yO+w;KHd(;C)!MP6UIR<Cs)%IE^0<xNKbc~SgKw1Qp
z{4_((Lht%Rw`j9;0VBT^DH^%QIsOO)W~3_=1+NE)9;hF@!wBP30A6uE*Y+s}&@f~|
zuuZ=>Fti8B#rX+1KYA`tihWb2dwT8>$D4(n%Q=o`so<nW&6k5Xfpa;Y^FZ!pOR2u&
z6OO&rX_6}OF>ZfP8<)&99rwGQ?jtd?OtP?p?Yh@H=;waAq&HXLM`-|cK0ASWI9w#0
zDl;f-SX#*v{ew9SjRm=+7~pr1Eo1#w^0{PAP9hIKiA|AGRRKggQ3yT{Xqsd<#09!$
zP!HfNa6TOKC%-@_y)x`6P0#U29*>4@3<R=;bdu_>)z^XXHFask0tA%^w=QiM1gdRD
zn-n9HKiaRE;#;gKS$r1Pz6^hAJ1ZvhP_Zm_bXI*u7kz7VIQ$o<))ayS8>1OXBYLwL
z?4fNaWscakWi^HS*uTk0*OE!vJI|kBhcQ-J3YvsSJ>o!lJr-P7()^b6hTZ|f{fEPX
zeK_78cW@Ndf2&q;gMv+bx`;GHoq<}3?cV>4C*U7`>E+G8!(Y|dcTjV;zYvet1L)<j
zA%<!M1X|N#uJfawT0}Zu!G=mr#$jT~g8l(hjU>#b*qu;c6wH;dJ=|9cirMG@=-tyH
zYFA}%RE0h~uNtV{J?<Km_A&RktbTl!;b~`u`z21d(mnHUNGc{D*K?9BC*W?S^NgAR
zXfQIA$8lq6clyHC)Y%=lU(6Q$IYYsu5Wi~_Svh1eZT+aYqcIAnu*?_m8U?r<eKO{{
zZY?CcTkO#GcAM+=e|4v{aAjSN?Eabf){50aU&v*y#WjB-_1W}e%R9Yk;XHS~k`gkR
zMv|&ICJ8&!D#t&OhzO}5C|^8t=((4Y%4Nj@xLXLlm;3H>D6peRd`c)RpdHgm<o2f;
ze5x8$ZJ{$u3W36eC}DkY9q-<^Rq-jzk}-WDi~3rZ!>)hg;6_YBt1fs_1Zsv4ztT0S
zsi`?tSX_K3wX**UWS#}2Dt*v6UDL)EFt~>J>!r~x36{5mlsg&e<=#tb_3WAag;+_5
zy9|?!l0YIfw%fZ(f?k{ZY0;P}X$+gI_u3^{z#-wSb-*@s<7cJas$Gn*9p77%t+~5?
zvN|4GNJNd36Y%d>-pD>H%0dkVELbS1U{37am2d^@lzlW+D&+Jl$jRiES^e%`-ufd*
z{YFYd8Hm%^NN}%@pj%s87lZ<wO2g92<jouU7;akw?b(Ut`bZo*Zze{$Y%JhU&<=56
z9aG-0w$47~a0v_H8$PQ_qyKcP@EgiTDIh8!fosYcm+Wx;%_8$IS+UFJ8zZ^1K5dzV
zQz;?l=?1C}yCAl90>l(~rBcD0=`k7t(Q0GQWgeMZSo-YjvIKX!5<Nnz!My$m@aZ*{
zWZZtS87Y+Z&^Qy+^;FRzT`JN%lWWbBc4h0`4gZik1;=ZGqf7Z@6l@v`GZtUVl}q5I
zjLJpacC23|K*u=d@rI(URIizF+*rX<S1)zTcu0VU9b5nE$Q#j^m+CDM+1X~IkPW&J
zZN9ObGb;nZ`%g=6sEpJeK9@H(<?5BPKnY<x6r7CL5|R%dy`Bth=c=*n?ThKw?&AS@
zy{@S_BKjRP-{x2IHcHx7=?LyYYB4Z?SbusLfe66lAAze;7DR()royIA#IFzW?liDu
zxO{TKh~@1u0?zL%EQqFlMQyE5X>oDdsSRY_Z9f+9$~K-+51@k`^8npo?fhfLjcIT9
zVyN5i?8+7j06de`YPW_4yI<(HgZ9Rb1*oQW6lD201q2N!a`wg(+xu4*?HJMH9L<rt
zZGJ?zj9akO&7`=;@H#0SOx#qjV#gpPOQ#A;DH9qcwa_DSxeo&gcPr?9_^9at(~+Yf
zJ4Cepyb6x!FE$9IrD)NwHmffO-G2|s@qZIB)B&lRd3a}+df;OjC52rUap6apc~2q6
ztjoG|!!N?RWF<la4^!F6)B}R2HawKfH&RzHobe^>_k{2tHXQ)ychuO!d<I8vr$ZxG
zYJ^Y>Q8zAJbZm}tw1#WztsFbJYW;S}Grr+kcAw~Gb6)65%?q5f%yk6VDR6|H>)9`a
zbtf}guNt(NqoRK~D&XZJ+AmReX6|-8QPHBsWNehIO=I#vl!8X5(l9-5KHiQ=E@<)g
z+W2}FwnTa{Vr=|y_c))yKp;tXV;8+!ndPX)0@(TS^m-v<8k=2Xv)j9?oY|4a1}Z>D
zvNj3`_x~kw{zri}ALPqlH*&>-Snhe4z3$gDcWR^|YX>d+YJEp%XJN$`w2HB1fG;3t
ziH-re`2xlqf#@O(MM~6(0S@SvW9!OVTGEm{%iHWz<{nn`4?MrB-%`Nfav#@5FO(`Y
zF|*wH{oYrsunTR45PT?g={5=t?%YC}H$UK@hn3@=bF`(!$suF7uSg2x;t~KJ>zh+7
z%Ot5E&VJgWWtFeaj5}$ApXz0-K5G*)9kMQhJ&d<K>}?-*WVq9b*nT%rLEFA;Q@O#8
zMf^c2@{xX&#D$K~X-b#g6T>V?PEv5BTa`J*Kgi6xn<zwbzjE929Jb4+1GXCh5{WiD
zk%TtgSbhg|@%`eB|Cefhh~X2;2}`};WnJ5E1w{xUQXdJs0U-%2_`B4xtAT1vfXX4#
zcUf^Il6NPj-m_<E<1!4YzOE;5q%XwNju>bvMsbZ2ny-#aW{X3LepVq~q%Irq;(0Ln
zJ6`4H9<TuTzpJbI@i`vhb%3V)C88p=j7iS%uv_c+rkhlq9*&6|{YzJCALVo175HVn
z^Q?yRkD2z_7DrEt`Sqh*{Uq*MOl~tuT=Oc&M$;}`rOYVDUTm@=UrXQDz2EMN>f>0H
z!L**pB{eU}{n0Na>>wX}0g1QnJVV*$pA7nSFzE8~b!D_9wNpl}^f<z93oEO+=5o&V
z4{+dCNa|CvcLFd)gw7h|G*n_<%JlAcBsQL|dkmi400tFnPv>4)D6lU@Mic?UInyv|
zPxDm+W4DCBRa%yN>8^bS=IKQolMA<Jz5rFwDitl6r!)1Xdd}VXHV|S=s<D6|l+ww(
zRp^4bJRG1Rkq%0zns1;&gm2#cO?vnN;V4`h8!esd7>`|?*c~)+h<4?0r-^{NN*GSK
zW|%idDZWfmTKb`)*OA;83-Nc{3MzYvTsEuPx6<C-*?e(VCcIYic>GMsAq5p{ZEMr2
zgT7#rlCOWBW6YN^7hg;XF_t#m2ZqSd8}W3Ha4o;^<acS}?St<OuiY^-amtVu|EhMT
zCI)Hyp%>^%EIXV4A*c@;CaopYeOihuF1k#{^&OB@<0L^|zCcTUCIHS?97m()tH8NI
zQunDBPz0U~QyW{mPtBAgmTI1kPZ5E5JzE@?s#{g0Y3}|v60j}{8`r~>J75elJ@^O%
z^IuCD*4#G;1%Tp7*E+9PdGlx23@;b@q6&sT$=ona(FQ|%eVkTGsZE7`!cJey?bzhv
zU6Lg=cR)zh_x958{@W!_m}2bR8mexoYtMKCm?PvQ@_;EDlE+YzF7F}ZCi{a{{g`ZM
zSNv8HW4itMC7grp(T}|Wth`UmDvgbItQ63ah;L!IPn}C?e?F&e&2kV<Xpf&9DmsM&
zgpIy<#S2zuJnt^Y<ZWDEIc_sTH#unKRMd4aG0L&ik`(88&U_;;V~2uhd<zxad6zwZ
z(mvDQKvM$*=L_ur+9v1AK%VRcIPpBYvm!M$Eh_gH9`2MaJM1Q(Euzjwu1Re%evnH^
zQ=RV#l>Y1}OAfy8KChc_ovFy5OOmJJEcGb8`o*GxDM7pZ&+f0?hiK=DFZ*Gyh9FUt
zB$>IU;o<chmT9+BlI^!_&dbxsNxhG0&<>W8!}3K_U4g>a0Qn4^rLSGv#Gg*CtB@Wf
zz<@TRP1@H2cXJ^H+#P`sSu7EsCimVB0k|%J{nUbnNQ(L?-c(G+P_ayf(xn|D>CLl_
zfr<^~mgnM>gnWiQj%rO<oX3Nq3%?i|wy5m1#_eRPu-1FdP@A~ut-#fz<UCh{7v{Jh
zU3g)m5cr6i<@trFfOC)bJulv-5O|J-xu00`01GOPdO~4X@t!|{%-6kxuP(o?WU=10
zo3Zq<c(h1ZpRHt0;^Wjp_+QHS(3KC^#IeW`u@M&u+|AO4l{QR;;`$!A5OhzTsDPxT
z=h3<>&(10B=#f$n6O;JV{@cMOm_{7VQ?f40;6nn20N+d$C{!9A8Ii2XnmFJ2dL>Lg
zZnmPQ*N8#AnX_oL)xpDKQ?fp5qI9gEuwPDf-TvXLO7XdLR7G%4BY#~My62g=pY#=C
z5^`#0DKE%*XxSxJ-4nCLbl0-)oQ=&rK+ai>ZS6;eNatQ}PSM88krq2dPkLLJ2dxxR
zPC3|OtyUG3dq_wLldH-A&s-Biuy*I2&;0mkll@RzD7igD3&FbFI4kynf>&8P1$fw=
zrw4h7zQ69b54_yOFw~Ky_Dr(Tim9=Nus#45-kKt2UH8q-L~TR3kxY<IZZ^hLXOS~y
zC+r1VMG`vry%&I|WO-oEo%LBbA*yJGVG`t&ME`P(#D+2u58I>d^J4kqI3BZg@u97J
z+0;lD*IM~1;8t!G0sgM!hlce72M%~NYvEUlj&yGR1vE?)7&Ei7nsOR(+0+twy|NuB
z&0{4H;MD74hSS>9fQ>Qfz>Kvlt`J7fTlW^Pd^UBDAG8L4UZP***mH|6p*!iZ7|yf0
zwpKBb+8{5)y349*=~M2G5%3k?X#MwIpfxt<oPHkdP+LM1WqbU|_?K5tZNtCPkJP%!
zFxiGJ+(l^xvt*9`SzBjtimWBkoV~3w-pO6N);{}ASFT4e?cC@K^YL6g=I|-KuiD#H
zH_vW3fV>`uD@~H>tW3r8Y<+&AbP0JKbIIR+#J`an`2GEF$_z=MlrWfr?<Ex|q#MVs
zunqOPM@bv%k$@&`8b*(JmNYy2zF20!V+^+bnygQ}qQZkCly+JOY{dal`?{01R#sMI
zRa+-h`)5Ayo)k#GkoR5;ZoVrLwCY|SVetRBJUdtff0i6o*~<gBT&<kN!{tWj85V2-
z6d7;TSK=NgD_6pWBS_|EWz8juF6`Z{6TER5{vY6<(QF~$x!}(rxM|B{4h6ua^Pu9+
zlLGF$(!zUrr{19~UbBAR3%x#iAg+&rc0t)GEmA9;MzEe)_h=VaKHgF#bKJ)%s2?T#
zK!SvnjJmt-@*Un5vsT}jjrZ|fJcX1e9qjT(`qrmZs{0y3U$bPsD^0bhZ6VyZoH_;^
z+o@c}#JIyO)$BX>zEkcy2@PvWq4>VOgZ}yFC9pwHs>n}8KBpdyD+8F8#gf&r)W?OR
zbLZmT=YNnZ%Dp5)y3lq=lHF(EF$gr?syqMH)ye+JygrFSxMt5F%%sCppQh4{L2oNh
z;E;bQ_Wkk!IywW-<(ZE-Go3|ctbX*!*73S5CIcOvl_Xgv)xW1Pj+8YE`%EKaVz%<K
zxz?CO{d3rTwpi=3O4}?1c-*(v`bwXfbZ}BpQ4!#pdVBc&B|(j8&<R0|LFs1cVipQg
z8FypI*Y}2Y_cgcxaMdROPugm`)P?HYIUYR$b?1GEg+TW0mmxL#ep)ojq=Bkb+sG<i
z@7)8Uf&n>#)m=A}qU)JKK5n*F@x;9%jaGN<gCuGR=J!cNN#}BD)zeVfA&Rd0>hxT5
z{4049`9f);sW@q@?}}il=)8|tMb_=8cQm0fG1Iv(KHZLc|41c<4td|a4wx2PY~iu*
zT>ie0FE$JL#&IHP@ufFO<cB3a*6p(o?#Xsox^?Ezrsy>%^{ad78PI@`XAm;7vejDY
z<(cySM&$Gh{Vui$0Noi81G5=pqO)ilY*JdRpOn!+x~T?tzi#ySS%UWWH4xTIr<)c5
zmfdr>j;D0p98vKlZvDb}IrYps!a80cqCqlnWW{>g-d_vlG6a(_@6Su(8|VjY`#~d0
zgG@TCr(SIE*r$}_y)WRa24iFG^nR<54LGr7t4gyaM>n!7E4)`_>K*rR62*6^Dhnm2
zVcLkkb=tzRrDsQ7p=C(nBz0OU*^r24t*=KRc~;JZsK51HF9R2=#6f+{ZYBIQz4Jt6
zx+U-J8%B8pW}fr*!F2jtYxsmb_rG~Y`c|c$+`PiROI`u|WT0k+HaT^uCeaq!@~T#E
zF~tum{_h7gnhMgX$ac~~n$fe+NyTqB+?iu8yOvAfPJDilh`5V-R3J%<rgFN(W5ZQF
zu<JwW!&nq%qwj`5#Fip<pFyH>uggf3I6pr>K6jr-m!aX7B@lLx>%nDw>E=RS$@*Mf
z(!i3Et1#wau4MC#GamU57fqU(R-ZF<ZUJ`Dw6&&&wMy&!wVf4(C){%0NB__?ds3gr
z9!vMrYLrXqKWcLHwtCxHa*oEYShEP_84Z=7h^_gsy9whW?AlZ_mf9|v11oBzZ&2Re
zM<`u(C8wZbQa_u3z>N#9z26urtLa4S<@wBcJwc(8q;v*WOE8z2XNf$$+QTEVzgp_=
zT=~8NL<=G)$;T`eUzSsox-JV0f#LFnxm&XbLSi9Dr3}l6;f&0U(5hvC#desqy|LNL
z)dfH?9-GgI?tQR=`<d4fJ(ruw7{`lF7ZWOgMq8KhCoH;W%HduQkI&Uip2f*V*a)~G
z3<CyY58|v&1q|_|X)vu{v25&9Q{h<+163ac`#8ejchj(Sejpskly_t5%y$EY-b(}S
zML#i{%TtvvUE^KGm7$^>3i~u?JX=lMbbqfeTJv~UvFAzpkZ>o;&IQ|&@iymUkCp9`
z>M^Y^-L<Wop67WyoIl+<FF&KTO}6u5Y*%cw=UUIj54P|Ii_0KLZXS(n|EZDoH0`63
zerr`%1sgL-1t<e@8GKV*D^M)>W4z9YI>s~-*7K}w;<vExZ>%SDK6@#?%K`{=m#fa>
zWrM;NFT^)}SIma8^C~M%vscziw>GBxMi;toY_&RjOr*}PGwBNF-EQPL6VcFTGg#$+
zii|!ktCui>+qx4y^0BLKoaf9Te@}$uSgtzmJmFbuFr-Y=@f9tL41LR0XaM@}8Mn8B
z(7+f{sKmSvPI)6c2cst(xjOfLw=RAU5i@r+H3e_klo8@%WAuchk5(r)z8@9;`048|
zMc1BOUERbI5^6G4qs&2fd%mq<C$hPCl}g;^=w~h-6p7H<0`!c9y!3d$-c&)Cb4u`#
ze^SYa93np3so|__TMGAoDIXCr3mgVZ(6gf9*d@kFAOb7)`V-n;^{SLHOA{H|xH_N#
z3r=Or`k{}LXCk|g=g8{9xL}@LGKh?%8ck(P`ob*PA^$1^J-Cc`0ZjhoVV6yeqpiQ^
zZ0r8?5#(Qbm2G%42n!g`jfaRXmOh1mW^|x%fz5L~NDH;HB)K{o7j!Ce?ZZj1Ri(mp
zSzBrF9+zjEjGtUK4KmWw&e38=%sn%ft3e7=YF*wBYXM5{9cxbGH8j&Q`WU8ot~T*V
z<B=O@vGxr{-OqGi+3x<XNoAvy#Yrh4`s|!z>28*M`FOsW15ta9*0fnNW4av8$;ZV6
zlDzOrW!6e14BevJDNAX$=FboG6%CY(;>@K6Cvg4qGO<{xJdk?}gQKI|N2R>V48Z~K
z7*BQjt^nWP^<8TXeGm`rJbT_|47&pW)l2iUvQ9s-7GaWRls(FO_J_)D7{h)#x#|{a
z<z{iFrly-8z!CP1?|)O@-L0n%eMK(Ol8uDoR}iNxrC!c&Y-l)Amo>ovIy`+qZg0@Q
zW;|D&Be>}et~!ZZ0ui2DRsIZ;**#t;2MM@*6(b3N*G>oIW+%D5y-S~%bo8C_8Nlh6
zdWd`bLw4_+*<rC6{DQqim8McAdZZ|es|ZC?2`I0gQxIuNx)X*Qs#~dWI2==L*2IZy
zu84-LiB5nu@>pG*?Antjl8^9T2l#)($s>A15?Q2RZl@E^$gYpz!-anF;b=s-i=UkJ
zZSK)1?z&}Cyo+NtC}<HvPRB0p{=9veN>sV`6}^U%KZkckTIVfYVv@Dlp8Qhl>8Cny
z-71!8U1wXZj*}Gb8e0+U`&*m)F0+*38=x@T9rUHg-)a6c!u<L-?ZQOR#hL9iv`7p>
zMF)cls|;0UrlyU0aM$|L%fPyePs`rCd4q>oGhWpKVobcT6-ckLDh=YKan{}Wv$zvC
zOD2bJ!*qKd1eEEH)g_dlH=coGHDKg|$$XD*;WZKQAg9W@Z_L#guCC0}sCv5-cP->t
z@8T_OFusoC!I&#@<(F1(7QAc}Az2bwnEZ??45eLJ>!PTj_`L8beU=7s3wY0V(FWIo
z_czF2D3^;H`H+xLL_*1{v7_kp@6z>sr@S8_huL3JW-KJGu~236erMPhZPx}hY=)HO
zy`w%D^g$$yj@X``_KW_{Uy5B`iSdbjR}et^QD9ea4n=NI+<1JJUIeA{D|18IrIf*O
zntG+TJ&ne99mmBA8@4TEzST|A%RA5{8e*b@H#pZ?@hEFOqHn>%oyV{<;GPuE*6_G*
zhy1m7JH5A|Dd=V%{zYM6*!-VClp`c?)*}=81z0JbJ(lXOGpFjT*b$CADciGNwx^g-
z%Zbsm7G!OmXS?zI(*18N<|QSrJYPzXX|ss-*z2dXzxS40zPc@j@%6ZDzolV0RzSM^
zfNWmSyibmyQ4eNug|)Aw*U;oX7W2l54su1CSDkhT>wUFQ=Y)7r+bKqa#KDWVYmr9n
zZGTZFzjfqAuSwkdCrlkLvYWvdH>G<S+$akA5b$5}#oQDJ7g#(t4BsP%q>D0e<Aiw2
zC!&DoT>icg-N4Zbh@EE!yN|U~e!I2#nM0oD?AvYk{xlg~H0&9hpFUf#6{$F0NxzP_
z<hANq5Y<Q;7nlj&@wZ6Nt;!-sZ@cl~4$Ig~i4tX^%H4$a;c)+n&j935bY@3xfz+O@
z-f^l9fsqz(SY+1$>Xt5n=ez$qj+g|M={^|89-Gq3p5em>&c{%Qm_Q&%hMIKXobo%j
z@DLf9<Q#ZsBR|CPL$pb9xih)bv>da0YMpO#<<#D3W$rKhXvNpBWzSGEo~y6pka5YR
zyz`Dg)cH!xSiu78p@;lD!;syC+0rNR?YnV;S5ng8@>uTuITMR1CfNSY^T2$e#tfP7
za{RkvzxW&kpN4yt{-TSoG}Zzxo~?%~!MOv<X?FdMa6^Q=zM)|Q2YJKu-48v-wWP77
zyMCI1vU~~}$*pHfI;xRwd{dV=`qQauG`eyry3_BrUt`7`I;<yHatgv&_t(nPxld7f
zlOJ9d)o%E8rGIcNMq)LW)xWggeU5x9zqRWH+l%mC``)84tx&d`eEDwW?%1N^|EY~f
zKpUH1H4X~ZfnlVQQ;nQ5nQ)0V(9_YGogBc%-c&+r9{Zd}qMS<jAi~Y5cmfEX&vQ=G
zd>09HFKKNiN|R!I0dN5x>?105z?|B#<VrzVb)`3=F)qZ%?Y%*|nxVOJ6bGR2958l}
z|2yVg=>nQ3;ZUj^e|w78E!Qa+|0VZ}2KtSr|F!o=c=5l{HWBguqfjMep%O+nt7upg
z)3ripA9LnkW$WwfF}SRWJ-Oz=kY?yW#wSYsGivb5ma<(q5uA1BR2p4D_UX}Fg|ZBd
z*^Dk`)V?uwf6P8-&Irep>v^qS`Wveeq!YIWCpt|cr|)rBbHBv12Aalk#D@1KW%$|d
zW#*kkNaaBmD*2zp<OSo6em|DKhq%Y1>-K_L%Gd%CiVlRbod^wK#JJ@AP>8{1_U@2P
zqzIMTB_>_G{g{KocT$2n?04SVR@b<3P0C72I@cj+_FDZBj#dYYkm0bZyr_OuL{q<M
z(DGHHN@aA_WO_wS%&C`xb4u5}BB<bUay>|w=S@$2^A?weY8qk>pOEsBih<I_DoHXR
zMf``BN$CS1zBpekF+)$x-kS0GClAR)<y|!V{XuRgNuhMMbq$ig(w_GaXhuLTKDz{S
zdf(+glm>k|E#xdNJ?8R5N@py|=QaoJ)IhCCvkFCz9?fpwZThMwYw*dZ`%HtjyHRi`
z*B;4yCk7Zt*Powo3s3ocX>m3D2)LqCz}wz&v-$s8V;MOlTmCZx^jA+dC8zYtON005
z7K3lo^B@u6Ru-%c>lnxFGr$C~!Uo<C@8=i%!wQxS6IZq}Nk$EI+}Erl|B&w1%F$n0
zy=%MU=TC~H#km{llM}4lv{89&TRgkhD~mGLx-1wYNS6fijtm|EE9|jUPye{DiQ+X-
zvU?};Qhng`dwqsu|LH8M^uS)!)7elyC~Zw~edBt?#xC?I$nY^<sLzBD#J0qbu+1|-
zm?;?=`YxROhe)BrL0+pvN)!vC%vnF8C_h=MBxg*dUM4C<612N+L9gs5tars4XJRB`
zP8OK>+!5YtBU8F^TDLip7_DqjrWvscg#nws%jFw};Ew%*L91@0KYahYGX0?Q{^LM-
zX<y;;D;-HGsEbAHKFvoGrllWKl<-{i--doiy<E<;UVaxBBe|7UVNtTCabf!W3F8D)
z=^utLHoMF>6GiAQJfKh3!9hD(yIX{G%_qm_&S#D&Vh{6bUFo7--rfwF%Ux-wQR#h`
zIOi_S98v#dHV59#*4ymHtt)JxWPzvh0T6-L5tPh0@A{65KOX<v2R~ofQGQyB{bRdl
zWTL^PuEdrbcP(f$!otG%p=3sgZMN@|F}>^s0s}e`c@hM|k$&JYTrIw3ckiL~jUO8)
zS!GjM@3z*hHTAJ84b2{xmC88WBabc~sQkEVyOa5q?z92a)AlnqA;fngNJ!V%L_ZwS
zuP<Hc;XfuWbN5&#DN*ZuuGcVP2Y0FhuOGE@e@R?^KL4k-T?~OW+ovxH;T+>qT;)`Y
z`%b1owWSg|@b`%M69yPWK{8GulF<NxY;MExt!c0im`n4YF8%c6QVi&UF<5RE`Gn<x
z>s?9L^cT9h0j5IApcKHB7p@!}Q6fambji|UDGd5QN1-70Kji-)vSz-Q%~Tm&0o5|8
z^Fz9_p`W(#e`Nywjuu*IyK|ymhgoWBKxBCZyn&UVwGmdKhyi*>(F*%RBO=!L!GpQA
zx->wJ9u7JBMw~)K>dMb2jcUd&7xZNX4w=&~x~4eiDU@N1D~mc^B1lQ`3(aiw?cp~P
zV-4LSdush$ijEpImM)Zk2*tgbp?FO@D)4ei;|Oj@KJMd4nfp0#JV*9Hbim>q@}qkB
z!|Htdz-9oHpTj#pHO<e=w0VAoo`-R2R9(Zmp{#=GnLU3f70{J+rcrcy;vHdq4^*o6
z>jbAmMQZC4<Q!YqU$EUg{gYq0%uSTzl?APadXnC5mYlr~NpJC!3$C|9q!QRNo%r1L
zekjVnuUmeSq)Nun!=ogE>Bg3_=z-LmPap)$0tO$)WEA|gi@z;Zlmn(v<JP%*<I3QW
z{IXy~1J$ImZwk~Ad7MNTN1)^pS>QMs{_wf0JwEb>-TL~WT?aidvgw~B<}g*suC_Zt
zEjeSvt9u9S%&LYhk3S(^%1}@^I%;8x&`LVLx-+HdJ<VuBIBK0?3i7hB_mOB-BxrI4
z+m&FJ-VpfP!+)%k9@)c_a($~V;s~*sy&4MxIg9<Bt935a*Spq5ethlQ2S=5DIYUg+
zdQr6iXMd1gGTBJsu5E3NzFvKWS`mXLvAk!)m&>BPdUtQ8<<HNM76SgWZ_9&H?u-k-
zS47FDsQ%L*m3hHhcaWrZp(($)HTxppw-t;akK^`w+Saj3adyA`*pQsK61PzmpB84K
z!)TzXo|+kw(f8^!Dz)5KP(P0ov-D!acfUFRq(HwQNo*AMo>y<<YSHR$a*O%toS=_f
zNa11k*cbg3<pM@kQ1#FK6pWvWjL3P}C#Y;)n(9=Zs}Eq3^eF7GK(!kNsOn%WHL%dz
zL38?@;G^H<0DW>Fd&v?*NLJWDw{JHjEN<_vW&1dN?nZgLne^%IqEv1<N-=V&<#zZ*
z9%6<K!3Qmmc`lF3sp}%f1qC8LQ<d@E-~P@B@{7s9w?OW7l8bI}Zi0;KHH14Z*}O5<
zHql-T<uw1h_HAgj*yY~R63yfkp`{md`raM4o@88sR&tKdtzF73*W`qzAuHz|?py62
zIM+~ro!tIp)1?bMg-|O#&7vQEOCFeKM7}KNLSLKl(*gWup)bb6g4DU!-d=S5nqnk&
zU1EO&X76o<8prshdY_1&?(VDKYx0-kd>wK=9(#1$(*|>6Kgq>osU+*Q`ul0y-1gRR
z&Y65HwwgL;$)?i}rzDAjD=%K(UWIakS%!WQTfN9T!Mi~3|F`D|w9VZ=I$-o<NDOz`
zfs&BEP>_}tz2^IyT7;^j2iznVUwlCAg`9t9dl2o4h<c+^c~J<*B5V0yl*5%TU)=7!
zUA(t3q9tY@q0<M6LlYz?WL$Osc0kHFJ9xmaIvZ>KgJ0imWr8{|R*}xbI|2J%yjb9p
z5tibr^%a)i_?*kFa%Z|)*-VFEA3T(pbu_nC>!BfPqF2+#dVH|wIsdWHB@dE$xHS9*
zsh;B}QaGgJwwN2b!SUIDo#PNQ5pgq!vtUE2_0)fgvlLPms!b)ZP>_%BqUbdFU1I(M
z*>zgAZ>v`}<L^kmUBvw)ueHCoeJ{l@a#lH`O!4JRL6v6Y-n`>2QoRr$<i=iTWqTZl
z`t4Caf*c}Z9ysbOILRhwn4O7=9-%YNh!8uM)qUekFk9tYv48vwOa9v1%~lN+S?lvc
z%>je&NAYMeUhSIed#GN|5S~YM>50_6`S6C}`wa<azLnv5Jlcwp=NKM`?aANP+3E8e
zgMaWPlmucq1#LOI3tvhu0+a)C+Sb|DZUF*do83a+y8pwi*JOP;KoH=!6EPRa=E5nP
z=;R*gO(k2BSLVDw7i@eT>Oa(n4;8SUj<wy}KB4r1gD@j^V=ybJ3J!VIA&$MYFc%^A
z3pM_SH@p{MJ<?-iZYM|tsKys~+4cm@d&Qsp?dt&AVd90M?bJ;z)7lVo4lvl+3)L9|
z|M&$RcI%nt3|?{Ko_q6mRyyXp_n0brm0c%?*H(#ylm7OcAL7RAO&qm(%x(SQ0#u6l
zj|p9uJ(`*9mD!tc?$CFM{lS+VY?%iJQkP0K|H5R6@+h0|$T&t|@68D8za1$RL;A%k
zvisOUoU@*-yTP~Ew{MjT184^Se_KX#u_(gYbt>5CTN>>#tJ_&5u8KV4AB!RP`&%lH
z2%B>fDIdr4g-1U}&f%nJWD<Vt?Vlfb>4C`Q`C|@eNLCNFt&NrHx{h8y6Wq%FFVCl@
z_kvPU^jKZPj?8U^$H4<U0sr)XWb&8T;eUSy!^}YKzVORker_)LFR2)eu-s41{I1%_
zZw&8{xTd`bq4?~N>pX<;xD!zfp8tK^uadhRLLvt0G^_K9%#3Gl3PjMY3bCWH64%E5
zO}^0|spbzg!6_W{3DnYl$kqDZ3lR44aSJUH2*$>ME8b{n`Erc-PtqdafUJAOF_aDc
zy*|(!84?#GANQ&yanoM*H`D*+32-h3Z`EaZ!o$O`T$_)q{=qS@LWj+?{*44*9+98|
z#*mWEy*9D?b9CPh-?CnTnMlQdd3u=*N$$R;D{|IC=7UJNhdu$E4R4PA@AmtrdUF7$
zldoDm)89`orOo5I=>K7g_uu45UfGb72pjqqLa{IiZ1oQpP#mO2w8cUR)~0($`449~
zP{p!yx|r=%)_Vq1Qm`LhUdB%GRyHo~OtgR12Md8})3m{Bw<6;>dBJTt?lEzDa>su9
z#P{s+0cU2bO>ZZ=`^gTQkh)f2KCe0~`uXi5`v26{OGq)|b?0!WGPB2yIWb8M=r(04
zT;lcBMKT8CqfNLc?Xfa?vu>@KLS!>;1DOw|L=bYFM2q1bGTL+b3~Wdr+bL3Z(n5HY
zw&hDn<6>5sK!uCW`U)$KU{cC-c~1YD75t>;w6wSr(SI#O%1{bcvfaCNvYldQQXxu+
zt9lD^t@u~6e1^m%Y$!ufqMUS+532_S_OnGScQuoINk63eK<&aOoanO*%Fm;HN_x0O
zkP`}RikGE$IyqY_QP0Z<nix_|1ho<R>jM3|?)@VK{i!6c>45%b)Y?3}Wo0P68^Vll
z=C{Qc1(YpQ&L6TEKU<&M%G~A1SjEuS$_sdtUw?=8ps*0EZ-2aP&{4*BGr9*ODlG~5
z@?EKva6043ic&JJHSbrp+78B`@KT=OL{`U>1;79JAJmWn8NoG3#bV36w!7J??kUe-
zA-}HErZkft(xl6S&XW*pF+KH5=0z`yPA8$s+ppNINUut6yVIIWmaWMQy`1-KS@7@W
zya11N2Kn_)y3$~Z(FK?1_U9q);zKk)uJW|{e=WkW7|c?YIo8i{nfUJR2^U_|Wpj}R
ztqB2|JRRnYE^0kVM#hNYh136Y0A!&Ni>JPZA$p6FeMtOJS6^Y|QiRlMUWCUE!c(L5
z!Y3muSuxc&i&E<mzs>``tAO(--G4V2-;L7=Wy2;kcAL_3tn)NO`ib!aom|X$s=UhV
zR#{V|*(_4A*;1H2&!z)SFneB2w+QKnNJ)|uZrELr%xJ|(+)<S(J-z^z1sz2@I)r<m
zyF$>(la@s%L}B+Nd>q@z2-XrLuo^Yn(1jfXKDYw!$9w7zeq-xDII|6%0c0t>_Tdpg
zI(e85podr_wiO?ydrVd%b~qQyV4Iz(H(wG})=fSA*QB&i#5QyYiK`2_Gk;0C*zsbW
zn9oaStImZu|32&0=Sh2j_Sba;JvS%vnoWcLUbDqYlL>7U6f&j0gnx3BiR~)YnqPI=
z@%&Acc0;)y|3*hFI6~X|(wotVuDynBDi~O1C=DpLf{p@=v;B*KL(HW-a?<cA_^nsf
z1w(p`#j(S!DQ?fGd%H$I4k_<BXgzN@!^~uJC{DLog~I++)8yTklOhufA3u!7b&*{x
z3iIOC5mfB_#v7I<T+{W^iYFVC601c|+@iLjtw;+u+zfd<N5+SzU7P2f^5@r-*Q5XT
zh+>+@G~d)Gh#5q)jayB-O(J|Y!~%q}C{n5aclz=&!(NEW#~f2g20ljQJUm-xUD5lO
zmMeYYDz%2|g%tclyL%&!*GTp4^hgTbWBD2qL&z8gBdr*%EX-QRaVz=G3@4Bc>o9ND
z)3mC5z%2O78{Z79Hn$)Vp_u1?EAMiL^#9m<^LVJ&_ka8(O4=xzN=bFvkZH4&Xj*8I
zLdI@rQFh5bNK>>~(kUqtDPaa<-$#x@heR@VDkThKpRwzEy{s<|pYQwg`~CCzJ$|3_
zc$~+XW9D_g?)$p0>v^sBbtehpYAn88EuQzJQ*X`YZM$4JYKLg))o)GhyX{+<LoUQ^
z9FBCEBhr=w*X5ue5}2PLDM(g%Zbp@Wa+qXJ(g;uF@M*(NpZe``Mw%`a#{7BZU+Klw
zuNT>L#}NFjlb18)2Q7qZ0`@6(KRZKW%H22=XmXh*Kw{pONV--SC>>TC6iV!;nQY6=
z2s9mt|6$kj9OC5&tvu6l5bKv=U3s<oxLV=jBBtAtJQrEdAmq!WZQx<b9=d|4iP;Hz
z06fgp{oG5`g}Hj;#HwRWoNfiatV6-a<ddu2aGc%s1$XNeZ*z7p*cdsQAq)eRXcL{T
zHRlVCBFS}*I7QxYZck#4tjE7BTttC&aH-Pe1iEiqwnH3|_o*aow6Ts=_nfhq+gm}$
z?=!({DpntL(;)ZQEd8{xKO$s%elKA*_Bs9LhH~AdXo?mRpU_3X%P`dQ(d;Av)g=Qn
zh}GFM|HTRt@dozI&i7JVJS_eHJ@CqnJ{*o<d>#T<VE0yAW>w;~ma!GuodUTE`>BUu
z*}tDdR_OrmxTk+%<#QZ~yE7+2bxWqh26`X)oB$&r%Ux>A)Kl|UoY;7Co-Gw|(Ro+@
zb;EV}+7&hpd+1+}x_91gA0-)5C3Bb-mAZdU+e8>F?BG!g`zbexzb(M+AGP`SAMM&;
z-6W>GJU9*tWt=!Vi<{WQtCFNPPSsGKHth#jD3{|MU)Fv=!u;<?LbiXRZ+QIRP6~*v
zP|+I7(4Od)%Qd5GGpORc-EV0i@<XY*)-V%U+&Y-fKU#`==d{8UP6K`3uX=wKB{{1d
zNl>R1QY$RzrO+Q)eL3&=^_vSIh%=3Xrz2q<@5Lc|Gs&4VKFC0TKlzL3w}H{QjSERY
z*5$()h29UUoxR4kq?A+ADM<zn(dU=^9thQ7#<NE#7n0<kG_Joxmo)fFwQ9aI{^DI0
z909}UEoy#U%s`DeT_KSkf*&gQEOZy|)V^`%9y5p+uDhI<cgY`WZeg@@!pi(PX^jbR
z4Wd5LVz8^BpuaNyN<5UxT#1d!jMhypVefB07k0+5{q67TtE$_b8)f?8On<zhpSWSH
z5_O>QfPEM{*`iiN^|#>8&-L~7X*URKfBTAR5O~@GW$dF9Eq8WotC8aBeY<0{K#*jN
z#97OHr7TgiLJ=Hf;TvL7ZL>p@c<br0X>ap<XEO@SHeA|823Y9o<Rm1MZBIWBhi@;`
zHU>ypMh`x}X$R$1rh~e@@cuHUYj3T6JVk?gnQk6vY-?-#?#lb+8{{&?#WdQna^?DP
zL58tlOh91ZPSp-<QnM`_qPAsjuECHpdk$FDA8T<?>+~pb@y&P1c7Em5E(nEf2*G1W
zpctxeO=klX`6m>^R1OAi1Qn<dyPI6udCz_KZe)<vX}H8qgv7OV_4OB8`ddus>6Mk0
zwnc_t(`+e287V1G^-`~Gk%4j$#Uk_L!4~JwZ`9G!`sacmHQm15&N*{%WO%T@wbjh7
zr7TJ;Y502R^^1bk`3slXEcv+pfK9lqSndWY@sC|>+AzCX{^KFqrV5Sqy-w{kjbrXv
z5|px7s3cnjx9df{iP!_Tr5te{CO8a_^RLTF&>(`L-qW^a8qc>NX=y$*>opN3V9$ME
z%~%R`|H?rwR$6cAUkI7fR&5Uoi&oF2sOM1B-&54zQPi_2>NFWR{h8Jn-(H)pas5R=
zfapU0^XUU0gyfu`zGao^V3+oNesvt12<=LY>;~CPu_UOX>wmcUc-jl2u-o=+HUESJ
zZ9DsdKjQZpb2yJ)&Xe!b_0Kjf&q3rV_r^^>?S}_g*Xm+r9Xh_HN%AoieFw?(0af?R
zl<VYt`_B5D*6Ob#c9c!}<XntzU!0iYFi(g1;2F62d2E-FJQtJaz^LAul^Df`!SIbj
z#{x-hK$^7VlF>ycZS^7lOr-euk$(gpuoW5Xc(=UZO5{0whJ$|8T|bfUIU^=q8ET#4
zUeRmwt6^Ky8k06Z*9gpkgLZ46rBsm})T^c>{w{ub)T&S{iKUUKCv^ml!I}$NC=TaI
z5(2B_W3aiy&0#@F;=Q?omI(u%xbqSf7_UuP<<S@P)tXE-o>8?QV(~xHZClh^)<S(z
zRIf&~?vAgN%<@8p>A3&8<Otv%FL?k={W%yqBfN)Kzu7t3TP142yy>nsf!k(fsg9;*
z3cyHnTIGcMcxe<|I7K)btV`roF70@V@$LHbLN*{&nXoP>Ft8A-m&Q-8&Sys)s7r6p
z43HPbvh2TQk-A(YoHe4{{EpbEJd75~cIx_h@;a0?$5Dt+#X0QxxDPsX7??H%K@X9Z
zwAkO!03LpJkRU7hfexSTMgPE*Q!~kV7hKm-+dZy+g@zHF>k3Y}(7P!?l*5@OhM8&)
zsoM9j<PP~mD)+GX@1`20c0fEs8Q<Ooj(3^@#+Gjc7D{903<O8<`jQhsFI-F-$`&x#
z1iQZVnwAnmoTCEa>K|q7+jQ+lhWfj-nz?_`MH_aCi$A$I*!SZ>fh3$J!X`6Ce%g{f
zF{BLQmJB!&&_KU$zvF3dBGqLlhiS`;TJPdo()@+m3d|!8R2o}5uaOoc=Tdcm4ZC~{
zx2u{VxgZw2^dtec&_M^E^|Hi-yXGw4Y)3e!^x%{u-vwPvh0{QfF<<uxT(q74Mb$8~
zBSO7d@o$=mMCR~dFDA&j>!c&WjeS99X;&zZ^Nx&Ul?IxcEf!znk*QWI$awI^V-yQW
z5uif?{6i&G=0xiXK<js_>}63u$c2Jz%jX(J6=1W)T;yohYH~I<-r;fIara-<$(D^_
zXM;Xj)MZrid<~Sg3R!oF{M7#EhlgDiaK&Rj^Y2>=3zz_FYGwsri7ipww|xgUnT>uw
z#Gh0Ro7}R$mAis2T9?yk($U5$*ViwXyT|<c_7G)3+XYwW=ml*hnRfYr6&_ZXV_n#-
zS{g;S!`QK2SvqDiH1c;EH%Bf-sL6@lTA6l3Um{C`nt%yv5$M}g>=zv*(#foh4@<;y
z^{cXI+#R9H`m`<rXKzS7Aw!i;1Lg9>Wxb4DWKB{;ieEHcGCbv?ee&XQGxOrMV)};M
zTnqKNbGN*!=K-GZX>uM#@s@!ZV)<Q;Sb}7xWJHvs{4w*jbal<}g&sxm@$n6fI>{Ek
z)$i^s_p64};Mbf$NUjBte=@6+sYC1j&RDoy=y7|CZ`Y=9`=;WT*XkmU5Fk#^8?lEV
z@e~oHoA#eG*XDcl^2I1pO^Pkr>uBb;>|=k!Wqt)0Wx~rr%~vldBe)K;uKk8Of&#^`
zBI9-R?oA&xmb2yMGfl2yxrWGtn}RVwp%{#BZ@n+Moi5s5o9QCMalhf|rWkwBKI$ym
z8G+u*SFU(~QIL4VNyWT<sK2}N-pXzH*CB)O!cS2&K`LDE^xIJxGPsF|ii4FBQeT0R
zswwSRp#@|LWZB!IzRpFWP#I+fMhHvI=jg2IPRn<M8(g*rxqHKhvoFltyF*M6EUD&c
zl&5Dv3Gtri8q}xuJx!nYJjDzjOUwWoCTBaaPC3#=99cY$tbB(Z;eIzAS@-R!nFA&*
z``f%Yh<qPBI@_W9$CfUJqty`DIa;>W#$F6&y>$lT?{u5%vMJ>|VleYEO*ZH|cgjB$
z8NpLJaz>ic0aUVyAbgg<It?@<znjhxT?86~K&!8Nb^L+$cbfEXP}dEO;Ypa$ZKRac
zEgRw=FHCxR71sM7)!re+g5xA3b%K%>d@Hf{0{mr(cCfZ=drhi!B~*dx(7VqB9`JSP
z`+mofcC8NH+YK>o^@a?!-B^4vs2R&@vLo;b?E=(Sgqk?A{-_hQ%=k?H%P-rWFIZ8a
zUG-_S_C{)LC;xlO?RGde<>B1hR@?IQqJC@m!d`MCD^8qlpeK>lTV+b6sj0NVxd+{3
z>OgAMS&0h2y1Y)}mP{Kl_r4Gg{S2x)<OTZ30=*Yr?8w0QvgC$)zO;Ht)oss9as53~
z|LCk-b3_T@d2gLlyFjw#SMkA=YYnF;`2^=r-MYRsf`z#GsMnXpkK`P`hXl-BR*4w*
z-=))eCh0Se;}MidORZ@!S_KqqdW0k9rs5B`Lu(hl=lQRQ1l=ESw`_wXxiN?R;o;^6
zb{1*ABTs{TXa@8L5Ospv>3NdPHTMI-6kldk2uWWR_ApPpKv4TfaURViG{bIQ27etz
zqge59I)8>!gRe%JY`Na>5U_n&ETFmXd%6q0TY99Y`3rZ3VMKe+e)sm0#)>k1gws?G
z2@QtSHU_!ApwD;i+|SMvU9UJVlY1o8g%wOl93YTm{5YIg_njg_2;?lkY)7UDGe=t0
zop#2T-3DgTx`Pfec*AxEonrO-anO}9smmG0>t;CYmH|5n?h3ruRZ&u+%EMUH%4%My
zCp9K^C4e(q&E*RxT!?=u`_*fJh7YxP=A#@WgCz($89ZEq6!9J9y(>Zlld)m%C&ZHA
zAtLxRXeZEf;dgDieZh@7fbAp|Rt7OAL+vCLP-jH7A%n<8BX)@6<6069akY{EPUkL`
zSEg!nQH4G#k}~*{Jo1UnF1Bl+j(VW$paUm^@i&b>j<T?a?At>=OjE;S@%fY&n7mCX
zKoh1&2cFWNv+hfyi4Qj0F`~W>R9U>;c^X(wm%%LB`Ob!~OcBo(I&@vPP3G!1oE3o6
zHNl|vuW$9&$vLMu|7KTPV!-~<!c^$!>Z+Z$<UTYgK4!daJsga6-4(#eZH>a_8Rk$A
zL}IC5Lk3V)PX>P>VCHCBh=7^iqQ^xN;FC1?Vf8<q<Dyky@I7Z<9MrnqQ-v?~kmUPB
zQ7=YzXjRaU3aH8M0;2h~Q~u7lwQ%_pdKWR`d9r<b-5!HvEgY7sS2Zho?MDw22fp`;
z)vGnq^NChNpzN@nszA>hRPDpmbH21Q0}-kOiC~SWr<lQW0Zp$e&7P1F&kUZc=KpDm
zY4R93XJBbgFbtyqp-)c23_{aCepB_N%J32bC&EePk}1h)Y5s`HThfXkI_PHo%CCS|
z`OM1#Y>V17d{Vk||Fu*v9;e1nfuda*8KG+BEsQ)J<is09>BoyFFOQjev)IS2H^48R
zQGa<+6r?gq@ya(Cq9O?wCPEB56PPbI)bZ{h2NBq+5)QZM$_-|~%r;104R(HO`8D&J
z;9(jV2T$5J2lxI8uvFls2z;VwlsG~AlY)yo!jl0@xPu!kGddMgYhewkO#Tmwhcl#K
z9C6@FqActt`*x`AqE<x<WhnZxLX-}aiN|Ox?+BH@I}GR93J&0_Ja*}=JIgF_WDW1I
z!I)D>nfyOJT=*&|>h+3wjKj2^{dhVUTLdqv-c_tS8(7rj1{Po8VNiD^$VFYfOi_TX
zifivKg7eI_D(9&Ly>>9zbKqp>Dgic`&I)niWUJcJ#{I2IZG*F#mep^0n#Q>6^lhNg
zrN~!WQ~GZP2X5XD6uaqh;l!vMk3HpyL>xYddy$aXQ87p7LF>$f1b%@6ZQq`d`vl#-
zQEU@wUcdj^YiCzb`py;>(MuY08o66SmFwU(z<ifjrSBQsKSCLf05cNdnDgoU#mJq8
zpe5w714{E~6>oe9w6z3C2WN?MwX0qgcRBMtU5$(Sr8DJnqPgtdWUXCPp$`=7WuB(c
zVS3SyU_X)O&|*Sj2!V$Y+O2wndfmd-RrW(KK%zQShV-uryh)5bbdw0aw;8?)>@O|8
zc{H$u->Ru#*@s*P5?T*&^)Toxx&EJz#K=r}BWzXeTi3TWb*a{2(${{T=pXr27|_6m
zg~=zz5n6S$weMfdCf*wHJ@R`D-_S^fqSO>wSDyuZ>(rHch00_ZTmgKu%dN#DX2m-W
zM-~3cuC8lpA4P*qGeH%l1_lN))BYm3^>_=(rbQzkmaRX)kN={LXMyblwpbic`$u9q
zq2QGXfw-ZXKTGj@MuL|zK_Ui2O7{|JN(A&8J}yipijjZOZ#l3!2*VCND+dWq%*0ku
zoTq<a{q_W=SEeakZS%lp8c86!B)>?Ii=}FSHE;Qz0lJ3&n+FrLMIAXqV`(d1v^7-H
zSU)|OV5^^)M&MzBYa6_TA%7LS^$EEJfmSU`TU&{n&*j~^L-f?jJeY|6ah_&kNCWe+
zs}CSLRmqq2Dd<oTVtn5v!=@BjpQjaC9LyWjwQfj;zK<h72&GD=(KE&xEPk0Q?fIy`
zdIkYM_C-D818dkI0Gz&$a2Gdd*Jk7;&Ku|m4z3}E9~5cM$y8HnN0e>iUxalY!G!cv
z#0VmRv%3mz9|NLB0vpLYU<V>KK2h2{iU|i^i2Tfv<mw<F&ae|;F$DTlC20Kh9;W!-
z#KGr{j{85Air0m85Hilr%N$q>UL;d=(-tU#;_N9INuUbVP;Pgr8q$bYL5+9}GeufK
zf8Z%yIgWJPS3ux6?uO7)PW1au%sSW398AN(DtsA5y_BL}1`~U}@;nU?@i;zr6L2n8
zqy;&IBLn`Y6*#2zqv}_HuQc7JSyc;f$$YVMQ#hu5b7XSTpV`FfA@{$V$GGx2=!aZK
zmS^CU<u;wJtBnz@OAxw+)++CTD;3NRFF|zm{8i8r)7g7Jxmo@MV!jwk(2#K4hU<Oj
zwUn^l8$hwuw1NNg)~VN`U+yLmTp!V|5Kl?2X{)2jJjfz|752pxQ|$2#F$Lqfn`~Ep
z1SJ{LZn6u@VGn|M?}jF!HmEA6GyJJv65ngZzthyVP=!8?qHC7k0m9Osc+nPl2^ym@
z<+6$3Mw)b8voD9;j>pNL8|_FmB95xJ3wzTZngDo)reW8EnG{DzptMocKT{U;UT{rN
z8nk0p2?)$P*Krl#vntewpjM{X9(2&LOBGM!ZVt^NB$Df(y^x-6>b1~FGDd(EQk0XE
zbIqwCUjWZqFx2ASHpZAKP{ZL>aB2;bF_}f#fI^z3dNykrn2t>0Ohtq-uW)J$uwb9s
zHqu)sY5|UhtCz~ywQ8nfMHrzbM6h%V8h`yN0|AtVzrbB`f5M5Zt7}R<cGZDv8egGs
zh_Aw<cj2;i+k5G^i1cCtXFDiUodq3a&@DmSKk1_NbV@V+mLp#&WkC;|#0-3N6HK@!
zdapb$*1bcOm0<+p@4})3a?nZ$1M5iXjYUCzzTF`=0bNQmO*+IyzK8NS#d;O7enf=S
zDfqI;^zM>=#2_Ts0S;t}zsGz}C-PEjGr51s<J!N!yAzvfPGQK)gNkgE_lZyRim8Lh
z#}hI`-T;<>M8x1g4R=e9_SXqOy$hFvWoqM**hfQVI5{k&?$gkwdh!^{mLkk&F7G*~
zq4*T39=WG=Y9-oG<OWbUinCh;d*kVYdn?M2b?)0muC|Q~$9&Hy*LS5ItTUCTnHXTX
zSa$7odg}Qt_VP4h4vi>{<r-9#r<G<36?-_o*`?3f7UIHF7b;AUNYRLTULKtd){5lf
zrqFyMUsel$DW&bfZtrXIH2zWujb2rW3b$`A35B5TMfdx}%{`0nzZ?a85+4%#e(1hy
z$sD|bGw6b4Znac`t2QV3iS##=pDg{PUIc_#twK_lr>)jkqXQzyICfn^;;aWn7_`B*
z^hzKvAu1fx*jk;8rK*>Lu~6m#B$XpK<w)LiY<A&kp|Pt>9o0U2{D^h=;U$%C-ul2r
z<s^VJVmG)TmLMH`@){2VUD{0o=U~WN?$Xd707QWz4Ebs=Z+Wno%fWtA#<Fu7r3tS1
z`A&2Lf=Rh}FeXp+52{cp#Wh;}qt0G4{13&vP2qlZ8p*MT@MTE2Y!%4QeL~Npq*&}o
zFVqWm^&tk`DWZxNwo2BC9a*Em%3uttskAqL32W#qI`li^1idHZW*r0+eh}`D<o{4n
zX@|3-mB8#C+|7D4q6DlT2_II9!`vij4%CpkyCWhOHOH9c1G4Q4?Dx+}=2;4FNlZ{N
z=3kYUBV?8$nc!(4?_H_CL^Yy+N?F!@Jg01FWwjfLm34D7c8ZBz0<(B$HJ~5hSHF%W
z%@@&mH5yT5@ZR~VC=1xj)=4Wb=v5WQns0s$+}0GfwpekHoTu6+ZZ1WM#JDIZQ`fdB
z4qixC*^b43q$E=pvO*pht^A(JU7RDyT3;Q0*RSn?^b11lAr9xx`yB)`kB}38CwFD=
zfAF}t<D*7x5IfXdHU<;4`7Wlh{3wx*Bd|0je3B&}?0M~17iP}Mf_Sy&0`>)cH}<CR
z<RtF~Ne&_srlskpyKY*o+2%CVXC)ep1Ze7^@LJ1ELtQ<NC_yvY03MDKg6TP6bimy$
zo_WkGpB+Zg?hn!kz#S`31v^UT8J@@(kNW`!*YKmiZ-Q!Bs*@;EYO(A5xK*g8PM2|F
zn~6qZx(@=*^=DYCl|j_~M1Ll5BZpHgFl9RgI{E=eHs4@@B!a2H-I|jckxvNACw_jj
zwvMu(8|>HcaBwmlsM>niPiIN#ChoKf{iryIo#lOTqze;>`b??Xm3$z=VW88Uws0|k
z{vG1%M5mye*Xh<Rqzse0;I$xJXBcE0Akr9W?$)O&K$Cc+E=(1wP^2RUfQJZi$7@Wr
z9T>5{!{2B`z3%;IThOJ>{ocTLU@H2xLx|&?#Vdcr(X;}wPubw%fx%ve))eOWFNj5G
zcZ9f-z4%e{7=5CEg5dD(M^k8;ZA}sf3Z~&o+@>I49)~G#v;`ON^JywhtxShN5DzkK
zdjmQRJj$%Jd@u#FiSWjqqR<BAS3pA}e;I@X><$CXkSL@GHBp3#7j2UNO3xqF4y=0X
zgnPjRp4QOeX6toS^>2!U_RK0g__UOlw2B}$0EWFF7x=rqX|zWnssuHEibjQE&^c+U
zwhA`ej>T};5nLW&0+t?VmPAT4BGXSztAmlaQW+kA<g>^A<I311z7o(-h(GeF9ele6
zNpDbHua`9E462=Mhir&EXU_%w#J>>5kcvG*j8*dOnXmzCAdix|si>d<Zm41Bei=J*
zaXJPJI!<hK#M~)OaO9HXN21S6@n)6-;HOtPam`ME)CI!DNpS?-MljnQ$&vP8G<Eei
zf(@DhY{$k^S`fd}OoJMY0n+_if6!r<q5$gx65i~x<^lFYqTWK^X+3gD^gq5ra9?L5
zE#d9jTn~1$bz%v@)>R_huj+j#XiKiX3$8q@Ex#H2nPq=+T<1$rtUlB71iD=R;7jvR
zCJ;kKoFMiz*n;mabJq6lW|)HqZNkbh$;sd^q1<NGJMc7ts)|{{z|VKI^6Kda4vh*1
zuNr(~Z6GcX7W6oXc2Bha+ZQm;7-`TuRxAdhz}HdKOGaH}tb31IiC`9$11XRMZn`KA
zKHIOwMtyVf9-MFoQngh<&}9lbUqRwQifDS_7(BH)3AoS0RKb>J-T286OoSq;ZmMb<
zlBi6+vuyo+7cEZqrpV-&7@u02Q}RJOh%Y+Z*(KFvLfWhnA_`sM(V#29@C9=gFp?Ew
zE)j#|h#vd0FSw@r5F{eYk<eQJhb;>dBwzw_I9eSfAZ{F!KHmvEHCwB|iZ^!5vR&To
zU<T{)ViO@Oqw1R>;ioP^>(iy)#SCuJ#2n6GfOXR-Ukx)%2C25LW<Iz|;AxK1O5i}U
z5m?ctoDpsY9sYCh5Jdm>1lxmw3UO*8L9Ik5-7??Yt*7q#Aahe2RSoV(>#Y;5bNxLW
z-m`j82j3Io@}iO4v~&=&sC>*i-i(m7e(@!TImz0GkhQ&P&3+VF-$9L(U}mZ)<ev@P
zxbm+&_j~fBtH2NI*M9c7w2H;6G=dpLfDQ5OK?eNiHHJh3`EG!J3+t`bDm;=Pyd<8y
zBgK#05?VxkJ3@3k$UGbLDxc6w)u>VgdsuyGD|ps)grR?WUyM~E(M&*bA1@XcDGI$o
zaO*A0)p`*m7nbOOBYZS{Wl(pyrN&IsgTIH8q(tuq_XOe#xr*lJc(HQFaa8TymgXPL
z=R4B#9gz@o<r|@VkD}#52e-d-peqxcGws_6sfWye3~@6;d59pnI}RO}#(IYu5E5?^
zTHjhXo^d=#ZUO+x$utq5CGrqsb#RvP0r^N*4p0iA0B+~fI^HNv2`w7?X01`q`iT@z
zf713x+RqGZQI3j9PD#;}>(!->M*pw50*#oUnhVU;Dk<t<-q5q^Ac3iM-FsMNG!)Vx
zDR*4D{oCKSMx(4zF$8h~#(YzFc|;Xqou!5`mJKR{<<NRopts(Ia}Z2smosm6bZ9Us
zjP;G7!Cw`s&zL?Isn4<9bQMEYTaRUFEd3SA{w!Wcbxf+ugZmSx*M|V%40mCxO6L<z
zyTu<Mb`JiqWnt#fx{Vo(M#v644thnbtCA-005^gr!Klb$>nzm{BY6rw-kqNUWi*oT
zhpAumE{>mWMM|7&7SF|He~{)Oh9b9mGU^wnUjt$u6Q`U6#0=J-2qpBlTB}u#>fZzm
zo(*2`GkK$Sc%Naw=HuRvt07<ixSE@pV+S3s8YJC&AT4$7y7wukJIgnS=cU6Ph2CL*
z=tEpV%?g|CwEH%HJrbkS70tG2u-z2gTX`j@A=XPH>>C0^^@=_bE)p>s^4`{IhtGab
zH>u|S1?<N|$3F<m(yM}ajb&lz-TpkN(Fb!SMI0pk7yeg;<kMb=Qmb!CH)+U_hNvop
z;k{WqktmG1CY2zI<sfQM2|Fo`;4||e@i}>(#+=;(J!z?r62{GcGKb#q;U#DsLs13M
z#JG0gI$FSfRJjDH8L0xPOO!%RGWBdy21}w*K9m+7XU`~&XnnW*kXVqHcNS6^!S2l4
z;1{H!q0tQ$*j;;6jIl{OAVBVn#Cv<eHH)cv^M}4$FX+!qb|TgHW#+zL{KXy}gx2b}
zuC5dgeP{0*1kNvHdhdnb6l!J_V{^9sjub$~)!4eck-P^RsHXb8ry7r^mu6P!R|S2d
zBzqJwZ#sUKW9M};C}2mLmvlmy49=V=`gWBJR$PZTK~z>Ps8|o+qbspWF0XL}NMM5B
zsLsmZu6G$|4vP{-JnaGl0%uoA{{taek*^z|AVX72>z;{61EK=er7f9qAh<Ye5|WKi
zRyll@l`)|AYws9n6?;LGf~gwD8Pr7NHAxtBFW0`XQ9Ebsp<RiHXs7pV1$|6na@*(`
zW5rkaw$-`x*6vBc1VKQ+uc)vvk<84=JOhHJNlwEE^p*sJEkY-5I8OtOarte>;<r9X
z(ADfO4rf*oUlXK|o71kO{;=j8P>sQUq9#XZG*vGngjrJ#r<nwp$Y)dltVu!;)zrP=
zf<DhzTb}w*NuY!drgU(qyQttg!p;_8w8e{uLy#IQW5aZK(HnMyB-~};*nPRb9|=^l
z>vKFRL4pf<k%S&xWl(Rvb}()Kl5CBjv9(W}IrE>B$sl$IF{z9{#LrFv8`M*SE}&mq
zBL);wI1{WFN?UzNt=a?0F1MsMTZps)#~~#*8#BWrkv!qGHdek<ESAF=^sC4Njd#vF
zviyow9RQYZPzUydC&t}H*Jl+I)=?J@mh5L%<;D_IEX>8Du9M3nlU+K#9oE_%BlMH9
zpsRk!#5SYvIYh1o!~env0N>cq)OJ{!pE1;IgX51;#ky)A!KEsL6egk~AC!y!ii)yF
zELwGGEcsX9Ay>J<7S~}a29hm?^bDi0Yjq!nw+oJz<MXaI1@AkSAa@H6w@=6J32sn8
z((jKo@+weKlGR8to{A3N4-nd{ip7tMPC}x>w;Iuc)hoB1Nq7;I3Hxy!y6uU<StBug
zO*)ZfqIS_Vu{ykq@EkIpuM?psQrVyy2?;M|UuwyA;%lPZ?uVs9kfEF2o2c`UdmxnO
zx=O0JoTZ`8@^U&=An#R|o~LM1CpqXG)f4kF!wA#;K!faqY5(f*!>b-T(lm@f0H`Gt
zM{xG;jio`KFp4(8t?PnIeWFTEhNbIZHGsOvFL^++21iq4wL-zc4VL5i_E$T=6!34k
zpf-zmjIKM26H!}PoBYvz4Uev@IHTN7xhY+M6xtfyvfYU2#nmz-h(wIrS=gY^Gx*+#
z3~ECre;wqCKFW>s{V3a84l-YacyEA3t$+bh!(hyq(jJ&NFKUWPuFHpFjN*akBAtk5
zP#G3<-9IO3ZI5DOD6c-bSc8kYa7a?ylj_1W1>klmGg19r(^kVDqnH=MLGaw1#Jc!-
zD;M*ET(iL8j2<}-N7k-7f2&o#ZP0Q+MN-zLN!jsC-y0FfPDfkY#Jjdgrc_NqtU3dd
zr#XY|Z}nebT;8kaWW)}}?t`NTh_bf=nOK?pdMlhI9>a*44C8qpR91MTo7Dv8AWtph
zsO4lG7?y>*yw|B$tVUePzo3MJ;cVP2UnhgF8&zgmn88;A1hWV0Ws=(ZTf(z3gI@t?
zxF1JR;3LG(@~>0A)BfapN#pLqgUHl@bjNe2;YC?X6}v$D@trWkXHxW~+}lUP{lECC
zO>o$%<M@p5yO?G4d<QEr_fCdDm$-gr4G%Nwncv|IVASMnS4elYLV4t=L43YoS>bo&
zNC%={bfBJDYQgMlgde~e4I$z6aB67GvmX83Q{<T0Jze!j$ia(>MaLRw2aDoEw#UJx
z4Q26VqlHot9Qvol$sI0@GGG5W3u(Mh68L(>m7sDu(i#&f=QEAYQ9o*ML75TYNzWSJ
zp9i#)7SCgIY&f500`ks~^e(RswT9f2NH?Lp5eaY}lh5cm<=a{Iu`JPCV<WZ-DUUS6
zv~<&tzsw-4(i*L$h1qV0Z99724%KyLI*S{ug_!aiD|#Wi8XyYnUUU2>$I*>HMk<Rh
zX9?>kuJ{V|0P!H-kxpl1@IO6pUqcoB;ws9Hx)=*ZoC4i=%0<OWJgFJKe30cd5>cYZ
zkkWfg=#Rc$%b)4dA6gF0DG5B^-Om#7qCavnxqn1)b=JSyMR<$Ra2=ad*7)(snFo?5
zInKZznXDtL5ETF?5#Vn`fiI&i`xI1!MY`~L_^5o$U{RoqAAsLT=#>--egM)J45#12
zy*kwZ((9et2Fs#!<XsLLdq)iD*?vM4=Y<zY0b^~%T0+&s##G+A?R8nX<IHZRXplFO
zl@z5SgQC(54%xK;D19;J#BEePZ`KDu4X=-?t49y%F3|{PcGsxHVt<npY9OWlWF+}>
zjH>)w!W{zDsJv~6vHJdrO~06TYFZK#C*b(BKM2+9sBlzQfN4R8c^sRO<6wIf&kLjW
z)e{Oz>sE{XwQ;ma7B$iJK&zT{s9&OF73BH<W{FO21pM^>buHI4{D6*{bHK`&$c-M0
zLIMG!G73Lki6HjF0u$BAzl?q`FacZp&E*KCxsT>20c~P(^rrd+{Xw#VGiQR0^u2ad
z<ByJpv~rm?&1v7S_JNUZ7}$;<Q$R?Wbqpj&>1UijKWeHl)YG*95D{7?D^XQ`C}P5o
zZzD}x%&f*;)3LS8mF|H9TFa+z9QthwkR~L;nzg9C-}Mjy@6^bxGY%)bQ2^a;@}%k*
z3d^Rk7wJ3%SC{J4uqMb36I*=+)?!7Fy*R<66H@`!nm;=6=_?z1^nY1vI%)owwf?`v
zTEPL}Lz(ARf^whW_HnlE1lLpWYvSI?vyf;}M1EuN9xzwInY08AI1g7eZFMVG3V7)M
z`0wch!Wjw#&?OYjL#8v#lu%{Gz=*eZwh>W3@QlYvZOpK!@{A|Nj@z`8-~K>~WHL>!
z{G9yDj1O6NLCU@QIM$%E^qaR7D{)f;BsWX!xP?m1jZWkL8E)8*9e!E27CHP90F8&J
zF`d)rI?*;qV(&DdMM09m9ay`a!#v%|4!xj?Vv*v-^eubGEY&H%Hc&$9>8JjGiiBgj
zW;OtSj{taUiT{k#!jp%Y<}7QU<vbM03So|N-@O(ymh;~i{{NlUT;{?5s7(xcq)wZt
z1(=fS|4~!?-)a5-Kq@C%J-OT*Z|!5(iFEscW2ElLJrr1IsQmny=IWKGGNDm$R6rUe
z{q+0r$CR2I`rTT)I3v`WcBtbkp)Xy)_ta~N^hG%-+j0|WTl{qY`q(7mZMx&K?q4e}
zpZ$w}>1V4j58cc54_~h+^j_Vob9+UVsr&xNZn^6gIt>@an_lsxl@ZGtv|6-U91l^;
z9rx!AMZ+nR_;R*$*^HB<He?7~hP`-hi_#Ir;v|#9G$TDaxh>I@`D>;%9S26VAG15z
zX$Ia-0>AKKuw6N=mZCBiI1Qcz(Qu5*^_J#lN6QGc0i{^Ybxo&q1HhM`hB+$$%oQJ(
zBc-M&g*5dW+!Ihuan1wmsT^i-Iv3H&X;48y0sPNj*HT}8|Ni~u<<^w-hiN&YJ*=Ld
zB?3I|BbQtAR?ip|X9sRfoOPMqY;8M6XO*A{$@!dbZ%{iJyoKX8PjjJ`O`NWH`B%>v
z0_7I5K`vRs^w7@zypLv1Is(ouE9UIh<eX`+OsuHb3!H{p2QnKK@xi4Q9mu&5`@Q$G
z_L@>b>{V&IH0VBtPLClAQKsPDy<Ht)dxSZ~Ml)Zp1tm2F`6sKV>aR$x+{eBrsYdL3
zr6$7Knb~sLty&w(o)YfF%1Uguc^as5K5QnHz|(0InmO{+k5<6hbw%#7n5n;TC+G^F
zNr?&zyArOvy+<l8MV*r+;;b&HGw|_3taR}Qxt@Qq6l78TXnT_{&c#x}@<67YHzk*%
z=y-4O&+dNJ=O@=j9rLqM#`3K5_VoN15eLa>={r&AA0jjB;DLY>X708CKHz2&j>OzX
z%`7;SWA{7rSEl(5*M{G{dv}y2zN<Ny;Auj9`8no1&8Zw^YL3pT$qd;3liVT7OqXqv
zl~tj&m0$h4*vXFm@Z5#PrO;8nxp|EUKhJRNLjB0|JU5BtX1bHZ!1O#m$rJ2H!y%3e
zITQ_0MCTRuflk(hYe@>IV%gwCjs^~!A5kF>@7tVQ$RaaeJxokyANvSx(WZ^^8jLus
z`L+wP(*{JzCaxTUEVhLzRa2eNNFNuMs^YPJ1+3-p5B=5?*o2@WKaog92^w*k64HYh
z%{2zlJ*-H6R-qo9%b&n!e!P+AQQU0gflD;8x#+<dCl5<(&p)sOF)xD&vUlDrpSA*s
zc|ya&#!cu~PG96~j4CgzP|W<=LQW<V{yI$K3Hq#1t|zvNcDDRUfT8O21r)F6xE@=)
zzR?J2k2~u!a6Eubst((qpk?_ehZTiE7cBMY-Y){w$*n$?7_8?T{KAB|w021#YO$k3
zLgSk0lSaSF#|M|{hjStlZ}YexaXs>7E$XQ9JdS&d-bax&+8bZ|d5h?K<Tm}t^?RhX
zNOKAFbeoV6N%a0xf-{|s$@i2%Qv|=Hm9avo=lnPNT*A{}+y;?OcV4wGrnx!0X2fnk
zqyX>bVs9`}dIZHjFpYm8wX@}l>EvZeFtMs_yg+zyzm^R`LPBK2(34y&a-^0r^~uqa
zX-k6@tkwYT$(fbxb8=5Q?m3w#D=T}{{m9F=Pja>jlu~wCY+vP3_lnZsGj+kx_@KE0
z9g7!WPzlfJloOr}A($K|@pQ#q;Tj^hpnqkx<s+`?oIz>>{BQBLPMEWGq_8_jgUIR9
zbNEf})$xC#Xue0BH*AtN_Z5AQDN~`2TDu%$zhUY=i}ANrf@@Y#_pEyNj44lUK=CHn
z_Em=iZcniTDT(`i1X-g6<NJOQ!*?(eFK*KqucZ@x=j3#r#Qg##xySye<edF&6Ha9n
z1V{ZPiIzv^iVuz5p(fTCkDQA32nm^RjR=1N_xa19^}~~{yG|+Zac;l6nRgH)Y_;GE
zYKAix^UV<HfkHwp!2%P$!cDODdg>|rAC(Dv!Zj3c#8el39+_n3u@9tNkju4F`_Ne>
zv`|3!Ko`*tsJ$nNajQ|t<zK$+Rol)#@I&)Y-cnJW&a=wcPxp*wwh(&{CX0t?DD;G$
zKIo(%2P**e*Z#c^r5A!XbLG3RwQTlU!7*{DV~Su)#cPT!v-vYcex?9L@PmGBh}E1C
z?3@yX;-!cQarHiXa=RhFB0;LFtIK@&@Zo;ns)lQ+BZy68a#3P`xJHscp@yZVLp_9=
z&e&8vK|)jiiwX)i;Di-7C6L_g1cf73hO6VL<C<PW$04=&YshdxVWLrA^eG7%Z*sPN
zB5UAZnTD*V(25})HQlK_9Uc1woYux})O?2uuKFB+qEl8CvJFaDw_OOaUN8H6b7CIj
z6C!y-jJc)hN4&$-6es68`a!r5h-ECd-V|-pRdUeDjGfw@(KR)N;gI(Be-TWGKU+7m
zWC{1*w}f7+25aMK37o9^IU&8jR2^@LO+SjyZIg6EU)qM<uO*A3>a;2QTmbIj-{2jJ
zG6#I_;^cy`zJe(el=n~h47JpoH@j?fTO;u6sg~HPg)1sE4o?^5KS-w$6Jnsx92M7@
z{@N3&K>3q;7g0>;qd>YAbSt+^JIY_L1Z*q(a7mMi(zGc8zYTzhmAERXqw+G-H`OK_
z6vQ2&=Z2`Em6LI)!L=1P=r^w^!Bx;ZlKZ3uh8cA!-ZMx41*7kA<@#efx}v6j5>YN&
zOm$J)ULvLpS3@Pn=xzpIrd2hfRO9c<2>zUi4@^Xna+C`LA$V7wdgV&Pj2BNX0f*cL
zr+OXt1p1myycV~K!<Wq1vZj=h^z1@xlS`_}iIJI&ki&JdutJPCUq&xfn8B=L<0wPs
zzi25+9Sc*CVJBp^(`;=-xJHh)1TB)FFo?43ka_F`)7z9U!FbkzVPT|JAd6x9M<&V)
zAi45@*-YZi_Ut?exr7xciXp=U3&KLvI@Or>Cf7OR!wGlTND=S9H=*8YbPJc7@9<fb
zIA)vtb|^SS4k5&9_C)Inwvl#BXG7c*&Y`Z2iectRYq2R9L%{(~5hkIXZ@vdLIn}Mo
zZF^M`K3@Af)$Jy6uSd=WweLkUp_ums^aH}~)}D`IGSs-R5-lh!EErfC<2p5T<*INv
zv^s9adTo1L)TsDlcsTZ|e&`<tYpmdA41M|Z8Cba!{3_E$@h9h){~shV2yhy9mNHS9
zOp-hoqhGhQHJFgfa5*yr2H+x;(;()TP1gPwt(WfNWU;R~(6LdR)FL%9SwkGBu`}GS
zsBz&D?!6ZdM7+VQJT)`68@wBa&mHQS!9l1UcnoUHdOvDKAJTB4!4uY*c1`%R!?N_A
zxa;$oGkU8eK<_uIZQ13SuKeUsdk~W{c7C$n6wNtv1;SbJD)Bcb?6a@2|6&DgI1FCS
zV`%-aFD3?l#vjfyEIqxca^HkV!UpSMmD}P~5;{A{3<-!h92v#JG@*(G)0yxOk{UQP
zq4lf9=JW}h5VdV_5buOqSg`t}Uj4xmq%!ni1vwW!4_YM(Jz=AWiOd;S3#H8}Qh_tk
zY)YO;`Jjv}raUTG#>ft3ac9hMWj&td&tG~@si*;^s7A3z8{C|7Rf|V6F>LbfT7U`#
zCC1gFD605!=DWxZXas4vF+k{1UBO153SJ5FcA&E~62-2lhv7F(+&~L&>8&0p?hEGP
z@SWU0SF<&{ZxTaI6ve^5_M+VWSu~U0Z+Q9)kDc4Ubf)V$wY?4q6q=(EA?wgYXy@1C
z?hoUB_~#%E9!A3$hKWQ|!;2f2OmM^`4WhvAHg%6fvom^J(1eJ9c2$`BqjqBt_{t6{
z-SQL$eh~v7^(NV<sp$OF^u`C!-U9R&G4B4?7Za0shFIe@53A9a;=(oJE(&W~_N<Db
z^k|VG^nyQ%5Xxg#LM`rdKlbJ1(m7QrZy)r_dl7&JHj*D#SN{->`@F*bN3mq$(gm!`
zt2}aygwQtm*j6Lu#5)@#=Pb+f*#bK)7MMgs8DGju!O6+VmX|JFIumNAB9UR@+e>Cq
zla~G4jz0l?iLi_C7f}zRrwL4@R%*R)f>OwgIz&j$a{A==e-$(gD2A4JB8pn{tEPOW
z_MkDUGH`60jBbPO*~{;R?%p(=QCA<w$;*~clW8h{xcW{kPqn-(V-drh7a`2NUz40N
z{()o%aFBSk_W8^&CWgmtMf@PEkCLdP^SGrP{mybcC57QH6?ZCeu}nVy%WpM8;;a8%
zt{0u;`agfdQL*Vk5rhp%4A@1wSHd)&o|x$<sY-!MH825ZP|LANMU2KR3G#<&ZU#$H
z^%SEIsDWanSiGLI+cqv(92g_5!Um0o1>nV~n`9?(vHWisaiicJ+2LS0cwnHW2lc>|
z@ig{MP&t%SuhFe~*YGznnM}boYo2{+jq06<-99iqIAib|IHY}u^kmmi)H){wlK~h&
znXiOFJ(-3awrY7WQyJj40{-oBY?VYstj8E`g1`yD*HnT0Ulae2EN>-<oO48Fi@FN6
zG9-5<Oq+m-K#^A*uMnHDaAb<WJU59F2F9r1Gwuz*1i^&!urifT)@3GuD!?Sy3Y5;!
zz7wkjiYr5*Nf8}2#T!|D95fou1snrw|K`o%<)@Y4;`-?VnFv5g7kM+M`+*aF0~~yM
zdm&0_KOlj|U*0vhr%w@A4(^5RFkiLsG*(2-uo&SXyMF>uNeI#z-tE25BI*(;;TqKE
zQ&;_sZ76OpNS*&nqdCun-pG;$GDc5h+kBWt>iQWR0j3g6?u&61+^{m^tMRS&LHcg1
zt-q6lny}BFNqt|a{p3Ee@UMN7H(riEru+biA!{;#dTKg_IWGXT<V-uVM*4`TA9j=Z
z41d7}E}|f42vL+~?$YFpjgf@HyB(VIQL8XswY@hr;)MiI@PWVF<WI*w;NlSqZZTdr
z$Eu;6>H<Q+X&DL;gn}Dqj+BgqfD2AmEkwP0+HZUg00m3e-{V3}A5B3`go4WZW`quf
zm8r05+O-++ZuCpY@aEJ_=y!#S`+%KJ*P+#H;`!-&Dl{`^5coJG99_MJ{Jt0o%3t<a
zJ2O~6B?TQ=i2Z1w!b<Zv=m2YWBIW0lsc8{NE260%gV-W{6tyia6y~C*@Ma+An*B!A
zXj5q;N&6NtRX5rdbm*u0N96GG;@wR%BWnAgYe>0paNCSGP9YKL8VubwjeO+AJ#04H
z3zH%G#rm?ipWq5OZB3pg4G0mmfe2;9fH~WDeL%7wx0h9`qXgRMgfPUbFDx02wLD(T
zjtIvbRbDw`)xwBd@!n|Z%%x0{>=NA6hHa6`8bOVGS5&#J`^%=`<41{~fA0*(Xy9dL
z3>)kV3ymt3oU_Tl&klI+>GjEli+3zmpQr^u86CwpaH$=&Z*LFTpRhE$e@&nd;xG2~
zYo!<Jq2_^XD`|jhNHyfTjO>iRqqt81He46okd^!W%5eVDf^l=y!~4&0Oc7pm<`Qg?
zCB#dvM2$3UV|{j$u3^2|<0{V6szMz7OA#K=-ZkW!#%+gX{bsu2Cjwx%s?NM8PUlP%
zpG1T5Y4LxRfD|O#WZ=|7w?_r6UbE&Tv&!GN?8}$#y35{I#NdWUonSSQ;WzR2J|7vc
zTs5AY3e_0;?Y=!+BiB^n8UACWp>3*en}F|tNi)Tq+e72Pr5Sfg#v%EvhA+v}`m;}N
zJ$w*Wfo(eGxb(~o+$pZVc-*!K{4z0gFzNhu_|${qxXJ1Ji}7-u2{?F3W68FJA;v|1
zq3e~$cvD12b|)jf36KH*lPEN3Xne`9jXpvo;!zVXw_<;ReJeL97RkoVh;i^P;vUHC
z65{U5EO}M^j>eHbB0Fko2O{EkFYI#Rv~5CRIqo<DfbVx}{YfHoc-(2vJ!0#z;HO&+
zd5;O|R}<S1EKbTfO7XZOiCManr;jclIm!w9;J$Y1NuS%OfX@uuKnxB{HlE>aT1SfA
zMc~gCT>Nd(SKCkhS~KrcL6}#bE9B}|&0s7~*P@Qp4amHUA<s!u-yVR9Wuw^&E=Pbf
zZ-lK6YTwA<gn1ek{j|+GtOeKn&0I5S>tgN>i8ZBHkuUz^<f-zltdN%n1-eXbe0lr_
z$(#Q!LDGzsokw1oU{V*(bA<A1&vMeATp`${C$0^7ddj}~D0ihS?su$7eqzsOw&A=(
z7t@!HO$3Ko<T<hq31g-iolP2LyPXi)HQk-kJjpdPK0sz0bZTQHbDt~+G<4X%SQKcC
zQ|GMW^NGAv&FHwaw<tB>=2OVz9tkbXSogyd-`1@TA*wUF+3ed?J>h}ps!mjm-lilU
z%hce=KaBQTc<e@A5xj`wrjrtGAEF@Rwtn+lp(SHAh)NSr-#%@Kb(HThS!t*%Ol(e6
z6Dd4VQ1JZGHl=~ydo}wk3AtzU*po<8vZ~3;X_Ik(|A97Q;<){IKu5Un?KzL@FFZ2&
zw+vk4)6f50+53?J&-f(3#+78}&Im<KZQLFRV)|9SO73*y$|bA3-trtnJC;!S`sDep
zD$_gi`rA#m9(_f68_<?c-BIwY+U?u<S5C(H3QK0lH2C_;B%<qY{^1dF*|q@}4cW6a
zQ(c?rCi@c?-+u9zaI%vvI1%^2#OWksEz*2-Jzzlj-*>KzHP4w&jL{v!r2&4HxX7uO
zJA$`#Jvc0^k;aT}<)azxbIE_Kl65)5rqprULvI18y&dI|!ats#-pk?TAtGf>ZI+g$
zpj9ut&1><#AH926Zsn`=9BGG>Vt4z;-pSn}bH(l4!iC2%g2QcF<lkbSA$)V-_~%my
zrG-y|K>TUzHK%ERNgFiOmpsdEr4@I{jfwHs>M9CUZ#pU+P=5FNi10m3rT=<6-F4*3
z?|JfhXR|j8EHp<qQMO6It(X_BaMal27Egq2k3h@)oCo+?#gkXh8c;<oe3cLe`cFe|
zrP$(6HwZnObbIQxd*ZZKU~}ly5GdUU+*-`wH7Ftd(>e5LiHf(BK1E4}`AG{qF15mo
zNb1)mE$n<EpC5lUFTOL^JoMa(Mzhl#wZgkr)p)S?n6g{#%#APaixqKV)=MFeRUz?i
zf@}PoVNyM0S99)|n%-N4M2la%c+m^mA@lv#HCMhh86LoQYwxfSN0iG0TvhfhCbyYn
z%RZg;LuO@Zd4k)_dpG?<1Lh!=ExNRYvy>dHUh_zCQ?Y=3+cjp~HOyMu62ynu!QB=S
z*SHpxw(v#7Vv@O9J-1XreMtEM)4jGyI2m%I7p-))dw0gSipt(RL^4Lf*BH}O`t427
zqc7LvBgc)saD1@W-?Ai{_Hsp}R%x~j#B|(l1|L}Q_ZqG9gIpmzL`bDXY2Ld^5Pe)Y
zOS~a?FK6yeCl%ZFRW7MFvj|somr}ssgP7k<qV1RFxnnosK-tFIyxXz7_|KzmE)U!k
z9d?Z%`g+#4SaY3Oziyr0P-JENqZ+pu!e42*5W>X;fJ{|6|M6x{Fi3-ZIn}AC>=OfX
zmxCCpYOD~U#-O-9X2`mClLd`uSrFnZ?&M}JYE5+pOldj-Vk3S~O=nK;y#&Iue%{tG
z_j|a<6AG`@`Q5uE1NQAW4ZYHMhuhI51XEzWPvq*C49D+p2HO@}zk2&ApuFj-M=z$r
zG%36jepHMET_x)SP;t5G%Bjf$e$XbTpYDP6u@R7^ch?8GKbZnS)O)t_;;RDyuP)xo
zh)a$k?3M6>gBc`A@VpZFzLw7R2*OG@d&G(D?XS{1NLsXV>&G3wr(8O<%gavTsFK*I
z32s9_ZS*2bGnBUVHk*^}Z+}n$2!_)EpQ|d{VnzkY6tf901~$=>Hgpo(HqHi*+=_c5
z&b>?6aqh3xRZr2wPa<HrmM}REBdY{^)IyJ>A|7A~RI&TLh>w|J;Y__KxBzC7)`~OV
zmIT3{4cr?)-mTrJaFpJCLD=|Tz>loHnk%H3aEl~+o|Y68f>bm7$5EWa7@mDHueSx4
zyZgO=GPMok;t99V;DR5DIE|NM9GZVR31oy6ja+;}{*kceUk1noxG4dpj1Md|HFnQ(
zTeR}mV?d$HAGZXTFOk1EIW+2n&jJ#W(mIhn+t>Q{T4XB0X`nl8H}9NfTI0{!LUe$h
z^XA+_9>kvlhjL9JPqUunF5Zyf7qfTOs{K>>ih(7PQEb|}&<U|Ml7Y2sTmGq<(`zZV
z1Kv&AK;lXO+pF0N%GHc>)xLzCS^DM8Y2#ew+{vv*f54e=F}Pb7wS=<w?eqlP9KF(L
z+)RT1oyjVs@hrWW2`7Z7dUk%=yH-zGP1bVY@eXp61W&}QcY&|pA{<k$l4lqTqlH6$
zLBsdN{C|x+@h+STYzZs1nZAajFdW(NHCN1iXHEn8@t>l~7Y)wZ(k?xC*Mtm=^kTI*
zuzoff8e6b0pjwTOg@BR5iS{_sJZM?Kx>+$RHDSA2VY?8MS{0E2|5q{7JTw$X;DW0W
z8DJh`UO63HQ|rB%AOG3o2yu3B;&n*OQtJNYF8ky6GQ6#<AJS=T2U6hp`Mf5&QSr3p
z7V7~)YHeTGb)TJt43@*`paT-81NMZzh@L$(B)pd+oA_+MiQpF8q?FFiY4EZWNRWy6
zDf^WFW@XD0SaOII*iO8|^nWEl#|TO1GJ)&u$6ZUtE0a;UKMCx)CBoEfu8kG`0?T+N
zHxga#JO&lvwin4wbH)ZCQ>CbhH3@>a{T=wH4~hBUoDksqVWB?fJLbbX)=2;KG>fhC
zsWY5EpWI~l)4O8p1aF}k`XBD+2c%bDE_|U^@D0iL_T-q3Zkkan(8A+yB~&UN{TLw4
zaj@uQuGALFL~VO5<yi81mrFkXMw&(DYW8f&CtLxi8%tZkd=pH1AC!2M^9FsIw!c$w
z3W(JO3$QpY#r-%vuYoP;!YJKkp8wdQB25;}a+7ATumo&!#D1Q|2tGiMbXmVS9ka;-
zGxKM-D0+`ukG9>ut+Q!g;af1~nu9hvTXDg!fZhifDPl9r<t727=}&;1GY58>>gnQ6
z6a`NH9sL0}z{y^>np>(c0ONEX2~s&YIBd6DxD*l8U_{-fHQS(qS(A$!`=EwgC3y6J
zR&dSmddssS&9+@9b>MdJe^W5<PhR()kQl<w2k@_U6&QZO*J1DhX3u{yI21owA3lH&
zequuOTe`&UpGRA58R}8vw?3d30W7e7+f~LlgQuYKm0DJnQSK=s<;1fjLZ<K3L@wn-
z&<!8Lz^C{3XH^n0DL5z+AV*_*o5zb$t*T8&IWQ6I1-iwXIfJ4&VYtzrAhy*DsbHuR
z&phqhqiT2c$!wAf@&6DX0)H8$fv;>lcTft^0FCr#LjNY`_+Lnxz>u*|_nmr(t1%%S
zA?`S^PjR;3f^a-X*s^7}wc@Xor~Q5g<>g6k`cZq0AjL-VM1s*h-^^#HD!k_eq)o7n
z7x(Tvi#tJee1e!b(<Kg}D2~DzLME!%(zIVdW&ELo2M?+#ZZIF^q5oL>25{!rmdpF?
z9TzLfsN#g#7Y^IOU*eHXOguPQ-zkT1)D4_1uoOc<d-@)u*^vQoPVrC*lZ5)mH``v9
z-@dhb3;O%X?Y}UAKkHvLi28pNTji0sWymD(Fyc#GLh4K|F0@FWeR?pD7LZKsE9P%<
zIm!4hD@#m2YRrCn(w{qbozH*lA2yNQmas1!jYa1o<rt#@h*@Y2)L#S!%N9=4iZ?uJ
z6l57{7PnAQ_o)pM<|7fwj0s(EY}F-YWcqM<DfZSKi-I|)Z;DL+huM={P7(Z<fP==W
zDP{)2v5V%YA0a*A4kwqdxQv{C2!=k_FA8RJFvfUhqPR~|DHy7nw!g->vkuNa4WH}t
z@Z&{glUDhhRhAQu!+|F_=m|S+RNa&CR+wY#Wysi8%hV6wz)6DH5$D)uH)n_-0SA@~
z*M~RZI>Atb8-Aalru6U9qcJcF+!9PCTnY4}uMnOz3uY7bQChr4>Kx_ooZmrR5w50$
z<!P2bFZ-{I{n+-~ytqLo2HIa%KMfuV5{As%Uo|t#Ke_2Xi1WrTbbpH%j~e-n)R;8!
z&p5<C!B)jDk$N+PK}ORk$*YEkkGw4Y`cfweu@qh-34MV)19Z$|nrz{u5Uck`gx$ys
z5rFqvfdva?Tp!s@&ISDejxeZ&lcn~$?Yeky0wf+=9jFE`_CIx;U#l2oyDhTL9ZdRZ
z6h&?h?R+0OGdw=F5s8ai_wVAh@Rw@GuP?xtdaQ*ysq}aRM>Uay2rgs4nvFRo4kEOy
zQdXK_ubz0AHSB&Q<rlm9=yu=iUoDX<%k*7XF+W!*zBY1yQo`iGdU?8$7K?LRHx@1~
z*cpFX(>jr!?Z>XhiY51y)MnZaB4Iw#vK4oa5LGlSGs09g&*Evz4qQ^)f)hp(1`#qV
z7SD>V2*0Gs0~_j`iF0DB><AC@C-gp#51iVICUsKS3m3_CHu*cK<R$S#aRs*v7A(k4
zTe%I|;4(!GIdbwgEZ1>i$5S=&{A-8Vt|M18i4mcyN!kU7z95M$hAQ;--#jgafOJ2$
z4aIpHw*z_Zf}4+-e{yExXM%fR=OEllWQ8|BdW^HQwEX(v!-wbBek_J+OiOe%rvJu{
zudahSeN9<72Zxa%t2+SRe(UUOjBI6>lU{R$kiBKrS-q#cc~(-6IV0Ei?RA*c*g2QG
zJI9qen^JWg8s)y#67d~a_4{2@5poEk_2fRpcp$kV3Ckt3?dB+fCY>TZ;Pqzt`KuNz
zY+0+Z4fuPKehEjuwfO0-_|*6LfyfyFch>b@rsbV2*HYER{#mzXRZ2H$VYY?-P<;x=
zTRB!~hj((O#JdAYqjl`YD^iT>GI;TH(e95!ZzJnGJ4Yfl#q#}MHw#C(Llm5^16$<1
zUu5x|(X9h;_=fuBOk{ewPMe%Uy1EiIy{N8jYHBvOa6LC2dqNx&uY58NbH7K4){c61
zXj`#QR*h5p(hbkn7zz@Gox6*VzFU}F6Zf*ycC5&+kbirxwOS~7H17x|T>@jwldT6@
z<&58UHuaK8LhuUn>P(dTd~#SH^1DwDO5iUpf3uOafD6uk@G7IzJyqhn`h#y}4{ZyI
zO!MCR=IruxvrI?2-CACwW8cc{YxvmvJKtN~k+yjtn|<D?t@1<10HUS;Ac5K<lQF}+
z_o3b=xzBAxM&aCpECbL?2qYf8V@^SB4QgsA#me9yH{+#BP^CU3o5hi|a>Dt)v)%ES
z{!o|Rr9oBa?~s}Mgp2QHVJ%a)tm|M=AIQA-ezfgQ{IWwC55Kd<_CyKubeG|&)ZMRE
z#O_`7`{-fXn+?!OgZIN-Fx(WDRY}g}v69LF`x48l%^J8s$p^^Unmkn>+2ZB>diX1b
zz<ua#|1VyebpHqakrj2mMR#_V=}EO_^#3}4Pd9pV@1@UED~uEC@O%4P)-2=qNGx~e
z$g?81h~fJcdD$i57NnrUKtZ`~FH6trLvOqCzYpyF6z*g;=0oFFa5ybn;uaoPSARD3
zK2aro-0ph(p6CfxN+b&~AzI2FC1TNhjwWVs6o{^QgH%6K$=obo$aCwKUZd^D)W+NG
zdM+#qJ2BUU;%+WkwcRWxckTHC+w!HNVsD+xnYFjMOGP?rx{K}{;^%qU(f8xKx#e|h
z%`5fqV|r8-=?6R~N+C(dxXxtulw2KH?z-A^l09;gARgJHD0^LB>^4T<d3WU70mgZ{
zROarozO{UJImO?O3|wuJdaJeS2gw7=woassMyetvK~q-~4r1Vs8|{XeF0w^;);%^H
z<*M1txIm;_Ppgq0MJc=>LDR2swwCVr`RL;MW_Ezt3b~YqCoe5;EzRyK5ErC8PymP(
zChnKzNf2_XbEH<Osu_2GNoW4bskhi~HaMm+){Dn{a~;{1(RgEi_D%Q27nwgZUU~B%
z?a|`QU)><^$D}cP-@3{NByK&;9k$N->eC)}a-RS3S7wWsEL-ZnIg=F?$6fw{XrehB
zWmzcsS1l5P$1ym!o_@Bc%|D+D08W29|6`FewI{Wqd7LpgSBl-unu4<}4Pl_-SlGh#
zPBwap9byYu;wcRBuI}u`rgC|KQeT}D7`aYT6~bcaJtY#o<uC0>7=7Z7Ay#wPyK_Yc
zR}wX}+?|+iOIgw*GG7-p>Ny%#?pD__4!v5MC%4M3@`4AV*Y@r6n{b9n3Y^!8KfJ!j
z$6wA7swY!jqRcA9d791b`@Rbtb#>}txgYJV*!)w2u%G>W?N(7<cZv25Ld%vPVplZz
zMVckn5zVPSgq`8`l^>rWDXDFXFWQVUrt&KsTqr`0Ozt};9*#*AUFET7o5TTA$QdVb
z4Jq!eZnW&35|!zIg*yVPU-b9dq^(F1VL;lBlM}T7<kFYBq&2@d1QtsZ9j)(deqd%4
zv7w6C64QM-z<p=A4pyxD_lW+{i_Jr4NlormFVow2l7D|+{=MkzLB0^o>mcWWX4mdx
z*lGoQja(8l>3qN8lap@D(1#Cs@efxXSp2<h;f5vYLM}F9jZIvvB4(yj-#2xgh!~9G
z>9L!3aQ}y}uMUf{Tl=+;5<ybB6r{UTLR4D1rMqEh5NQxlx>LHlK?#xWW`F_dfdK~?
zayZZUzI&f<pYQCwuWSB+f$M^2?seb4T5IJj+`9@mu!ZdDIY#r@ow#noI&TiDW!MEA
zuKbilxvno`L`<Uj+|%v5ks>gPS>L54lZLq$q=CmD)6&ZuG@EzkO_wg=yRlVm><S_`
zC$;I~@S~m{bOvRmhkeC=o=1*S;2-o%9Keir>e8ZrdvtW9lBu=_XdHpV06Q>0{AvF`
zg8%<oK~|^~Kj3VrzCodIfL4bL{qSPSpVE+fTTl%)4hxYG|NUC6k-MOZLxzZtjzii0
zp0YSx;e5a`fSesUQ)QAZcVco%I#r1355<iGgY_}9NAPq9tcjrx(^X<B4Q0_?U@q7u
z88kAvx~<b#8v1*k?;*O3rY7_27S6!1$K+l62F5*nvlzCDDKsbA@i{;K`ji1Yq0FI;
zH@oBmn>UcC0fQjHHwfw`KlO<VO6rT((3&4)m8gah<vxo^-T}`vzVT?MZ@)OLs`7e7
zxknJ*xnqhjjUjD<1H+ITH5K>Cn=w1`#)QK%K)MTGVpbiys^`v@1dJJ^|MbGZEzAV$
zJz1Uv?0w8VzOhx{&K@&qU<i?K+a>?ox{CrRgYF}TRxM0*;CE++3Kdf)6I}#Wd>qer
zhEo;6OTq+<q4zz&e9-3ILgSL>$##45U|wy)gbd(1SIDGo6^_=jleB?e)IDQYB@wpr
z<u&Z_uGWLb>eV-l(QTmz0t?SFrrvM%wely&DH(|(mYpVMv;^AIUVLla)wK<Yp4I0m
zzl!k)PSn)Sg>c~j6ZOx>tF2CRnfm;4;rS?m#y#Xf1Y@W``&q*Ve*koVvBv_PlF+g^
z(jK+Ot2bNsLp{2Hrx%3!1j-xJI5@4e-LLPK`Zc!gme)zSQ5|?zw(b@l?W!p;iMR^{
zY&nEA3H@R7UiVaGJ8C%UBY8@53@M)5155I#7InAoKmLmU4Xs}T1d8LXy7cTbRB9_x
zSY`$|jrQ%bN5FBE6ZB$x&WyO|_$^E2Q}K=^2jej$vKnZ}{kw4y`~kq>tVcbo*9`ie
zGaAOocxC?fZ5XAi!o2s*=|0u=!gdyRI+dG>cC{0v*;Hm`&ENU=fP3ufE@js>7#8Xp
za5+`zXu8f^7K|Fcc1;5ef6VaPrs;Qz**zcrE;!JGJdQxm)!eYqu<fy`=G`56zEO$2
z;-kyCPJUJc$aZYk?FO~+jo;au2S**`cAUCImwqsR56`9a)r_AMIu!M~DjcO*Pc>BS
z`r-Q`hZt8Go+b+=5-mXK4w;vVG@5p7{iSf3-fMVximaMnLG6Q={EYg{)`5lE=Q{bn
zwuAihS*pYX3sq5{Oh6}R@`gOUlv8m2m(G;HGEUc(!TB-(2N|>dg}?M2bPcf^!zeu-
zm38bYMtsUXdwGipA?bGaSf%v$y*Y&pk~`Y%xpaB$<U)e<9CQ#VGt5=Kc1!j>5$IIf
zi1W;qV<<}{hS9|e0w4TJos_Fol2n6Z*`?#O+eMxubI14L-%ewz@>&yOnLynC`dC8%
z1SYi%#$>6|Z!jRd_PLra(OBYJq``W8*uG!^xq*2pqlTQlTeAY9ig<d-`S&lUSx8t@
z-;2xkiy98U66mr{)jBvyn(I%rr5CW@0Rr67tgNg&rjo0&6T4)H%7y*^5@i2UVSzG$
zR)KHKXQcoP0qPEGu6%v?u3Ay3wOU~>MMYh@EYZy9?ZQ|$Q?fzQ{Hr|4g-($JLKJW7
z-FiLr4aL~>0&cI4rpO(h;_M`rY0vF=E8i?{cZW5uv{o9wEqLd7`*%NFK&rmxz}h!I
zC2ifWJSrhjvf4fKShTCF?kP+U?Zn|%iLNd9&G1)}Ba;PBgC(joNy9rYyC)Xa%+i-n
zck+9{xZ+RS{PJ<~BYUWqZOV?2(E^Wc5oe)UR8@AfCb~czE`%-sH2J3C(L#MAY9tv0
z-Hb_OCDgW(l-==oY*}$BBM>zObN0~~rjOc;f&JWFkNIDJkc>z3-LvH`hWzV@{Hrni
z^}xOWtJG&)>7ORW=IrN!oRyUC@c<flxIK)yeSK0Cp)tQ?R?*O1<gvtd;@E6F*YV+M
zEk8Dt8bB{YTf=XL9Vf25S5)PqJBRhh;76vhAxLT5iuuz>0_9<u>RSA6Im^*U6Tpeg
zo_T|eYL}PYv-?%lNxCirAlC1HhwFjObcX;k5TWogo6%SS9kxIet9%S*K&Chjov=<$
z_g4!r-Uu-zF&sEc6-A*#o#MRx5L?gE%Jr6a=uiD@;B1maO?|Fq=M^t|=Tt<IL<p`g
z5^lb2XbNy`zB-Erb9;uraQ)1KJOCD(10_|Xogfksh1#Q=N^-<8JZyCL&cE&L_yDGO
zQ%uHFiHgf>*yuf__#~n}*WhzpklfwSv`W=^1J7n8ez$mT0K@9{{T)wRnlnD2Y<4=N
z=Rn%fx{16>?@ZiGHUKS0`d-1>UC-AIeP$>@zDQs>hF%@&)hw>UI9gwzK$+sI8Y9m6
zIPi20V#I}OO2eI5^F;78_^*#<^}-sweKy=C{@xG$GyL24I*Sk%$M%|a2gIWPmD9EV
z)eI~5#z*W|0YB`wc>tH==!<<L=2FMD80}^1Po7T6h8)A`GJ|Z#TiyWw9g$z3YvoAL
zc&a2e9?AWc{eQ>w54jxLfVtOKIEiXVFIC2#|I6z77ufI&IK(>nPARSFpHv8*)B^sK
z3GCct92rGXc(yOMzTbWYq<_wc`p|bRurLT(53~U|^_BLH-;iFKI4JJEc*pyf&Co^L
z9vv<HiIY9AHpdS4c`x`#4@yc&(l?248sM7?Rtq=^n(&%U<D~WVZ9C{@Y+9;>94G=H
z=xAfVlH4Q1D$6Lt*QLkRt>=!i*p=P+VjoFZR8>pi!IBO@L7aZcNswLg&*vQ2*4+7v
z`Q&umn;^lm<FVJBksTH~sONN^(I{&quX=g1Nr@V+czz642-mgBwiwMV{~KGQrNe@+
z4oE@ZdeViuePmqhEB@<VHRTWlg_Gn0?V`>>)fd0rKhJOh57o0YPS?f%VVgMk902R@
zGq6>?gEhOe<KG5sB2e<}E88{Zdov&Zp;khgQ@+s$T*#mIjR=G}4rhT+uuQKG>(vC!
z0QS6Z2v1XQyETP$J3+<*s7Bkbe;1O=a8OZK?^i-_%nij2&#&Q%6;4gJz|<n^rhr9^
zZZvlwlumW(y`7T{ee4DI%IkE+4w~Vzw4?bF#}U;lA}e1aPycZ=&A4|Ua49*JzxtLu
z$q&uC8;0~?*BH^-+&)=}AO`^A=boQ$;6zV-8dcTkkQTp&9F<UltoFwGMzr-r`ojsP
zZM7PpXxJp74T!}uR%N$Bgdd{q;1`d;pZY0&!I>Mn`UHEgMFCiL;T(XZwE9;5-_g<!
zlBm%3ygC`b&L6Q4Xu(D;#%QO5LASq?^t$gF-Fn9>|H|%>ogwV3y+s;_DGDf>UB5!_
zSPv!dCK(lPE4kaT(>JXQlo5nr$$oBfkA}YBo%Xg`Hgj8YU92Eo!}j=&(CpG}+L8nI
zRhHX3=;1}8)?*aDRf7$!$HQ9th#}#z{=5p>UYqm#f7S)?XYcq1q9d=jkB~i&%UZu=
z14DlP{pn0uW7+KcF!~q_1PRtm+Rc^a<xW6E=}X2_nQZdTqvO~x+%7~1K>68Atc5L>
zftnH-m5?Z}@5W+Vosy@gH(t;3(adN9ruuL9)IW#-qn9WgxV9<r)I=QRdS2uEa-q+l
zjDIjHpBi0cqx^3>*Aglf<o_Vm;7{jTY$rkMpSw5^Lj=cC2V6G?JneYVwr*=kP?;to
zpdT<Mh6Aq1<X>VPzK*@caP$W6laUpXW@hVD(=5B(-ave`j3`47+v`ZBV`8|R6E(VD
zwjB`X<;>K|bAaoeHy)p$rUF7SU9<6`VmpIi<@$22pOT&6AkQWaJ<Hc+rlYcE(FW4i
zPeGt*U|lfLI0jrl(TsqE^^71^sr~)>DN!7%T=-Q5&S;9b7_|0cs{!=a`NDr9h(St1
z&#$iw)0r#%%T8ypNhp2n)IjqAKWQP53wT)!k%k#E7T1^N<m4oHCyF+7dkohDc^pf7
zNpU=+=V*UP^7O>M*;w$@HQRQV#86$<N;(L5_WNUk^fE@zCSr<opXY_Kckr^d^x_|A
zr~Lu8A0#z^vMZG#Lk-vez1iviDxQBv>R-0!B5Ipe_Vvk>GOk4HVXLzUOwh|kaQ~=B
z?iT7eCQj}TG&e{FOJ(?}6wLGftHiC&_EmplL`74jV-3#9Wq)^w@mYotM5}3qEHBwG
z{lc#8_=j;e0ORz^+YWcIhY!4!00_`CkZdS?c%mBaG%C=wP+3p_TFGAVG(Crw>uAh*
zKqlnEP$bN*txNpItr>aP)-&eVDI`P0oPG*zn^J=8DErc}HvEbkA)6HZt9F2sY7PM?
zSUp72sp4*)Jeu-^U(YhL^kAKLey=VY2T+Qu=G`(Q2znBYZfq8^osE2V3y#{tsIcZ%
zoaIm)P|v+xa6fXfyms;}ZRx`fe*$LoADKM%2kQ?`*(D&s)sQsXM)O_rgC?UstSE$$
zo2qstj*;Oc@uT=ZCS5<~-_|~*2bg)UFL5#kpsd*Su&x-hEUN#BSpKEd$47w44EAl^
z1?v^~kW_eIPL~)jCNS-SCCRlwCkvK;h5!_cs`-2!r9<6ZjpZ|wsh)m<2b|TlmQ4Wr
z-l~8OK2$m?%Yj4fH!7r9arPwSvBWVF5f<y%WRNmm$V?nXrF#;0!T^BcGxG&M@uNV_
z>M?QqcI(}Y%l<h7lEO@X@4i0SnUhN74*@B53k=uqx7TAk67*WD-uqTgj)3%tV=wM~
zaXH-cnB318{I~VvR`kt~F$2%vb<TiDfJ7thY7|i4r|vLNyQ!qy{)#8+kLq?2v1_%4
z3Q?cKvwQdel;O|sacFf4d6i9YwkqLk(sn>bSxA4?kGYpAZog>72Lp*NDP>b3&wtXJ
z7j^D`#O5|4s-fIAQ9?D<a7I+bjusGD;ZLTcFbhzdJ%MD+zl`$#sRMqHyob_Ja>!~+
zfzA=TQl6q(37oOy)m>%e@<wn&PJ(&q{ol&7@L{&Z`A$Bhou%qgvO30PBDo-zM5f2I
zYA%A!vBbMGnpbe2Q38|E&`p(Ix;@X)e>1V?ZrdRlXw{|+FXVLcH%j>E7H7=T1__b)
zLa-qyCZ(H{p631!c`R~Y=-{(-w}YO&#$1|_=W(CY5-fn)-xUGQCt{+`lIW^**3@<G
zo3P<8>pYHjKb`yA|9M^Y?GGoKPwrHA59|`UIVnU!^(*LfPh1^Oblwj<PbzM0JSG!8
zo9hbQKkO8dktpJS=PY>Howx(npUncg1Fn6CyWj;+_634e3~>#ID94KNU-E2A$BGM+
zeZ7JcC<g-u3n;;HK%bqp30j+<cxXs)<DO~`kX2R!O>Kvk>IsomQqZ#7)I3qQgfFxv
zUt&AS$3X(te)NM#Yk09{;ZJ1O?en+b6r;VQzk@TvR<=pN9>Bmx01-4quk`*)$^VlE
z0wSRRDwqi0E%a~%Aj98bhysuM_WBC}2L4`xyH~3mZ6BjzMcD0X4qJ(|o}!)Qaphr|
zG>EE>AEbv-;5ULjH*{P39IjAC%>2o>`=(7*(@y;Z@jcT3<37FIpS!7bQ6qk5@!3yV
zm`UGt$=<xE839OxRR#_%&yvT#n93#6cAXqJYUy7oT4v7F+kj#vPSu+?a04(9q9-aT
zKC6C^QFN;C1Af%Fh*c7NX&yIkf=c)T|K169J1;jr=<<5I2e8ok`luSWvmoPQXI6`$
zdCUurRrvl<%nPza2WiJLGug14sqAipfa?vbf{rqqa^dUE8w76lu(o*z@G<S!<&CKW
zq7fd|&5%*4=Ie4$(nM;2^{QVFev5uXAw<pS;%y(+zDI_KVR)xq_g_nJk{J&$0h#wT
zd>)*ZVqTz*q-jSbarysc9sEn{!AEViyEF4y1C2o;g|i6Qw4G?H{geV2S(@Y-z@@*C
zq16keg@d_wimau@N1HIes=^~x-s1rO-|>)d1|Ig8PnIP_DL{h1f425Vmxe<-He!sM
z*qRhh<qKoQF3jekIYEPmuKyTr;K$(QV{FT8gBm)slXbY?>Wu^4gMxQV`p!K#kR8`i
zOl)HIPkjA<XrRzxb2W5QzTc+m$zN;wvqK=Yk%vJF=yI2cg!D$#1+HVscPdYkr!u#d
zeWB3j)#bew2+3b^NXK}8vWdpS(aw2ocep&ik?npCp}qY5sa$j-*-6Ie$Hd-t94QaV
z*;I3146+=-(gNnp0>~pHz+c-sgs3zo|F6xuR>^=W(e`UZ{-0&J`0piZ{=m1u5iF{}
zISBzp)=Rr?u-*&bPIU(l)#n|>d0Cdefw1d!5=b+AjAO|4vfkQwj;%3#gd`+3Z_r!y
zd}p92I|dAMrA9gluG3RTfW?)1S`aJb;@qwl8HH6=btA9HL;O4znd~|bTWROwkpbqJ
z$R6s@c#e*GsCb=r;IqSAnD@y93T?11iMC!%NZfxTj#3yqLC1ZNyt1koK1DZ)F_FDu
z?L;LLoZ`P9={Rv<$e09mqu}{ksRT~|(v%t-Jzz&#(O_fTz#~LC(@Cn2t16X`r5r#x
zikTkA-vO+|BV&}oc{{*pIsy;ch{c6@&9#{{90b3buq?97@?pQ;gazO#i=IyJ+{)1Y
z*r}4}L|&G9&_{^*-_D!%B<)vdx$n@AiSi61PpgI++u$FuyU7i&7E3p*{%Qk@&;LQ#
zW_E!);{Mt=jb!_byiHrL{`ah(5``NB!)DRsD+#Cnm{k1cZ%LrssciaAGaC6btb=0z
z?8GDSD26d{G#{d6K-2~kr#_bybbxiF6USVkNXdO=aiOzo)gB&P2~ppiAZFmooJo1(
z`1`o!=lHN?p0P)$mdqz)mV5lVgm4A$!}=<++{Rx*=wUua@`8UrUtLiaSy`)L5qK);
zPFgUK04DLYHZ$WL{v#zwjAT<&`TqKH^o>3?c-zR6w%xjqsAsS$z;DOEJaHmhZ>4jP
zx#aj9Wv}%yW<51P0L__<%J*dAXdF-?3X-(x+IQWnu*=Hp-+68?tq?b^x=mHD{~ju`
z029Zxf76k;cq4e67safUAOl?H*$uC3=cX{qEj>etj}8)u<?*SH92K4x-#<=pTokKk
z+CO5#dW`&?peBsO#`bpuyXSxZu)Yz?7%*XzLF=Hxaao(ZBX2Y)=3(IU(z7GSF#0m>
zkDtSW1_Tc{WHe4o#ejYK!Pls>I}K4eIr4x1F<}C!j^`h2en#G#In2m0+{AV}Ow<bH
z(DHnPm;{!j1+XL$Z0&##Ev0S`iH#M-xBXPIWJi$aq|U><aPzCF3kM}|mwWY-$#Oiu
zU6VJ$BG58uDD*$D@x7u(;(;W#QfjBFc$`hn7v3;7*m{*@Ay8&Vqv-isq-}=lqBUCw
z_QcSln&oh@u7(9lGc_7XBSMjKb#grl;2$7+;(2(n-Qx&rVMGlF<CP;P6rMN5^_b7{
z2}OB7peqahPt2yWkp0zVcdh0CPeD=90RM7GzT|9xVm8$!|1zhhPc*Q_sZ_LM!v9JG
z+^b4`>v=xx(57LpWg9e>?PbwiF2z?5Zdwr@zFgk6p=``OTWyBcIxXV*tGIvqzy7kd
z9iGOx2!KpVwQ<@q_YG~kgO6@DVz5~%s-<SkeNV4eaZ!!~(7B8}jXlA(hHCB<IDekK
zqzD>A@41Trun#@!a|<Tk4le-ul|;Dq|Ghmgm7{A_s?B-{<E3<t%dGnesEmH6$DdqS
zI_=)mC}i=Lxbjc0CuaHe5kXj1uM8J-0$kQ3jDu#h9IZ4s+Rse@q|DMi0E7!{GsgVR
zdjM+b`%H7`Z8TP_1XUvvYH3Tw=&gzj>!+s9Bhi>bs$=U?!Aq1wZah|m4wkHjE>WGu
zcaO!0NyLMNWq`Fc{0cjyy61_UI-k(?Px>Cf$mDitLur^3AAT--!KF^dR#CPe#mRTv
z(9L9>!o6lPx1?`J0k<CKyU_vw_-SPky;^pKfMLFt^N!n@Y8r~=butx=;Crg(6(}c6
zRbv^jotMU4banhrYPyi_yedy`;GRP_2!Z-Ta^!wjw~eE>nzX5#+P1?DLchVrY=ZQa
zMEr`&*#d-bh<)jV9{jh|sSR4L^1GZe9<y(>zHFa)cs(r8sd}1yQ(U?ND0Ld-%!vV1
z8eE@BNed4Ks@h%V55-`l6*9Mrn@Ld(?WjFnW%rVN4lmA{G%%x|m;7nX^=dSw#U#M3
z?<W9Wf(H*PevmA}JjeZ(2{M`ee-E|z=Wurm2IiF<g_;&yt*%NHz_vB6?9VJZFM_Cp
z)!5t0q19cNKLond9DVfE3Og=3uU?gf^m!=(qFMElre2@5ZjB<LXXHoaQ_=o|4v%@;
ztAn0*^6Wawh@=ytwS6PpuJ+BOTGyL1y;UqnA9%W6ZIV*@SKZg{+am(Ft7+mL=1Y;x
z<ubRQd^dsK(<vy?m_sjdIdQZA_6V4{WCt#ucAD0)?AU`**8%Z`TLVx;LpCbQvwYDn
z5h_CENh5lh+!g%Ch4`wD<p`kP>fhumczMY!P5K_-quJn(&rkym2?%|j*Ny?$%GWor
zeB+)SDmjiVxc~#Y<fw*N2S-8;R`5ZTk?#zD(}GpL6F?jL7m?lD*|-Sgt}<=u=*1e)
zbblf*NDQA=e1Injw9~em7bJEU+Ex(F9v<L`i5WJOgpz5z^I0z`k~*3D$yNILBsAbg
zIjt0{1AumUSXrvgth%sO03-8a)NcXNff+45Hqh(*G`VF7qiQGP=Dn@-9u3#+7|lG}
z*}8)t<YDkw;ZmKR4x5ahFu<UAq&1pJzQ4NUOoD%NT?11R>9b#%IcPmyb;T9GwDMf7
zza3lgl9Y3HTe_sld&7`w_mNX5#g6-4JWmRR457}wz~G-WGWaxcc=&X0%<ld$3kv`7
zK2DK)Amzokr=0IyY9Zh$#GrEcZslHT*2J(?c0;@8ehF}SVO49H&!JHp;yq*$2R0pX
zsq<YCmK|^loRkx5^7MjSQc#s{)4+81?4Wm@Bel8KB08at6V^9o*iM7IBQ1q4;Tq)z
zbZqmZu-D9lOj6Gc3`!&`^%Lmji3ihYg9wMtL(uDZa(;wq(@e{c7yMYGxpNPA#XX>z
zj(-qhf%=N(&7Iph={G7?4^i*@=a)BzWN}d#@gmtvN%`V_C2wePm<l7O{R}hK#i7W|
z!>g@f+PMrN9BI$3K_VL%OxgE^5g|FFVjg!0Rz#XL@@DqtsyBufmr8=41s6LmwBBBx
zb+K09I~8>;`%Fh0^AJbZ`A2@`RFm0u0%^u7Ay=!Z??I(%I6DjR%jCS(%vgp!MpcH-
zL5L-%wGVU|U!n&(eAJ3s-6gD$a~u8Ym0BF}O!JA`y?L?;%(7TBwDV|@A#Hh81N{tZ
zJu9qqPAqD#8trC4tUs2Vp-<UX8l$ySa)~yh2y<IEht%dC;_SpHFtJfB#XVqsZ^pGz
zCg4c;M6%Frm-S6inIom6<K@jPTfAQ*+QWC4T3R!{z7;hpK49d{JJa9Gm%O%x*Ot-m
zbNAv&IKoN6!k~X-0bmiIb|=3?-H*7ZY|yD75=*7gtE-FPTn!p2(<~T0-BUfjzNiRP
zO^HXVX35XDp2-qR8-q}!@~|9!Vm)k_*3eQXY#YbH#^$*asz~fInaHbm=zgGz*?K!V
z#{|>Rjp=QT|FO&Y)j))7y0zgs6YHTZ2jVNwfvYn~f#v(0^>4*Qvu;yuE?4|Bvv^p)
z=Pl{!D>_NwK11JmEUm7-(5F#ng^xw-U-8jNsWnr#n4%vD`)Bxzy1v>Sz0y79==!QT
zQ>M0w|D3f`PT~1lM_?aJ00z$QAIoB*=6{jFh-*HSqG~+~(_(#Ct-;AiyrlI)H(`*N
zS?2lgZGq`bF1sm)v|5vpJvYzj$)Y!=WxI>M83K;0%=gqlGzvQi)pTX5`ycOMeE2}{
z<b%ps1Tlfy-Me_z*!S@E7W%kzF!1i&{m(B;`a8K<brr=oW9U+1L$g*=GQlqM`fQ-#
zl$WeOxVYKD?Iq!-GV4Obp-?QW+WEU1O({=pguLq$1Xb*_^oKqqv6VEWll$2)qASUU
z`SrWQl*GvRxLtNLn6=nAI9k28WA-xmm9iAfv50x?Mxom7@I!-EX9T-Jvwx=cGJI;G
z&SG1l!>wf9n^-{Ru^6PYQ@zWiFZPG;<vJP;4(5dWYjFpB`3D`o#eTAGydyx9hGgM&
zIdyW#xg9F7m1zkKVgArRXXQ0)@DZ>Vaq0b9{5d2lnnkOyYA8{SVy2v^ZyBtC-~(-#
zXjPtn@9ti6>?%klQ!3RxC4+zluT#LsQ6#&gMgdF+8Q(y1Z06q<V@g6imYNylI#87|
z1=34a>4&zE>C9x=IM!+I{=1bL(PX@xa=7VY%GwD%4w-bupP4X5N;L~|50=zAp$qOj
zlY35x3HskgtqF~k;&xUm%>$(gFJ6{C&-68qTEl((2pflQsY1(wBKtAr&#7S3QUxQ|
z#d)~cZt+#dtng7?S5Q}%Q_1<oY?GI(v4|_#k>6avmcmTsHANJK&}7Cm!&UXu+cTqp
zi3%UY&SZt2+u<VLdhGq33(1$uy@nNfQ{$im_5G{EZ+FQkD9SGp8m9g4X5yaXq*F%$
zW<sOgwo*Kd+hA`-ZAI%WTex5V9QRQ>p&v#CP1sJON8cPx)9?c6s-1UQ?Q3F%e^O^l
z@5^E}aSdIw1q~@nOq@#GsxMnzm~uJ?3~}Fz>eMlhjtSM7sRrj>i+q$|hgd0%`-;Kq
zVlo>Xk9?^`RrD)-cpb{ylW8UIJdpghfKi<eD{rvnD<UEwxDSq~V-wxFe~;!p-@SRx
zicsQ+m%tDG#}_`&U5rzF_#Wf%azn89iv7~$8oxb$AtyTN4C6DIDb)zdh56Kxo%a)x
z>CPL$-4x<xUA`w)QL=)pv-_OJADu)~y=tRKL@NBgup}$EmNk%exiPVUI}P}5C(y88
z_!KH=lCORtkO(-9jC9}46AI}uHg3hu@ASRVeL{JhJ6jfT{n9tn2)Ti4;#~SE>MQFM
z>HUW)<N{5xX>|k!QL6dsQ=bkO6GhW_FV<kbYt2``2Ga6mRSkTV6?PWXrlI*=MxP>7
zA8-_L7|ntO9MHu@hd<sUcGKa>>hOIt<Z<|Q4Tt>eT%C8xyXkUWJ{#SF$Xo{&C@8VY
zUT<c1mR&C?Ddy?%_tlG5%a>aSxtNsb2;5s(CQn#g3dFPj@QW?tP$T%!)<d|RVMptF
z@l&H>8waJ1oAdYL)LZptRH6*niph_t9W?rdiMvA`7o-ANwDvN$<PH|Z>Wrd@iMNew
zUKpVh(9svb=v~n%XKV85Y6a-dk*P#Pve-=yd~}%>F^>1|5jux8^sK<%$SrTn*q*`K
zJf=P8IdjM3WhMGaSZR=RWL%~rWU}=Y#L#P@fgb6(@peN55}7{V0>8`@8SJ{gd)FVG
zTkErB=^g(qct7D}X|>nM36lmTr=)8D#B!mt#npX8xy%{1I~77LvWEM-I=r*u`?|4z
zQt)HayyhGR`LyY3p@C=XH%@t>Dli3^;E@Nr+^2i?Jup93o6YT;g)*&Ekk%snvRW`(
zHUX+)J9-tbb3c6|VwF_$a8Wj}Of$7Tz*8cUOv3O<5Z#};OvMNn|JdgmqB54MN#TlW
zp!`6cl`B5I`0o|@995B<wTwGoX{YNRNwis8wJemUa|SXKv)L@rFn8$mPP_GDKwkQZ
zn~jAXutkgDQjPD(Mh&|Va`W+H8@EnZDf7uEH@2wVqdut|Z)++&`rOCL^5attVd~Tw
z3Fe{+<7k-e)YrEK6t(7-_2s18Dzl$Rne+-35{COiw4J)81VFPUML3;5$|+eBjK6;>
zdH-Fn&U3xI`CuK}etx1fx2me|o+dH8wpdF!TwnX-*LoxLdSIQ{Hi8U9CGu(ozJ-Zk
zX=uG5Gl6)vEjBQj|NiA&WLfKHh(jw3lOI|S=f5~|ANwY`81VaRpJ}#Uqk;yFF|&}7
zr71BRe&o{=V$C97uRZOen00-tvd=`R`z4lsl6N$!KfZ{f4DeRSWF~yDTjHKnR75Ap
zDX`M+Cx72!ByVS<e?*ZM9E<n}uCv%B)=ww<01jS^Cbuk}e{E04I&MlE%!eir<EAV;
zD9-3(>~MW7NA`>sk9?*8`_y4pfxqGqWWcCQERK6l^)_3$fi7}1g*|iKiVPtZ4)!!^
z_m=Hi?Nq#e`AA+C!5zPrmKXH#MN=r}?KaTs$s;`0$oFRl9x>U`i%`vVy6>r%@>zZ6
zEXsOl^SIU9C1nZpeQPALu3R@myM4ZNw0mooZLdF1ty4>mP1o$qC%c4e-aHn(&?aDK
zRb$#-avdPy^GYO!s-S-?JKlCQL;aaDRUbr|p;@`=n`GbaI~KFUm#N3&4^f>B8gVDs
z`>dX<7!7EKqyQA$epD7N^Z^*(|M4|X3Dg+W;Y~(oPP-TU(B~F;(?9#}mVS9HXf!dB
z#KZexl7fnxipOROmwJUdlYHnnl!YZl_t$oj;0w|xtO^u$o;7{zU*~myn+?6{NaI!Q
z#om2a#GfZ)9MfYt_Rc3DU<Sm4{`jk-%`D;8n{u3aHWM??<{rf^<AS$@>jxFgkIc8z
zs?c#YGv_{o4y;u6X9rgoC?gFFhTSnDL@-2*DwEv8jlWn+ntd%K|Js7BRwado1wZCe
z7rZ*l4zSuAOziEJqW_5I!<u3yCnIm55!upKMnz*8$cJ5FYC7E=Qcowm7|t}F$Y=Xi
zMuyeQpw1FkQNM9`Jm~P45u#c6=mN3x@}kyKz|7y~jS>I?DH#>22lZz!ucoy6B1tuG
zL&yb+)g(U{kF_+1*ry1pIy+2~&*Jaa90i8Rl)NkaT3_-`b&!zhg+?S{<mb+~y9zoV
zMtlms#Jp#9dw=Y4$!ndUtAjUsZ$%mS*@?r$NSb9bNI2qa2h7!bc6TOS;MdmXbHY;=
z2X6>u(A~IVeBGvkEBYr(-R+T&ctE%e;VaBj?cyIF`BLZkTp4cM&RC=MYGn#r)XDv;
zHV|#W;nj-iR)dj5rP6SByNa7(U3iBVjOtiq3b#l}#};EVAHRJL-cKIa+WeIJaockQ
z4%X$1Ibe+cgr__5t_tb2gRQ$CZ1&J|f2ZDA^O;Ed=#nb@S3!MxmUHSH{_IU0w99$p
z6&roOG&>6JIavVsPYAwe;_#o@2=FfEt2<I152UlL&}Z@YNZs#Dr9v7>MKs|<7xJOF
zO|5zp9_Ww9*Vf)E4*TXD_j|z~xte?4oUv6bHN|e(&(nx<R#@KXRQIpnvBiGnD=lq(
zu+mXla6|Sa7vm0IA?8yi-CClJ8gQ#>Qz#A}?n&*w+GeMphWy@Y9@>Tbo(!~dmgB`w
zPA!(s!+y5|W|{AFA++!<>sC*!cnq?0(2nGNy@p!e=*jH9Pplx5mo*a;%`WC?vsKyb
zmaXS}>ePZN8lr@Ao1DIIi?pv0&wNV<G}moS{Xv=Uqpm$9ta!q8;lFn-M5td`qB9eV
zlTxB9Wf!s&Fp_1S1^EJ8OO?Wj6^A&`Z5+=<FD*N}KTJxBFpN!%>u)|LW*}~*SExc;
zs!{l9J!|0cOBB*s?qnrukHj)bekY&NJ%3T-a=fm-10N$I;RKmA6yA}y7mx8)UWAJ?
zwu07g4QAgYWfxuRH)w{h5S8;)KKEL%c6oJ+OpuU!Y)qDFX}G#IVlA^h+O5xxLw0`t
zS|pk3F^<MrWjL%$i)cU5$-UsQQ9uRMV<7H%eq@J_ZIZD)Lbl?2^1*zr@3jou<FJs_
zO`E~)!!nOUJeVFU)rMcE4D!0yu2$#G&tce)FCw7v;BbsobAhioU1f^UnW(o!FWx(M
z{_yuNrC!CL6iy)1C^d8Wlw4(`wqIlF9?%>FK3q5jt~dlLV^r#nl$fQ?o8kLFv!Vi;
zl_hT32eSW+&wzI`Vt^i})btf3L5p$9sz*0**R3MRFDd%k7-pQCOIScFx-kpY78shE
z`e{t`ee#T6&I;+V`i8nTk?C89zaQCjA&&XQiCXGb>teNMv34?hsb89UNnUd$?KGwp
zrA32)gKxOU?6*L58ZCBXYuqf++*&c8(#HLj;wRCtmCU4r<u|Cwv=whX-Uzba{LIz%
z(i%-<*K0JZLhoyTUd}d$(O^Pez4OSIptF%o7O-Ij1h`||dJVcze_E8yI$DtQ0W8?#
z=SKwjBBa{Cq&IW(EU71Cf3763s!C{AVy|b8ZZ2I+jbX;Oy{Pu7?2vK)XA|`}&MBr2
zAM(8iZSZ-nJu$ylz9CJQedj`TmQaJ+wAiP)ZJHg3)D@2)U`VKyIDp;e`W`3vrSs(%
zYLwxNY%d7Q95&dT9*KdgWU)7;(XAyedS}f8IJ5Fs<IkGMQtNVjan%p!l+Q&se2$Yt
zu!%fmZTnUT7>lnGm}cKPdKwbvomwqxFLubEa6Men@=p<&!y|#B$5N{;7<PQ!8s4bw
z(Jv2LJj@Qzr0}`epasa7daxJN1gR7Hi2Bz)>C@NCnrsY+L&eO!*~HTCJFoW7V2s(H
z@n-7LrKJds8?xq|*6w)O`hM#xl17)M*^29J-spn99-)q_mPx3vj5%Hc*^htFKDRi8
zf21_hnJy>Rt&EbEma@7*-dH3Cyk8L=OlM2i@teT>GX<k1+m1JOf3~>Vm|yhw_I_$=
zN>jx}&A@q<yBjOh2kxQ8*#CtH0ELGC0Vp)gJ7QmOOBklh)Y}DRqx*%hCN^nZ0#mr&
z-)V4mB`Bylhq3k=b74~RPDlRyLO)QUKpNE_mHk0hNFqx}I+t7kzwHSr5q+B*t+=oC
z*Pi&Ff#byN;jfP0go`@fUW9e-jMLJFCQ-$sF`+2lB@xTJ-H&K|1}Ezb<}Y6bo*iiP
zTWhuPU${eG&xnZ?^xivVQO{{5VpahN7nA#7!pX^|d@Fp;XwdHxNMbYJ^pU6oih7CH
zv9Vicmsya(@%hR?8W(-dH{<?^{Cc-55Yx+pnj@zuN-bgs@LK|Udh?-%VsH#8&oKUe
zjcwYZf8QLQ2%=hkaEt^9vTpb#RtYps77%^8gN=^#J^$<gZj*y{CNSD+u5a7aeto0i
zYH$lzUcbM}y@82cTm60cWl*tKgAK6-?V{Tejn_!3sWa8W11QCiiL~qbR)>%JYQLPP
zWB-;#81tBB1)(tpw%n+Gi<B+mAfISrq8oCtl95A;6_fq`^bj1`lb4mUScKb9s-yPe
z-fX;@qn@Ep=R>ogIfu9~bPW!T!9Aq_Za$$NF>A-Gj7J`7_8;@|tPU?vyAOShj&ICI
z-V5yRmTBp^Vnn&Zc_K0`y8wEly;jIrUr9PjAnIBDJH}k*f#bX+&odpHh=O_>%%%D_
zYS-dRQ}Md9#m2pEZrk&qbI#LjA+-i0Hx)dh(Rev#j;{UaN4UwCl2RR5MB!)~y0B;5
zL7hIgS@T3r%W==SM=BOW4NQ}xDXf#|FR24=3NQj^KCZ&frLm7qz1(=C4V%ZIUXTq=
zi+pK0V2+AB1GEuJcC5GefRU<=8mZ99T&Mr6rvUHzFaW<MRu$tqTgWgCcUVe#*krwt
z5zp_W+0fp;vzIASN_yjSlH-6wP9~V=U25ci6|EahRUt&&CUcO=)2~7ja(=&7+ZaHh
z@6ct<zg52v1+&{Ev$0I+&u^u3nW~3yLVI1j2k_r!v8Hk=o7g*XF@)~nQrc)uSt1Ht
zLa^1|XlZSBnOyu%W&8bN>#&k}+8FX~PrW2umPhydLE+g!SvdStcw!rfDXhvoPwC?K
z;7uHx>8tpW1@N%Xil`#j<Tb5y!hPh{4@WL8^|8=7j3_*iLUv_1{%J3O1WX{6C2Jk;
zYs1iS@SCxJH}r=!jSz($&$GH2c39#@k2CdhYY5E2trzQ04#6TOkcCWi+!LCM95#{a
zT7t3`S{JN$LraSzicdsp>4}ZxS<OC2_xbzxB0j9V50s92LeHDMT_bd{_?|?8k3Tj`
zyx<80a&q&9zSR8av^!5~A*IoVD8v_$sqR|ocyNBK-aVsH+QCO2p&zypGjhsK&KJpD
zr0h5n_tiyYO3a$$Dud87@fG40nLg;_iJ^8>x2EzsTmyX4j6tlqyO10lqBs8mb}`r&
zrB-qeEy07vpZbzlk*uz+wue##T++8&AkQSdSZ}c%dvN~y)RuW0Li;>sONa{NG5?VD
z)e^P8%JRx|w<fg)7atNC&sehmYoN3N1wnO`RWsV%BD|H-3tJqHU8`+(kCZZHN;Gc$
zuaJJfQzcf{aqEv3i7&5R-=;GY-JG7+Ort-ajx^a`-^`cW??kQwObJZ{q@Kxw4e!Fi
zIA4QZ`!wwyK`h`=2cFJ-SY<FBVv>jlD11lr^8Hz>JFo45ZakZTzP^n?mp@qaN;RTp
z;sL6QW<&#ha<nlo<xh+KPfX*30Un?svM15165Y+sG9NPVTa#2IY0EuW{Gzo#n}m~>
z-}K2Rx`JY3KEh(4(VnoyG%SX+Sm4xZGWc59z4o;XBYvv8<8=5B517O2j?pwTc&_v>
zA)m&PqJH!v`1a427j>3JzCtc4hDZS)5Oa!1Q*KOD^v(4Sbf(N=%+apNVLD%xzAohB
zF?y#PoMDENTc(tXs?BoaM|UHgYL#JA)%pF$*f<5n9X=uMz5Hv^iwoRlImwPIDQv78
zJqyr#L+k7dDYYhOHB&Jy-=)}yW%o#@rnU9_FC!j~rkOjCMyh4|++3>Hxmmxx8epXU
z<#qH6^TzX%SC(4BA*tHT`xZIfsL2Lyh^VYN$JA-&&oP$=L>R2&Q#=8~4lilB(d;6w
zC}N!vKFzI#0lpf%xV{f9;#Uekd;KP#T|%arW<dQtJq3LQfUq*@`$kT2tEqQPxrH5H
z_ypthq`Kj(J!3bkKG|<Hjt@Kh(^px&CM>mTa?@?^)8GSEdm3*<6K|Jp0CFBkiRcfu
z;dR{}bzaCGE+_D~TJ(hap8bAK<l&(;(vM(jn<sk$ef#q;n47E8$72&bn1L4oVI?Y0
zRa3L0_HS$;7f|Ap{Z*_QsaB;~ZR$eJGuNx{EE1J!i%a(Gxr2a)h(ytOM}hh2UgT?z
zH*vQCgcqwGFv2ihfB)K_?|u<eOGrd@YJH+oB%LalYT(sHxL@yQp<Uqk`HlEnM(fQ_
zKB0ef^)$K?GoK4JwVx`i0kgcf(!21Ulvt24Jtd$<PZ0o>w43~t;K6^<pYdK}0FCwe
zgVSCyi!4#Em2|pqAJKgTf7!#z)JhzUMx?!|6w@cIqVHF|8jGb&D8+f2lRBHg@?pZg
zcclLi!*|?8)~)`qC+T?Kgi9&BNx%;{NL%N5ETFz7IjmMLdTe}sqYgm$Iv2(yCGU*P
z6K;s~4w}SmhQ8~T`fSSW(9MEnZ)IdsWUiU*E`WpD0+2e57w8A~yw9E?9^wq|mCcp_
zf^34`fT*LR?5}6!x;|Ru0s<ysq^|2L`Eu&H9*!lIP{F~xNV7Nb$T=kV3ek(66VT-q
zKmW!4#{U8rR!5-sIi^+T{QEsT*N>FwUL27=&C400%l&|0nOe5;{>u+LWo4!&%27W+
zHI7`Y3!?);$2VJf8kDK`RElEUynJzZk4_V*f^?K%b1?cq!|S`|GG%4`(fQ2J-@cw3
z#EH$4P`2j_#;+MTUm)c&c~Bsqsmpx(gF0Z^%}kWjk!o`!Ns)ODK(IuG-XCS#u4_J=
zcgu+hvD&xU^*<&9q=8CaR)723)47D{K3OdZdh);fX3Kuhm8NWjB~}+>^@eWbuYnQD
zx24&ye=;P>6g*iiW5r<p%(H##|CvUQUzTuKs-2aTTYCmar17ikT%{`DU#b^<zvm9p
zYv4T(!nihmkh<p4V^C+|cT`C;#nCtd$nGVlDO{xn+P_8}vM`?jk|+Z)Z&}Ds$b8&Y
zdjBt52Q8J)3Fo2?(brr_X%>d)e(3^Q;wJG78qBmV#0SUy56|;Ce3Ucr8e{FhAtp+l
zH(ta)uAvekuPvTZ?{)uvv%MNn(m!32Cgr;;6;W)ODa>1-Of6<GPW%Ug#(hXq;b35R
zm`2d1&qs&`XwX#;s;w9I2Xp=q=>O=NgrVlUG)J~HF)pf0GT`qulo_NKp6**N&<Ohs
z3C2=j;}+SG^6U-U>9`WFQHc8_X&$U7>IGa5K+U#V(K<YD(tu$!4o~J#j2Q{zy!hPo
zq?^2sar+Ks!{gdLw(O8(*2xmpQbMAii9>tnQ1d<V2B%y)<{9!=l!mb)hX7?NHd6Uq
zov`*js6=O`v>Q}|BZGc?fp;1{npptvvlsLM3#T=JDlCjT!vqvEUd#Ue^(5wd9!K^C
zyZ~3G?q`u2xV5`|`J2ASYhoNboJYJ0cb>ZdCoGa0I*QLzuD(>ZScj_g$Ake=X~|*O
zG2AYVJLGM^afpKVBo8&{&B0uI%Sk7Ti|sW(%?E82MsnY^&cImQ8~^<z3zC&G&1lN9
zC=Woswcf5VE%h3ckr^+tp8T2gHYg})x9vXq*CkhJ5S{s5=lr*cUrG1LB*)Kpqe!Oh
z&Yi%O^6jBe2ffLenV&9SU3XhTP2(5${|s^7Yfl{W73&u4s3biA{vC2o-Y6Ni%CJ4c
zcR#NCyc)Qd#8CyU(y_lbbg-FRr52L%<698{aF;aT5xj+S&m0{{+^S4LdUm^`$=`Vq
zUtBPK8piqZG#!B$Q7cSIU&-^QyDJ<;cX7k*f8@w~3rzj>D;OJDwn$Tjf!>p$eN5NG
z`<*M>K0fh!sodV7msM=ovzC<xWr7ceWDaW0BfxuR`6Kf+ZqRPD_>l`8Z<7hSO-Jtx
z8uK3(?3PLFe0t%`JP#L9!}kLlib`V-J_TqsO${(`Icmyzw=n;UZ%p%E4Q1&O3OuBz
z<I7!%cHjRd_fhgR@IIfx`uQF9Hrjb^v&YJ%l-Kndnb6w)k3r@l9%%=)P2w~kn0N5Z
zfb&PguDiR{i*}|u#k~CC(G_@tP~ZT|(>TzeNm%;&1zZg>CZQGVS)x{`-mB>QH1oDl
z?yCvQd)L{jqEuDY)=*Q(?Ah)Vj{A+NF;knVxaYNYUDEE&mD-YS!`FRt<GB8M>$K5l
zP`9}XW9i)tXY$$OGvdfBI=Pr>ca?NX4U|^Y!LC=RRPYWB3uE6B;smIU#%<>~ps_Sr
zK(m<zD^~;=9`83Ff=lG`UlGTS&1wHatWHfQvV^qEXNLAgID>H&-Q&EFy=Odfc;1_5
z_N8K;)dSuOz@HS1*-f?qN^<tyXpy_P+Fz{lzc>)E;q$I%nA^9V+ru9_?pOjmG{xiW
z#Sg5$<Z~72(|)w=Pu=+6%khHhB3pd4EXQt%J|ERB%u*9O75YpXo(X}YZY7k|=$@$A
zDXqkM45#GouLu-T9oB(X#eZi?7`2MP<U?Gx3#GCHD%%l9RB3$D4=-6wUkzq4#Vgt*
zu_uLh^V{b!v1k^9j9-$hCMP}UGCgifQc!2rO_&9COi!sdMDI|&fb6|px_6(AtY>+Y
zj?qo+-ZTQXI+Y?DWz|ATov2a<fB!n1z3F#+*R2xUtC)l=ehE2}tBYOGArIfTlA&a4
z=z3qTLvWt<+J-bv=jGdJg^RXXM{obJRRIt713~YE%i$E-L=Knf7v9HviN9+w8)AEV
z8P;R95?J)LnK{S;QROH`@N=m!df@k3vE^7<=pq0Zp<hrcbu)AEAGQvB<KMo%K`;Rg
z`$&GriQq$imbp9wenBr4w9byuuR~}0U9FzLh;$(3-gIt>-{p?HUC1mH<Gay{VTVuC
zx<i2gzfzivFNQ9mZ*jv}&dan)aft`$C<j%x1pjn38!*KBz+gVoYCNpqko<pS0jlIX
zyr=WL-PeO!Vu+!I{gvL{QfI=`@oHN%K0R*}U)b%a?AEiM7zciT7|1d=nnT0X)OLQq
z?x?KzTSQfKI{C;`PRZ7-W}JRD9lA}b9tSqohlG77mZOX=F)FYK&J<1KCn}gd_4p#M
zzic+^6dmWjoVEkWU0%|i;NkoXy1lM*Li!$rditonBOx(fI>$j-O7uNF66d#-fCPbt
z^Hj1$;z~i7I_iCyk2e<XxMMcI;uDIYhf(Ol3Mo0MbNWvtL{w|Em1k97_>U--s^`+$
zFEMvp>;1^arN(a~N~NFO5WZ@%dg+%YQ1r-pYVoU`Wh>)>dG=HruxIe++%Ssyx3N?3
zf7;V1rm?e>3Hp}EVvYO}omPT!nW4o7O;TuY<Cznm^enQ5{rvuURmk&}U*IIX#p{un
zL!}KLrKoGDFxrFSq+wVo>xn}68g6pJ{*1i)PD{{3Rk+Oz@=D}33&N+{Zl9jJiBfeK
zPM}Pga3FW0-VhK_at76I1{_rb6Ax7rq}Zf$_k&#R*vzBHVWIB^e+|eoy;0Qfro3ok
zh*H1s=u#*@MHE#<5~-yfN_fR-Usn+Q?N6J%%g6bn_pse%lM^E$N!FdKGFo+P>dWUu
zvFmCcRIgk3m_zYjx%M^<FpFKw^XW(E99P<}R=NUU#9dp(#~Ts2o%HpX;lbX^S5Deb
zH-|G83San1Nm+sYPhB}{rHJ*2{Q%?K<V!!mPQQkkt7^Ncd{)Vz75+Me&Z%P;GS1)x
zxpcQ#+b>&qU8R@S!L{lIeR`TDZqQop&!#&LaJjEbL%hD%*FuSVvx@We{(X_t`Oc<&
z&!fOt4eN`UBD}ue@5w$bBl4;>&%giuduv9g34B1XJcZ@nFD|p?JEpc5&MTtpEmmmQ
z4fC|`_=lbC;s<U3nmaA<;`KbQ=h1r`&4$wIOd*_kZ~~<_YYTtn?FHk^$qC_O$Kd;$
zL5#{f1MqEBNJqB)G-B;iKw}#4)vX(YeK@H{tF+5gRDO=oT_jFS@LC+3(B7AyjqCF!
zNM?)2uP7zzrpvoqp{r{6U5qXxJmw*^T)^{C|8f_qL)n20Q1f4b`!6;4`@PL(h}ID@
z6A>B`8-O<?`za;)*@=9?cCiy7zH6T*x=?PJw35~=)<u2f^HJAHEWJp9i?PAr<jIZy
z`(#c)l^Q8c?7Kb<dz?rNt(A-pu}KPj{UQ;MS|cKpK%|0b>io{DYs<c0n?u%s&XGjN
zwQvhKx$Im9KxkwHH|L1PSE8JPCv>tv$=-7WH(R{G@l`bbEa%u#dYWcQo)8kQRrN<3
zwx4(ZvMOZOy9K>7p!C^U`sB%d;t6;^HXe!?ul)p4TCTrlqz{0v&H$rB%t!CZG!QuW
z&nc}nLe#HCn!6Rh0Vef=O9Q!t;S1kXAtxF?wYoy{`@2v!m)_T}5f~LE)soh|AJOU$
zj*NFnV|SKQFZmrS7YyAb-~C{E+e*Xe(7dsux|a_T?v{!)qNf{n8K@<(>$*N8Zy*!X
zEv3kU9iw4~-|}()t@gPWKxe*UYOv|Na~=e2O}>`vY_$cLfaBzBPnb0jV0(AL>TIXi
z;M3%CTZ$IdrSrhy%~9?dgTH@w&_dR#ueA8VlI`QZ?yh?u`JD;0dxXBgvs-;rQS!46
z3-#qz&+6=>zawF3(ka3psFsWYx8@f)88fT%&~!;;GcWDN=Kh@eXpi(xAxYCKVw=Va
zji|RRt&^6dJj`@2V@RqDnPP-kKnHnsT2)v@efK8)T(?vJp)th1fs8^Z!0<`u?vtke
zvHr)&H+qfFfFK)Nsca)p+t->P2VXgv1>|xJVo7`??TLt_&XDn#&f%Lab3mZU;zM37
z@RUw=bxq&2crOc@po{2*yv`sTuVZWxuzFwS@`zgQnas5J!|=$|R+r}?Sj!k2TY^vV
zrLrwYnBTHoUm?|sipzfV3*x^kW^9$%#wC@4ejB7^VUI<7*ZKO1OO;W!b~wj|0;vZp
ztvcPFxH;B=n6p&fZkqE(aV-&SEQ*UO@?4F@Ghoe6ip62Is|qTV6uNjrdx?QjHPb{^
z8Jjd4f<)EK`#-U&kK8|i9`%|PC=z*AmS`_j`szQv<`aN<%HJ3LJsbmwSg{AGVv|fp
z?pTdHYec>kr@kx)+~+N=+XL`RrDx|6*!s_e%^0}^fMvjUX+A+W_<Oe~$>32$jzF7d
zb$XcY{H1QwLzmS3=b+^YgSI%ic!toV?+C{}aX&(mXL{(X@_P60j}YTUoJp@frUV|I
zCb6>ov!|PPxdSelGVug^okc*?@x}&ib~?JucvGn&HGez6BZtP)Vefz$I!q~5x;o;6
zC|>+<H_iS=*gX%)rki3%?_#p9#F}OH#9T^ldZmL`K}y{!S{U$2BEX!{7@)gpjW&t0
zhu->fLu7o#M({0lL1ilXLHw;X+vtn*{Yny%J4W(ga(?wg5BeUR!vSrEv+A*3oor|M
zq}K*hji4}#IfXgU0dG+^`&eUZ><MS0J<H{tfzSwo5`9Q_?Jm;RV|PojRKzUYO#*ob
zJs`X%{JpBH2u?0>RMA3+KrWqQI)ABjzxdU{^UUlT?(ZgB$KW<KTLA%=H*qq~A2(v}
z>UX%x)rnv?Sz@f*U1zZ30kKlHIlbg##HK-vrkc7p$%CRS?lx=c>(sr1gOg;>0A0%L
zsI*as8c&ial921d<BJ+7W;k{CwRVc^Mm`VfGk?Z=mo1m-3ChFJKI_X0&Z7B&j*>gb
z@d5E8w3>_XzZAv?$#vR0xrXaHwmzvPX)0rF_IhOH&Q9Zod&)b)Np`DBNcm)}q2_jP
zzURtG3iG4k59tKNsDi+x^FrUVf^cWgQiL@IRX4-d-VY{=wGHeJ>vUyVypk<Zn!!ls
z@Y%95vnQaVvpR0HFJ+Pg?dAzm3b}+op8Aowlp2nKQaPoIh8*H<EJt~Hi|lG|T({pL
zDTK_@1Eyneg=~c)NjJ;8EA<t_a7kK|YgCk<tyu#wlf+>eZ*(E8Qqjl%LUv)le%;CH
z#pZCFtaecWjxhRz@mR~z;zDb~3bGMO-k-l#Wld=EGk*Oa>fSo6%B71NmQq4WBqSsh
zkd_h!NkNqE4gqPUyFo%i5D*Zgk(Tb35|EZnmw?jU%{MpTc~0PSp7*cseXr}Bb8+^b
zecv-{)~x)kHS{sJG<h{a?CCm#5@%rg@uJD~uFQyu^ga7IF4yW5K21F-yjK`6217(j
z%fGUfq`kpjcF04rV|yvGNE-!GZT?1nON;b7&ljB*wc<k632yj&8c^fXYa_#ZKTN9x
zYd?rqP*bf?z!rXB^g27N==gY3K2N(mr3t=id#cIUuY8i~dBP3ejR1!lPW!oQMFi|3
zMS99wHQ7%W3_el9bhLI-4SS4G(WfvPwJWgI%cZ6_=Xle;e~)B^J^eoLWu%i{o%Sg2
za(B_UJPDGhZ>d2T*QVLS%<INj9q&OvviVErY2_%zOV~nt2eOp4&fPB>kU5?A^5+RS
z`m$IZovcpdV2)(*lrYhd+bzdL944@QA5Limh(+$h!!;cV%^`!L%YWsVL`uLhiR$}^
zAMSfPgeQ!<uHUwfd8qupA&MfX^wkw8VWN<O*Qdjf=VtaS20i<D?Ml&2E`?b;c3|D^
z#=9Kaog8!bu5QAvGY$CHhf1)ae5*8-!g6-~@p^s$hx=g~u&_~NK1jdHx4Z2+TwG^0
zO2jThEE<3jEcJDEO?x9xmAK+dB#~3jPaHFylIoDlC~k>Cq|`x%J#4LE=Evr7Y`!k{
ztwmTRYe&iq``aQFEWHMuX0f33tMaR6^e-dU{^O=VXEp?;7)b-<kjjfXYws8`v_jJy
zW$&u{CSQr?IsEiT9^(4rC5P#Jkpgk2tl-!IbyHm{wzVsW%Mb0IbrV$L+c|Hb=N)7h
zO>G|?h_w3`n-tllWup^fahWUA8d-ZgvM`D{4$M5qFM)!0J=y@+WT;Z(l5dD^*pz63
zN1cv@7Sq9`s@JT95+-=Nlw)QmV9_*m;6c#BSd7DACs(*MxeCY4rfb2l4<)9!*po{&
z$1)AibkT6XZB>IUB<jn%1?3=M!lH$xYg{TgzL>->Vp+VZ4=2%RHhZsNzz&@o17BB?
zs&ehxD?(vREy0zz4;VLZ1O3}ZE5Ch<l3!Yy&GyKlm)Ygb>-f#fu7~xXP!e#U?SKCi
z7%Hr_2}Gy=1MJ(IjA;LA2V>>JLx{CYxHRm*CDR{7vRF8F&KRnxxr;)2Gp?&3H8+M5
z9nWM?0EZG2nQ46^2>`ZLS)$wbzuph9PLuUXi86b+yEQLwq%9zKtz;K6pw+jqwBac@
zUY~jSfhP%8!K?(KA=vJ|zAghE>iTNKMLomqliT5bo270QK{urv$Ew(kZ+y!pK{py5
znsj(j3#`<%PtPP{KsGk7B0$0J%9S2(gez>rS0R{wk%88e(UcIurBTs(fk*Ni!!n1`
zt*xRxhNlSI)}D<d70pT)aUm|-zUK{BBXtFQKH+2sTldkcN8#=6c2U#F5=QntXT%fF
z=aTvnNYX%xlY4sv+(<MM)_R@Z;3;8xH9gyi#q3*(fdha#Pv<duLyx-%(vdeJ-f%Bk
zvY?gkG@t-mz)wt;^BUgc2Q2#GM#gftf)^A7xC(ACOX7Vlz|vq4@WDua97g*Tv(Y4O
z>C2l?lg^GzN4<6yxi0z*)pj*%K|S{S_jeJ<)d9SvrA8Pt{Q=R|SQF#iVt6)gtIhI9
zkS#DQjvT4ru+C6b%-GgmUXB7W^0JSY<qU6ca7q6lc)9Ed=RbsxNy_*$k!JsiTiRn$
zf&?aL=LbB1)leKCzzn?aYiJP2uvJ7S_FqibOYrxAe25h{*SSmKA=R1R%Gc-Y?}ax*
zam)GveY3*$d^=Ip$*3)x@D$yVx#b$U63^B5u*RQB<aRvZnjz_E^5gvjQxL<z38R;C
zi;~M0bGq~(LEuO3xBSQGQ7;7^var&MG*fckd5t>L5-KHYnPx5ofA5CYyIW$Vfh8p=
zQl2YbQxg+ZE99cnw&x)lBfsJz<DS((62pnUtn20GNtBOhob8$3a~ZrPry%Zz`!S%6
z>Ek6W@u~qgalxFgHN3%uO$8h4294GF5QjcS*>syBvuP1xBEn?xwfPq0550o^5aLb~
zm6+<ybt9F0g2kx%<p)M<kIe3>*I4@K<!DM$GpXl4zmselr`K*^aIpREn}~#i3X%rM
zg{xFKaxer1hFd(BX>HU-td8)TZZ~Gej6`f=()jq3%ho76R4g|34Uz0gI0KD;#}2-J
zDfQHPo{zIcp$7fktc9g4n(Kk>6TRi^=;D{856X`~8tJy6HEjnOjOZLnY#m5HvK4Md
zGL5)%lRNQpb+v%vV(F_JYZGp}jl3?$(IVChDfTG&I263bTZpcA-PC`^);R4+a}3&&
z9HT1^2*7cj98Zk%Cw`yd7ssgyo7`SdO2rUoUYo#?Eh~z?R92Pc>A;PEV7SxCYb<f{
zEki!4^E*hS_f(=22_p0Ea*eN4=(i_|&3${ig`@5xeS9pJuEd<KW`GE}J=;%yr3!n$
zX=`i8bLBO8JrW0B*gU;Nl%mdv<_A*U|26oHWo88>!uL@*HI9q~xNQh@Qkgy9^Ub(j
z+FxtCp;bSwko{4PMh1Df<(}O^CvTz4iAz+q-ka=4xwWsZ$QESC^Ab)U9qjzT$9ueZ
z8;5sym6oe7a-Lc>r)){-p;EFYZ~4Y|cOyRtYEhcABJ4VY(Cd_|LI>&*v{={N%dRda
zfI2&-uIeg1nx2ELTX><M7J-O2cGrK^NV-anMuk}~V4`Q_T;oJ>Wc!nnUe(7BZjRxa
z16js$qltsa#z$2j?y<K`EP!D2b;TRfxm`2EZkI-{Op5<p;<V3C(RnZ2lfpJMs$TH1
zYyn&&D)&zB1BfH42t6m^GFbISXVcj->Lq+Z2@4%$>h(PJPFdiukXM_A2i<%rAD*}n
zbkb4h_rAhb{N5W^)3^#7aqOG|oZzu%_}n&c=@m<J$FjXkT=(|bvxcfv%lgE9MV?<-
zRLi;f#=<rgcF*kckEnrk%i0w2Lc8I6gV)51b%b?x_P3IAxFKfr$c32E%_(k;EA5UQ
z??Xbo#{kQT$IgDZLjOaHKS<^ND=8)j4p-<@HcvdL%WCoC3Aw6^{^E74_&r8k-lvfQ
zUIrj3RyX26)LdC}^E_1{lC{N*<+AhL7E5~M<iM16WrEx&Wq+r-{X-A3`-hh!9V(r?
zCQT~_hZRq*vxreU*hxh=Yzd~*?;rH?9>=S^x@<UYAd`&hW#?#IyoQdRTtww^Vq`sO
zJGd_7H=Ci^>SZPu!3e8RW>w$oA~?;!{22<{4TTYB>0a;uE8Kc91V}}jGI|MhIgl+>
zdc(nZe488=s<{o|4BgGEW?DY1*w~wfQEYWKiI<)3hm5-I=H`amwaeYis`nG_j7u$^
z;wit*^CW49FU^3F#Z|Ld=gJ%9N~MDIp<1mQ5nf9@DNsF;uE<l^OQpNgdA6qFm*MEW
z4Z1#0JSdZl66B5MOy<NiUJ8;`%!XNg<;t^T)|H4mPAVTjc{vrt`)zNbf3D;M2k*7z
zd`k@ohr!7d%S0+qBkdtWu(*tSxKzRh(DCU-Od#?L0G9pEcX(XZ^=>`xnrn@aZ`wSZ
zCR)dSEn0sTJ2vQZ)@>&g31>@(jkFnFFSVd-$G?AjOsS+J8urauu<ql4ppkWRdF#;R
z_v*fnan*MGl;t+^FTWM>BWO!29@>G7@ynw*YEH9#w!FLFLFR+BXaeEDDGs+aA_%H=
z+q3-fcG@)4n@OWafp}Y>`amM$Gp~trHptMY1!3amCEzT9sN}ag2361H2o>UmcTAG3
z`#Y%t*OrAl%p||)YGHzs6@bz!EKf~^u}8jjY#wOzr?F_lXdmt!ax5y^LG8YRV0ZY^
z-MyjHyy*K(tyvhRt3huk>Bx6XQ>9ylh%ygjA9R$B2i$nXt66D~uZCyDM8qy~KdiOH
zw5Oj?;;RbEZe%@aZ})}^#jPheWu4~Yhhp@MyI3BsK{sI_(Lj!1NwjL2(AXSYH(sYx
zZj-Db9%X=W%V8yGT)ACci7n!uP7`hRR!X9V7iyu|z+Gw7Ly}YG3wL{xHZwUXv#`Il
zFr?X?KRlRl7I|a<iuOe-Q4$my|F8etz6PbOL%fQU3<c8k_p~2^ni1N~Wd^Hs=B6z2
zVosY19cBzJgS<y0ubP6a@jJ9*7-UmrybBG*x5FO1?;shJZpySa_7}Rxf-|tS(Fvow
zD>We3E-kr$_40)b@|LGv*yTm~-X~^mu#s1p7Ptf1Va@4pFog?3y}OwK(qj_A#Hg0>
z9=!-IQic9#xoaGRtcM`d4OJ3sDFJTeCKxt1qZWP&^@@aDOIA3lnEieP5XJARMCb&U
z`9~C|yUGVc$r9a~pCnu|Y}%ZYW7Fgj^_E_$c*q&`m`UwU8#gH37(D4s43EH92cGl$
zCV>1S?8>JsyPbE`oup_wCN&`uBWHZL?$2bf@s6hMK`xe8%-dxir!?dm3+HWRk2YbY
zG!cZcTYbyJ!XS&pvEb~{BeBpQG&DHCABHqVKQMLDhB9W0he`B!aU>bH!$R^)NB7O`
z2MlBR0;YJrk2|BEwuDl$>hCcuPcgqF1Q3nv$fhBTPk537Z@;&N(ZQ^C&Ds^{xAhe#
zJ~A>=4Y!Q&u5sndPNp~dqo2IyTfCn*gM&7<d+TaGxy8Fo^<sJk5@vz-fnF|$szRoB
zbJY3yzZM8f=1Mv4dLTuq@7`$c6v%pO-u+UojHR8FMNrcI+e`aHDC$^kxoSGTk}8>6
z80VB2scPliu+&D;_B!}hyTf#lQZAo)2|I3O3MdO~|4N6BfewkV&wd~{R6wEj@iGV$
zSs}hSab*y-dd;l$FRJyDd&3ie#%?H`N8!>(-U^stvZ$Msc0tBa5M<4-6>u9j9s5Yq
z6xh9gbU-t`tUj1cS~OMfO8=yX(6ud5s1THP%`p%4Ws8Ffyu?KN4UwH-!(C9Hvw4uE
zT6i!MhuhB=IdI=Fq~+C)b!AnUno2i7o31Z)@o#<3YuoirXrCz1Pp_(}nU=e9mN?~n
z7`QB((glE~ax=9QnAOt(fw&-|Eb{1XBLxkf=%CIfI%?ud5OKFUVMW$1fOCWFA*^@j
z)Nv*YC+;p|%|lC5pdTp@qk8<<sJJXw+iQ7$Q_S0ol>SmT>R|n}!xvqAIy@g11nFD?
z2arOh?8fR}D{Ae}70a!-u~JcNrR$$1dVBRDF)+|s^}{2*>r1pIC~7p@)GGSuYsU<1
zZPyKkVAI6M4u?wECElpfcY0BAP^6=O#s<XH04v|fc<p-&4y|!r<2ShjfUA;y!&Fna
zw}r<QA5Nt3;={SEa$D(5I7V9ycgE%dwH1v{@G`90nHLK;?~)zB3|=p?%~@vQ>`Q+n
zyT92zDp~yK9J`v)u5a%klS#<_ya%d^`xq4)o3W%z>|9o`c;)FNW<%%HWld*Fb<-D*
zY&^3-h)<8#hGKeOh<t;{*kWZ2mTsahO{?z8YjON=uOp6Wt3s);f8+BZbF>quO9k;&
zl!VR#y<Yi(MU6_wK6(f$rCfG~p1l?YWS6kuF*7ss6i&HB048MD1@Zx(QCNu*hWxL=
zZ8IpiT}o-W>p>1<)k8UP*$&9qT*;ZspyoV0+1<0;-};KnX}`x3v%AE>=`i8Zl`NU!
zfM-Wfd{6Ao>$mrnbSz5!g#5pO!VBpHkTO2jX$rmG659GMg*XtZ*;FsL4|OTC@(olk
z<@4!`>0w`)?#<{{3wxBBkAbrIwy#DAtuafiom|jJfQm40ih_%24Q?6+>NCb(47Q1f
z#UDNQMBApn-Gd<cBdWfxf1@>g?kT`k0Vn~?ao?)FIMG?qh>VVHo*0V{KzMZ&^v047
z9XH8r;)7A}-a9WXF<ieEurQ{Mjbq#xvNYV{Iw#Y9@4Tv7SA;dQec1Ilfyr?bYuOP>
zz5xUUy1DA$Mk^B(2B+-M#?P-R(9bN-`y#Rbd9DYhJ1MAsbs#cDF_cOUSeN41@!20e
zzEGIM`3vtk6wZ`_GLE0ei>&qUt6UBguosbpgr2J7m=ETbdJEl4{LZ|qPNVuIATi~e
ztG=xk-eP+{&A=45QxCuhdLZkUQS0yz)$^nlr-eo^r6PUcK<ig)f1o_j{rPfY)hnO;
ze*MCmR~HK(0GDgPi=^j^)sl!Z&yk$HkG{S_cM~%#B%6>cJR_W*hPY?sa%_Qq&-5c7
zxr)~Nt6!GdmZy2h$usr_o(Ar{_J|Qzwo0L<ckVYi3*bqS<a=h)y$m&D4I0UNH6X5h
zJ=)(=g%HA8f`s)GG=5#LAB!**VEd%6(;tzcoOI~c=BS9RjK>Z2Wl+kao#l<Iu;1?1
zPy^j(>hJ9UWsSoj%dCR!&eh-Ni_cQYpMxQDuQ_w(eNiULmfbxNA+v0IN-5QQAQ}5H
zJCynZaSWz$;QTi)rNXV5dY`YZwl^Qf5AUq>0K~2k$MRU1#pNz8cD(9bQ9+<jz0?N{
zJxHhnknt9hkAL6@ZB9>4CLVsEH<S^Hd{|cZePXJ~YG<X!tL}QF3PyWe4l<sYkPa^|
zr{h+J7-2Fxnzu?tS+^qK=m<?G{y2kDr=yLPn$6V~9EaV_^mQh#!_}I~V*14ICye9O
zN8Q8^n7HI&G>RQ}gS|Ep84OA%KEthJVDz<pK6HuZ==6|YOrrDusl=9wn;K4^01DS{
zsI0b#Mx-^SNk={4f&nR#HOr~9>{)$-)!U`ou6txzgct89aB*btBbVL_?J+t>i5ObS
z%WogtVY2LU3XUV~WzZZ;Ec4}T^^=d1&l2g`-|89p0Q=&ww$Cm^%Jn#NXR+G~&u`py
z)Dz|F&})xI@uDl3_ZF1DI5zeF{7U0d6_TC(j>g&HXQURb()V^tty(Hx$=z}NwPQyB
z2?)sntqw?&&CY{%mYfbGW<oCl4jn&{g&@6Kywn@7EkvuH8&Au`ty5!v5T3ZbBpZ#V
zj7UvXq1Pejxvv71dZHe0s|w|If3(oeP)&e$*3x!<-V`t#XpBCd?*%{*A<MD1EcJ$X
zZL>2DB3Va5^X@L{#`)LWKRi|~)8tKOFKB2KF}mdY7=Tf(jp1)pR;3$;(Ow%|Eb$Fk
zhzjm4FxzhXmh8Q#Wn_&c_&7|!N&OcVfHqO1=)3JnJRHKult#w4r%^ZbdPh+Z#P-~S
z@Fq()J6d*kdI4;u-Qmh>2EB=J1~%!{Jl`ZVJgIH5NVbIt>-ryCF&uWen@t3hm<|9&
zUCm|Y7A`C-dSdb;=#pRnG0m11LZq!}wr^Y+s12gm?j(Il^R29aR?#Nohb_4G3JTs*
zzy(VI6y@y$(~(j*_WWCnj7C81=RSZe$w1=M;{5+^NnVC5NpJb$oCn}EEjkeIZz&R%
zJokO~x?v!gx9y7$1%ku29L5^;Q;Hqlm1>6#Z&cg)=k57dJF7slEJG&I(`XC>#Jr(W
z`k+=HmIA%z)i|<+cHR>LDwgN?H7MK;QoyBY&+ZH-0fj^SI=7=)qie-e&ZE9mW1yz@
zb$?g$vy!*pu1RRHz_Z<22x+0TFbHnJ9!`0-JOazm6Q>PlBBhj!l6z3uSvS2T9Egnc
zF7JUI=EE|(JPM3H)W`(~rKUhOfS2W#?HW}Gy6dq+8AK)ink1z#`dCTK>8O);ZluWA
zPj+s8w8FZd-SRu%eu|#>oXaAtUuRx!Zd(Q2K9)XZ(r#^?lG|m?WA{O0-NQj-B2b{D
zkg>^cY4ZacrCdqu%Z^>fb(-fs`A@m<zV(uFzRMmf+o-i`Gn#gT^P~HOU8@a`XUj|%
z`3!D<nnqUcgGPm=&q!q@&%Pt@df%15vH0{n;V{~I!658M5l^uy$NXSvUQ2n*&)6R=
zgRMOHq?@ZqXdkze$(M$@<NLC78HF;7TL3ubZcVa@ejVHk7~c>2PpMa3ZWg?aK$1>L
zG=6oNEkR$Bo5s2X7iH28lRM8K`0h%@&Pw`9z>TOCiu^#UhXiQH+RJ0Ofkea4&duz(
z?G(Z>V6kUxujEAP1V-Q&rD>?&$8i1piuVUO&f(g73F`&g4qp{g0WF`sjsEQJ9L1b(
zDUW=toXxxzzR!I9Hc_9<n__-ADP+X07hr~(0P=Q8b}Kk_5>$k_NjJF`B2o4CzWY8<
z>$(Cg(KVD+;hNnkO(THlp2}g?gVLB>@D?YcflKqXPyRKWNcNXxcxsi}V&BG(J<FfG
zHmmkW)986h{*AX5Wp<us$XTs)C!G+l?LOl`yR}(bdc(OkS5<ZA@tZrA`<vymE4<cz
z<q}aJD>MbHK<E#PJx4*@ayFe#QD*kb8x-l-&G!HrLU*OBcO>Z}j&fPT>|4vl)qEev
zEVXQ*xdY$}3}cY?wN3P=<7}CB*Cm>ZS@&^%CMr4FHtbmx`Jvp}x^Qbv>P3~)lbRQ`
z9-C8<tlk?kqIVXI8Za1=@EKvB{ME-4o)AwKTUngP&K5-KeaZ)me0C1{O~+;CK_AlY
z0{sRF4x>cmK{$oL8;^kF6!P*|lWV#~ZHY3_)9Z+OcPa`aj$3r;=*a75Ui~$$;oMuF
zssuOJrW)W0i=K7gKofm)jUZok6egc(021p0h@f$cjCK9>GG*}<;P>=pGY~&2t_8KK
z27-2_t2e(_%^4^W0JQGVb0c63EE&f6BRPYhX`{RBb$ZYqKtE83Q6Z7}`Ef_x4E)kC
z+6RlO%9;BPx&Al&JwV1ac?p*wEsECY&@Mx+vK0GK`m|D+F+M1N=_h`b!^+To*MF0W
zK{aKrtqI{wbzdX8yteW+k#N~)%^!WX*MDT8pTjeB_7N3m29e}@^JZP1l2|{$s}C@`
zARKog)s%dc2=m*;VmT>xIb4iv#Mq#3i(@y7&pr}(oYs_GXMA*&%<GbbB>3Y#<&;z=
z(S1(HVRl1D2ACT)0Wq<wh1B<y(5`|~!tc9?xyqc!gxZ(?mECNB|HI9tcAc8(=|WUR
zo=Hq-=ke6s8h(8L%D5!Hw)Sh1XPftTZr`ToAs7MuF&g=qoTJ$A!s@a`7cI0q$sN;9
zpfaYHpD#Ja`pFPg@CpVHB)sNbVt+0a<U|tKWxhIabB60#AN|mNk7QKx8o7?m2n(kj
z=Hw$27VMuZl3{+hp@v4%L_A_0wG?-Y(qJVylMec{N@92<*98~lN>6?~^*0KhTkI~R
zd1GT6VJO1lv-6<45tq)2<S?A@p+*v`Vm5>_ZJ#qTAhp5M4%X<dv~T~~s_m_{G1CL0
z#qIN^R@AbL_wzjlsN^%SDp38RZ!ly<3h^CkUBU|}GbsIj+4X>-U8&%)<$arvKcYlI
zxu&7=$yQu}3wL{f-?ughUb(Jlju`f!v=t+mUa%#cIf>l*<Y1Y^8Uq=te4de%JIcsu
zFS&&oNVURLXGXljHp}o~?}L5X<pdOx!`i@*(ekHZ=2R3OJVmSOYQUD}SVuw0vm@1H
zaVlMh3q`@krpnaKTZ~K=%{tCQJ>G?P_2j2h3TO;$jMHsi<!80AJDBU&G;G;k%8)R~
zBiVKKL2_jw9k<)vqzMZ)lb#-_%n|O4PTzCtvV5>wq14|JDqm4L-OPV5-4P~Y`0*eS
z=0^L*f!$C5RNZwmWJh2p6YiM@ywdNGqJA@u3F>_*0C0hx-@O`QmbYYx&&PJiVWMx7
zYvaKO7!ZG4QDb{|_22xQXD5&ge!^~1j9QfaK=nTH08VGS4@U6J;^4MiNG_g@gN13A
z8&p<heMj#7>ItEf?e|M>R0=^+g&+MBwfip!!mO;rVo*Vq+#`^<duI|Sg+APyomI*)
zXq@}xRU={IGuzKVQTRp;ZjI|^-{AInza^nC{Z{pbgw6KCSZ)G2MLsC2?de9tpKbfu
z3i;p-*>Z=bFW9oxlpi#h(0<*X7h0^=rQLPgTUIW(-o(LkC#RoX0<$J6Sri8i;W<W&
z<rur;_S-nmgYLCev#&;P-JTfpKxsq8NuHj1K}l6SbRUcx>+T3KvB!BYmF>RN1vHaz
zwau9r#DTUYplD<o)8gg>*I3Y2ftnqIHL@$oN2`tlFY6H0r=NOh{=Nz0CS!*yV_Lz_
zo4?n}l?kGR7Kv+AN8`O1qnEb@Whw8f28eA(cM*2Iy;0S-2C)6diNu8vNJ({hgM9(|
zM5g32`W^fH=Pd2=!?~YJ2UjOZ7qwCdgO>W<4ERbhL{G#PmN*BhrxJ;lnytB1zkuEG
z&qhe%TrlS?@X8%dvp4){zdz|^fe8XkO`j%|R@)sTKSba+j#L@52!m!)1_GV2dO!Pn
z=DKofWSUTLK9*lK7?iOeD-hk!L&N{L!zgj{q1|2fOci;fXniJ48S|DPRQlDm1jz3W
zQ%nK<>KD*&B$n~UYF0;o`zS&w*3_|X^6E0>tmzlGSnja#`svHAN^V~1DRr~s`Z;5j
z3En(B89QMGr4~^Yc(Yf4;3`FYer!s>$jDfftyyD2s%dGi21_FbxNU3*w~bBDlJc+q
z_A^Ha%NT2~O^Pj0by5Yd#3z%cBkWU3Jm8#Ye6$Y(HOgP(Zu#^Y>81L-MJF=YiseRe
z*k!TtcsVj+AXZ{ht}e10Z{8}M<mt1Oby<dmIcFgU8@oJ)>~@dpNI;`JP-kBD(-?$;
z?V0j8#RW5$;evw;LGPaSYdVekeebW3V&s6}gOw59PQcPoSd@Hb%J56Cm`BQN4a>CM
zO+jc)BlqoD!f2FMlWbH#I!AAqS;za^O#5&N8^uF4b4GSfr=gNm0C{}kziH<b%UV(U
zdJ=R8YlzMwS%kl5ILHD4MT>m)$#^R+<>Bbn=Ts-&CJZ!j@YgnP=z5DKUm{esw^=+h
z^W`e8qP$I@?iJ<1XH6<LWi64E{x9&8Nzk4JX;wa2Oe|Q~HBXoYk%1e2pv%z@0)SOa
z@=`5+64cp=3&4cFLRJ8TlfY#0GX(E*WBC(j*Uhwd1L~q`@xp0Mp;L}oE?%e*=tC~s
zOv}q2Ky%aW$ID$cwc6|a<CrwAZ)@>S-NwZ1wE<!4p<$jTVC61nIO=z%z?WH|^mi$s
zKhM?~6Lkd+oC2k?Gcg_>2jM-j{*_B(w&6j9$s-j9k}-+dp;YO8W#A3~d9D25P#tr*
zu;U|n-aF44lr&w|keoRTLpC+GK)Z`sG?W>(q|r_7nXja}H$V#}PHPv_Q0qQj@_Q8x
z>CU;e$kizpRv_hGIb6)%;49xQef;bRGE}ydAba0-Dat}K61CbD0w>JcJvr?tR}5iX
z8Sdy@#R36BP?TTQkpJyH^}ocY-Q2_>Ab{)$LBMM=*ui+~_aDikusR7fu1U@JPS0|T
ze6M}w@ADJ#lM1f1w?>hsuj<+(qan2!x12;=dT1loi(CKA9Jaqf?huqBPNTWKsE4`M
zr0FW1*`0(VMo*!-Tc+E3XNfwl`Cf@(Ma@m#6pJbc!@!pQ`R?4iA`zTL;>N5FxxY{W
zJ=;d#JV5}%XtV3&XOG2>6j33FKv?hY8&5tOrEm6SqSuKir~s)3rU=F(>W&JK0?d9O
z7)Y3m>^dpTY-`*5>d*-wKHkxgS}9l9GUbUzVAsMYV1bIP>+u{)p7jD_JrPWqHbp6S
zWM6iH>cAFW3W13G0xKhB+~Wl<BwV!p;!yq4Q}V@)!=Ivl19^|INr%?4TtSuay9I%l
z8jEjx2l^H7MEmpd1Fz+s@sj^YS=fw>(+mfmk7l*CKSms{ByTKF)`+n~9qj1Zuq%}6
zi7&I<`goAWkn_O*a8EfDKyiProE#r8RUPeUI_xZym>)6kEYDSM7cS#eB|eeG`Ef+Q
zqaR0uA(>}PsLHYO5|fm-uPXdu;G)KI^Ug=^CCf#rKD|Yy3HG+cg^3tWl;Oer+TCPP
z2>2QGDF4|AXa@2^IlQ+?yNS(3qxCi29`|$L_CPqh<m~e>tyo{PV>i&;FSkEXoi~o_
zwR?$v74~zt-3v4xqa0H$)ZQvJmlJB#?G0&J^}(UtT1eej)A%tXY~30Wanoa*+CC?u
zOyXl`2HSIL`!Q1O99Ged#%;}FOX@eug|jtQeU)*&o;w{>+w4X!-$D+?F~nvk#-6Ak
z@J;LkVnBff3TfB>H3sC4{G%mQ{~`7pS-J<H4(o*L4MJAxt#_yiV<*|yxE{kcHw*nK
zr(R|DiiCg6pcIwatMdwb^tvU4s>pw+^fi<q;;|X$v<cY#T-05=k)u|5pM7_p4s`0c
zs>|HmI6v4L0VUcd$c>X`Zhgpc3Z~e6$>;WB1JO_sfJ|hOna}$DS$n|rHse7bfN!AC
zM83xa?dolww<hhYC@9iu32o@r5&62!d{0BEC~cbA1a9jFN=Ysa>0Jpw=I7V*HGYpm
zs<#XpnG}7Jp5m<ye((t@I}vKm#G5L&O}+VsFfw!~k*88)agm_RGS#H}UEgOX^}=lB
z(0RpAY4^hi=lANgY4Kj<-f<`4%))(Gqea3g1+<>$vYQinzGygQZNC-^KK@UhgQ;2s
z|9%nCv>g!3&fqbNZ`4s@(JZP4@k?pX;FQVL!2;OW_&8y`N_6Px=I*OYEE5nvEn2RO
z*QEC8G@-^|D8I8OyFakZqm~Ph%JaOIDf7v7j-3--QllN6Ed3e+B@;zcJJ@)1`MX1M
zh?|Q~Nf!E!Rw6B?men(u&Ek#b7O(PDY4Uy$f1@rSYQ3Nrwl+`}MxwTLx!?r=$nRsn
z`&{jue$=nIAWk<L8`nag!iIaE>`%6Ayf*@(t6xpCNIRpuM-?+4H6{u{5gxfz0i>^y
z4+;{>rl_X&2M5cQs-3=?nj0hePZ~SeJ8znRN`G1V9HCREA_@B~_B?j5EAT2#R&Jg*
zHAIKsC3Q2m|L<wK)1Tu)6<0`UPtpOhs_LY1WBJIDw!0`(9J6M_eGDW@WD-Wopr)UG
z2nyB253V;6sG?BQS8*#Hevu#oQBiqMKFqj><apO|3)(dgS3;>90-d0O@_hZTAFoER
z2%$oO#>v;ApkelD2~ZR{f^O@(r&_hkN>4+SR$N2$w7x)XP8Jd#>7jYj|J2JkA!cF%
zaD{Sq8KUse7~Gm0tY4n<@F>-ob`YmNh(87>9gB^1mbp452q3pOqfek%BuM{)5c{UY
zB9A_kR-jZ+%(DUPFLHNh&_Q>ek*$SfVkZ0X7n{9`a-jTUzg+D>{kF(C)SEP{d5Q!R
zQ&Fw*8tpD8q+upKn|HnkZ_HTxGb+n{?fBG|C06U<ZO$5j(&@W1i4(!t-7@Z)`eA3e
zyNJv7<4og1$OC5wP!52uEA{~(XIcbsQSnniLsFVfS$vYtx4`GQL&3tn=38dTRuUI3
z6=a#ODt)3{u$VG7V80&Q?3kp2=z5Sv5nfVqAMd8bAd4PMK0IkG6s4*&?@|gV(G=+0
z-sfpp7}Ni9RX$;DHe217gd^s`u5;{Yk^T4~=)d^WxoY#IbUK_t?LAAf=uRzcPv^>E
zH$qX@!hjxP|2g<2;7Rh2@A}eGpoTkFhO)qNy-9ml2X2iw<!-J#9msOkE>`A~^zxu;
zr9TUBC0%EY9ajxhuS}a$dy%1_r#jWjgp055GCsA?J~#<*AqHk<xhxFFB#op5&x9qo
zSMGLvLEHe1;mp(^^vm*4f(6Ls9o%|Wbt2zzJniTA+Y+fis;55dK@Q0cqpO>)h=u>@
zU5K6udi|sV7&O5>O&&8vGJOdly9h30g`eYU%K1_q^vc(nhajxmJ-lm?!fvBMpdshe
z2-DHT1>5DWQQ@HL!XH6HIV`l-+P9$QR#HCdp)#)PD9I9$AuK7PsmMaR<DiO2#@lM{
z7$=?EbxQ=BW}Up3my}0tJrB$4gdKNLu%QDt(CjYja>BQPvRo|V@YESsZLtl7`eTQ+
z3GJPhP~K||xn$k!`Vt?mw^2+2|IlUvaZaO#ClAZrt9UKxMUF-WMJ8}sdh(@{*zEHz
zc_69M_myuqk#25}yM|STVCrogMNoneG#2Yha$l+Bu7-6S+f(eIMs!Pci%-txBOn&$
zQf1*aK>m8L%5f*~*dF}ryc4*~hsoL+#hhdzIo%oj3aSKI3c1?S+0W%)0h%LZwt4kO
z#mtA~>%{n@l^u@Pcwfb%ZfSf^Fn?dmqfn}Ac1>p^@2*zM>I8&;YA08sXSLt65c*++
zS-WHRqd8b0m>W$(*~w{tUDxr>2hhGiM(x2HWZ_b39E-RqXTMQ8*BWb491gDB?6|x9
zb__T2^rnA2<7aZsB|9$gh?y85$5^XD)Tnk5|M~Et98zOte~e0{(w>12!xX5?kxSR)
zedO7(D6%k~<g@;v<J<S3X!?<Sp|WgL*W)q@sZ5#tkMd)-ll@;;u!}CE@6Ya)n18!1
zbZ8>C%5HwVZ(Y9LMEcdS)pdpO<XCjY_YSxo?(52(V>7gxlRVWd_OYFSs32_nf%Fhp
z>UBrVPW;n;O@U&3kJBPRFR&vHlaaKa%^QMCkPH3iqJ{sC>;jo;P|{!}8v7jwG*&!e
zf+Hn(o*G0d5fh?<&-45%qaFPh9qLMumz4Lkqi>#i<Q+KaN6oIx#^QO&UrOn769w9F
z(8)Oc6GMfpD2h+!@01Fy>42dQ@Zr&sz|j_M&s4M>sxM@=U7=m31Gzz2&!~DXTPmLD
zmkSUC0zksTp9v(M*P+R!W>LZl!VnUYjn^l)j8O-3-3PYjR`fg59>s8)-OIb4fv}~o
zKk&3Qyc@(-a6TIK6F<144ZHI-b1L^ZLDMyP0u2vqMHU5@9;Wfosg&Yb1Pb1>?4ha;
zDW8<IUMoG;6O(<Z6yF4lX*`so&b&0(!AW#d;d@Z7_r`Cg)3m>}j?{_(7YJNh^*jS<
zBiT&6pRKW@*~Z1z=9NMDvL94(EeAYfxnHV>v)rt>e&FpL#ccyMof%yoID#pInxKW9
zl~maXCOVI@96x+&&`&2?*Wvw@Q>z^W$$&yaXeNLMu4FglN_Nw;Q=$HANJA8IB}49g
z@uPf8EqU|!?U8Ws0Pz;VrHVAr!!*X`WF3v$=}<{4Wb1_3cUN?WzD|c<BleRrt}XRx
zreeVF5?5`TMoA!^>9AfXx&L&#sEO`Z(7u#|_AOn+^!u>=ZF%c}=UO_YIXU7+U&zRw
z(YPa|O42cgQ_L`lHu)}fVrT<s0R^7xZBUM2tKwp+oNHgOPFmqSNV}_pRL^DeG0=J;
zYD){=CO4*B+pAq2ZciO_1M-<v^2$YrtA0FMSt%EXohN_$qh0I0+t)hpocI37z_~Y+
zC61|le76-qxcr7H6w8TLhz=QHpi7LoBe((P+}Ck>LI%n_1a6zba3};2ATWg#Afyi(
z+`n0ymk>We3356<ArW#o_5_XO$wnFwIMs4xJ3A4%B<`SD7G0oR!iRe|Xxp^b%wsHm
zrs>UngJg%g_H%{=jZj+oI}O9bHg0#XAuB!k4j_yTCe{<cL#<f+iqV3hwkY`3L<|6`
z;z8%>R-4`zJD^*PACHULAW>j6m;Mt8*{4dMTPG{^<TD`<5W`KMQC*Ik51(WvCe4A4
zKzK8<qMs9|wgFNWr0t|3(EB}%Yp!C?pq!QYAd5H~{o%vw*91UA4g`TK)St6ov<nAF
z#US~QVSu99klQGZhgM6~&Obmog;C6*1k}aENT!ShG?24imFM4s<NzcJ)ZtkKxU<Qq
zsHn6wFMaZd8$1Q8V8IXEL9cy%`+U^d*bD!=N)D1*W<s|SAvm%J=a<u>hF0X<yZWHh
z%k^UOBxQJfJY;z~<&bJxjp(nqv>#u4w{hRs1bt*cm399QPtdxFzkr*Y;RC(HD?WJa
zQnRVc41?7%ee-6TeyX&toADyP*W{|rv_R64Ect2J*Ll{m<C(a^qh@SQhvSD!y!PJ`
z*HEWB!~qhYB38Jp&Wpyh_yyii*@d>rX3$4T+?<UE04icCty&Vzm5^H!o1yB{`Oavb
zDxtlu!H4irjynV4GP2N5faXXDw4Q&@f$T7d>kc~8MVVMX8ZNq()0efBIDJJjwkt<5
zyK9b4arRsL2hbVAODRAt;R&{GjRb(>sl$9v8^=OWpE1b)S)udX2POb91ZoT{R7u1A
zH;(+Q1vq!+uINuWFM+ffxmMp6z~#3E;Dh3LQ&oZG1q!fczl>J+87za!TT~PtgN_bG
z`ZoWgoYk2)+3Zdp)_`~6QaPaWHmEp$HlXwb5gvvAJJj_aSOa%<2u^H|I!JzMIBu^e
zQ!GCLeWEN}L6wdduvi7{^YxMw_trss37Tt(BPU;elmkuY-=SF}v8b^F(%#zUU&lLR
zgF%&wi1S>}#{{vt9y${}a8-D6C(Cm(o2tpO+wY{jyqagS)ssX^OlY=v9M(Mn0Ox&d
zjYBf1i>1YNCM;RDn!CPCfBVONJ1z<JtoDP6@1U6(1OSg>)zO0Weg2GC(HD*7=ppVq
zE(8Ih9~@)W*1ltM+(yjAMtJq>yc0-*Lpv2o0jgs7CZW2QNp;6I5Pti^rn>NtKfqTC
z;3qd3G0s3&qR6oIbs6g|<EiP#tiTXMWZFM%n_iDy3bl5>`A|v87Ph<Qr*o@r4dv78
zz!XrWmm%>{EJ2_UsOF})J5c8A?4MOQIOT*`e}B=eh5nqCD?Dc}um-|;V*bBt*wb(6
zfWYrWl?s-DV%g*r=OU<!ObRp`G+$!DT{7xVF+AFLCq0#Vd=wO2&R?$&IZk>`2M{S_
z2!b*_Dhk>T?%vBhJXRB5(0iXsERTNR?9Eyj!Me+JHW#cXo{%c{^be!~#8iSBKq)-(
zZ}`^(?i&e4`r2>dm@ETqclPX;a%&Vtg2y184Q6hJ41X@Ksf<CVWb$w<p;HQKAp!vV
zQ_yDJYtQB~l63Gxd(WjV5Eg*iU#SETCM(ajP>SA>Ew|d{y&?BUyfoe)ysF$R;k1L0
z0VtTm#790C^MA<ufBvE54lTva@0|cV*7%y};x8<~JPLLK-09EVF2ga*e1u94tnSZG
z>;seilDps|H?Lv(?|ps_xU^k$TJ_h^arEQTB%dH>4cBN|M{Ut5d^FS4)NgXzJBNMp
zCEg~ZgWaMydH??Xn(wu1ROry&|M|{`KbYadKjNV{h=V~wmNeq}bbG}L&=w{s>$TFI
zcfK|wK#%GWeWqk^M`&r(*y1$M`^62egUi0{3_9~20scy7bb<sNT$CKtBtaJy7ybb?
z+3GB4Avl`kGcBg{u}PczRHZ`{a@&A9|L^r(X9d{(C-(sxQW84`{A+^bdjnt*%gM%~
zsdWA{{U-lc-=K5#84ejyNT?gQl;R3Q8KCIn=iJ<-0zTg})_II|-(~{72=s?#2TjDS
z4MgI29$)n$w&8mQiVWGvDFli_*y0X(x0f%A<>Xi6oW#IkHyG903VKtN&OY-Z=&ucn
zC>-!hlP?cdrt~*vM>JpJ%~LsKlD}zGu$9c2CwF)Srcs{G8EG_1+lN2Wkjsuh)ogUD
zPll`W&NV3~By2&w931gua(j)A$eHMw=!57^OLdLHX`Y><17{c8yns&I4y}n+a9fq#
z>Z_Y_?nYtf7x4XHczZ9c<atX7!;+1o{n*TK5V$JN`WhB2vxjX<;BQxh{&DM!++#ke
z9F`;lIdTt&H-sg6DBxz`iqBVgkLFgt>{R7URkun!WOgd-{19fr)a(si{3n(1`6swA
zdDkD)b=Qqh;kaYPlcQ!mOzXKXxI~*XJ{qIORf{}&EvnyhZ`fp^e5y6}U}QBl$4*Fs
zdg%PB=(H$s)lDx|;!Su&ZM;><ugsE?OTWESP#4@7OUo6df=`vExUgmv2v+5V=451Z
zuX3JbW!6q&ohl{<+ySOU@lJY|m>9apj~1ov#o)DLr>QSyO+hZ#V{H!CCicLcUK)EI
z_{ekehEEXs)@#6&%{1cf9yj%8p`|{Ydx{YI!~Faz<_vJtq<l<3Ik=@#noo-9OL}_b
zUvAy|(*;>+m0%@5h2>h|9g((DRI~2#+*4?+W(N@+IQTteHP;?-tx0jy`SQUnMU5Lb
zEgP`+i1qMK@4tNfy;}2-;`RyW=G#Y!>B<RnNQ`y|!}a>vHPs6UO%JD;wHAIxpe>|Q
z7loX@T#y>vNIcqnUCfQo3!=zF;$-aK-GaS_iag)QRq>t<ALeiGU_=dWX551>0r#R3
z)U4KJdpZ61CR(3P9V6$GKzsfO3+&yFooaju=iW?*<*g<rXG6O!f6eGt`+MYP?3F<S
zxGO@AmzS66l}tNxhTkQ3KjhsH&xI~0Ts<S(KRGO30}yg6N}qo8)BS@*ufZ5DuJDC7
z4tZkIF<`gvNIuijA}6lU7V3xB*a?a7?2mRdu}9)Or**UZ_n4lZ33b<l5^&SyB(9=o
z6YsoqS*$VZ`7Uw7w{(V5t7gHFQ09|U^50%^_Xf*mw^|*dIzJJAF!`@P{BW^gbbbBH
zF@S(0r_&~o`@S>2`z7~iL35?F9YvUwBs(&Sa1*)n{8nsStkRhoUd+_WN!*F-ij`%R
z_8Zi3#5{H_l(=aMC}%v&Ck?Os%Hh_nTdgzG)5X{2gPg%3!m0p_v4|tb3-=r9NIeCY
zdWi%kZwrZ6EnB0>ez<47*h92s#Os0%Zlff_cNfhrr+sm_s9&*Mv<F?|PGT*`s6Xg=
z#))?%zpoUVpc^yRRO-ViR&EzfM9WGt<UFB1pSKsdh!a<05nL`#NCmFe%XK%7_#F!q
z1iHnKnMKE}^TF|Rb9g|*F5~WgPUR&T+pONP9-T&?iNV*{O2kKZ2$%sYgHBioQb*bD
z<f`QOp<X{N2Jd37S-s;p5eD~<w;^=a56_k?^$}1YxO|(6FWwjMfp~Oz?AfpXNfGxt
zupy|_HhfpyHTbKF=oCEJad1Y??n0AvchT(PnwU+h*B{^OxSQXgD4|AqN>i-N&k;3i
zZ&njU8RwLbT^}4Qnhla3@_%B;_;9w?Cp29vGnWobSy@=}olHzjM87UE+yZA|3h@wD
zXnw2z?muU8_Q~C=U~jbUaOt=LlGvV;r1T)xvCfxHaeHJsgZFr9m8;Y4OT4gs?D>W~
z{oX4pI}xgRsNLC<6B&j2lyBVU?5ngAzP@-3<ca*fy*-60c`IQ+z(%k?3}YRkzmt04
z(m_BoM&TRazq)B$q2NhdIQLyg%U8XKOc#d+LWPCvQ^$^NOnWoOXRQUHvxkng+qYEW
zqhGV%%gx2TY#xO6;bfhOC_X{xj05?kI!VCh@l}II?l}y-Ih))CA|zK83>CWb2q=cD
z*7|@M<6Ynto!3JrI>BATv2Ut&CjILz!x=I)H#jLyPZ~S^ZWnvM=ZS5-cdR<kiE=0E
zibh58^tB;CZ0If502OymB_cSWKNk!+esd)F+&-Itq4)_9`B`8wPXVF!QAXG}@M915
z`*&U}8iM=hj*mM}DqY$aUZ2h%ruN;GXQym+Ji9VqKLWpMR%l{!qa(A-bNft4y9<J~
zaQ`1IJ6=)B)aeDiSc!`6yPsmvR9}lhlIr11Q4ada`jY9+r$bHN^<cAA{wqR#F&3vx
z%~ycLbCAsBB@eD=lk_anu+!R=tT`HP_x{Fx%2Ece0S}CYP>L284ln&gpXjCnAF}`^
z(52e=?MCp$*MKABdW&uyS81Slscnn&Xm&bX`~#;6eOg3L$ta|ZZQaE`aQO6VnhSN_
z%W+vyt$*4?qG*eHbgCQNRZvD?CO$LO%hueE`>V+j-uf|CGFkh50I>vdUY9bUPW%4>
zbqI=qtjKMQfmql9ZN#~gdtxqA!K-4qy23b#RI`LJdi^7HFOh)lDPPCy!MC^;^Zq>C
zaI`5};$A@!=3WnGUr^&Z<9a@7h%cbiwfDdAh1LN4Qz_H!ln?hLi&aR1wjWn%NjBI+
z8>IO?16&|}ICzD3peXpmJ(H`L_N%X^Oy#be%|i=5b8NeKb&$z9U8g_DZnwT3eZ|?d
zUa;`&GQKD}AR<Qw7osn@YlBOu{mhH#&gk<`J<??cSkqh6?b;ANj1F!z7(73%>~me6
zfZY+$P>~F(7_*y_8r%7~^Lw2+Wo~C&$j6PuQ==oi`>LceS;sQRq1(TXsDmpf_RWId
z+1}shBQus_8hqyBqJhz<4h|I`KAh_-EA}%UzGyB;;3iI7H>1{ANJ=0>Bd(LLZX7J5
z<+ye_){Va$Eq6Sq$K;}~Tsz#T7wzE8d_A=YjuSNSD*u_~VX89@E^%GqX4vc8o#Eby
zoL~UA>S6HA#*@oUAfK`aUpy-OsE5Opjt===fdnak6|I;AIwG>1Jdg|-q0%5Cd$y5(
zXJD)%z_zd{|NLDS{;H4Z+iha+s%WC1QD5-~_DoOkTx?dVCZ0;#3G5d6Jp6d*A$Lyb
ztMq0wqqaUDOv@-A&QD-sLj0oDa!r$lgrN8G>AT2S0R5XUr>Mar(lULI%b?|@Kmro#
zY-PXH{4cUG1GZ|sUsP(02lIDy$pB{1-)9OWPA*VA-VDnR;SuK<$<h>eP!=G&`ND<W
z5dsUpzWtb#Hm!el&G=mM|LyIO;CU>eAY%sy&kg=m!@Brc(4m6^n}-xXl%MJrAB*-I
z3tbt}0ck75IfQro<PS9#Zutq^1j;81cr}W}(D}FPe+0}OBHEIA0b~z{Y`BXT^&A@u
z)|#lx0KVM^;%JK_|5V-mBH>@?@D_**Cf3L+m#>58Vs%aZ<;!2hHFys;zy5NzIy@pO
zJ}|eU2jDIo2N3`V5Gd;YmW87P4(x+TQQhu<QkfTL%*FJlB?sH%%Y4V<k~<-|hrHL|
zy~??5`p?N`+yY}`l-ZI)Zl~cteta)TLd@!=vQ!RD34h#$vzcYeegB7!J{iq|X14*+
zm&20e+@$*F?;e5Mw~65MZ`~xzH5;{V!N0|b3$tYY=WL))$lmbf#E+%x=x7=Bq(}@S
z&nMRwd0%pW3|N%iVlnFXCM&p-mriBi_yBB98O}fF0DbZn@FkOfcoFz}mI;Wm@n+QD
zGo2ndKyA-&5=ik$ney=kUAVY-Ex?)RM5Q(;Sa;L_&zosq`TN=WV<zH&8H^`ktxJHs
zYObjlK9B6)sa=PIqm|#w8J|ok8veqtWVlslLB0w+hrk;smAmg^_%Aq~r>wvx0p?(S
zi>XSBzv`QCJTPi6ps^q&@Q->+uY>Ep5y9-SuRis?Xc)8(J|{3>t(|_`4{1v~E`I+)
zv-GYSAk>5d*BfAi0NVGue&M|Ri^st;{=%aofD!26swRX{=h+Y84dVXt<*&KkfCPhw
zvIJ0_JmCG=r&9Yuvc_KmuZEX;IVejeE@isphpW}zfcW&nRRc!@$QXF=-4CF|B63n8
zqu_6|`}Ii$B#u0_z+%u<M<BKN&w;Oj>BqFZmmYh`eIA@Kc~dIdGZXKE(MDw8jJ+a%
z+*cPHKlVJc=%9qmcHb^Z<M|_b#tUT)D&Q14to}YkWHuSFp}{)+Y!{9L6O$?gLZrKH
zdt4J3C^7$>C-jLsJ)oj;;i)JDc6!zEV}%P=bpG3OQ|q%E_zeQQL-d6^gpLDPdEj#q
zV4(>x&I^a@uiN}5iWww=3F4$`m~?tL{M_fdSY+j)Y@*JKm#Ao2X(#H4ZftDi-^oK)
z1uTD6=<_(SPu=2gbwj)%c)$1uyS*dS`7mAt+zS^*Mg>;=J|$TJL{R_(`fG_8{gy)$
z!P5Lqo_FI%Q>Ah6+`|hQ_gDlurwi#1d}HYI|Nj`uBVOY<g6zdiBm&7p`%8h;eDOAb
zMny#F-hbMbzi^5>EijVgwNTqL_@#1`H~JtQ_s_AHAtfo-wE@1eiwBCB=(l1B>kc1a
z+e>n)+klZR`TzMS=o2!3z!-Q8#z3ZY0p`uA3C6wPrDwl&Zv!icY<YMWym!pa?dF9B
z3ST|qKGIg8dY*P`U0q$|K|w%30Datt55wQ?GA=>RSq^xiCI#BX(}A{l5b3e7Fo*3v
zQ_jvJ8!s<FVzhM3``!fa&c1RXNs#|y6znp&9S|x1o+{1sK=4GiAs_%%hK@XY9z^>a
zZ?TC1T4*8kgH8QUXyW22pGONE%7DW#pA{F00)<BC_0C+fpLp|D7wsZ2!=dZ5DZj)6
z)79uUkW5KhMqU^5Hy8^-9vaqq-S0&Hiv>kS>JH4;76rQ7#@i>qh#n$|YXhtfK4)MX
z%(IHdTzD{{wc|p8*RH#SG9pp&8Uo$1k%q5d-@8!J@Wn>}wmSd>c6S02dVQ;q=;B=g
z?|N1OhWg)q0S>;FKi9>EsyPtwjoiGwIAl`XVBCDWpwRC)kPjU!!{}~9vkH(7FP?q9
z@P2u|7QiW#n4i}WkztS`wz<`d^}7j5A%Qp2iX|ws1GC0>to*_UevStZe1+-&+0U5J
zQMz?8BS2*Qr=Rmria*Dm#fMeQ#;lB3Y!$!Mw;9+tjkJUpoT>&K$cnqG%l&&R8Ca=A
zoc2?g*I~woxR)+ep6+7MNiC%ZW*VO+SYovx-aqH@>yv!20lFL;4bPxu?%AI@KL1#u
zIHYTa8n%G0=?3G3(1tIz8o<^WK!LPdO&Uf$9uCRtTo>!EZlJD(I9QOtuA74i`e8n~
zSi=Je$rlY?Xx_mvozZjoJZ6n&nF&RSFE}sm#DIPle1%A0%eR5AU`g_irToEN|J!E$
z-{5cH!RY$O=l+=ZpZ|C*4osk8-}Hz}?ipYs<(&oQ&%;>%TzDejUVn=h()d->Y5hz7
zxQ<chUJR^?_G0f4@iN41BV8Eal>>^vxN-iA%KFQ>x(*pwmogX}fnY5o`|}+C^PB$%
zOn<<Q15_y*@Q4|Zf4zm0ezDjB>w68%LZ+{ILh6N3_(1kNj{G;a^afAiu(G&vjZDfC
zJUR+f$?;;+07icR82x7{$DSO26_ups%7x>a!Sf<U+aT9!7Pt^IF29cPKerndGV?F>
zhW|IezL}znDXqjsk4o(25X`W(b$Vt-Rk<TS92_li@N?t3cUw;`I%m+%UsVS_+3b%u
zyi!s*Yl@dHIE`Qw3pnVyllkw#x6VLa>|ay<8-e~b?_X<s1%87*J!5#G=41m5pJRyA
z&jbfA1Q;Hd;f3pk;{aXag8@O`5Id%eGrbtL(IdDqI0A9K-#OQ(7ja%JoIu<MP|o!-
z$Tpqlgi=lZG|m68Ggt~pD}ht@91ETHkyH+UeIM;#&cv_HvA9MM^M1Be)=`h@)47ST
zzi~?J2Vu{M!2DrQUh`fO?6+Q)6eZvuTJfYCMIX`Uv()n_KFzH+`s+QJJ4?6ydeI-#
z+y>JsLjzpWED5-jSCs3w**C;(=)hjB2bL;Pap}e%Px$j6ST#^Mpm(_VWf@Rh4=qm^
zBj}a$lSfUMf@&N|z{|-XHRHs+^!Pu_vYtb5@(T;_H{KXv0Gvy46?ku^1Ct#W$}7G@
zuy<>nQXnKMCk$xASs;aSA!)%hrEY<j6QZ+*C?IZ8+<3=wq1_@y4hEs3KcYcI76efT
z{3B%c-*TcH$fM@?$)?+iJ9;=gjTo&~3O*bh!lkQ=^7VDlnT2w?7aUF*z_!WS`B0=k
z9cUpLX64gF#|0z7u7EUGM0@LeEjSaGISLUjd|(um3oG<p0J5C|jPrvPaa=eKkf2Km
ztnN!-b>SDJEs-SKi&I63z<2Psu|fsThzFo9F0o(8pEvo><Z#~wFPx0idO-rLvwO^2
z7n&pSPT-~M`tiV@pNAq?amw@O`M$p#k<&S+y#-c*mv{GD8N@9@7K4`;K9EcpJdl|A
zr4UGZf%g--B3=3iWB;155EN2QFzVC*D_Q=XV*nQ1N~;IuG|GXjC?mOWFQCb~!-E||
z<tx&agfcq6Sw%iLD7v@>kg6O7(;LCKhzm~V$M-+QN|8XUw6C+x6Uvdiv>!Y?bfQ~O
z?gJsI_+_vt>$u$GUyJ?wyxnEMbVamGHxqz>yIC`RA;;(50n`-AvxgXx1k6|Mx0&z%
zF}4eyk?*H038)3l&*V2m2Rj!3M??si3`iG;J>5TnXDBy*0(KmqEg*WyD=^K!uk-8^
zDL247aB0PhP*NWX2c)PnL|r_Tj|^-xBKYoq6??kK^lG-F0v0AEObWMM0cPXN^XT);
z!(Um?-3aWyyd`4>1s|CNAng5_6r<nFmoI)2tPPh`O1u>ZmO1yV=oDY^_X)duKrH7`
z1pNFrz%VoZF~BbVAL;p5Vq6CTQ?W4jhzp3AGW@?k>5NK$wyhxf>iyUG|CzYIEQ9~9
zkq;mh0$G0lJ1vpt!xENHlaT+A01Y7EhrhBYC*j7o1s0ycfrI;7mBbEAkIY)ETQB&f
znCcQ}E{3yEg8c}4y<z|yXn-|QtOzzb%duXt>@*L+K)5Mgm4tr@ks~1!+vvl62+!43
z-)|PN-FZ~NBR#rhJ)d0umG+lXV$%Ss;LN&t0ABd>wISW_cHQmsO~A*te@X`fnv38O
zum!NsZX6W58t}uFJT#`aR|}-Z^5W};tBu3Gp3;`NT)FM5J>d5%7Ul`I4)WH>cmb0g
z(l7m9hYSFU&ak|L@(kYPtdIyGBXs|ymGf!g3QD>zHyxps7Zh+wfPdwEbQztMar&_F
zVNchZ(HYzTCjKnH7qx)P_GVX~)%uF-lJRM%pF#5;FYkE92QL>Od+w%zNKd>+?8Uwo
z#8FmYrz{!1LYC&E(MZ562DyCUX39fF%5^4&J1<sW7lHsX7>J7BOt`8lhRnQhL%AP6
z@W)wDT<RrQqfw`@H6n5Y6c6vOlsijtULf}bDZqf1cq4_Bc#ZbU86@E4i`Y+yo-C9w
zzC_L*=b`!)A50773xB2=9<Mdz+>@OdTBZR#EM5kj$S^`5ZD6*86amvZ9PMu%*t8VD
zE$lNgI~gEo!lMOp(|6_E*|dsWjK3jH?Jxa*ti5$u)LZvHtOyDyh$0{$rGy}%q;yGl
zcd4|5q_ilZv>=_*-H5~hf`YVkgGdh@Ll6A+Ab8G^!?~XCd%gd_FwCb`-s@g#?|IOj
zO7t7%$QrvBf2b;UuVvm#QByEx{)(_^K7{IA1r8ACat;K|P7Sqx4Jk)1DGGcwJgTYo
z7Wkmmm+U=d$SeNF_{BXTNu8$bJ+pLkH#pKW_(7$PXztj0PpBOJ(NRY^3yPM!V<>XI
z7TPL>;7m}-fWIR8j{m#A(rUWFr}OhIr)R%B>YHe2sIRrjQWAD(qZQPZ!J2!cRn6sc
z?Q`+~(c=9ImxJ*3G5Gg<Gm*WQ(gyZR&NbEH!$p(U(7c1F{q_kNho&Vri}v`?&*LqA
zNbn_zJl<uzV2w$amVWmsa8D=kfLSd;=f(ZJAG+1nQCt@w*rZsVK-xE+$djK(dItiI
z5aV6_f5ip}GkOlf-&dBra>a^Y1*}*+NRyFv;XIod)3D2Gw*9x!rYPbwF#47b<`*_E
z2s-GaPvfr*u|0}Vk#VYh<FHt|cW4mqZ#O0_OD|*M^tbpUYwgZ`*uuHNx&?M$gzG<a
zLKh>vc`JX#Y!4nBd%jeodt&b}F<!{5bLzvS;l)PY`MkN+%h_vO?b^st1bvrg4D~$c
zUI=v96!&udU(+!HWvGJ?(qok!Y9TJ0w|0jnwdPi>gA&Fx&IN-$ZX|BOm+bdrcQ=U1
z>TZE$TE46L4L9;lm+2b3hbO)h+K?VAUiS@Bif20G<_o@WG!yGW^vfF;H*U8}6xsK<
zo|lY@%E|Dq0Z>(7?XvCKehM!vr~}3Q0=TbXM3Ze2M9T2^l-R%Y@mn#+FPy+fHN6-?
zgl>{$*lD?A?`ioZw?&9t=cQDwL)HDr$hHf}G}N*-vsT|zy@>{I6^|CThhM{#($EO$
zMQc$u94tV7mU&0nUp({HLjo^Z96>3Ejm5+EcEJu}Q00h`qAgO8SPW%#DZ+w5A6Yvt
zYV$T<!W?i;sp|8)XT}+HK;_)?=t)KJm;^RLxcJ{Ss>cU|obd<n><MjDipBp(oC`Qu
zgVy+{x9)-niD<L@FTvEZ<v|1O#?OfOc<>!}pBTyjV4RXxhhSVnos}}{SBJPNo(jcx
z#dHDB8^ghhTOYsnK4`z42RVucVD|3D`DlRIk@xXxgwh8Q(@=l&fWIVegqX&h$m63v
z2bi+_tIOYoJT?sMi|e|*Od<RhFBGh<|8XEz#>>)0A&!UoR{@F5cK-tg&|^M=meD_P
zufI-y^3Dk@vj<EMM&V=qS<l`Js1z%>+F(0-%+l=cI-TQ0v#`C26ST{1rw<t2{#su3
z?%0+Y1=<a}-4^rs<?UsQ`FvM9rx|X>Ahmyr8Uwq3y*;<i+^L=S-qpQfm&2XP_eGrh
zlB9tg!SU|oy!XDX8(t7_a*(ViE&$&Bm66)^Iw{s`@Z47v<nR3Pc)tq}tXS{?n>$%T
zpFzvy=EQ|)(OL(Qk(i7Ey=ozg_Hcr}Tply~*w|MuA_=}ZJnzNQwQ^1))m>Yj+3~E7
z87e0F+j()W-&Ec<Rn^Pmu706^^NZ4+VUym*g5N%JaE<m_;~pL!+Ts-39Zn?$kmjST
z!Ivx}JqLpN%Fd?rudQK(o#6m>AZz=jiFzC&6IHwk8ofW7816jIyu)a;CM2bkuz$Eb
z{20mIDQ*e(isXUj!+e87`y52VLTHEYGg2Mu8ytL0LS@nx-sT(5*UV%<1tkRcdt{V{
zRHr0>z}96UgJ3}PA}~Rg7u)J*x`hv5W$(51wjrt!6Fbhh7f+yGKF+?{YeHk0Rdft2
zvQl_<moF_D+uQ9qXSO+^poCy}peQ_0KQ`nADp?y{+Ufj=pol|jfYNJLj-=8K5!x@x
z`98WbJ)8U(oY@Cf!g(I46?6zpdn;|vOwd=v5_DuOo<4oLGdw&TMix2|;RQ6_WAt5Q
z<d(piFyux&c{WR#$rGQ@fv^R%*`4dpSxZy%mhgy&Cqh^sMPx7IkuN;e{(^Tunnwc;
zgilK8o_OK06Rpn`9}2@8aDxYjZrWhVEe9dS`{{hw`z3f32tQ($D>cl7d>;jsxxnT}
zYWGA$QI%?7P`l6;8!P_lnX7A+l8xCW5Fy_)AlL#aNkk9-#5w-)g6{_SfSg~3oe;gI
zz1G^-Hxd;t1-Fe~9JO+s-4Z+Nc&$`WMHa-B9*>f7k5?)#;P$AJntjhF<PSUdn795b
z@{PU4vgP>8Oa>i=AI*{K(4VK+-s4#4$q7gl)QxS!xgz0oFZ&o+`dq&$tjmz0Wt5kW
zHdu=4{uLRv&sck#QcngH(+DQjn;PnVR^UjF0sb*1-1+7eq8~CHEZ``ntGC+}%5tt7
z={!=w<tqeXevrePP`qNFzjs+h?*w^l`Z(8V%fGJCjuKN<0ON7m1K5^4|Hr|5mYZAH
z7Lq-Wv@JT<FG3WDasQS`k9?Q~_by|8cs`)11a$N2x6&|k(4akfK8^r~(Eu4MMS5_Y
zp#zX2L5}Y+iRoazlaAM#&Vd)TX6ESNv4OmCyZT<!pPpcaht=O4+wJuia`+pZ_;)>H
zMT3WahMgq>0aF+4tNEOQ524eF7=H9x;U-$8LZ&M5#J$-(W(nL}l7A!>tn@6tckYnM
zMaM?N)-R8=b~bQ6fOB~WDlnc$3I$X{)`HQE-^lQHFgt$n1%zz8^^Ko7TCxnI1D@|K
zx(FwIACD&Dt-@tF{uZ!~%-hDD-xaefub(`hD!WgnG5)FqpvqJ`+sOD5(0~A?A_|Gm
z#26ny)WC=xb|*vThlu9!H<iv|tWD*UC$qUsHHo^vKUHJi6(QYmkX(x#%r6N7O%rsz
z5@TV2U;T8$5KM$O=-|zt-A(=7V+M~uFQM3qLOq(wR*^(RsbZ$72tN^~T)X4x+KkjA
zyPA(0Tzb$lK0l67`D4v)WaRx4<F51i<o4D>ZLxZGGi?_Ro}EOLLQ{KSsK}Xs6jLdA
zn|?g4eA<)K5rC4e{-zf8E5{E1g>V$!)%q$=8SVTBE#?+mTBV0bWQd8PQwQ4Md?V<W
zkYES=La(%>$7CK4vKPSSY|oz0TqBh=1R~*(QL*@UqXN+dF#?g&!M3Z*4?%5+|7qsv
z3rC6w()QTmZ~WsJz9|$$mS5Q1QM720aODYcuGGp~dtzqeenRGaLrkt2$`1YMSAU)8
zXhu2Vio2)TP}`L|ccQ_<g8LQLh7H$)$C|iDOhV=2m^jfk(4c5@WvqW4^~>o1laUNB
zYmL)MeeVcAn$~&560NL6he#sq%2MEAgMkFaRs#OD9g2uaPy`*ZyrW#g$q&htP8HFT
zsVp$;Fs`mCS*9FK#vA9B1s?H_%VOp54aSd2>Xca72Zm&^ZZ6v9<ZJ8Pxx5B4SaW-)
zH25$mBrly675?h1Rsk`)yv;26@yl^@SyJG^eN}|0b4sRPGXG?nqR4_aXC8{FTI|my
zk+nMNK@5gtX>6#|RXnNF-Oss~Bl44e@a`#7I7tgMpeR+zc+Q*}!ORaqPU2?jx)~zQ
z<~W&)70};P@`YKae{gGjRX+*DUY!F2JCc7sGjM$OaUg27fc#O*nV-c6;MU>(+2Cq?
z>cE<c<$KT8FC(}${hC5%Ht@mWyWT}|feG4!40ekeMtfcoUJ9iOrzKU6p1f1m7MDM|
z5MURfSyGTTZIgvl-%$Ss`>rHnNE{6E1<bUyw>Kx=V7*2N3a_G<)$H|L+$CoMo#U8h
zq6KCRpH6T?r?D_Aik@^BSA8#8etqoQbd2YwPStBr>Y!;u&RR!UjEkQZ8W?J2VF8U^
zCGaen(!yH*FpR&2Nxp6%Ah|$>!JjCw+k_+777MDDQ4Cg#hY6)~Laa`wE>nUhy3Q0=
ztOuF&cWjJ?IfI#Epe~P0ZZs^TOklwZRCyMsG;(a039{g31RTsp5wnPeS??5N+lMx8
zM_2qBkN+$0y4Ry7qs=ew4743fHLLZw4^ZE4Z1V!#a9xodfax#<COq`BJ+oQ%9+A_L
z%I~~R(0&wtY`g&|fgtOYUn7HbE8?CKqBxinsFCm)+%tf*k6=?W9QfwiQd;y#jONY=
zk^-6<!o{)(i1IGrwo$GZHUKUAhtD00_68tMR8X^|5Oki9jZit@Blg2!E)~zlEX5v4
z-3$Hmu<w%Yu6mGPRvk(<3`+h3Egek6REvi=02-?8-h6T3xgAUfeb2e4A|NXK;_R7Z
z%3ZLcArDJAIu%Rx(^9!7m``b08XmB<1)Z?3(qT7fzEh;LIef0aUjd1Q8d-Ene0_TD
z&&No(j+9a7IL7X}r2#94JGX1VXdbkw<B+ARWIZ<lL#e?)0^-CkC(g`>j)<z;I`@H1
z4@3G)skzEiACV>W@i$tRZZMVGXhD%6QVcAio`0I9yEy8`<@v>fuaYzIL<YR}GNP{d
zDrtf7%Sw;mm(U2v<|$8fgM|qcKmws$=NiuRA{Zcdvt~4(%!<o0cu{*|aUjEbEk3bI
zn)OFzo<HM>t&rfZ7nxmpn(iOWSL2m?rUiQqGBJhDOYr|%E5{l7e;Ah$)c|dur<O#o
z97FJ)T`3wG${<)_g(ym%f^DF%|9&r<t?C)GB$Y*1#omXbNkDjGJe5W7I$02a0@}nr
zn;?GGFaVJcBcJ75zuO6nq2Lgj)HOU?RH#2N^-TIRp0^VT;tIZ4dp#bnxAMmn7`y#N
zOp09Z93q+qmqrwfm~TEJE}aJ`08Nzt{nCqg-SaSw?kxtp-ONxukO-!>5Dlw8;-|Yi
zvZE(#6L~c_Upp<Z6pHa^dE5oL9L%!w8Kou6o~(qyLfF_3#LBbMv**rF+7*%@dxZOB
z><jIoz#LtuO~)QXr1JJVGEO;aP<YL8beXVo8jYICada>qN&`4Gt^zS;wGW<nt*99L
zPrvwm9^E|f(Pr5&ASAUl{C4hzHmrbhhh@PeiN2IU5Z`b@PK{K#8zCPX;b6h2wnfhn
z_M~sBoTsuf@Ycd4xtr&at^ubJddN(1rd{_SLf(NNRQ*GOHhGBFQ`tP1I*ND9>8I&`
z_hO_G*SB49_eTNE*pLG(**GvifKLU8F&xqL*#VG$%FvkVlvYljxJDcC(GRrlp7VKQ
zTn=+7hHW5-6d+d{S7bLX*e~($L$s?|T*lHWrSfhUDO@}`?j*F48oIk$^h7@^;fRZ<
ze9o}r5P#L3A4v6L$2u+eEHQ#65<W@#e^>_i|4F;#S64T*T2wY~A3y!K_)lKQlDPv(
zN;EBO_?Cecugo=XB(8P)zIY)H`mI(VVuljFeRcZezus}@KwuF)pXZyPO6VAldoi)}
zn2MqQra@`x9`0z>cyN^|?P-|uMnTt3gi|tiFd}37co|}3TtP(50g@PM@sI`~r!k#c
z;Zuo<5pC(CF#$}(_}aa*?%^+xr3mgeBO<e4R)rs8UQOpRb{jA*+}Ua_>MT%Y#Aww&
zPr*lqWbF2#E3OU@$O3E1+$2b^!P8(Q&ZvP;a>)XIO6WpR(#b4<|LNyAdFO<yQUC^#
z@;I~pHs}tt@SmDU1v3{<U9tfCB6JbY0socbyQ{K0U`#+9*slD9?D1l;zX6M5Gmymt
zNwZv%v5E(b2}BLZv3U>VEnY#OQMz(qx-NaHr;h=H551+(5<`QZ7V4K$(4S&JJ+bJc
z0YymDAfGmE3Ca<`_o?(Wru&}Q+KDxsoa>D`@IdIQkX9wg{1cp3Dyr4SVA`gCuxR!;
z$6jMeRs2(=V96!=(KC)dPT?*E0}$<6KuCP9KruUNgKy8e*fAS;7nF;6v^zpEPW`cZ
zaH@lH3ow@t?W&9FNu&FDF7ULae*r}m-q4qyc`C9D2l!ABbf7?kN`|;Je?sctnm741
zwJO}}1D@-G1b%mPm5KgMw_T%0{?w;11)naKd*fwRuYtuyMGY`?PebYJ2rf_tIN|@j
zTY&~TwOXcyJ>|?`)3CI5u=*y#yrhYyy=)IHyLx)9%A(tJ%o6SNYEAcxXipsw=JJ|e
zm_G*{ii2#b0Tvol1ETg@f$6``JA(VV3lx`Uq_<~}m_?sAm;9*yK_i`U?y2*DQACxM
zzNS<LVoK_tuAoZ_Ow4_oD+aL;|4sy=@i7ZP>5SuPk)OQt&)}kDAXl9*I@puu&5|XP
zB?&=4E&rPVHIpcWpnSAzuLJNNbZl5+Ud703fGZCMC_h0)J%O5M;-TXOB1(&3a4R!p
zPO}e$<@Q%;dFuG}@jL`73Yn)1zniM0tr{>8F%3TQqw>qPv;5z!@oVf5LwX+wWnaw}
zsQ-`~Ft~I8DH4fw6u(}E3bD|C1@m(M<6vkGrqX5575WO6VjnxybCzWF(-#;<77ovy
zFY7zT@560Ha!d&9%x<p*z?2_Io)2>EPCxl^Y$5=2cYQ0DWgqW>>EfRZKWD+^O0s0G
znkvVCe51vEdO4izr4x(K%|V3rrl~CuDGsiH#e@I*IJPoyF3k&ITLe?>(@MDlIBScD
z!B*~jrhoCS(>`;Ew&VXepTZbgQwwI0r4Q!f&84d}JLZ^<rYmw3ozxU7!N`|3s5X5V
zYS8{^<%kd$ilYiTF!xLJO#60&<JT#8m)FVezX9!HQpY$T#G0d))A72@KSLtIc8&*R
zjuB!L?e*@gmb*7ji7`Fq1R9xYEk;H*^<*b(gT(zaI?@LXD?Jw|&DJo9@szTU{gU)C
zaEdqHcn))5f*2!T<98JN+T&+(+C8lBZfce1&c^}%-I~oxkRKu(pw$JIsv;sCR<_yc
z1CDdI0ip<UI?%UcNLNX9-7G?Z6-GOp!iV3<-OYiC78KkcF=2TtkcR@2-2b;wWOErR
zrBlM_$LqqJYw#}}bFd@IuL!YuelHg>Di|5@4bDZ#3MK*b9GLnZoJabKxVE9G;aP%9
z_zV&a2C-{n&Sg;Cy!30ZzYHz0_(VP-jUWa8&}TPdS}s5#M_J~TV^5T~323qZUYGS}
zfIL1l9Sjy|3tj4ZSqOAVPza2CUH;U17u|j&fM9L8tjc@P!J^(e`UVFUL@hNgTk`0H
zZ{tJ^mY%jw@J@OWxI}E4#^NX#HsOwkX94&puqG4%=j~F-xiTR`;-7`zbE@b@outWk
ztt-(psp}1JLWfR?kIc3fUW9?M*eO{8mnY0JFZmb{Yn^$2=fP7iPR7cAfhd|HfEF2(
zWKRut9z#Mgy34@yr&PfnEPP@>=oFthPp6d=spZuaG!Q#Y!?(t&%T&Biw(#gy;WyL=
z3Js;G{$z~;!=0Hv6<+zUkAmNzfs;Wh8nWXpFb^||6-31UlJ5!dCITiC|I?B<PEd8A
z??V_?Y+a6av@6PFGfpfI(1u=l&nuN{z`cB~2;l#z0Cyj>mL((aMkT@uWr}o_0(T+J
zfCTL?bqav!4h88nYgqKgsb=blU3~$O7H<!EB2Pz_Ar2oM`jzQk=&8H&qCQbmE?C8g
zVNC$zk&w+BUGD+CzdFE06Xuq7ozl$lZMt6s(1S(y8;*sXY~IfcgK1p+kQW9i@_*V<
z&3zQgUl6>4m^z)U#khNqmLKBSt^&3TxB|Ma7N16?PsW5!via|85&*uZ?<JW^B_b~!
zV{@G41okQ0%V2!h#Zk3rw|JUw_WddI{c--t>T&Q%2rqZ#;66Yq19v-BYlIDMZvo8c
zT3`EpjZ-h!K_HsCVp8#Ip_aH4@JnXLOQ$@R=H74N!O);eb~YWjtWo3P6|I0A{P2j=
z*c0T#;+NI@xc@a<{TvvpKcT}E0WLzF<+#PahRAJd;+>MTxtJ8G|MW{z3&aAYBT?YL
zQgY}6JAa^oS>}@pMVt3L5#U2fNq|i`-&#DC2~Nxd>v-%5K|yaKcg8cMhmSUi|EW8$
z+WbU!AMd62y^p?M$%6v(GF&A8&0a&#7~=)Vj~@sQ*+zs1ETChEyG)jj*PgBT&%QS>
zrJG?jvz@lwf!DhUd$fq`2a<R~B^pX}Q6r&FBBgAb7!2_A0vl+YqvW8<y}xQJTD
zktIakS>ZLX^Qhh#BD7=M|J1-kTlsuZZT!;{l8#&FVt;$V``<w-4y;@X<w#T~rcYOC
zE-(Sx99;(vEzV~T#%ccj24pROfF0Y}yp|Bl>=<ZiV+Jymb$({_B3Gcm>cjoL!2XKL
zS*krp7VYkZO3Q8P@@ZnvesOX0ng}uR!NF^Wt0E+@6)#ky--8ValJ0;YpHsf)7w8|V
zmRe+r=+P9pn|Pmd??l|rSp&(yfMo#-04bMdjjIBgATl&To2y9v$aI)RbZzF!FGO*h
zZ+u&sn^PM9?0swa7V6)GaQpr*;&5dB89zi`SYcie1!lsn-3sQl9)XjRJ?1>#uj04q
z{C;8N1(E~PwB{5&3M`W=_EIma)BH@`B*}r>r5g+Wj;k%QDG2Gw^14fdEg_mqt5#!H
z;<cTv-~_$D=}Ep6<Z}37(jG48CuF?qNxaDqQTLMr3o{fE>n@Q5WkD<U3>7dSRM5&)
zZ;p6ou;A|ZGYkG0b#l8VebY^%!9d4xiF&;KaMGdm<#&pcQ}IzF6>$#gS@qOc8I~Fv
z?B*7{<iiDLqgC$(){#&Fh@hdfyLBghC4Jn7(-BW_%ejl~zWU@FJifgiM|@XUchmc@
zXd*nXQZDbNMa7Y^xw|6q9yGSsDzv{4Y0z($=HYPOfHTaOdh-6&iOn~1hY{vtfL{Ej
zi?(bY!#fmwl+KGV0n~61Pf8ihE!e@$6M*^fz6d$f)A8;&x7`Hn)o6ZlY!_lZQ(VR0
ze6AJi)5IlS3<)=<%qK<OB(9?UWiXiDTMJE=3C7uz)eo|-PKu(j5ER5Jlc2v<V7GrX
z2<M(6g^n)_LJp$07sGEBezaeNn(<ZOQo_r+N6Pyy_v7R80r~#k7`!=`qt9*nYTXlJ
znCS*8mMC@pl<y${j~z^<jz!~;V%WA1GVU=!j30LxljflU+3#NYA&4nm=Bs~X1PSbj
zu%2R~d_>3L1FvAzMa<H;6N5N<*1}fcgJ<5H32jj2wZ&A6Vcg*AxEK3Lb*l~BjJKG7
z-eY7TcF4^cv6>I8Eo$4z%5{>~DP#0wAZ&$6Mqf+Ms+n4bkBu|rZVNPQ9>A$(P)R^B
z`*1tqUHQj6hNEXbnH#@*3CQIZmKlPB#CjDkQ2uLAlizat+Y9#?5YwzKmT3Lju3?JH
z5)coYz3m7HaP=WC+BtImb4T4VQHGVS3OC|jDJ^W)gGG$1FJ@QjuA*7Eb*<N}w>es=
z{)}dv<0mY)TGHS}HQXP?(W-E3j^|>ioE%ml$V@T=<CLR^ALi@y-B!!uu2#Z3XS{K$
zw~9BS)qf3Gn$X{`NE%t4c@#Scw%du)@wz^UeZ28aw|q??iRn>)^G^VKoV@h`;~AQ<
zyZ|~D48cy{@1%#@d<W&KD1jU$u;3=VPs>$%b!7~PeA~R7)m<eK`cx!D^scQIo|AOa
zJUv*ds;GJgVI&^0I%7CXyL0Ubh1`y@>YR|E%hi!x4%D_4-mX(>r!Nuq#(u-EJEt~Z
zM>JNQWtb7dz`<U*C^hrtyL2Xj+kQyA=kmyNm4~xG-raZHxRw!D@T+&e@H=ZO-bK)J
zv`qkFgr9Fl;Fb+U_muk-9{3%8{s|1496*4&WuFFWYzg-)R&Bh4h1kAiD4m1pT$av3
zw&mdV^U?fp@$3j8`~4qj<F+|69*4GztydrZ$Qxt9cbzYu@MUNInmblL<vl*Y@WQ??
zlp$IZ_h94am{dUs1#;BA{Q1pz4?~_UhCQShr#6MU?r5EUd#xVaVR%uzw_BwvJ?O-H
z#~wOs^xN(Q*0BwZx$#9B#<~i;XuTItB+WCIo*9}4Hd_QBJ|DzsPw%dmogii2e@b_q
z^!^QClppSMp0Pv&3?L(#*CDA|t=b^8vxi^E<gikSfKS^ZVrBpez$WC@ia?*CSPu_-
zNHBH!H(Mo3xRxGVz1+?|;gP?=7C1_j{BRRZ1bs>W`mJZHO};!ga%1780nyG-!93I_
zR6KV17jSnr?CszWV|FW>D|lSmtmAIwzL+ob9Uggt97Sdcn=wbn)7menYOPwUH1uLS
z=umU%Q_}kk-(v8XuinqO4__s$Ysft?fd&WHj5>{O?u`ctVv584JQ4c(uIb*9GB3}<
zdt8Z|(%lXMtUN=`doU%9H49+Re@*sV0fg9$-)$F(7=sFCSS8n)Ud=!(pFr_S0R0;1
zZOg&SqeRmdJL7zdw_LZ$#(B9W`d9oy;<2Q6S{uAWq@ABVT=;$oqd3jG;`06hi8JZ5
z>eV44yzKoziMsMbJ3DIUQn%%{6~8eDmA<0zSZAB<**zU)aWhN`EZ-W!v9-CC&r+Pb
zVSDAiZT;}xG8(rkr-s<^iTFW$o=?VmiHFwHiQ&BWH20&Xl*|>(5DzT-tBOecpbt>_
z(RPTW_CUwOVrFMHQ<=fa!@N1Rg#7~)SW`TICVIdyPVKb{fl>xN2S6?g`GV8tYy2n+
zQSppEnA#j>iDR`>d^n-j;5iesn!M6Y#oI=>Wp!f-r&~fGL}y@4Wc<f#VJof&Bg(wH
ztL?QGaXSIIzJU(=dETbHt5BP0H6buXKVhgtYz|p$aH)_%Digb3Y7#%W=rHuLv~J9K
z!z-wg$nv|)_5G?{!MvL`gh%@^^0gk#c}!;dXOwRUBN}e{H$(}$3GwmOc-YuVDJC|L
zCTt!cfHZ!I@R{_29;odwBnH_>K+?EfMhN)sE8<|({tM|!VV^Vkvz-Pvy5eq;TMjQC
zZpDPug6}7=AAg_5Q}rH8?E?m;!#b!xlHbe7<e|&(jCTm;(s;y=SnF)tr&X2Al+|ts
zlK5V@b3J0O_i8PEwY^ZLPad%3@t7Gd_MSmKS`)XuOXJw_JL(Y!+y1_;PTFZ4A&2{)
z#-mv3j6NTgeOLuGyBK|585kN0wup9DW1=5;9{kU=h>1HuxO1zVy{;z+QKGjcd~812
zk|8_h5*RvuZVy&amfUzTq+^D;?0Sgj-K()yUpmG+lTjh))UN3$`QrO)OoG7yTlRDi
zR8QiO_o(?AWzLxD=9p959*p7O`H>MrST9|F!~f%soBfkd+pQ{TG%$<N_0E+A!vo>W
z5+S<XB_ew-x9xpi??5eh&&pKZsQkAFfle)TwF|tJsX>tDej7b956<OsDsl@_$b8zC
zSNy@ksO8oO?^RxSPxVsSOx1`!k>l)VvK=RrwQ8#Q5Yi0>*XM2zC;2aVH*i5ejH3k}
z!YqHH1!8*hP6ja!S|aiY4PvoM**oT3Qm|mpU$gFi;FqRBVq&r1RGQP2&7%pKRqzH4
zx7YIEWDijHS48Z+4Y?2VYPIY~I5(`oB&3`3frXcKyMxKYo#Z4hSJ&)In}9EZ?08m4
zHE)A4x4!j$LY`_?uAgtb-^)cYp($$rsO@uO-mjafj>~M5ahb)@TlmtDsvbR)NPd5F
zfNl(x1U{*P?nY=TZ@*^ueUk2j(k}V=at2pp`eDZ*ncCMR&Xx~1R1P_|f4s9?hE${S
zR;|aaye2td9zK9@+8@q;9CBN0KHLeYs0I1%Ua4~M;T4wwAN}##@K^%kgM2~jp*~>{
z2;Ru)uD$ZtvvcB9nj8K9&604ozZh2I9?TOS<aQqf5OOnCPJT`Ez~ix}<=X71RyLti
z;;jUR)M~LkVtL@BpHf_~kqzU1eYY{XAeimx4i@POpSy;|-t(Khdu!dahue*Z1OYtI
z@r=H!<@xYM5|68H3(e!NnK-Sk+EKX|N$ag59I@Y~r^#~Pg3Idr+9g+>$o4k3Dmv#W
z(^4-PX9L4+gT1{A%aMH-sbaVL^H;eUFhsca^D93VUj^22yzx!RJ=K$)D<}}m4)U_<
z5j6W3<dRSY{0$asMjML;7Ee%vnZJuEpauK*`gVP_E1Txp*ddwb;gm(F_U1sf%i|X#
zLWYXBW8^r5y^(fNV52ME)i~NP9cj9hjVT6Knf7zdcn32=<NZ{(<Pb4kR^Po8-ZQ<j
zwB`1|_U&OY%yztwc=t#5JYL29opz5c^GbSWgLPg8JKuozSj*u%bv}4JbTPKOXo!;a
zT9xbi;y96tv)<OwZm`Un^7q4;S6yD4TfTe1OFqFX?7cMX;CF}DVo5wLMs|}{ax`3T
z2EY8NVo}brW%)Qag3QB^8)T<(jF~A&4jjaEEPuZ0QjX=JH*8QLrrvaPoNiweO&<!r
zgjyg29O8gZ@R@%R5x}nx0Y3d1qY;}4_SSnE^*e-8=I3M8nvQL3jXzOwE4iLhy)mp{
zcZsD^Z`pkU?9w68qU|(3m(sH0dAO5TcQ~*Un``K1(3NTVb2X4Yh1L6ddM4i*YndU}
zmR4k~t~Dn;ngsg(#feSAYNAosuVILxmU}~8^#OzMtu&^?I&97*r!ki;?|gEP*H;M|
zVZ$zL5&{lmF%@^5VX0c<^={=9&U4LU=xp%jMbB8aY=1Vlh1!u1A_(Nnj38OxDH
z9h{CyR+WSrw-$dqAw-wyjK@B1az8^984h<b^{#@ln3NN2NtcSgKhVuGo_Y9OD|7QI
z?|#rqzBe%eBE4nwGT3#PYVR!M@9wXDsN^;&V8L831I&#|+*Zp<Juio81jQn^>68!&
zdSBatMqkk(gOfc{+7sBJ+y0Ymed?#wjA5vPg@I37rR|vP2E+u&5+w&{3d#4vDDdwj
z2D#uj_lqTFGlea9yz|)MZF4@{|DxIK(r(Mp=Te>9FtN40Y05Jb)9w&wYQ7p#T4W=w
zA;(WdjEuF->#jClF*(t$5=AK&52+T~e4w=~I_@dKM)B+mjoQcEpgVO}Oq)?=f=h?0
zaNX9UE8?py4hcz`hRP?B7Y5zLx0sqnAZlgWUp9K>GiQC3r5iO-NngICLr%04kuYi~
zOLdwO=I&Cw!9l9anvRE!EqL+kc@^<%d^Ly2+d&)EOok2M<L8k6{l|~T=fTdF7Y!T*
z!Edieqd;5joNey(SbQ~9D2jYfrtLW~wv_V>f}9WJ%p7Qx*0u-N@hrX_ymo8qjiu6G
zOucj6dOB+SR=|zPZ=2bY_&tx%C#P6hlM=K=A-5SH%Myvapvzk9oN-bs*vy}HMi!$@
zGPq50ZF{CoD?`>IQPABG=5TZ830pIAw^^olpmxwVvj+W^Zc%UZO(v7u#RplPAJxQ4
zUF~$LuHVzJ6K<Yfq<Y%UkT&&gRYKTo_e<Z5j5)Qw_{H5X+WpAj5r4d?iD-Ps=(tAN
zC5Q@*OilcL9y}MOuAC1Lz$_|O_XLdZ+<9cIvwqOxlBOFzxZA}}3*?~3*nZ^&H1*qM
z5sLi&@5pR}e*7o10DXa@Of^g{1vH3jUO>9weMoD4+m3MQ=@v9@O|!l`%C<p9I}D}~
z*to{x{j{&D%WFN7?k4_bDs#r3&wy2YHWNBji8flqz~2=54zjf~Nu`$mw0e5bx}Gg$
zP2b8XnGIK{h7cvSZG>J>IOlnKiNH=#Q=!8m5kZLT>JmInzoYO%;p36z#9A|DZhWTX
zu+3Hpv94XANAM!fxT2Z7hC|1qjhV5hho6g<R6FN;0r<8R5fB77OL<=8wIf78AlWd4
z(i|F~zb^!+m+Bbiy+`NHt6T=vSjqkMJ7=29%a>zlSl#Otcectgu)EjG?}&XOd*#T=
zes>TY&WXMOw~6#P%$F&(wdIsojW<axwIA=4Qmfucr`2&%WqrnO;25K<x9%mRFB@+X
zOru};s9nDS&F__?^4Q0zHyvqrRGK#&IGMdA2}d4fDu0{#_P+b}k0K<M+>zkEv0KY4
z7<N;ytQ&{XT4U|XY}l1~ZhaI`&OIENW4}(I`m<-DGgpl|LwinNER43MvkW_mcC(jA
z?c;RNBiaE5&doTh9mmnim*^I4@tyLWsbX`jW}Sox{ar`6fS$hmB@|RBJE|}{crSsB
zPc~5H!hcu_R=z3_+}8xjnf!D7+}@2`Of`W1=P5}+#=E*9X_1XExPHwaaDLvVS`xR8
zk*~e_%c37=;-8zO^;ryu(Q!&Q?b0<oZB4x+J@zyC6%A9phq%uTu~=wb+iM^Y6N`Iz
zHt+Wt^aI8WG-JNS<K;HLoxNwZFiF8~wIBQ_zMKYnF%x|M7okoFqducp2uDh?Sgb&t
zD~)Het2*4ms_~$<NIuQvyu*W`qSSA~QiJ4A(jJ-gN)r!=nkcqen9%!Rt=PH;^6F3*
z)!MUHwhk&?{^rq6$iXFz@yQt$rfA0ab*_EX1J_KoB0v4y);*MX`%Eg4OGI4G(zW3W
z_DyXaCD#=Wox^O_<a7PjG`+fm!Iy<?x2wR~p{#>5Olcuk=HkO_jOE2M6&opxJIRBY
zN%@e1Dc|N{hz73&rqzG_+6e#j@NESN)z(r&<A~GN)R^0{*z`>2H-(H~s#LA|+rS1t
z;-D@vt0dY9@piabzO6w%<Yt2oY^FoXpTBa{cHv_7K8M759XM=KU82(2K8=;{4K(Bw
z6T)CGCfiP#CINn25{gD8)fI8y<i`7QX||a7o`tEx`9kcDaQ!|IT>1oOp7wb0PPVW2
zNfq_vXv@}L$zc$?us`4Mq6Hat<FR|?;bkZhht=m|V*$?$g|wjMw9+u7ox+=KO!{{3
zuMia}7nwJO`%hR5{p7D1L+Req_y3+D{ean2Rgs5KwK=1p&Fcb*j-=;zb!K`>f9Q=c
zyuxJbOgzJjLbPxpM{oFSi)^W1V+2_VZ6qx0E{Xjx&1{%d0V={)r;w0gmHRLEleu3x
zx>|jIhJ7Aclm3T}TkVXMH11hZ?I_$mLWQp4zEB57n<i5;xr+>w6p-@*M*5H*OVc{W
zTs2p_m2}6%$fbdb^})|gUVHu8mYj07LT(<Zz3G(6iBTn~mL*OKn&FWCmDnT#3hh?<
zv38>;51%d7ax?1oqqlN^{NZt9+r0+9_q;Q??%6=ea<M|i;;u6O+?Kh1TD_g4+lOq;
z!wQ8bJ)|&GzV(YjDT)QRuK$=0DlX^Fm50AYB$6(8G*LQI(_CkAuwhq9uwk5ocrexF
zpl|8eUUj_igz{oEIz3g9^FR_WU)lmuWEVZ9oI$v)MT01?fFE+el`F6Qd#d{t)LwoC
zhtXct=kP`xCOKVp-eFTrx8MU>F3GX;YFcFbA)Hr{lyx)U1}&RF>E8GSQ!x;=w94y}
zsOZHG$(r>WMio+fck&wuJIkc<Gt?_?jFef+&};dJUdFR%`r6DkRhtrRb?{S)eWPJ#
zJNnWHJOXr#)b02jU5XWpAQ?$fq9r~%!y`QNWIvRU*724@u3luYE>Ck`R(cmeC&9^5
zCAK$^%Bw*_n_&Ldv@biE*>oi{-E9+m3;hqZj3JZMspc8X7Sg=%Hj1x0jr4iAZIsaI
zeh+~+MR*Irfjk-+6$hiX5c0Yr8I939*a%-jHM(pBACS#!o6_Hyl#;o!fOGvecQmv6
zjq?&f5Nb`0Pl0tc0NLzsuMBV?1PqS(w;!K`feD#vM)t(K?AXI2#-;qs&p%C>)C5Cv
z*nHbN79}m)4%^QLM^{m=G+VeeMbjD1nwyU#(g#x+3Wr}91}+~`H9)E33>ocKA5yjn
z2gc8&G@m1ci1J&P;CF|%zPv$~a_o8nzGlU_m27U%{++>mmoVGAOdM3GL)Sp3uM<j}
zP9<3=fwXw{){DnTU7-Y$?<YD`BrVEi53&m_g-roavFduFsbsXV^Vm+yRzn-+!yvK4
zSe7FBmaj32JtyOXTt(u9+u;Y;WGg%-TdjYfTk{4P;)zwMK;q|+FtF0Dk(6Z+3nnK_
zOiH@PUX6a|8t|?$JdjBD3|lY}f1D9<YFd$m`Kf!P`S|!!La1cSEv-#yXD%TLFYDMm
z@*ug}Z{&*%duF3r3=)LhHPy;}l{t3Hw+0nZ%9N2TC-LiS2i+}RbPdT;TRUDJIZ-#l
z$>63pdmT*qH|DZB3+K7svRk*_(_kjNqK@aFIJk-0H(a;zfJ{EcSvzm|juhDsO!t6g
zA@W^iNtjw1&w7l*E7O1w-gX^(quC)dosg<!MXPy3Hvx>?(QR*=*{R{>Mw<7HbW=Lb
zbovt0@56PPVt)8L2;Vhq*9vDWsW%Sm$x!F3cN_g~L{JrgFbLfO5KDdeBipEfr%4w)
zKm}rWAyDJ*Hu;9<-|gj>4gDZNNF@abmv<CJW}$NyekXnAtwM*|x@Xe$!6(7#(R9Ra
zpXAz`+kET*_GCqh57c15HZbp5DCN%)U^EcrE?qX1{$h^*aNn@b);?#6ERr$xLH7E`
zR})kW60yom*KO;<?rkQU$D%id3pv`*Pyyf2H+h@GDI;FgHfGY>c9Zb7#rr$yhUg);
z<FU7&_gJh+SAQ+vljrY^VK4|X9y9OD-rIN$?W8RBP~d^oxxRmsQvHzYqpj$3w0RA!
z;EOMo8YpGHXuk7-QAI656g%)Hc&mNz8qFL1UqVy+hAF&^bMx9B#bR7Vh~cFZO7Sa4
zXuFpHCyxWRiKT3ihqq^0shIM*1CSe!+m~lp=aGlWf0T_qFE@To`}z!DJCfZejhef6
zQ+CPHXO<LJ<6(ZIyde8+(gPS)c;N#PGS&*pd0Xq)^3T%q_^b)3#YT!PW*QLdnYeV_
z=I;+or`AGM>umAbjOZwITsrclang4>lf|g1708*CmYWjuQq98{sLXAqu9sV9@o7i?
zbeAu-WNnORE2`YCEec-Tu?eQM758=;EZBO_Y|N`+J={|8LB4EQ0<*pO9P$Ev0*`f5
zPqgmMZ~p2XS_YoW!EeW&bq%j2j4GvQt;<Wl%O0?KG`OYtqr1OPeENMpPtW93I3qjD
zs1HAh%VtrFcK(oMW|+TkV*?~heHTB^RjP?_KXQ5yI{o=-xeDWgKhlwx%_C|Zs$2E<
zuHA)jgaaiKu`uBy&^!)5@WbqD00a2nDf+i)_4~c6D1NWxGlLR{8T_)4k7Sq|Y;UCv
zjJWo}6=1~Qaqp^{dzinOoLLEwC$%sLzdpLVr!!DMC9D|nv}5gp*M~5XXf>h^v9+lV
z%%EHBu3}Y@)e5X;ob(S2bH852pucw0O+zzHIfV|xhHJ1dekRXXeQ%*`J{Z8W#_&D%
z#%RuvWYKrbRgx$}iZ!iTGD|gAR-Nb%#O<i%XFa!vDuPuvLS&t;#=P4#tLhl`$-4yW
z_SfhGF%f*#R1?#jeA&FHTEj)85gt`?-W;Cfp*e5v(V#Z7S5rlKy>k~~JaZaSqbBN|
zb6Q!t`RJ&ed+Au3-eD;eDgjs8q{lZtxh)Nc(z{fJ+rAUQj!Dv6tX=im(J4IW%gl0v
z)8#1>qzpKL+>l}jaKy5|`%&Z%K$HW+e%Eq=d}jz$41~^|Kg7<T4NTj=8bI_PHNrV0
zb%e9C1SAkI2mVdiqgQ6gSNxaIFfro?<zM+;;$VMJhkz!BHy*mJdcDs)2wj>ETwGvi
z48Ev}l#C>WM?V_W<#s@UD5YLk$A^#X$u#GAx3jZUmt!n>t63NaK;Z3K83Q(%w*sqz
zb)~)*Iy{AjOA&j`#F+dn;IG(>4SU|jM)g8sd|j=xF<oZthOMAd8CwJI6-@Plr6lXS
z;o|!1qPaDG!FPs=5ZOsAx_&i!tz{E-oVSd^+6np;-<L3O(Y)NL^LcGG`^|xUeMZM>
zx_v(JU?eFJT_}?fUFXFCz5|&=`F8ZTrtn5OnG|tz&R1QAhrwEV(v9Ouct5KHgDEZt
znc%F6L83fqFFOumetlZUo5x8T>*SWFkyFLn=Q>7l!1V2I`qtc~o)jq~w2&-bHC#L@
zM$2vV$O_BAF6(snk}TXHM3J|8djOjyFzNLnd@If`CSQ)Sb$2Z1wda~)C|lD^sIV=%
zgzn}>fV5$`w#(J`IqM|`E@SpIVx@L@k#T_n2xoHt0@Nx|Nyeml2Sj565d(w^0c?o*
z+g1K8?c-j)iIUQ71)!6#VYxTRW+eRF3{GkwArH8J#{_6ei*JOnFLTH~YDktW_!~(D
zs41#*$HK55s7yX>!C`+6^|{6ED(1>-_u#;b{T@lu&_TWQ+xK!ItaJ3nQ)U)UZrpcZ
z8m42kTM6%`)(4x@l6pnEbB*q?Crd;#cIG;WH3n1Xq^uK2eLMz01_OQtWNX4@mBL*w
z7W<Y*D;T0b5{ED2E{7@+9$1#k*v|Jw_n?apbh2yNg(P*ga<j_1<Y69Il{_1^<N8cZ
zSiR%pcT-UW32FOzxyMWCxN@43C>f?oA{zkQQyf#mC(>>{aDDB}?czIq4Rc9FOWpe+
zNw)&_STzY1sL0>Vt=p0BD#`JeZPJN<y|g8tDjGJs(;{uT@rA;HbwmNd0T_6bXD)qz
zcUk_9P3H71_-GET))-AlI?GK2x+MJ(0TO9LWx8D~_}w?a$b^Kt9e&QO@Q2P!tNDMy
z3ydRpfdsxo>Xt`5z23&bHINAKCNiS0tEQQ~l6syL7aohVctj9Uw)+I@J?VpPMse7p
zi-j_lafPs^JQY2(aYkJzyOKfCWo=Aunp$kNwZy%%wd5i<j)t=|a5$t^XX%6U&;ZGf
zZY+-Kog9@-`26U&?Ht?=<(~R=ruhudi(6S_)%$kJx5Ew|H<n+e94Jand)o3NZlx*7
zf0^4-7+Ttad6$PkxX`A~JVMfwVb25p)cBjCi%d@r@eMalb@$3hx-rjkb~C`_P?*^J
zJ(WDP+4pzdSvO#HNg4~xat*QHrDI)?@C;EDeLGW)geaIDa|}&JCwCIc#dj(h&1}Ba
zMWZRDt9tsD#OYly5%(4TN;|F`&;43ByLhrl9nv1Ffg)i9SD`R9*z)qA`+Ps*5lW^Y
zXKPP3bS%lrSv2Yr0>0I_0K7YF*ebq@Wbn%I_L`Waq~y)8Bl8~w|9QcACQpR;hc6<d
zn6STc#7$xv5j>x?6_^kg4xt70*#e&tIR66u_r=#jYh6&mk3;px3DL_%#pHg+ecJVo
zJ*8wp-%c-Ud>qZac?)WJ<M(oE`+f6@bfkJNSz@Bz>|XH5sFg!JqS3)Vo14w=9Q;lE
zUpZn|8{I9*ym3FOKC6nD(244RwH_cBdt)6Ujj%A{*e0<-W@<XMnNj$|j3FzpeaG!#
zx9Mi02z~?C9!0CF2yOdVi|OU}{5si;Bh3IdCL$Sf9b(I}_1Zc!2=qSVNR&uydomYe
zcgSv^M93%N1)-LLRz1dtfr3WAKf<=qXyAPFbhZkU=pZW{wa{h5qG!@FGQ}q#oseYn
zg&;>M+NMo(b|A3v?NDIa)g0z|qqO|oc|62Xg6GZ?0{W<^L$3otoSju+^a~;Yj~$rS
zCpI*0p!YulJa%AHtKWi^U20uLl#6B`KbZRxGJVQ@&k!<{Z-}GXr1dyLZ=_9$n|$5c
z_{zK=dap3?J3DUNo1tIbAP#kjjLy1ra{Ty^{5bR}<{TYG^r{#kVX5J}3Nuje`oyg(
z_e-jJguD+PGO$ERcg5HZuUwQbb>#OBS2BFDyTHP3_p@-{bs=&1lY_?EOzUk=?Y7%x
zNK8DHZ(GLSVqARIZzZ{lp+@f(S8WdKMr%tOotN2F4&hL`Oz9{ye>+foh<F?=H>p-{
z9?!mKI7E(=`hz=^wZyEsycv<-zcvlw;j@_)fvDb*->SbpDDh!m?NGsx*=`}oQNCK4
ztgeiRZW}%;gwJd3ORM($d&h?;n9wWjAWS+5iPk0tiM2^Fy68S^Y&z9Kon7a(kZCY{
z!1bIa`|UZT1NsD<?)jdwO62fwWFLs)E74dU-W9E_`_$bsHb#`SB_rEzA~y~6<Awur
zeI8YN)qjN`eMAT<xeu1Mt?t3U9OZ&#C!u0jR7z(P2{d}*Sr?41Yls+^g5;)EO_!&*
z-zibyR7i)TzBdcWOufKw*sc@K$kH&kv<!HK6v*4?ofcAz11tZzp*~^=zQo8EHk%gp
zw|*SgNN#5D?@Gvjw2`KkUqKOj_wx{m{t_mf+E%^g`!gYwR9D}HC#g!!ril2B1C{z7
zW8V_w*je-WWk;1xyZF|DFU#xXo_i2l9oMx~eYTE_>7K1e{cn`QM(<CfZkke9*I#<*
zZw~2-RzUGl%om;SD7Rfbj@Wk=d9uw0K1E!C=cg(>aWVd$YSxgVRJwenbVECQZMF0+
z&w*cjfl}4BBV>A8><)ogE87}j+wwC+oyPcNR9dc-MjeLW#M9dcGI7?_SW<>&A|F_I
zJ2P69Oqau*6vODvnm$1oZ~<|OysxI)GAWAB?a+nTMty%LXQgFSu&$OInlP?ow*tb_
z6>7QmTi3E~zkDeLa`nJuf)OBo&WuhAcXV&Om**r)H;i8|)=2xMAWpegj-&RJHXi4Y
zHgEKoEx%_rXB*0jR$-Hds6i4Gvw~!-W?E)t^8isAOu1OHm1d5o0m&~Smmv7@y%G&)
z%nvRi-m@+K_{nla)O?>5ga|WY1SG>U8*@}e_Xv>jBh|;CiJ&<kngP`@md+B;q-oKg
zE^}J0lH4_j_PbgAIbSn&yi$yAa));Aim5+3s|7FFGN$V7@VqYsq2(L%G?ryX284}=
z@&;+WbG%Q>S~QAPtQXqXIYr%_)ehES^K9l4i_=}$wiVORH`2ijGL&hKo1XkL^us|-
z{*?Hx7jJxvR&$LRqW`QhP1!m~2}V`ACL<oOJ{r9n-Z!NQb<#ypRSwec$6Z@ku{CeY
z^0?;tAamRKCJd~`W$#)26y7m`8noi>LNwIMtAu3RLkzzR0wV_7G{g3nTOhU#AOgXj
z&VsLqU!Vlg$(%~K`s^S^`U7_4C7|5XcJupaCc6y3jENq+a{r+Ai3O)7(1qRY4N&*l
z;Y45;Xk5fR9LO4EP}&fp-W1>5zAsWxJ3zj7x&7hD*sGMi6q$aRs0j`48{wbfCJ(s}
zlP>gND88;z)i=qt9}ExW8KH^Fbg4{N4Sf*B&xoV!tKKn^;egFKTi@!IEW7hAsw)Dc
zQ1g?GLir|K+isE5tUTp<o<aBubq={2;|x-G>A`^t4@v20u*Mf3WC-b8XX^Xt0{h9p
z3A|WL_Kf_|tu77TYw1j{6On{yRFh%px=#k=*-2d4>7{?Rs$_FNGjME;8Xch#-`dp2
zX7Sd#GzM|f5#Vfn02|F!>sMQwcBTKEBOOVE<5rQ{%Vg-xM6E+vIu!BPV&vVj+pZ90
z%P=J(Vm6kO!F;~eXe&vM`I$5(zKwb0Nu~5getsvxiNc{`(e$mR=lGGh13=+t{m55c
z1s(iOp|`&C@+B{#DUsG?koRBuzy+!>*E6_=0e){ff%)Ts<(Ao-=aDNFbZhc)R!b?%
z_T=0P!>#n+YqB$n{_vfnxbNiHNE9JwHT{iL>_YZl$1R+ysQcDg)0@H`J@10bg|0(f
zY@;<(yz1GEGduxDLHAB#6c^`<)ii>kTi|`6Y(4AlzbsK+oDmbrA@@X4mRcrbJj&X}
zbSrAZB`tV~KqNpXSb3z%ng*-8KxuA!C6Fl)JuCBjW|7Qmb<bqu*NO6pqMKB;uc#j9
zegpAGf3E3us(@zTX2R$@6a|g9<w~9J3<scsYnzvRe>Y9GR_42^M(AHxYfgN-j@FyW
z6W(eXKkKmY@bfn`Lj7UpMZ!Mf1>@t))=DQGTcqXX*URI(l95CH=>=vfcE<9uzA5I{
zWjU94%1ycIi;|#cpqSr-8Mu2OJ4U4*xcf{Y{yEo?q?fQkw%{>U?gYG81K}%>^HuNY
zj>KP7@4EWm)%ZtnfZ^VqV%SB58n*wYi<-L|p%J-CMpW&GS0|>vh4CC1R6bLSf0rHE
z#o@9ZnFO~v=8>oK<7ezGq4k9gRLtr#9lH+%p>NUc*!P&{_j8_&ILKRXCq|kjzj(LM
zCLN#t-2JPUNq_~mx&RMgk*E8T?1UsLv@ge0xl2p!zc|;rV6b6M-5s6ucz_E55kNV|
zp&&uDJK^knzxSPGHu}5^TZX5gdn)8on<$)h%xH7NNR<{KR}Z_GCz)<cLPKHpfX9ql
zzt0p;WP%86bmeVK&w5%g;)MAjBnA))0G60Eo0U+Yn?kT1rsX|u1TK85Z6Ldx2|AqA
zVn3&N-x=TNUY{8v$A(`A+hBvhlMLV!)%Z<=GVr@uED|tA`-(NqZW^;`X|*5!D?j&w
zi1p0H`J+W#LL59G2>fXC3jht_J>(@&Cpn0ljJ=#TYET08A~aMdOkT+Po9{qe$%X-!
zh?9e?PR8d)?8GiY*4rJ6aB_#W<Thy%-nQAzLJ=yJ@vkY?LljMv#a)_mA6Rn?d;6qP
zewsR5FQGELga?>%CZoQ5J3V~vPv)-_7<P6ND)+{(w$bldY6d)OZ`vg(7cWdcpE^_S
z#w%?Vdefl%fM4&BI5b5eOBc=Z=f!Wn;TBEF7Kt_%vyR1NwwEXB!2tlzO2Nv^42zi;
zxm1emy3=e^<YH^BHur5aI$u~N*1ij5VBDyS>MVTzK*42K;2lRZ^h4Jq;Jii_)1xPD
zr<Ml4rw*R$m5L;y5Nycq$=;b8vXlQ*1BT1eHmhC?<gJ#<)yI@g+sGj5>jFV2#comm
z+(pSNilw(Qu~K@w_7>4QA$9YV28qZ=6$1?j9?D{jGLHdiPN5vM3futn*+KbCc_y4i
z`~w%%+(kFv&)ov;su`?sl^dXW>hmN44H+xHtmbXRJ^6xzM_m=)qW9RTx2<pyToJN?
z3<GuWrEf*;Ty&94sbMoksUd0d6l~n#oan8{*OIobh$`O0qaS(igFzXE$>G9Q=2@xC
zXqedhtkd=5Lcf!^bwK13!4#{q@arXBMXz7~qJd>BUjUraoP5ncZaYee3~~vZ*HI%T
z(j}d+)(W?;3T<VTy&J=xN!VM|@+}`z*L-h!!tk3swr`30m#)m13}q*Kz%mR})6m0m
z_od3Wbh6d5+CFs7Ga%}fLQt63v1h3`s`1Qfi&e!{ea-Jf*SBh3SZ%#Bv*C#Z!(Uh4
z+PRKzD`3wFja9=Lr9M!WOER4F@Mkn1G-xf1uj3R4g`t1y^Z7odeijq^V&_sLi~Wk3
zmLwjMD&o-^v(g-n!HJcE`KO)PW>Iyc%j*22i=wjaaw<^Z4DMf0({-Ex-MWK^oqi|`
zu&!pU&Sj8r^Yg#ZHmf4XVco~>zKn{RZnHMoh_P}+_8S4MOe~&F<pX5)dmw;3=#B^`
zOo8L|b%Gl$S4D1BK~81?Rv$i{%Y6(sa}PE^u(A5?7#aa$(M#*Lr4gL>Uo%@MN<x(x
z?y51Sl;AXknwBOTO?5<ZXK6HTX3x!S#3|Nf3Fe^`8gyM#uuflzLGPg0%ov*{9B}&K
zH_iO3>#;^*ho4qIKccHz@QgT;C3ntk+IEsvpa>(w-fnSA>sI-kG=uK;Y&nf34DQDo
zb}^;fX=8PUw|N1j0;AN3H5t~rAImuFs}!~F!dc+poItbh?c!yc88o`xPVi;S&TiQ?
zhS`KuVhgT-P3*ToXDlCG0dLPg&p>~ZlR_S}j-U2_SaXHg(ARM&eRun26<KPOXRsLM
z87t(O3V|W|@VE?zN(K!5Q!8{%Fq4aK%NIM|5k451y8n3JYdayccO3xc&LjnP`;t12
zqqK2*l?-%nE73<u0gNJ)b|(g<IM^H<x>HMCc<F3di?xFAK5g4uu6<Ydx%Y_5z?y^t
zwmSnduRw4+cYX#%G3aMfyE0Td-_VSqG2*~@25<Gpx7jY06z37ND57+}zTK}{u}hW*
zKV8;7|Iklu$P1Vy-&uRgsFkOrTCroZ16yHlb)XD<M$SSrBeTKLCMjFmBiTwgC|()G
z%`>_AL#^z<es1GHz%cB=trH-lH}V<@$3$VA6(Zuf5!PZd9~|@^y#_MmR<K{Z8)Sa<
zq=mE8d{<Gi!sNUhnOKs@h!v~3DttR=^z}^52i_EEU;J`G6}nk+hU)j*_9N-MGOjMo
z0U@20&81#+8_RNP<;B!bRJRfmCY-Jnn$(d*!LWHO?n3t~;c}%IuB$M|NUnQLACx(Z
z@6wS#biRIguymK0iCtb;rWaE`Gbj#ye)T9geUbs4yS;2RF`btQ#1@ElvaUZ!EvC03
zQU0Ttz<O~P%tC~J4gpDpD`R<h5E&L@{p1EXdL^9?)KZM=oQsyh=yQ?;9p640UAh_{
zqTZTjx~W4xi=xJnmZPx8c}(uk70?mlg?u3CW!B15Nyqb2nBc;nH16He7${$V`f8Dv
zpnIrz@l!E$GuAk8rYJciF1D#L?t-D@2vQ&D$vE%w+jgzX9HABMB#mV#VG#mB5`0j=
zuprwrvnCOP+n|qV?}7SzY;L&8;G(eS=PCkq9YDRlNURi}XtbK#rf4;;iXIlwz8Bfm
z!uT;;ZtM{B5QF>sIm_Yq>Q>`f-jieQ7k}v3Dpk)n8h>_u14QCjrlcE9@k&0pO_bE~
z`#KSw(XvXj`7<ICS8|9Kcecz(L^IzvRvHsBn(yM`|3A*YJCN%4?LVT7LWClrLJ>+v
zWJ{9lm91gV;@Im{sAQCot?Vs(Q<9L7l{iFY&+PrX-d25Dzvp?LKYY&9_kGU$zOOxA
z*M0N4+|ifyap|*b=`-8~B+W<!@`fG`BhHnUtKUr=z)J7tCV_KkJ~c~X@w)s!1YNk?
z265rGHw0TC%U+^kf<NI0q2}&3ho3ztr~^ovxb~+6fr)OxT9FBu)1;qn`sgn)UyQ*P
zf6p(lo9$6$tt*#+b$`;~#|s6Val<nNQ2Po<&}s}}_~OCiSJ*<Ro*5YJpFO&d4eAn%
zXH!IjJ2@Pk&hT1)6?LMk8)=$J<v`rtWKVYf*+++Wy81D&5DgL410-QY8`EShY&`Ph
z;z2ISrQDM_AAJJGF9lvUs>>Z!!VuTG1{~fQG~lDfndQBo{jpZwn*kN>=)P%D2jdX~
zom79MJrvfLe03&P+nX+0&vXHWQq=THo3bw3-SVc{J6!0!8IzGhJZERWcj($%1R>M&
zVI}is$IXhxD#GR^K`YaH2@@}P1R-O91Eh6MLXn@lrE9Yk9A}JCj_I<U=`s-*I-}}5
z`+ZaFh1mg_6B~;|K^>xdrg7iCF_Xsu%c7tRI)~1LLjKKzWE49&r2gT;-CU6rBYkQ%
z7em@Sl}pw*&NTE{L|zETzk#g8z^#N^EDubZ+iu*4=9P<ATQw$mc-m*wvIg&Ej~57M
zt$Zd$KX<v;)@U#_OExJ&lO;Qz$MSl2U5dj)?7(QjR2Be%<QnsO_Q`kE2t132^0VH>
zHLB<4dlX;NL`%5)lpavh_+Iv8K`6Fuby4Gi5|tjOrVG*daoq2#wkfSqP~)xnswN*|
zl|;SrT*aD4^UMp}wN*!1ZRVBWG0w=nDGYT~6fLP+enoLq37}_jzC@A}j1U1(VeZXu
zmLp|S$4Bp!hULqbh7`7E)V1sjKc*9LnRs}$bR}K0?50NA>P((yzMbNGt$cddLuJR$
z&4xW**<DZd-au($z&qeVPQ?78knPx|NLd$-J?%+dA<fR>7d%P^?{jmtaTcPcL`d3`
zL*fo1rB`Ac&I&unT|m@3Q*(i=K-570C}Pl=BG8$lNV=7OG??7l@CR^X4|7tJkjM>w
zt@fubPP3=VEP*dN($K@7a7qkRO6CX*S9FK(_^7fC%%8tl<7hi|!(d3C2UQW;`sH%Q
zw-4X0BvSAxv+3OD5`Si1_&{^fX4%YAML=uOiJhhOaBG;kW!caNzL>jBwc&hL(l(<)
ztV6qFYRUpNIQW;c2MeY}?=mOEzmn7!l?10(eNO~+Oz*)B{4@cl5x!tfsig}%T!x<=
zm1C|;SzUdBRiW~>)7_DCZty*cjAh-;LXkJVj;aCB&>4)l-*`y<PRmlE{#B-mGi3*t
zd~pJ(6Wi|;;=brE3Xm@2u{ps>uqIeYtY^Oz>B<0@6pdWf<E|=d-03A6t+#CUp^o-J
zpZC0b*TaaT@+-&0AH-bHGhbQ+@BbCNp>dkQZr8l4FG!G#;sTNazt%q?4gW1Zga|Ab
z*qM}@Hz@vPj6OdD%4+YDB^W4i-9!#XnEK|>Lh@I|f`+8;m^K|C?M}=u;$iM{FE=RP
z`u5guzLKA_;$=mDjLOttNOC_E!p^ULth2TSxR+$d*p%Lv+%Iue5Al+Z(xbedrtv6#
zoNLk}U9AwT`v8q!1PZIKpS0GT(9x-0c19<6jDI20HT~g}fZ%wwev8Nh7W$phT+Dal
zL~XFJJ_$WX4ZOuo+{dSEI|RknxAXPXA?l_5<qVm23+W*_KJWC5MZAxY-#DrDg#~ls
zU9sW?dd*Z^iaOJym0mY^a@+$g>iq#>gx|`YqIKm0ahi$o;iCOCf({}{o&#l85uc=n
znorz7YOpz?g0eZf4wxd_aVhUOMFfOecs-ZRQ6f`d?ct!kmeyH|Ms1{Y)yFYbh-zwC
zy5WL|x;fBz#O7LQdwOt`Hq`PK$h!GmY}_F$Aeay_qGwcG4@UoHd@g~r9!E1lzG1J>
zJ(+_)#B2Tr4}SLZO|*Y}+9h`fGV80{Z1x&Z-tWq_8=5WZRjwMCb+t3mK~OM?j>}Ng
zT7cq~Vop-R#RkGmyH<_IN=%me4w(`Jp59t4{s!VkM5C=^T!Tw844(0a*DKHyUsu&P
z%1uv0(&dZPGEKUT6m#&&xd6iEL7c!KxyOZ13iB@Zy~?=0n8fWfG~Hf)(06MX>Cl3C
z>M5JI0OcHBJm^07?ye=7dX%eg&4*a&gG~nUbBA{Y;XRo+!75$<ZPYy|z3cI(y7=KP
z>w!c8z;@9K@CEok93(4U_&#u7io<nMkawSz_*hk?BxD8*`F2rAonMi)c2?jt>4&R6
zV~fL-s7IZ=XB<+hswjP~EP727Ey-umk_RrcX8N{GOC3c-ybP@SRFMDdm@^oev8iY0
zz{vQ5L@$Mrhp!`}{!IC;N~=@0G}Mdut{{Kd3!w)-zEwTd$m6vjymXe?e$6H9EA#2g
zwjYVJx*k7w&pIema&`E$(bRjVib4s(zK0flk59d8Ys!0jZ<3nRQT}DPtWd(3TsqJ7
zwMS96j&fBU!gR?#W|B2NYY@AO?3>**c}3w}f@a?HmreAHX}DZIPekeg^aYdS(mqkj
z@6Q%+Rp<f_R%@ENFyqtW@OsPB-nRiU&XN-%-87?kHEJid<FJR&_4k_ow=8^Y6TBbj
zO}z!sMq%CoLBSqduh~mN;nZ(@MhBqJj(fJ*lZPTXlu^TGNd|D1j>?Wm5$@b*?Im5C
z(Nj5v_6qI>*RPNBH#*K-Vy|r=Hn|x0(5SOCUA&g)V!ChOP}<VeGLZtE4n^KvT+Zh$
z?>}p0YTzmv*t|Mk+!oce9%v%o7XR2KO*V-5zr8*QDY45D_$W&hV)ic{EJ0RKP#+Qy
zu^;M1>3O%KB1+zSL3_0P9ByE$;4ur=jEfsgIGTtp#hf2Aqm=wqHqOWR`MDhJkK!h}
zU7hvSL-*c$@Ko?-F_FtC(sQN8Z{mqO<GPAh?q(10$yz;ZWHl$=MRAhn$V)jt`7_LJ
zxi(h~<nulUJHNZ@OmW$SA=c~Lsjy7*<X}j0+s~IUd3X)%PdIfYzKc*#>#ID*DCtQ(
zOH<X|f!2{OS9Pk~aJ|CthX)mEp-dgZlW8QKy;;Ao3g7JNNYC}8y6^Nz^(!Epvr~4|
zFQY|zlY0*&W$Bzv4bZSiTN++8OFYciGGQZ{MQ?0N^DtQF%v`Fg><6K#Cq?SlAkTz^
zXJ6xD4ja;O8GSn88h>8mysoEPEZCZp-)?X~B6=N+yo=MkDa}8Edrbrbz?JsSSO)mR
z%Vd#wB%z;`NrV<s0|WGCtMCKncKfM&Jm-)|2I&!Y^e$Jk#$K*2$1(AVEIx@+ud_Vs
zG`kr=usu^N?enu9O{U2_3DxZTPrE)n%VB!g$qaa|bYA_zuv`U53^2S%!`}W9k*y(S
zcO|n^oT#A>zl5yjV;(B$RvM0p66OytVkdc(jQG`j4wc351`&`zDOvJEjLhIguQ=mz
zjIX~oE$br)N~v1eBipoZ?|EebK06QqE<9nwA}?PZOpX1T%pQ{FDrYy_i8`TZ64sXw
zweyR^e!WEX!_qq+nx{`Ir7yLWG1XJyyn1L}?bJ|2FzkIuNuoLR0Tx!;%8QGrES-Kq
zLI#eZyY%6~wiruNLPEfrYw?wo$=O~g>J~Mjr(8GO=12{c+nC}5pk%6s5qEgy=;u$}
zkJOv~KZvAU1iGC3<$HlX{!shy#j5k?-wxmslA7{4)bDUK^H>J~5aT}#Cp{6(X!b$N
zxhsZ3Ay-L;7-d+O=@qob3~_#A_+?Dr><SEq(CGxOHH(k>+%H2-K=sQrv^}@DsMxP%
zdDPP9WQMcs!@Jf`qj}HcS|C-2w@<#WM$zi^_?8*^ENrDtyy+0!6{{OP5gShG&8@q1
z3Gn+wzKU3Rk6YdhMaV)u2tV?}2g|XSI%$s(v|MVkMoH8|>aKE2HVFcm@B+r2LQ850
z#Q*Gwe`(9t)HsH{!_9)tF7Noa(mut-^WNX<p+nnuMb(bCC4uZtZ|`}*CFAhRAvtyT
z8~Z#GG<XOp?X(H{YL7rBEJ^US_M}l$%wziK)_a9qisPrr!UQ&R$pZcJ$82?X4R#hj
zY)(FS48hzugayAc0W8y!ivY{fHx_<6#F1wXJr})`^@fWp#mWF(IGSnZYl&ef8Kn@m
zUCf|B;vodY81dKuBLUuZQV{V2t>7%O-aJzKYi|wy2ec<BncYGD6QDc(i$3X)+}%8L
z0r|Es;ZuCZM7hTV9Um7?J0?GYx-oY~ZZlvg*)$7JATZxX|Cu`v)!8=T(_c8aQB$FX
zSTf_hg(Z>4WFwlDh}Sv`Sd^)x1D~F0BTcD)G^%!g`9)}tCbR%9RxXn59X`w|cQrP$
z)Qmk*fzisa^G@$(^j~=0b<;hV*KNjK9s*ZTQk|HhIXxKT(R*uXh2}$be9ms9fBgEj
zY^w9}!lhvH(ppRQp>SW#PtqK5uAeeglA|EN(i?v7kMGhRT%OS6edVkiX3s)T1340%
zN$D%U*L$%0(lcx|2A#3XNE5Dl;hIC;cs(`G+9T0#5<_VM93#?33UlO`5DnKg7Tg+_
zo{7UsWvC>ICs9FJ_ZC-`)`Cu4r)rw&1i<urt>S&O?>chEm6n&D*=p#ClBKIRoDeJ?
zkgkVz2hwh})(H8ocO4n(urLnQMz|75dEg8$$ICVSacF0ocF?c_lsEDe9x<W+=P0_!
zysLM%ni8Ub*QGc$DRH=TZ~552_=%N_AfQ=moHja3wNp7z>2Td;4Gf*79GI$>&D5=S
z!FXpID`<L5r(a#Fn_V<aJgII|ZnkuwGrcgI=tNKE44X~~LuJ>2)N^ZZ7%Zwh(eL;#
z<gsi8lxJJsJIBY|ENb^a`BJ3#4D6@XyfrI-H3~p2Lqw4i_F{=zIc`0qIia#VsE#%S
z*axs8$;=yA9#(?dQRciFh2MFC=T;6wUB0r}j@(XBMt>(NwaA%?I**1YOvBkCd3^FU
zzm7EKq>fBy0&is>ZBaFp^rhdCn$skUCJXF3`H0+v_*_XEL7{T`d?rLu8khE0k2S9r
zIcHpSQFp$W<;new;cF9d67}rIm=)7@Kzm*dmq&k6Ortz=@k0z2DT&|t$BfcPup}Q}
ztXZSYC~+9IkdKr?`gWWFJ#8I4><qMXX~tLj$8Ohk$X-Me`XHL-qNHpS?HWw#i^GVr
zwPCA^2%wTP9iY%RGu)F~N7igb%|!?`5nrgoPEGFAcU-Ry{`t>m7tqx)9)rp?u(uEj
zpnnPl5SxtH<~-gko<#e8qP6toot~>t_*nI?wCW?Mz>2`@AycDQ7Se81KWWXJIUIQr
zGanHD)e^H%*O?#7c#6zAl$TqHImofvQfLd8nlUEwasr(b1E`U>VKU~2USx<HF&9oy
zt9bssy<FX6IN@^=K}LxEt=;;5X@ITf(RP(o@{<`Z08rw>$^b6!{F1#|(cnFRz`lMr
zO>ENKX;o^@EV7UlayZ_$7IO#eTir>aIunS)_zrVBWj``I7aeQVUl3B#KB)2@s!|^c
z>O}zrakWMr6V^Hx{@~W)h|c7=j>ojDRF`JGqz`2i)2Ei>I}*(cG_o?Xu15OnoB8)#
zZ#&p(FNIz-4Hx8lok)_BJhx^2i!OUo#D0sOOU^GO?~{PqHi_MNIw-T>wQ>4hCJqKL
zBww-octQ{i^6YJFYz*jZCo@4Z6B{J<s#1(L_(zU7VGP`XksQJ9Lr4;SdOHcXC3uQI
zmGC}qGfA>Fe}ztaXr91vE`Xpv!LRkTMJ~hH7ywf<TyrqVYU0&W&FI9Fe|aRe>K&_H
zm5g>EBTe7iZ!9lDruZWn!V@$a-*p~VEvw_noGU-5=%~QcnHV~jV)sV6`^eEMi-xco
z)6di69cz~+ldfB9k*UYPIKX2rt55S$tx!!uDs^9nbX;o}2zkB&%0tFNzQh0q?t(_x
z&(5wG^f@}NehpNJ`?_WxBKgj0c-%EsxSgFdK0z_Z?P1<P7+F(%<42Y@^J)!Dg&xe+
zU5K63QSH__`fBJHpI>~=$MPPb+AzG8hT+YYuk{iF!bYlfki8li>qMMX=$0Ui%OyoQ
zM)C+E;M5DF`h&SDY-fYsg*S9kcf>Ox$)W++qJ@#hPsZ1w0Ep7$^xA=~jSM4t5qq}&
z!-$nfAW5dZdFcl(g!VBia<VJlgFTAspG?kJ!*Af(U5v7s8vH)oW95+?S|VL%d4!nq
zkdVC^GgsJ?1U4l+F${@H?)$cot7%Z7eFBhd24k8tcWO&`wXeO`Y+|LEF~ZCX%RiQI
z`7pS;a&}?mqc}3r3DJpBe)GV~wa|)pim{V7km0moh5LX7SHf#}(Eq(fLt(_bu~2bT
zvqg+UDCmu4!9hq4nU1Pbw-yu%;|j)m6FoBcFu3~#=}^V-CrPj36iZD_+bNXnB;B88
zN9b@Dk2edRG-=R1f}+-*V(YG%9AJqHRN~=G8lQ^6N?Y0|dQ`DxU@PmIU2dD{P$;Xr
z+_ZEitN|AnakBmroPqru#@#Z5xAka3IM!RX&1Lek?t6&Mho&OyH9aYw0R{jFe-$e=
zDk9|K%0c<&v;n5RJsS?iv+WuJX9j=IgHYcDM{};o)MWAcKMN*5s<)Q9x}O*G^Yb6z
zn?0cYJXspen9%QRASwazwGh@B5t5u_3nuQMHmUy1?p<U9XQ46k0qP(!qp|9hPLuQk
z<d(xQ9MX5_oJuJRG>}lndnI-S3kZ?BCF*4PX%;xJ2tF;)>z8ced6P(uQDGl3U2M%L
zOw|u5e3W9A^_a9ZKICWzR|?H^CG&#*s!UxW@0U<X7=zi{UtlXF6Jir~6u>?lDJW$T
zfJ*8?x$7~9I7fV-+{=^X{7nSC=9=GgzdD^RQM6*>G@SgdgDPWsvnP>huQ7S1sQ?QT
zAF5A%IrTAfib0=!vXSILF!wn5r($g9x}2*OZ{=S-=aa8p?4-E3SDVd*o91*_8wM9a
zDI(<^sM!@d8hN%V=8-0KFndR!yb3QygdOy*nO~nga&yOqem&z^Y7qg#N5Pj=p#sAG
ze{5IGe^M9PQQ__2qrrg<{c}~_w*y=psP{HvZf3PbwS;P^1q<&iPxI{G1L(8MORaX@
zzPZmuFC4AtdUB&0`#k%@ICBv%$YJ&tkwB;<CAjJoE7f*un!A#5)F6}~-)wC`t2(6p
z)2Zu#`cfzz!;vN?cQT<-c=aTRpb#~XQ{Jq7S=uo3;)?CNa&3!?J=pc3hn{KK%%qpa
zs>K%ooZNkFKbHkkQb`FISfGuJl8PMvB9?+LFjlDdAy_(RY&IO=%AL&(X%+#GfN(Yo
zW=5MG7HXIUv*I0u80ab{8rS5g@7sJMjT__WGeq82;x7v2ryvMFkC`y^lyJ9gF$ils
z(=+9v0R5+Ej?F<Hq^L)DI=JRPgFFbOalu)G+5hvO?fD;iF!)3_?TepAMO68^?E(eb
zjpL$idxdXqRoj-Sl*)~}5NkyKNtcDHUtUK*Kn#{-uZ5SQh(K!9qf=L|={)<WVH<N}
z=0%)Em6f#s>dxn8dFew``fp}d(#XGJeA3TxrnmRn?p~&oQaN-vLlg6f3z9<p_{Yc?
z1N$2@%d^c!4rmF2sXiAO!I4Q1CAPW8XDYa=G+|8KyQ4l3ronnm5JV13smb_bYT6cW
z>z7<r3?IGnb_H!_aScxv9JOksd}Y2}Zeh%Q`_3RsJ=%Qc_GF$D-xTB-THNNmjLF=q
z^w1#TJLBGTy;(YZ+~y>cg$m07T9@3~>(dKsy0n`^y1BKv0|+2*0w3KVW5r;snmt$o
z13!{5_cQ1~u<sO-=^cCBhIA>dz?Qt@m51uzp9zdm{wXqr9`Znx66s4|B@xCSOXwHX
zjJvP{t8Cx)sDw*JeD;3w)~U*Vm@?teohPXCs{uq~O~WeJG28bKiSxSq{UdL@e7q8J
z=#h|(|DpB>6m{Fj6OTBIKQPv|vH0<1-IX<CsE?8QHqnV$Vh@aEah0;vtoY=v2OtYA
zo1^Uoggh~~q9yS?C^fj*7@=}5L5n(y$u8(X634TAN_pDu0Yhykk1>;|K><jr-^Ciz
zQj40*7SFGVO4YFlECA+t^G#V%V)2EEQ;ia%RxP(>9`oBP1f>qYdu=<832GaDNJ)N*
z(*XhHWaD$$Qb#vx>X()R%y@97QK90)=G6OM0P-pg*mk5DBBSIfSLTWm+~Qk&38Ga_
zM5@A^I1|i2QqVf(c0>#6Txx4r<I0l6sP<aud_zVXNG#Jq(&rqqJ|P`UJC+SIJ~DSs
z++?=y<lmV9mBU)3`t&gHzBKUR4En1$0{{RAP8mYC)Rgtq#-uOzO97S(clo!l010xH
z0ISaJiVW`KfzhCI5Vf%@KMPkS)bl?gQV)RM`ZXpwn}s3P6SHp&>rN==kHWZ-eqeUW
zn*=5U3Wb+1*mt!&ny%;n_Rex)X}0WpYf+DEr}1;_NC+LF0h!O8F|#7O`)i4B<Q`u4
zR#$+Z=I25jbO%tGZ@YBr-@?%VU%G>!m`zk4`zjKK<tsg>v}<O<D!!CBmzS+|`5V(K
z9`5=a5137!Msl`^;<g-f#)L2!4_I)xva{BLkMmVSQTRmTi=?vdBTEOQ`y1NEyM6tw
z;K+xT<h=FhDZs?Fw#6jx>xY0V1-E7bHH#0i@R+Y2MKJpDk(by`C^GuTNbW>>A>d6E
zBm<ZW5O}{6-E_f2qhqQ*)05+c1P>NpHRVd=>*p^47W78TZJ)<%sb8=RbTY0Yv%79g
zxNk1|JZbc=Tl+@7-f$qRGWVJNx_YH!2{8AYK>eq4<9CS=jj9J6;8coG^1m8J#K<!*
z#c~=00ctjoasKXh_E3bo=u&w3mFvg3NJgZA?%b7l95U;vu|CYlaP0*7oXML2(JHk+
z!DBOB7P~P~K}Mv(_u(@5!q~N<0C5anz{mbE%o2xf&)EDa4J16TZ2Nu@@{VIOO(#gt
z<@C%$6V*l_a!=D%FaK<pDZ22{DAc%b4fqn?0!}|)btS*$Fi|%2=2TAi$>0x{0^b=i
z$m(7Q^4woHJ62PO%+i#Zclr>U6nrTT$J9O(dJip#CQi@fsP7qATwm(z$?Bow^DVmG
zrBzEEqmZY6)j&48KhH!PaG%WFPoI8H4foQ|ebg(Sxs<w>8VUdTPmtuCQx`q=o(luh
zByfO82omU%K7iVxaJevef^@{^{$D(SCr&f#6<X&~1tQu4I(Nbw0emotGkA^H$<XtD
z_X7f*5^V#gWp79IdT1Rx9BZDc4=+-9)^#4!HI1VroUYn`>qCtBzDRt1?E)Jq3OKtV
zG%wQ6?U>~GQF-??r*A6huge<WWuW>c0X)#bMAr-kX0gT3BZts&za)$0qM>3^RpXsu
zhiq9_O-)Qf`<#E|;jTl8drHGUYHG8G*V6KSgkc-H&*WeZihB<%7ve*m)Op$TFra-9
z3j8k{GCM;#!Om~a&iTl`MS3$pU2B|H@xuYR0<XAOkO|wYDxRsQJzYBuVw}_Q3Y-{P
zt%;exHbtq>+ovkYNCtA2PoE{&H-U>rrdeY>{8zKcXP`b)mGcc0r1+Xw5L?xyJ9(v#
zkfP9t!R9b^`anx}h+Lk**uuAm%cw6FCt#S+el|FM+1qUeDXMETK;Dq-Q^6hzv!g*(
z@8h6u4i^y3Xu8!r`3GY^cf&KwOUnW!j!+#F#Hwm#O6Y$|*|PqX`($zumbNxIydg?t
znlT~2)OqCoA!j(`V{o;Hm^6n)-74<Z%YqpBaMc)fcc<s(L|;V8BXMoUg;+am#A^ZR
z_E^Y5->AFz2@Z~3n=ftzbXG%b7iRBgbiwHpmqwFtmlx(gp4*VSwS9Oextp^uru&f$
zCQecc2S?l$#W}oQW<{hD%B)&tNAvV$-yy)$zY40I5|p+j7FJvUay07z*;V^+@c;$*
zg^Vn>!n&l+&@ZUGiH<(O$BRbwySov9q=b*w6CIZ&#n}}Hm9*}B*1<?;^-JWn^sx_0
zkE@-D&zX{MIEU@RLgKubq*=V5Mbe+~5tO!&(>qpLZ+mZ;AwNI3f2p9*^=f7Gbx5s=
z)L4NR|Em!T;M0ZL*NotTf`z|7Cx!Id`am|a)A{`JATpjBRYe|?vCPTP^tVBg#e-i6
zOo@>+261@sXB-*hC^7g#91hcAo{IXuAn!y-0Yy%K?~kW``!vA<&$ysEwew&kKpW=O
z+h&Hx_rg!i@wlxw<j3~qJI<9pNtz6RNGrMEa~@e}?QpDBK|H<6y06`QKbi?=!-De!
zKkm68pm=#dF9P+#Itj-T?ZBdemlPlVJIBBSY#-79+vk1#Z4%&hhlQ}Ike^=`)rGS;
zV!k+jLa!iA76g6*uu_+vmyF1dZ|-U00y+WH2)LNE#Maw)ZXe@6`A&&d9PcSekHLle
zRPa<ea(>H;edPaKhZv~Bq2A{oub~g|-g<2$UG42aab4pS4TR1l!H`hE&&Wlg^2TpK
zJ39oW_36o@RRLz|_hRcORBTCQKGIwnHVtmS0_7FTEzH70gXhQ(OjrOlR-IQz!2E2p
z@7GV72=#CiE*l}ocNj7IQeL3pKR&cum(;-l#l|jhE@;Zmmvh}x>$+}tg?Tcav>owl
zB3o8N^>+WX%a6#$y@~60tAk$e^n6PJn_vMt+gF}$K+aJ5o5i1h3~`hWpY+nGdsKw;
z`^yNZX8pLE?0Wry{p++a!G$>jZ9w!Vwd)|QHvkEi<hoM9T+!B!$|G^Gz4mhDV4zrw
z-!@Meby-??AGR-mYrX^^-1{HSa)U~7*eSq1m}Y1+LZc$CYb&km-C4E?#F<QcH<D62
zN?1)ujH^B(V!d=ukoV}%#4W$I6{q(U<IL>2f80_Ie(R3z6XZvzfai8xKR^Cs`J11-
zkb?1^8fzk0x7EGc=t?LU*VOj{?c8+mPHHWLT921qu}$Ah+h+)gxnTxti?VKcNV#*C
z>^o)^N<CxknP+~)s8rie{dj)s(>xmJFLd?>d)E<|y%h|rQinnK$gJmzh!mz_dw;4!
zCXP*Vbv96<iCTFQ-COU3-<S^*ZXPqlA3fV<sneNYaPd6)4kFw*(T9LJ4J6RKN=BMj
ze`Ws_qzZLSF3#$rFiS!e@%Y)-1A(BH$V3IQ01I5A!-7mOL)(T~%kSA61F4jY$?G>i
zt0Xr_I!wK+HW7;mtnr%IzzG;>h@Dc`(O)k|{?Ok4d=dhu-0$f*CawX+6LHV04Fo@W
z_cr?voHBKPr<E^}HzTI8SkolSkoRSF;NZ5rLBD_;@gkNM%dwgs=l7pIuDie4>;8Ru
z$KI$D-@qzHVmn<~>@UbN*^<!Tm<`-Pa%IY1jvVB_Vn@>92Bv;436XVlK|TI5Y9+T=
zVNwd?yqU^>vvM?mqqes7=%W@L0Os$1A~(zT9=L~BK}A4viUo+7vL6Lcxs}3?c9Hz!
z^}^-IA#D_QIKnr~(Rafa+H&mBCG?$rgUB4TDJY(9Rwu?7H*qa_a~B4GHYv0S74LW&
z-(lU?ob{5v@d(UpBTac$pZh=C&Hqpy;j0V?$%wUjX&Vbd^%Fmlo-`kdNX)IFbZ`KI
zQEoV3ol`WLl^;*|i`nq0I{;8(b4wFs=+pj3kyB?pir}O=u_FYBK-&6uP{bm;_u1jv
z`DVRs`h#G<eEK1IuwF6E!>k|&@-7sJ%s8p&LT&#;+>#S!W$(a(hqW=yNY*Y><Uoh5
zotz^)Kk5_M;A6En4`}Xp4|_M_Q{2Q)fUHMB``fqee|F55AF#<YpI`m5Tz&NNN#rn;
z`4r({^X}?5W>u)UyzAeJ9y2JaM{?<2|C#k_u;g&r`qC1z=B)AdMmSr{E(m^ToR7Hh
zXNURIde>J@i%@8%r(qN8*LT5Ly(>#|ZA{HB8^?&WTN{+NbBhRc^Q$5tB0M{V&vVg~
zvFdO4V<-@$IUqbIZiJ?I|E$crkb)W&6%2>d@53cR$_hKY6yQz55=flC`yXom<HVP2
zH#j9<_2Rv0(+{twHTM^4>s~&8_n~$M-Ai|voV9e`i5&U^8fAsO;&Bot@kd)QZC@XG
z$6WZfESP7@4ReQEgmGiXziSi10*yOV-g&wDAeqi7M+i|D&P445hltE)mEQCjS$D77
zQnba+4aOE<1iDhvctqxi%GGP{hBJq>`^DE0m^3&9fz<RbUf5b|Xyrut7cVO?o-_44
z{Iea>fETzpP9N^b2Q}I{;>SLPmjVhbIHt!yN-lVcCvPqG$FjFS=@P>)%${Y^8F>tO
zq)fn8G6_fE;XTD4qFD#g$7fgA4D9=fv1R1<KGCo~qv;<Si=>`eo}IC*euF<$a5q2_
z;&Ni7?BDY&OYeu^{`Sc-6(px^k)tRV{_9vm1B!{#UN7!{6p7kGIIAee%h1CilP@Y%
z{`+>QDfPEijX#36+Z5h8F91kxCdGPE|JdqhHP~vz%v^s90#cZcCOW)-X6Gq<fkww&
zA2&M-NG7!fnjDFJo!tIy>7ZHhbJdXlu*?}<eoYKT_zI?4&mPCSfO8Uw#PE;$Bf2gM
z`0~FndP9x3&e}8Bh=0<LLqUwZN|+25?k0=x#CtQ^i3^q07a=-a#;oI{*%3ecPJi0~
zfJiBCx%<9ZAoEbQqrMJ!>@V+=KwKTcz&VXyZbt@|*9G_wf9#Nu$&<L;H;4&(j}>hM
zL%16#w$Yq}2qX+%L4dyFSudH|*Tm&`aI+~)Uh?5T@zek^ZnF1gJaEs}I12y>^O^8b
zM^I}PIfw0UFGmud{(UwS<ol5PW$#b<i<dBr!+vY4=#Mbuu^8*W(Ox`@6!42o-35OZ
zcayD1I1Y;k>AYTD=))gUlc~82n~#UhFHe7N=l=Wq@wUQOH5eO+DCr{E2EyTt9Lcc$
zX7Rz|EysQDlBo<m@(<K!Vj=)9-@?`rCHs^7E^EVAyUVc6abYhi<`fVR(jxL$;aU67
zrq&r)9>S(Yo}oz|Q^#)V=j6?b;TOgae%_ogcXKnt0T2m{e;>&+#~@I^8aO|&BhHrU
zhyMY<#u^#K080LMgvP%}zfb@;IMb2H*RNmSRVz4bg|s+JnL+7QWdoi6fy8vHLSp2H
zUE`Fkj&t)bX~rD|;Q#LqZuo=Xe3r(37T$LW8c8@Bz4c4QgE1rBgy1?O>Cb^uj3dUK
zreB!yuRI6NAg<nN;FMV|Ih?O~?$kbf(+|g9#udodyBJ9R;Xqfn7I;2|LQjwt^tG58
zQH=i$v=;%lKPjFXCT+Ir@_F?8y4=#YyeL^GBS_KWV3DmWoSiXTXAEmCLe@HG+J5jK
zQe=l%Y|qbXD#)xSTrg}<b|(7zPa?9ZuH{<rTb&GbaM`roZsgxgfa+>r-u|;tTaFxF
zTf8R%8Lm5F-`~cN&|gfmeqh@#FK{IRIpBP0b}KqUD6Dj87N*CXTT{R@l_KeRUVRYe
zUjl#x4)#038NMJzgmLk|?;(7P`%HnG$L!rSU`hE_fskq5vK@l_d}W}9MuFPTfyysy
z#aR}CBj{w$3*NtfAAf2$@$7&gR*5Y;-;r0&-WiXLpj)Xu-yBG)$=@)|zyD7d%K5m>
zvvA{>xFzK3dFne9ivZqt!oC%BLkB^vnK0G3{A4EoW3OQ~<zHaI{4O{y9bl%GXi8d%
z%W*c%ULHLk;w*ptat2asi4D6Pw|#tS269>;m(RaTH2e^oR@H8zv20zmN@(5g!}+=k
z_YeM*Hf@{8&twQ4dk>VjkU;@&^`0RN#+o8v(v4F+|6sm1Osg&};<b5MbcUb6M68VU
zR`l|wB0l%Y=U((|$9|Zw*?!ITrz^;?KgT7;5BnMjhj4Lll=$qwf8DiX-NZfwQ8-8~
z|IftfED}-Bvpry4FD}M~#sL!;4FmrW{+S|XC~mhwRxUW(k#a*y7*1CK`n4V2>-{gm
z)ItPu=Icl|5AZ4cYV*rG&*3bNXH5C$qwdNY8I?&lp8kVoy6J6;Y2Wb1X^<~WX&Ar%
zRFFtPur}cgXa4c>9oUn=4!eE)3?v5*<O{N~rCk#pmXJ7am&DrIf>12+D#Qw(;i%T%
ztjgALlbC$e##jQlQ6u1ypmVV)(eJ{wEuQ@JQIvrWF_E&@u3aN7y--UgofRjG;P$#t
zQ{m3M+T!NNwd><P&PS!jmfcj(WbR;4ICfl@EaR~TA4!b6C!rW0k>&lpnl@jL5$+_4
z@g6gOL$v41ZKGW^IPa)r%vFP5;Cs5dMORG3=-QW$y{tApM)Q&8E%hKN)~zGCTAfCR
zgPPKHO2*n&lDd|xGi__<l3ZpMT6_w5$2(%L{dVL%S_O{vP`}D~(HpqQD(bcaZ3F-C
z=_V%u9G(B?L(o6VfGp?_Aa_HvEY$VUM20C5Sx%3g4Rc*y)hEjj{q6p}s4M82s4mxK
z;-10CwfJjou@lAcj_*Ury_Uz}*1z=5u0-U9)jz)PtrFbS()4CeR7z5bt+m~O_e?~V
z?@)v(4tN*j*xS+)f4e0*gBq?n!JH*_rhr>*#&-<ERpG4gFS*CO#oYe(5<9p{%Yeli
zjDK6evi?+>10JJ~)2i28MI&)lsB|xkN&G?z^buU<I}WtVXZcsM%1+*re()Y{0%b3Y
zNVvBH%>f7F*NlWF{q^>dhwuQ0xlFwjT%?rsQgv;bxTo56#a?gnwu9cRa?5Y;;>J<g
zHM6=b<UZkPk$c+v^e(mx<^-P5y00?3N`+2F?m$~(x$4Gg6l~qxCH&hy{ouaUpx0lY
zv%<xPW>~1*V%jT_WrVV;Ll!Isd=9pU-V3?7?hbhR+Xbt_bUf$)k&-5V5|_E~&Pyt?
zXzbe#xZKj!ICwXp)DWbnv0wbRmu<fO30y&<dTXvGO-nAlVbyuA?dtOQYDZ0Pc#^82
zMS%3LU;8Mht0y-uw7Nv%vQ)Cy!15fL1hNQhpnNtkr_Co?{udVfBrmqGDsC}V_^VZI
zadzC+l_p|&rrRUAJ}zJ<{*amB2+S=0_uIm7`3&u{snNH3!TFnZ%n7Q7s#7c)|0fP+
zzwBN?%}=<lrcd-n>!`4O9>Cl@yEeZfN||8#>uYzQQMgIB9c*<*rbEZm&t)&+b;4Z~
z?R#K#hHxW!I*mm1U+x`%-Hw6Mn0ln)%+AgA*Y>&CI8a4Igjp*G+7IKDuIS}2#=GLl
zx_qy$VF3#79_;^R$2-ug@T!^gc;2*&a_m)U3HRY*LlDGEBOxJwmDCh?Ak$nn-3uf&
zyQF)S<F1AC!KA~5>M29*yM>5kh34g-c`0%K#EOl#mh)lfh?I1@u-_GOvI>8O5!<jC
zaHWFqIpEUhSt|FxsUn>v+~#6%W>XhErd+<v(L2qx^;F9W;j2bUv+O>%DX$gPoie`)
z5uMzF@|OOliLXqNCt2r~DaSpdX??^<nptAQ)!I%Z1fd-FOSlP(AVSO~mG~=#wzeUB
z5FXH*PV`lVo6SG9@Q9Qs6Kk`o*v#!vc_yX&`)-8`%agh(rPn4cN~s2Tx@Bn_0&tiS
zX6^mINN;GA_94CHJCUh}-X%Ivs~3g;N+~(Na!=Y7=HmH*ASo7#W|o9UxKH>5dDYav
z8FChe$;A5-Ub#&QEcJ7TyMQRn<Z#yarCFQby@dt%!_^iIOkg7gQCTTDa5qXzdb;vb
z`r6#++S1qf$h(Lh5$<bd$*;NuR}lSoIh=5PunAt@_Z*Ov);O!9@UBuuV4&FF<yajq
z)G|5PY*`Idu2x#mn#bsPPIz~FCfCOUmj%_-x)<2Hh$Vy$_2=y(hx<!^d1E$n#*kEa
zSV9}~%;()?&D6^jJ3y?Y;Kx`+s676=x}O%20r$COQ5KRW5DbOOGWHBiWF%+43q3M1
z$}TqF8kDEXr_7G~j_X&EK(7)itwaoLs&x7em#LLiu`4~Tu-jZ2Q?O3Ej?g9W_+8LI
z?jicGiP>C9#yP|tsf7*>BT_eh&th5du3eJ0s2scY7p0Qhj;Yqaeo^>&`TTAn4To=8
z#4d*-TqdGPRwwEZtN6N6^Fow5IN!1VwIVr5O|1^EIiG9T)Q*%x(<a>^T_2`B=zpaz
z$y=!??u5DeZu}nG?&@U2mi=JqRd00cmv$5OfIS@K7MJ|<R)G!e@POW2dh-x55s{CD
zxw$rL@|qI|zJ#Y|ESuR`R2A7wZ?(hy_#s4DJg#3~zY_H41)$ORPV<fFue^L+3anfD
z%j|>>tymtq9o1ECQ#l*Rl8`5$ag>bi)JdS})tQbzTsXeL12hddy3rl&`z;Wgr>pqd
zPkYZWLqyWt$Q@eGc-rNJ-Gq5?ciS%vgXxK6#5jMR?ifdW?W?01FW?HzH+Oddy~|S}
zK=S^1t^eh?AzXEtt|wQ4TrR|L`kSl#9mN@+E*j4%-|l)>Y35_UebeydsnwVDfn8HM
z-S{hgOWsweooGEE-z+Zt@TYyCBUO#Ze_i7CJJBFk0-?_c(aJE@SSe1)5Nt(*_J|(}
zU+Z15oUWMUJ5%8Kn=a5PqPLVx9rHuF7S7QxkDUun6IP4GSHhiHUF(rE$Z9uL_~g(m
z1p#3in6kphJnY-={^di125_S*mwVV9e@G7#p<l_XoE{vM{F_q|Ae*0v)^Vq#UAZ*~
zM<zP7j$96e8zCi+K7mWKCt>H6c@5aVuw?sH=uTM0t>L$)hW0fN%SV>2=;3uQezy@x
zE){ToaV>smCSfiRqSx)`*FWIV^DhiC^$cYVDtHw|by<|wKM*#77p2NmL7aw$tDW~c
z?YtuR-!J_zI1Qy>K$qe!q)@{B+C0od1o1=UEo_zBcLr}+mrwf-tbH~Js6d2TbiRN5
z?nKdeHgD{UU2yF-&%|Ec1&Cq5Cbm(<qW`u%ENmSl5Wv?0xkWuIt>i76Tq3!4r9RO?
z<2*O#uM4CTLK6!uD%(vqvIkMv*oH(zX}fr{6dw^Oc}1PzgpHF4(o-8pPVl>KZ*Fc8
ztX5l)OkfU?NJ-{mfYNAQ*`6K^oi6RdA*|H-sc74(p5-rd{vN39t9iF6D;$BX8Yx@h
zAL*o^)(L0FO}{pBu+L2&0FFK|kL<-Kd3zBe;b{HifA#PKo<LTi#VenfA^I4;a;;=i
zsl={O8*kgiTNE!{n0ft7>%y=6r1J|B#sqO~(0O!DbiRUTidmDZ5*SI|ElLPiyOMz@
zUZ#4s=QrKnBw>^guzv;aVPR?S?vA`m5+RhbFfef{Ot#mz_`&{r^d$#c3F(`iqaBL-
zHlqhG&7|kQiLw|)0^uO9w8h(^>q{n-%^jsL;LPs>a07DJsYTO(zbS1w5x5PTduB<9
zOE_%d;dr&AaCh#ekr+Si5&N67TRef-2o<&wFvte3?e&!!%Kfl`@H}BS7<n3QRZVB|
z&i?CWH!l8mI305dw5@B@!u8;*ywYN=ul|uPq-lboDr_GU9R#`w10{Z;^R!IVSZ#3c
z?ReKYA<=n^twWv-VT~AY)mJb9F1q>-yQmp|M!qHfzsYnkh_YU*9~#cI=W)AGhCi(3
zEVvg!Q^{|)!;^U2N_6l4LLp%b90gH_dKuY@CV&5~ly=eYaHD-~Iur8|;i$_%D~_c&
z<G*o~?m9g1Ftq6l94YG1*|{X;NY<L{=r+6=pEe51o!q`uduyL^al)8!JBs8GZOtbI
z!pf9K<;a^mIyxMkoSmK3-ihu<E)n*DUGkZ`4gSXa&9!fZ(+nvW<<jr6<tRmGXiNG1
z?oO?bTFxwRLa?R^cF4#KOQtEA;g$#?aFLac3Q4%=aYO_<wpR9=m!NCN1Z}B}@{mFz
z7kfXZ`{~7ArC=ONmxkr$Qio+)S<-Wu8GrfSBCFioQrJ?m5Fh`Zet*d_krXyUAk)cD
zQFz!m4sgkB+Z{&rzh0*87X+`=UO8}y>$Zc~;Pf4o3le|ptc})K=uoy&4|HYxn%K~t
zLo?EsTq$PnsY!D$VFilhrRJQ%#<77MFz>o$({F;kB{M5vT8CYItu&uSY5Qok%S%I!
z!M~6$q;s%!E6$g|4gNi<(}g=>H>k?uYMXUg>SF!jPqCnWY&mt}Zo-kkyTwi|lcT?o
zW$Udtd|)7l?>bNT6AbNjB-AW%p^3Q6n1d(N2E4)g)!Igpqft8@ysG0VDN~!Tl3<?z
z-krFV2F6vm1PE2~h-LeCfBWDKUW@8xK|TXI$+<r?PXn7x_bD+u+bgdib`uuz)<3;Y
z*9<Ma)8Xc|vL=ntZf_H~s4^A5pBu!VABkWYetX5XF0q0c%-Lg{e+)Q%EUruESW4Rx
zg$5lKwlf77%B6zBwGMyON^<vHmXLDt;+yi;b17r$tQ)({z(tFnWhZi;X}69(cz2q+
z*J>&RwrTRJz#BxA7)fUE|E7U%!nKGr-=SiY1RHVRh=fug`0}fk;WkMVRG~JhKU^Jq
zg&{*SKP7cX8zYygiJ8q_EbLIi5$2Qmb@sXYCOY*?LT&5t*7u<+hi&FDMPVuGaOab#
z8Y>*5m27j|V+Z;+aFoD2;=^CHksluTJ|fkfOxir`=-JdpDduoGrR^(2M%($?92|8D
zLsAYCQd1KSb4`V&VGaFLhmd_+(>16JSTx<_-pgsu=em134vEsxw{Ua!z1J|r8w@C7
zUz96|(xxpm+>3+v6}cN)jI#M}GKDvdRK!$APwJmmUK-m7dMi`Jo{i$+4d;9@d=f#>
zjQe5el_hG9{i8zq@QEzFb*(@j!D-#Qkm{&&UXAH#ezlW0r|2Uf#amU~(3iza$rK-I
zZHf29W+p{-nR5H{9;{)^0NjK6z6no#mWR9k>frH~y&zo0G$1J;<&b^-!3qa<Ha5{^
z1AkS1t?>a25+mO9@6eQrDkx8)>qyGb)Y_3Y)?deh+KmwA#_<fh^r^0%t9A~z9$$68
zbfn=$IbTmHYRrg~U(;aO=b*H#TloaewNv9mS!(msvyn~VJLaFKhMx+Jxd8bI1)>}O
z(?p*htdYCJK`X)F%T%5-6f9oPN{|Tsp?gxM7CI@TBXftA)+t!qsERo!B7z6p$~`G5
zdn#*#bTkrockXVmlR6Y9>~E>#ZxS8n*K=y_i85I}6N#Ap<P6kfFsoLoFN?*I45X*M
zTJql7^pS=fAW9p}DIw4d#12R?rQX>NZ+`jk7PiA4s<J1EM5II{r)csT5B_jv^)+4g
z(z~-%O%`h&&B9tOC{gQ<@kbB=&<RJO(0U>z^KOoNhV6m9QC8^Ya@`@+O_9WBuNbe}
z5XJ7mE}|UZdur=i3GObOpm>Q@_YFCyPcB0D|8dk08qU^S8)DUz(RD3HV%(>~QRq<j
zYT0c^lM%Gu%lu}=p}Rw^J8TINUQWwL4bk^PjzAk^0!K8HeDo2%l)(NiL>%?%ZpWa%
zd>h5v6Hjbi_sKMB6vdi%g1hMoMDGRu>VnqqYBDBd_?yJrK)p1N#b~ON@*@OO>}2=j
zAH3e}R*ZKZtruUPO>ZSsY&$)+it&@Mz;-r!_o(s!ioQrs2=0q}j(UIuqG%T+J4=pK
zfx;|07W)MK%O@tipHI-gYJQQ;B83(|z9C&xM)`}TAzI<s4(&PTMYe6x|FkSfiLERm
z5dXls(Yv_6Q{{?<zj9TSPK(J+2OZ4(+|Jv?=_Q1Fxc7d-Z*O_A#ETG&WB>`Q$ZY(P
zKwo%&0^xMaiqo@?Srea+99PYgyyllSMt;ZUMEao$yV@a&2C2a~lUrb#Yo8Gs1ILMh
zna%^xx&9g*sUgu(i_gpb5FI^L$;)jxw9_$_)!Z9BBopDp-gBcS+O|uL*9vpU<jE)n
zvK+co7tu+c_re(k-j3=vA{r1@4%T%ovyc2`d2*Xh^DaU~(VMY2y|+H$y|U*;iLoZa
z8WI)K!2OjQLz?*ao85njF0Sz2x5%KZ%+Pi!v0J#6v8#ZUpW$8G1;bNm=k(G#otifa
z=+DdjtB*Oc#!PrrI_fN~tIntWS7yt&YTuMC_IiCX5-Ooh)3}Km3d<oY)Q!jXs&A{8
zk#TD6N-VDaM0(G4_VwJyTJ5a%%Y8AdhF@TNcv!B6M9=FeB>xr#jVK7qSp<FRQ^!*p
zJx9#aar6W>w&RIQ6~%FE?}A9FG~c#_pFNPC9BT1YD`P!Sv^x$ZC|N`wg|HJ}W^-3q
z%Cn=oRxfHuhGNg%u@|h(Jz<iPhSNK~E4lQolw5jp+S8yNXa(>gmqx=1|LSB+;ekvu
z>2yM}=3$hhZDGDLao<QbiT}O?l8g9L_#~eM5$VYT^HR2C0(Z-2A8p{g$B5VFQzXC0
zkTJh}Q0O{asT1S3Ge%|mQ}pqGOJW0=T%>dvETGU@lA*t6?3)xh@DQ|8S3SH%6n7@m
zSc3h*%^Gd?Plo$w@M8G0+im<#v7OQDB075+D2Pr)YW(gkQ8;)hBp)vfn4QMNr|a-!
zTBu7!ztp-MWq_S5$b?zbv{}Qki%fOM#Xrd;88x(mDsI2;AbFSV2*^x0xQ@ENDLquG
zw<{eZ<KwcTc=fc}%OlgA`~tj^R2AYlD)_HGMJSn0zTPdVc$lbU%~GQ!ufEM&(Kbm8
zBbJK#GR%jjCrFw4Y~*j*<J`1FTpV{LT~VVZ9|Ec<9c*P{QpfR*bU_X21?=Hc%Jii@
z6LN85XI^qtqfA)JSX6}R*wIOIYqf%|Czninrl66bBwc|0g}h7%DxrDl%+T$j24b3b
z)<F=zuFXI4`OPg&Ujg;5s;jdzf?^hrgpnFFlZ1J`Vf82z<2ugs47akRd18n1#}t{9
zObT9alIyLYFja6R?l6+XQ6YHk@2XSmw9pEnF}F+_wplUzWqXT4<CWpN?G#^f<XzKB
z<SDVR2eCy_gDPpq<U&9ejA#_0cEStM*|$!boWF*`;t=VPx99eH!A45mDNf0u3q2zg
zY2T(fP86C)pucNyqqPaMX_ifnzI#egYBX>f6`|LbGny;OI+lu5RCCW2y3qP9y2Q07
zmAarXj?YkU#y**ahhPxvsywczAe-p*po^?s3diFpN%z!ivgBlKnP%$u+8GM}3(teT
zbLY;N(9lp-;(i}umGE@ChUZ@Db&_sS7C;e4#|z$w{2t|m(X=N>d;sMChCjVPzm&Ab
zYGhTnI+s)|kdAa9zp^!`__3vB)veXlVx*mM@JwZ+hNMnWz|FH}^<8hlWhnxOBAs)X
z*UjG_Br!@1cuX|8n?EP2%PrJm5Xa%9AC;fsnezvD75hg|m;97TA9y0HHIQ0uWW6uV
zyjY}1{UjE)8n8kSbGULJom>>~2Q970@n=ba8akrPT`Pko&<A!NO+oFW^D7eem(wwh
zI+e!z?Zzv2X@9C)b4s+^-$x&pqqNqb2rDWX54UB;8h&$37RXOez#n{B#pUWFvKfzj
zmXzQfXlZb++>KU$EvgCY-Y;KWUFFB>GvyAICQY-#M;}8cMQIn?JKTK4=>ah{ov`i!
zZ0Xz1(y!d*Wy>uOIKE}xFkDcxj=B3O>D4WH17+Q@SpX=nNMwrwDfiQzf;NF#Y?L;Z
zw9*lv!@4ITs{E^~at60HEJj(YDb~`3^eQ8hkVF&hy3df5?V&B#-UDp<#-yCZ;r%lv
zW?(qh^)35|AC;yCxsR87GwxT4%DDG!HNAV*D|782;CdZY?m+egpz1Sk#ot>gn=0#<
z4iA);3_9v5JB~YXOiIr<Ta?Zqsv~?Fd)Q;CbuaCOr*T-Xar+8$D4O=TuY2lr+>l+3
zL)eytDa~}9m>C(L;TgfKv#{?cCrN~+ASY2_dwJi1!S>21ME>hhXCd!jO}#eA$V_30
z8oP`f`|#{>pY?_9QP6G5=w`&K^!z-$iz0~2ya~1U)|o3%pKHhC+?&D5!q3&5JJJ;=
zl6Uq3HJ3rt4)=A^qA%kJ7U$!!YnN)Oc}Hn8)+y}5!mb1}P2R-3zVyvq|5wms;emh;
ziK{KVhkbSG`lr5lvN&$^{H85ls+`}nZUeypuy-@tMaN_IvJk6=Ia%*&1Y?_d;o-S{
z`V7f?rvBt9WWrM6vU04-HQ<qu^#~^k^Gm0Xow*mKofJ0=|1=Q*20#42Z{K)yV`19{
zpAf3Uy*tC9qVWAJ+mDL2Ur~8}uh;Q7N=l0WZIr@2_UyMMwL{y<_s2Nl7RR9_E70z&
zoo^V0qkNGNX(Wm3ucj~@Bn4F?G*FE@vkT+mwbgXmb16zvjy-~uRF=?w{<}NHolhJT
zP@vw>Jxj150^h}sK`{j>7u11%h1eH_{E@WC=HytQt0DFEHY+}Ov8s6ANOo$M_Ua=V
z%oQ0w_MXcIl_4)C$*LV)D?w3RtsPg9hV%wgU*ROlguCSLfubu_vL-eTA5zLF#wGa8
z-)yPZB2oj+^9js~*s`Iu%-XZ%w|H3;PyDbS>4dMzzF*rh-UF8b%lydi<k;nATozZu
zqZ<cq@r4Vf(bkb83)$#uZLDbfH-7rCRTH=Y_VNZZ<#y?MKa{3b?REx$aoTvgAyK#m
zzh%w~(&ub(M)0C?@C{Mw_+KkMTh9yA>A(X{j!knlD!4Nd;SnD3Oia#98G_sNL4*A6
zjcE+rj_Mi;h53W|N+Z)8@rPRQ7m!%H3mjM0KguUt%fEoFyflu~X_@$)8Vxxp$+n3+
z0M+L7qjpU{cE&!@Qz>pY`CV@mkb1`VH4%%I2hw0lu(h<G=(EAhtpY~cqjEcb7p};9
zU)ru?32pZpE4M`<7?J(J@MBu2Yxm~f2J@)dZ~spk`w|@3OKxOB%S6R>x=SUA^EkYr
z1{$L*Sr+RS^S2iwMCX9L*XHx?Jf6y&Kp7-hl7GO>jbVEmdqkp!_CwN1Z{1C`ZKSyO
z9f%Bz>0(O=V*bVlyD5hBHH-ZJ&;1M7OG`_W)IO2yMT1b7m~19t|3Qmbuyg@kA~V}G
zwWgOteX<s31vCX`br`q}R*)6zrTyEo@~frz3ek4ZbyW|=rk5{YPS(}cCF`;HAf@!F
zEvEqLRtOe-kVXlT?oFCQUN#=hVhrt^=aPNLgrf{Wo}PEa+x=MCHig$e&`H1phl)7D
zLO|f2znbp7cJ_eEc97Sgfll%l4s_-f_cw7Q)W7=UyP|ks{QVh;ZaYY4Ya;C|5;&^S
z?AI-9>FAH>mWwojaHtacl@+zxKYr{I6p~6!Zkbgx8gE%8P{K=5L(Vr_1RNb5ebdv^
z*%Q~@5whXJ0G4g${l+FqFoTkp**-llPZF3%0%)_@)M|~C=7t}yYb9HM-KaxL{tuj!
z@G3T~w&q<*b{$4a=7gZj-w@xvjU(CCjQ@N<2SVzizKcuqI(pJ>**V5`&lVQISTK^7
zz36mA9Hnf_&-}_1_Mg!5H<99&Kexp)2qUq-c<6lQ2I!Hr=?Qo%=l+S^24bgzwq@rB
z9XDS;1e!N=ciE`}X|3KI8?3C%l{)pK<AzQp;Kc9dDzHfwr~Ux}6PCc1mTmIVBN#$8
zp9IyG%LZCDjw<qRujH2xe$nuNkJdpIX++T97WXSB`!Eev`p*>qp!$=Oa;PJ3E7;>e
zFAHjau+LB|3eZ$k-faC-3zHSRy=9RAq3C$paRAtG;4$P~euZYKKUnhP2a&y&%fS9@
zUYR~7$KJhrcNGs0PkIvD{a<AtDdAUR6t@Xk&W4-RbuK-j2V%PghVN{lH`t&okAx&%
z-0B-}jsI|OW4C_(@G~-)1Lmx#rHP&sM=Js{Z*3{d@42hZ2{}|qS18qcNYH|sk0#{R
zG6s9?KzD&vo{0;f_^nsE^}?@5<S6~rm%rs%MAyY$UrD*nf4D59zQ0O2+3PooGJ|0-
z?r+}4a%D)N$wf*IeDGq2be8+ICPeZ8S4eLg%|XJj-(&;%mnft!!PZ>2AQ#1!uwJz7
z|A8orQi71TtAaf?-(iK1U+sHWn`5uSx%6!~9-0wsNT?W|0eD5(Am^TruCA^x!^6Y1
zj8pl6h>aBm`$+ib7bA{r%e~~_5N*O0QVSuze$o2T{D<Z8yCb}(yON}<A}C6m2qV99
zOZ0?^uvui92*K(r&CDH~G~jJc-vzsT!uXlE{sUi}_TP3)=LkC<(crv?t$5pkm{#{N
z&O`BSvp)^>#&n+4Y?7^Sqgj!bJmOysgw_h6G}pOT%B|cIwArJ>k>!UC`o*B-0^l!?
z;YtNJMM8%c)IRAw{-X+DuvM?MHDBKH<{2z#<@iB&lJ~Q-vup@#Lm<U~lS)e_zNxAx
zsfizWH-HFVWT6r3U-;-IOo5H+N#GHWsM9}#pLmk5wfo4MFO$g&PA<j9?lLXI7Hg>w
zw_*)jpg^hZ$Hi^VaPPZ9-W;l}8Rji3(>qa5`<^}_?|z{pKFI;FqFoKPZa=)!uTqo>
z?kh*exSR_Y;bD;T(eEogVM%$6X&u|Y&PSQxFfrdgXk@q8B36}`p4^r$wr57b$)3H_
zo{)T-av9=i9+Bca{d?m<kt@sn(JLz}^Nb4sX6HAnr`&7);<N|Eu5DnnOU%<DcijHo
ze?M$=@Xg=rVV@8wvFo1h#irYmm>uX|9ECd$iEZS&qja|zjD<~(%>}e?jFvc^z@uZc
zv?;?<`3R*OG(jE;g3oR5<+l$kAPS~_VK@r2UlwkV_8J}e*;|ob-1|9JTlK@ImTgBV
ztQ%0C)HDsy!7%T`0wGE(V|Lu3)xjT;uU%0rr;Xa|`TUvBatwZ?4+-eQ>h!bl#j!gm
z4)|83_krC^vcGnb{l7AH%#W&Esw6EYCpln~?~r-~2xE*KCdZk&a26tj4qe}rF>Az0
z>I#5V%yf4CQ2#CI+#+xSgj89)w@9Jd?2BKVcR1hfO0?q4t)US0ZK^2Xq-mEC$p$+0
zyslt>4z~$6T0^sGV=uxwIL4r`(GtEFSX?Ydq;@T3?n}9|1C5w)zN<VxV(_gmY>BG@
zEKJZguQ-S1(4j-!ot?!=$(@M~C{dSf{GnecXooqCJn`07h1;<4;|DRjX0~)7Mq8vL
zE!5Bj8|MMUeJq8}5B{pVw_uyKFIXzlj4H(_b?)3u>dKhA-?HUmFF|XZCH4Byc2)jk
zE<#iKyICMX{z+w3PwVtK7(2`#PvVmd<hTi)cf@FWZRcXYN^K#M;frFzk`cj_Y02{4
zrsO43cu%5+@z3w6F>*MaFvs|AZuzRfW+KvM6aEHv@;`04Vq#dfZEY-_z&U}vva&J?
zY^ZG}0XsmT`M&XNl;UB%!ukj)5&b|kQWCERG;+@*2;lsg^(giV6pJjR&*uGABXN>|
z!6V*>a1~@*ymDmO*=}jx4%yx(UNM8*%&G4ybJ@yFUh|I$sFAwWy1+!SV<*cr70s2-
znNANh-l*)p3a(R6><V;h!l814x?q76vwpQl8(&RFQruC~_F?r&)@lpZcROqK=G134
zpGMQe*Y_rNv~DY9#$~$GF_abpj#I1l?M4n)K_i1cUW3O!%9Yc`P-7SV1_IsE3Azb6
zXt+hZmH!O?H{aLSr)+En`T1Es{>vqun+ZR%p^c6=CT4rV2l16kn0y$zgp3?ucZQXe
zg}l=2(Gkq<NCOlA9;4wke+ymFm*D{pvxk|lHnSzg<G*VES7`hPS=lykLwGXG&|jOT
zFGT-8*1kL*sxN%FL=hE5v`N}Twjv>nLLrjud#UVX&)#SgAzMP$vKtg*-%6n*3M0F+
zW)DM1-sj4UL4AL}_x-&6bsP7d^PFej&N-Ezis!O@Re#OhjcUsZy9Uil>%s34@HnA)
zfFL^r{1e1KT^8g|+c+%w!L3X9P{Z5>%Z}hZiX447KpK358I0mcB&{}nbS$H`c66fa
z`JHP?e#Bu)G|7@rJIWS$3=LKVYn{@S5)7-lDm%yv#LHyq*iyTlc9S?^E+Se94>{aW
zNf7H|ui;DRRVvgHP6SWw56#g~Ta73ryvM==zRJ|=Pn{U>R&pIqxsG+6ypw)3p_i{P
z+MIO&EQc+u0w=xc+`p!x(FQ*56J{B~`<8dAj!&NwedIS`_5R#K0ew+!g+M3=DBCiL
zR1DvkA|z+gIAU|wm)Tq8SL@v<o%mKhkR2D~%}WB9e|BM2oQBG#*9o?m(^20O9W9@X
zzdP`zmefh;I`n93oow!*Y}>@8x&$f+DLw5gD#tj=adR>M?lX0^+B)C*vUcJ%l&3jf
zv;uBw{BMaVXu9*!{EMI{Js`OOcNf>AoU<d3>5xi2iPHyL4gzU)-t*^TWJQSU1Q||h
zYcCrJk27dTD37!pn$J}c6)=$uiw_eDEWG5l5U3o8bo?^E^>Jo#@!qMn&lVR_liV8D
zF&|7(_9l)!xBchnH`aoi(_<LPHn(~9goY6jJ%Z?ftSS}Gd#`6GE(Le`(FqUiR+gGv
z(DuEY3PcWD`A{PO;1$_sMh-}~#OrDnr0a-HxhoSClni13a-B#b5Y}||=ZxhcI$B3}
zajnNcb~S2V?Bous>wptJ!7xJzZgBsvs%C+mG`63kK?z<=yJla?zfBDo^<*hH13sI#
zgvs@gbWUjkQ#5Z)xl9!7mk{KBS9W>V^s}q$=$&-F1jf^&Q}pV#>b&5(i-YU2MLQF5
zYX3Sj3sSX8X^MWrK=!~1Z?b=($rh=4FLHPV48VP#D*;VFk6sv=YMSo-!yK-MH5op8
zi5Gw$a%Uo|kyjsDdXWs(KKu42B5W%rg}T;(dy&qUhEygtHA(2TUc#%K`uXXqV*}B5
zrv0K8)gRqGtII>T*Vv~)#95HjQQvhP5%xipp0kyrU}aK!J;=<4Dqf_~mKoxq#;yNY
zA!G{{<K`9>img)vn*5XlW?0VkMALl^OttJCs~rSRNBNN{>k7S4`!Mk>##mdvmg8lo
zCy2fctnIUK0vP`li(32++Na)h1Z=~<9BAR8*04=#L&%TBT*qEJmbM&MAp}?zNP|th
zc>+<WqvX(w-9#MswQNsMBsm#mgzT0M^FFp6h90K+xejMXq{pg<Oi<Dd#}d9mmJ@5B
zufQ!|#GaDffB6xH(R?Y*`3n5&O|H~y2wzVqMsYpQ?w74I9>yPZFKpr64h>gaVD55l
zPVf20`ffpkA9Ux{zJGP}YJH-kn~S64<d!sDG2bmH)xtaY^%=@X0~eX(Q8-FQ!NS?<
zWhUebB<%(+(h_hBHIvhh7t{LZEPjR5P7ETp-xM~FS>zW<5$Qnyl0ftI5a<Y$#7*hF
zOP(A;9BZU<mHhZfO2chAJ*q`$(Wfc;-EZ28aoaS$CpOacMVG|wD+KeR_9NCrzINbi
z#ES!D<YOQ}mC1XPmh!R$W?!6nvA1sX4XR}M`d!{q!EUxzV^i~U-Icw`hOp8c%cCKL
zm7-DFv}wb=JC2GIAc3Wx);@<MI^VRvQYq{ohSpv=FrnxnAc9%@7i6E%$|WxRQc?o=
zU5^`O=Cof7*BNAmW(R6()7ev0r=rgJCLAU549k9W9*a(E^>GbPMdEPl*%cUHCeOB-
zw9XN-SR3A(G{SseLLdVBm>@?eIINnVMiPj?7CziAMCkcf#vB#>U}<f=?Y48!;emP)
zp3h?Z9K?XSPa}%ULv?CAbwGfa3@k~ffCjv;{rCZyJ9+K7Wlo(;96Cr3dN)Ps@GZpm
zjY6V>$)n!?1cR2+Ry^KG;`fyn*LR9~Zf;_t=`Z*^ki1TO?e|XQhNIhUMMLiVT=M|R
zvjoDMK~nd}_7Q0cd{*n)PJ)L`nX;dDogK7Go!I#OtwVC#NKxXucFC05-BW;kJvo4b
z|7#1;v01?YpsSJXFS--rdVeVKqpRRrTJr{B#n*%eSJ6jmOG$(>GUdaMpf1+JN32<{
z=X!KwW7`m8^xLV-WSwOBPEF?@O(lt(k-#{@44}b}WWOU4Zp8z-2si~_SX!#4`&x86
zZdaK~y3FEo;V>(AN6AcSq`T}75iN*isqYsVKg$YwTY%r`^#w6~UtZi2yJ=5jj%1nt
z%5qWdWK?V8DN4+065DsGsJ=wKe&F!h@t^=wTwC}6%?wU1J|TqImEZRQC6mL&HIKyM
zj#i3C{H%8BQod>zpSa1bzH=?vG}L^$n|B2**^d>RMuF7FzwjMY-I%!p=hePuZm{dX
z8#l+L_YIM%v7DSf@fXuGEZy{lC(|CaotW@OS|eHkq@j;Z=l`*$TP14{F{<f8b6PHn
z5y#_j=a)LWpACx#pnqs^5m*OB4NXObKikIp!tGE0nl4rSMfPC);C#_tE1D%&vO}H)
zPhS|=9)9TF+|%a<^8nXk-CB;I&{r%Gj4o^=O>+~$#?UI3IwsEQs#T{F1;_AGR6o*P
zb|T1|7$esxvGA%fehWT#N?0&=&Y(WIzw!vP7WeapOzBi4EL-f%{8PUC3n7`ZP{s;J
z?sRZ}xL>I3sp(>@#?564!o#dSR$%T&=Cmxxg-rv&F<o#7*&>CtsWKfrb#|LHQhk!e
zeH+<rSFbk?bLb(C@z4?;Ryw?f7h#Q(=g(X_%#sqUElW=nX<8&*w~5GaR<wp&YrOCg
z7Y>;{jbVh$6tgb;>(PrsJd#Ckxn$;_{(=ewDH`2;YS+mg%+3yng<ZOU^QyGT)45)y
zvU!n?Q7q`BOH$w_m1YD4#2$Y1Ci3luXN5!nC{4lbk5Zk|$$WLh@w8*uiRw96I-wd$
zWV4lMEd^)nnB?<(p4#-cDMftfKoL>ULT8_SQvbERH*kP){g1UlQ!pp=x_?$fcG&!Z
zrY0J^%KJa#2quU|Ey@VRt%wnlOnY~8%58&i$96R#c=aXH8=rb`BatswG7;1$Xl1s7
zi}`6?noGFLuuUP_VrM16Mw$ZlExd9dm$f+YP1z{vD09p4Z<)P6tM8}rmyJ(RPacGc
zj)AVJ%;bIl&xzjCs~s*;>XA_vu|eBTItHZcOt{!N_S+OjTks7K<~slm)<KrsJBzDC
z>FJ~S$!xQ&RpufxzTcGp)SPp7VSClmmbxYUlVVoy9NEB~;Co&An7OTn>Fu4bM~b=~
z!b@~^cYJ>*UAzWE>=B1{ZIDd_!O&#g9$FrJI`gyzA)i{pLLR!le><BQ#YEe~Uiw1u
z!4qCkd-@+8G!FFoR|nk^Bb0mTGajIBceJ(^xsA8Kx1{{)(BFdOS49mu(}UT22>P<*
z26-8SZa?i_pY70MBZ{=xEG!G93GEBx-+on+IS%;1{8>WAyWvlRkoL6@_;~%Fe#QF5
zZH7IXWSd?){%FHqXwW=ANKPO2h=A7)*sK&WM<6OodSXf5<-DYJb#tM%x%Ow~c_*e2
z`&MEO8qTiM^iN|U@$IDtJr=MM>x?q5scf{N`{MQX<hAOBu4sfuX*2=KFLT+Bi;F4A
zBjddD{q-kk6Sf<<)62-nxX&m5BZm*ZZ*+99_}8!A0NY{vg_M{L6x$mx#f|?I)(FH{
zq(nc?c#JAd*Lm7$YGE;Q?Dt-%r<Cuz(>ri}_ueZGBcnXK3&%eWmGUm7I7^`dx=TI{
zLvSC{AE0Pzv|WBk{^a{@UB%NnH$5`XY!&ch`{m+iV$%WT&!*b)=d=?Se1JgMXsoJJ
ztwhX%FJa-2Xk0~-7?#$7S(ng1+h042x`{?s>yL7M=l@eIYOlVY@owkk;_RNAc8-p-
zBTut}-;A&l2>l7Mw%Hec5R@f`@~FG@k30E4n+}}I4k9~~BkVoW*ZwQ4a<p75g6g}O
zY!j03{Q}1@w3$Dx`e%8D26|I1SiG}7T!l>u_Fgp*D$LX#x?G&}8m4bKMy&9A(%(E5
za@1v1sA!AUB+BlW;s)J5g{#R71j~~^&$jPLHrwxEa&Q)YrhSz1WtPia$gI4FJ_e5B
z`zkB3K@`TT$YBB(j|ch(mE@rd&A3=aX}0N(6i)Yk5*}%By^SS0SSSa;K{x(9p^A=<
zBbzF|^|jxN`!%#Awr}?9ly<1ZO|m>?d~Mi8Ul<owf@H`lAmz<HzW#_WfwKt;k^m5!
zEZC0}*F?K1GQstekB=Oc?~rBi+*h2Xd-}GS*UrW7`ZAD$MZSsI0s&a|q7}(5PkUf=
z-_5b;?2eU~eLFH05&qP9j`}tE1g@QF7}{A-JpgZrz`3Q;{vz@IC}?;=9bUMGm&LYH
z1#bN^Aa9s`eB>B?&dFCdvN^Own@^BEHcK%Ho-=%=qrZSH^loz4?XQf54af4AK1N%v
zONoB-NL=2`GkQ~+9r^Zn1?=(Ai@&nXwjhlh2{gTLsmUiy>4AeLbYO{7oLQ(vZd!#3
zy<mVOzwU(UUUKrunHotGmUJyt$0$ee)7OcU>;SI}t(w5e&wMOj-Qqvnow?MIc~P6-
zH1ypsnN!fwy4dY>er=&EUkf{Erbo!;hFe`*|1}>p4FNin0tYCawL9eWxPMpnn*LQ!
zKMF(9e0$5?CFs~2K2sZ_f0DcC_s{Z-PfjB)csD1+W}-&GFgvzip<ZerB2aeGlG(wA
z<7`<oYrHz@Mp|_%7TNql<i5p+X{r`6It#iAFXcm#Wxsm53U2z|&~cqpMm8+kL3$~h
z`kM91Z-}GZMzqHHWKNs=WEQ{kg_ymi+=QU%!~P^Tan(fd5t~-W<r=Pgqu)3p9%kP>
zc>l++W{WUC_ivN01+N7>$3UsoxC3^mJ?^bE#Y>|d1-&~+LDtNeW8b7xail;%<J6Ok
zJE)Qyv$L=N64rH7rp#zJl`4Pe9WPGPY=qGDT0sx@{5Nj%;=Tc~$fwS>m4*}X+4EXc
z)Fs37A4h!MPUY_zE1WX#9s6Uhw*Zm{|6|LJ7w`|~m7Sg_IS+#ZLNCA>&~v&4-H=5d
zt}8s@?mV@3r^YGz%$uW!Kia#v6n?rb=#>}DIrUrP@!Y9Yi^gI`8q*vHt)(QK-dQq}
za9qZ-CYdMKEy)zAu){HB7j%ntOhS>8TR<UG+7z;*4_(Kj6~Ln9`6jNITQ)>Lz@#VA
z)p2UJ^rTNm>!HwLq~d^{1^6b3VbUu8kCKEiY&-8J$y+5j@}$@3f%WV3qhF&nO@h%>
zEmV9CPlVdK#O<J=8qT=zX>%~Zyi6<liRAc>O}9a&bGm7t?w1KSr+cQ?hNh`ZcXD~T
zrSiF$ZGCDY6g&W1SkJKVQ%?2RAAT4Ax*o$d#iFG>w^;0%ftq(i!$%FGl3NK{%>!VC
z!-IzoqFAiaa`KxV$|eg5-gsfU@6`u^TVtN|U-+K9FgkamHA?<XTjnI|t~f&o`f0jO
z8POI&0aRgs4F_88jL{Ze993fLUj5dRra9gwol9vu&3WOhff_qXu9rhYL)vkpRIg*%
zau=N*V!QiFRk_HI8jz+kp{rE@UN)50Q&!-Szi98TS0JU99($eX9_lh#Gv%Dw&MR7y
zInpIBj7+jF*JiG7*oLIO8|Ga5Se39wPmW4ryh|EiI^>PYsHd4An_W@CYjfC+k_$X^
zxgNA2S$ZBovonKv{KP<dr2uMz>)N~%g9eFF$SmP{?M}^dm}YySGPD+z>?Aw0Ws~{w
zU%67kS&p*WdP&lwi4!ihS23Dqp05zUfyMQO>}M*oDbb8M7k175?-pJ^$U79TX%lv@
ze(?d*X{Y5)EUK)~DwH&El8{A|cQ(?}Mh|Ve%nxN{1kG-|n8>@dq0{#C9SU>$gG-=)
zL8EHZo5QLZ$t;BXypIqi`7;RCXN(rHt8L{hv<cug6ZiEcFc=C;<UHXycs?gbB_Taa
zj*F-w`qcEoZ)hHM5RE*%q2(K1?t-n?)X7FnXVqnbfAzm8O{#(z6xreO7QjOlqGU>L
z@X7?#Ze*O|quN>KbZ66mQ_o`GTU0T)o?qGue2wX^YO<wnGxVC;h{fCQ)gWdsOVU9}
z-K)1qCf59jz%D*yA^SP6>6I>`Nf;V|g`CxDg!a9;ug|yXU|X>+%JQ)0n*#_xJPIi}
z6rFiZ*2wW&Qz<WQQ%dXJvo|?2<l+#Am%tgufXzQWdITH0LY~fnHzJ#Dg~TJsM=bVe
zIWjM06O;WFYW$Oj)|ST=(t4P(_c)TxF&YZ9J!Z`UR=wtPg3O?@hwilOC;O$sB8>1J
z(svxmx=H_B-!Yi2vOxSjv0mC2UUGsg=-@uH<G(_}&S>jFx0KigD!(#RqamMArJ0Ld
zKi!ydOt%Kfk^s!bBdEfNIk%N^$>agNirQ%wvV;lBtA}}@(KmKIJ`VC#gWIE6mXG<r
zXplpy4lP14%p@P)L*j-&dCkMi(OUtj67K4v1Y)goC4g)oD`_=lM@SCyLVp8Vd0IqM
z5NP*@i0ByP=Y^gBE4gLyg+C`ZcsTy~?*br9o=9{1fjgrG=7ofg*8GpvtHJ}MOP;ji
zw-QVi1?1o^FH(r2cA@K{;4lGbt$M2vxk^n(@dJ`@Y@3KlhgI@RDXvhxB&Dz>lmO_p
z)|{}VHikbu`V;nhR?Gi=(a0(@A^F_E8W&BnZk8Gp;C1a|*%d2ABhjzMVSbk7WLg^g
zZ{?9b#gg6^T=*)2OHX(k3L5K3_=*%cU|6?PoMLKP1r?06UX5#|Txften1t8e`qyYB
zk0=!P((HjWckTCyACE98%gWSd8=t!}Q!n(2NQYVPT54ci6mg&&IKHd!zxay={W%vN
zB}f$6oFUD$u-thx;{XR;+pP?hvk<3mld}F&IcUYuw;2`&v`E){^O^+xU;I)1y)Oz2
z3(FfC8iJR%e=kt31O>EN-*mH*m@!!~FnQMgM>Bc}V0?a&8Xw4MBPVs)0`rh&(H7U6
z0*ssO6Y50D9LrI$uqzZc3M?T%56tJl$F>hkzaU@h^&w`reJ$>v{kfTl&%z%6igd0T
zq;Hxo*oy=c$g+SvfDzyD(UZ<g`W^6ZH8Uv!Q1*y)G=>q%8jFkgU+3qTNN3s!X#oCj
zR=lLY!R$3L;8S@$x%baZ6jH=n#`Ashi|P0BWOitXi3l4=mAt-!fQ1aB)c4!6aIcrG
zk3_|wxj_BoDF`f6o<(3oGPbSw<bOUR2kBwZ$*PYb`H&6SX8K6#$cp1BCZM4s?Y<lq
z%1tCMlb}3G7{(eNtXh`0fMd5wAYZ`WpA#A##{Vf1h2RzrT+`$vu>$DRu;h(Kfyb8f
z1qul#9KR^Z{*wZE+mJoLaG*+qGS6boC2!!@QcEu)^PedcMFXHcu(~QXAaCVih3Dvj
z8WL>5q{6;@MT+E#%p1KsDf?p$^N86#<drIN7?-M_|F|vx3dVT^wV2?ZF7|(+@)pV;
z_5Mt>B@3LGAkRoTb(9~f)!<m<+nttqPZsqRK|BqS^E1{|t~$GlI5%1dw(gx~oN_vu
zDgSwyQLE948a^IUeWgmSb!C41Ik0}LU@i$Qh~xcvcyvHvN}=xx?Uwt8Eb}}_Ve%%n
zR4KQ1WvK0!C)pqIhi^6kRd?#;60&53l`Jup0+I_m+_mh`z%LQtq)eK6FXJ=7WD2rr
z<ILZA6CnpD6l$;@Mr@==Upyv`*sr{}w*MDvY)@t@UP$}85e&SIYIMkQMqPt0g4yxd
z3>+cbv9!j;RoftEA5rB?lYV#u@{<PpR~jR2Ac^fki+^_VMj=U<LepeP62?hCh-OOh
zI)hXyvUs5zXP0N$N(gX}eb78xMbg}|0+;BGuxVR!*_i}JWSu{L);o(9Z|Z;BfE2P%
zkU2rC`#1vw(uygqad)XqM40Y5R03oF>rl|e9Z<wlKhR`7`)E&QekW6)=47MMVsmEL
z7}TEG`&KqD=<NY5WLwwrXHWQw6)cc90eTBF_(}gVnpIL-JAQ?2<hPNdmvGZb<+S9e
zHdYU&hHrJE^DE91LVpG9Z%XS`S;>AxDy^zyKz;2Tx~^K<($ef{QiG(z$k(0CYG?hb
zR8}0I#gq<=y|8VR6gU#0K+TdRZi#X<%y4nqPog~^#RK>gD$0#z2C>THSj#Kza{=SS
znC|IPTP&|?#2#gaMYbmBt@QI?EsF>QAp%Z(_=|_wcNY~9rK2?(w>H~s1KwDEFuy$m
z>fk57OK#&_c|7Z~Nb_&Aus};0ckBB<;rSn+XwGz0QQ4#^s)kuBaO02bY7N8<oWon~
zSpoSY<cuz67X_q6=C4Sq*@mvWe*%hH-8MolEdkdwmw75G8~Fs4Eo!S}n@?R#d$rKe
z=pb6{#<+YeJrtQM9Z%GCO(bFY*ICkCfXRk-g%Mkbz(W}$4}F>xYy!}aw}lNau)jmN
z;!2kxJr!u2XEjWW<W}M5DBU@i8`iZOE__`I`hkP^fP-W$wFzAi4eFg)@^vPP#kCW-
z?@w2_Ym3JnUO>{;tHwuuf!@``E#@2HG^~@|ByXf|LqjX8!+VJ^a=eNQSMvc>u!a*Q
z;OU4QayAa>XnoLrp;NunSdjEb(z#ICi2pScM&f#-3n=V<gc$7-FC=U<Jdzi7XYHl3
z<<T&}{3sqqk8ruTl5vT$ppa0)#Vo`bD223w5U0&ySnT_@+*6*1&Nd^*D~b&W%B7_y
zFSSJf8Da)0{Is_%<rqlq7BXZF8r5GKwip8rM@d{FkvCM-C3IcwcS7a&1g9M=$?QIL
zSn8*h7)QAR7N+qMlB8e~;YBF%3SOWPxbU6ig~9VI2Y+UnD_AJg-ne~fribbz(sJT|
zBV3kbt;ir~X2htZCqDjX)Mx|tvmv~Y)Ecp!W$9Kbw7tM;A)FAbXcDL^K&~d=ptLeQ
zm4f(_Vk;&j;eo$cQ%BB6R!MC_U$c|+TJv5Nzo~YB1|_`t%1t&3kiS#4=vqww5Ofle
zw1AVBsBi0@UT)nW5kPIZk7Q2rV~pnn6V0Ip_XCf|Ukiu@ul$o0C+Qmq=^g{~X!@6}
zjKr4KatDrZHeE-`&(Epr!jpof6)cL3C5t=`7PhCZr~<fvb1`6Tk3o)|A|cfh?|p&h
z%eIg2<VgBxfws2s{g`d;n*6X^-elr~mbQyUA5`ZWpLd*KBN2>+Aq~Z&K5_5Scv4OK
z*YOwVy}0{(J0Z6SKbCAK@0YJP*_o<|&rPX%v7KJ7yoD@!AE_6Ca0(50TE`6Cw=N-J
z3G{`be<K^P;)7L4Dtu6s?U%<84xkIssPchc{90MFhXcz;K~*DOc$;43Kb4vth+DR%
zMFxpOQnLlN_Vjy0=k<4+nUl7$oEz{x$vw8AI>xG6B+En4J%c2C=b1H^u>vUw>BzAQ
zd#;3$IH|CqECr6v?Tm`3op8)p$BESW=Y0!>*9JdJ8lQKQ$5-8@%(;KiCcJ-_yW#E|
z^x`*Okxwzuwrp@4c5k-0(c*fNHQ~5<=kUXvfePP{PjnKJzkOuCv3auo4Dd62dZX4h
z$>KxI$*n9sars5_q9L2Db6Zk{TC?*kxJMkBnO!18Z6B-F52b#z7M+tMx%L757H+iB
zEZ8T)LcA>DFHiPxG=JxH6v=mM$@8{VBqcGDzsfVl=DmvVO&Pf+iSO(Y=NE}g7T>k<
zT8Xl=z>Gh&OjO{)xK)=35N39POH~>+T)js+5896&rcqTYNlkDKR8*?n^)COB<UYa|
zi$m2Eu7N;r=p=G~H|fXA{tFvJ#YN0FE_)-&32mp0&&2Kec2cLe*7_m?{w4dbi*nVy
zGBxX$eT-Uz{sC8$2<S9~)w8aco#ekLU%18r&-7%N<nkVr-g64igrs1G^nvu4h|E`U
zp>5~d?;c%wI|6zbt|qbVEmQv|df*Qw?r>2<4)3x>qTf;-%}UatgBIfvJEdC97JRqq
zvPA;bq~NZSwgeq@+<)C!4l%2Oh4hyl3->@0NG)|v?}d2&+lUc1Z(hKDw3BD-O8%%p
zuF{%QxK6WjRuV9ocELOP)`{wDq~xDC7p0h(B3ElOce>RCBk;zufXk#WB`QL(pVegO
zu6l2;b6TZH@!6vb7i2A;b{!W0Tf$9Sdxiih$!v=}fc}b*Osm1Gk(6X5dK<4QWnRKH
zeQ>b=HPNJ8Te-)Hhd+dY6tacMCeDD~77N_*Uzf|3NdacRXc=!`#@HhC6&sr=#M4tX
zFy@8=&MFn$X)!uDd!a~USCIqJtM4gfx&i1}=Zb#9yb6e=Wz)c|bA?~?bUq+A-xAq;
z{Uss}Gb*V+a2`u&$O_fbsj%QV__rm=!jg)g@7O>(#L{33Pi3cJNorcxw=cW7MPz%$
z#{S&sjHG_sU3KF>MeY5tldqY%!<Vu80xk5;fe>u^epkQgKUkUP2^sS);VHROX=HEo
zQhn5yi}c(!j~K&3j}&dgHz^}Z>ac~O+S)oD_p&|JEjZF|1-!IOLTi72)$8Nhu}$hf
zm_ZzFs}iVCy`M`uRS6H(RSJBIP!mOO9#y}CbMUj!HlEf~9~f5P2YnE9Q#N@Y%`%*<
zSl=^_ju*Ms?cWj;1oEcVaFB8)`U*LF(EBDnxrXckPD*#%dSlztmWJ_<MpN2HIa(U}
zBcpV9gM)Ndtqi%kZNHO|I@5pf1#~Qn7se_WoG0N5S|(+*d~&BPTogCW+>q+no{*Qp
zMf&#z{^`>KZs@)K^gr*u$z&L9?D#Gp7sD*D{F<ys+_aLP9O=}XQ@ej9ITpfwu5+C8
zLZ4RO`4`BVP_9j)C0CmOdNxSqq0Z->CZQkt1!EuI%?2Z<>X93a>L%Q~XT8?0{_-l^
z3g5Z@FBkDFp>iYrXiJy0WFcAA_l=}vmC`I>+CQXnvbafGN1+<8k15n|e_CyylRI=)
zGqj$ROBSXrxdIBBQ_9yGbh{E2oRzzVWsS6ixDx4*OOwJA+JQ_o>{RQKae)A`6>O89
zj-r<wki2Rp9LYk7Q%<hmG;A9=a&`8$y{OsraGCas*8z7$$c0I<hJU+*hltsmls`=;
zY+BPBYu5!FTE?5V6xq{c+FC|bMcdyAK?1P2Nc$?L@DcT22;T+wh*fU;X1j`6h#H3$
z6}4_{q+E2Wu)E`)sL#vHMVSuZRGl*X&WKiX!D##M7n4)MlZ<QDtQxHXZjB9=m4M44
zNya6iK7!Q}<hHtE22N6A3fEqt**tMx6-{<Sw^`IwF)9K-oex`o6^;bkH(n@|Nh)=J
zNOKr22%TLq^z#3*DU?`e9A?08<csGZ1&GN7U9j=yZ1Uq-m?F9eq}M3{L0rwVJ#v46
z+ugBc-7>VVU{~1mA{utO_2#UA-*&+hD=w;g#{P)0&U(OZ?I}73O_lD#x%Tq0rX0J}
zv9<)qm#amWP3M-|IkvVy(pxcCVp0#e+RFRKTZ8YUP}N*Q5NJR#=zQBTBH(#w#9{iX
zRg38;hEFXIFBzY|JW+t?anJ^&F8dfQNai3eqMF;nKb816c*DSHH&AuDZo5e#_`|Tb
ze8C^vd$8#*BQL!i0c?LN+Hm>hiZ*NtiAuOQqdd_8K1Lh|Yk~phnuYfLhWkBBAO3lI
zn7?NJ3D2<ont`*+DNc$i*wz9K^Y5h%mVI3kPCk~@&+x_;v2*S-F{@l!TBdD?Ph%CW
ze&|1x^TgQ_``U!$G1~GUdA~n+UR;i>Pqd?RAF)sB73C~=9iMuw;Fpr0fZ<*3Ce`nk
zO8_a0qM|mC6@^Jr_2bt>a*@8K*F~{tai(rWE7cC{<I>(w)D(Yk&hs9yD>dE!^2D+c
zmt(3gAFNAJz;PDRMLrC<#Vh5QbJg8Z<W1}<cScGb90)dDZQOFQEhn@KBRKo^R_i55
zrbF{?8=Y&br=Z?f$W?QaVwGn7tqE7%+LMt&vV2fV<e<8ddlxbIdnbJw7*6tOFIVIG
z`xZH;XoJ_mgobrWd8E@L^Xb7^9R1#vLD$2Val>-T)Gz|&T`pf=cH=XX^;-Rh^m2Nb
zYhv1;5++t^>JlcZ(Jj>aHZRoK{{#LB9k9qF-aFPVTl?tTa6Ov7enO}u2D$QhIT>5P
zw`aDeSoJ%}YY0-V{~Ia$2}ntxxFnaxCOxy#oa#9viev<90Jm)~pPb~sHR}K=BB)r|
zWz3`(fqEq_lJwp3{&GsdKqWCSDre}U&C6tB*}}ny!fud+^48_Ef>2G(s<xRX#04LI
zGq4J#4!9D!`s{-ftNuVT%B}cYBT|bfaj=tx``eeP1P8LB_F}7!gt+qX)gjYeh=w`t
z_{lH(WEI@oM_bus6nur32a;+}QL&2ZVR7GQ-cFq6FMQi)$+e1dAHX)2&yfJTocoYJ
z{LNunOKwH}koP6!fu!%8WWJY3mBE6Hb$mpE$Nm2aH5sA`J)~8ap$_G9Ys5bNZeDaK
zb2j5DXz7oD?{-dZhSYakiHsMFqpb&$ux*GIe6e92iO9p7YIg`I;mV5&)nPPz>b`2h
zLjaehJqfq1#?If((QdT)ZAYcrPTVT+pdSa;$|>WvY^}>@=D`8V%4WtgZj$nc8rj<R
zk;~R`Qv8JnbL3Zr1SJWOQrU&)_%QPysE$e_E7~Q2UC2B%G`!<o?%?yd1OJsya)%o`
z?!UP${Q&9g2%oM;free9MIN^r4NI$ew!b4^JTR)QPs>_m8AfF}76rb|dOJ8xcVbpw
z`f76sh<cn`W2MPhHeU$BPvXlK2}f!&>Ax{hO8P#{_GvliBXH+|FgJMb?EOzdLL+*=
zrOEPF<`{X(v=RaxH9uB>Et`+MEU^X6LsPxUy<>NSl7zov;)JSZmT6;`CHqRpZ2dU|
zRoXOh%TTa{rG=+bZpbG5OxwpnR02{LSv+Izv&BZRvWjhX@3>!YjKA<KM;^ylL>44-
z%`7x-;5a$uckhuQ-cb&LGE-HVJwgUC!tN|L)vy|v&A+-rSjZ?J#;&P;tnbjM&bKHu
zH$icXqhhF5$q>n6J2*gjse0NiK{&V7Bc`vOb?j4a=Yxt^aoS(DyUR6a__vga!y!9@
z?d6bhnjx!kYalCQ3UPahn-S0$*g%&u92&HZ4i$Bl?3Cst<zw?jZJ-AohyC7fu3e>D
z8h_Oj4^*V_>$_!mwk+}8;dJBrB&WIoyu=aP3NqH(%3$vKak;P)L#L_H7is?xr<u%T
z-*bVDj5qHzq$?;?bE=xmy*%k7Ch}U=z=#qyt^+8W8!7bj7ALH+N37^!1Ju$itDcyy
z!8^~KZPj-XaK__Ib`iECNwhVa_h<}2&ghqYY4)+}4@+K?VbUY0Pl_~QYczC~ATH>*
zTN1S~MW=A)R%2tM7&fH+Fb12(RF`|4o1B(OY;6Q*`ABHSmBlIh&V%7K(8>Id48mIS
zV~7^Iy+kY;|69C0CNhcf*FJoA!ZnrCe0~~24S@2<1Cw+|7=`|ht_n@d!iicvJ*CD$
zMoE(e3~}4V<;kAK)f`0<m$Aj`KP=aXL@r#AEh&V9DYef(CUWq@Teo-*U6cvRc>L0!
zI^uS=MZ0S=<<a1S2xMmt-}w3Ue%h0#Pg_1+|CXPEol=l_SHt;`YX7a0a~nrZ1^t7A
z2nkda8VJ4d4IDMIOo*3jZ}2Xrmb79?6lIr^QJ#LNQK)EwN^&a?+xT^6AZW=+syml+
z6iq~G&8t|guKFL)EPr<g%L<3%BXd4!xJ$_Jl;}Z#xhrMhmFa)XsaFA*-ZV{db%wy4
zC6rVW7bkAe<qBCyYYfDfSvp_S5!N}o22DX>LoXS`C+EA#T~9qxDGzoSHC?%TE3Kri
zPN~k2FKMW0!=F6d=#VcWRH!uan^}WFN9PuEf+cJSJh92|H2w>ocK{FP^g%rW7XqR=
zA&k;tWsUr}j?T|I;6U!lv^{Bn<8W}hJ*oS&V*(xDw5Vp{QcrhNcb@z$c%8C~DzLJ<
z9^}_RDnD4$L@cOb3w*GKtZ;ir<I9HE6TXXBe#QdIh(49oWH}ow0}qV`O)9_>?EYYZ
z8*ftXcKh|CGRva<_bnDmpIQys(71*J(pCZGiG1Q~(EDgXgREn<(rHtY><X9mFW?7Q
z?GLA4dFy#Qm|OQZ3kJ{2`jUgv-9k0Fe!f3d;Nwnuw92_^im}b_p42i1Dq3XY<hQ&p
zD0gAL{c|vYv94A_7fiWu*<8~LS1{m?AWXr(ck&RTc_wF&+S_j4Pxg`Ei$F;NFVXk`
z@9)b~JCEern5@x>iHRFv!&qgYVsekdV2ws3eff{ayuFvFDIeYeG0T@pyd<)HoY`b3
zOr+`4^E06SBQe<Dy$<nnER^o$0Jxa#mpxS*&Xq8YZ;GwE<Q#W^HL_mAtwh7=*#)Pj
z2`N2=NS7G4(5-%1m*wfE)%*{nefQp2h;YGgP)swLpSCXYeVHoA+fN-Q6(0hqOA=3f
z5<zMeg!q9?ORp-FKOno!aUkNvo(tT%MZc;A%$hQjN73KmX6qSty@$C6<C_|OPZv~b
zIJxQ`7Fd5<a)Fy2qhmcQVx|30xzR95**@4-_M}kz=&(j?oxNmNxj3!XuJRui`F*!9
z<<T;A8AK{4tAscfG8-HnsKhlH*uF58M?Ce748T7M@<NN54>TRPj{-4_kFTxrxR=&b
z@)=B#=1zMlNhoYEumxA1<%oSe(wJf3*!wX&+b2!JfFn(beFMbU`K{`#V=V`ydsJ`r
zo6>?qP~yL!REIM%GTbIY?sydJm-eT2Zu(rN3l{T|<Qrvvtx-e1onJqEIA2wKR%5eg
z$@?IVQn&zJv&6XtKkuX1e-;$ubCqN>C7F8azF@8%Dw?s33-pp_Jt}Yyz3i~qNk1(m
z&`5emC~jtH$+(m)>SXj~B%Oa=)hm}YoxilEg{P7d#QFLu`-Yqjr6)nIlA>lgI@Vpu
ztt=yAHt#Rdl_8UuEiW%W+}t55EbMfSmc0By#XFhAa<<s_m@IxeOXWBvT5_@Cxyfd3
zrtC+$ZP#sM>w;6T)y#g)cdxPVJSHo9(ix;EHm>#SF0RSzRP*N#msq$H4zu<ilD2wb
zV9&RU1#Ok<Ji<ch+Wzc_i|i5GSlrAm%iR{=KQZYy46%sKGU*S;2IdKN#+=tQ@!gs4
zRpt(WMuAS+olyDe>Y|YA-9q%==-bQ2E2W0WfM8|dOep-lkx*X!1rgN$P^l@NrzVjq
z-1X=TS9^|Sr)ho&_QY}B=s-s$@r+3M+KE67%!x}1ZtL2;%dg|LaZUMKew}{8LPnX%
zz*ANrbp%hhmlcE8>^$ije?hb9Cv_JEK2TBj1umkyeLv@hb1Xb%d^0?e1KKxrOy;mB
zecbBr|B$lt5O<wC5?on%n<Yntx_C$2^|aoKw(z4g=&!ULjaDa9w`DAb*yvcsj>_iG
zy6dik$o+F!`wxKfyTf%z7SvV*b&7h%^)Z~L#ay~#|5JzC^?{0!R+2w@E}WK7F~LN}
z1`0jzpwiJyYiKZU&EvNYuz97=rEbq0R^Qq(Y=)UD^_TNU19mS0(bCvs<;MrRoK1%E
z7l>A}Kf%5bi&GZuVmiH6|HFl_pb`1LhB_>+n9fgDGrDj-KG5IQTV!J=&CiSr=A#)p
zX?yv&suQ2xD1Nb?T<YR2gUX8qNhwr|D#LLfxXX-56RY#KOAFNp$Iil(TZ9^n)@BNo
zR4s@02M-=pZcOoP_)4?g)b(<P#yd$#8@@i5o8~HF?=^%)`b;jnQnI9{gZ6v8`E-@V
zOI(pa^-E>^cRHCwWepR?C%?fHR6P>=xxl*ohplf^+h&uY%bkYDH=1lN9i_+Mr-BVR
zYgxw{Stz-@J8xEQt-73WZ_k9mUj6geN3HDaCTq-^vxauF+Z=+#)Bb}sUIvwZWLJ0z
z5Tl?}#GL&<BwIj*lkp;Hbt#o7hKU=ynrBt+mTjx0IOGGuEz-Qj?095VKfzVV^t$?u
z^5Kh;ulM&BU9qyVI=g|qSKUOcE_Dtx#K;L=-XG$zb(Ou@bbgyO1`HI?aJ@LQxJW&X
zM}}MS?OnDtp*!_O!K_v(eoV#-{0!-ai;Z3w)y8Sd3I>ZaJJm$GJmhT-QoNsa+;l<x
zJWLj<LK55)+EIN5l||gQ))GQuwCKjsEg~*&uCx7z<HO{FxSnp_>R`l<>VR&`_N-`T
zY`^qF<hhYXYQ6FCw^OIoL}pA?8Z^XMeLDAU!hHzYL`M0OQi(k%;4|Ociwn&vVb&?w
zNW)s4UHV0_fsR^IZer=U2%*j5y_?Sm8Nf;9GS^f-_g_8VG^q<#ODVRlJV>$ox36n3
zU3-HeU)%BH$H!;aKB+{>U53TQur(cEoie%qIPUu=tcKY*D*D(CFr2&4?vb43W!`rQ
z{>)7zX;?k*F^RC+<?h^qrwL&JG)R!kW`h<{1nF?Aq;fvm=y>?hp<zK0k-74NVW`OU
zlZ`3EsgV5?b_v~D3XT(+>@rhCDz>O4Tb9nj-2cFQfr69%2X&c5Zl?+SAG_==w+$*;
zg*sw}12{~!@22-2T$#$YV`5;_*Xun<g|mDI@H6E(*}r>@X=XBfd3)m3>*H$qL{e3f
zbDbU{b=JOolwb8@m&jA=h)UiC3U>_#H}0*@>=UVn{)80wni_(kYy%e`ls%bZi)~Xo
zZ*doIRjAm1#bn4JNYVSX<DMEH=M%|Z6+5K;oo!LUC7HA+mNP)vV9$zpXFrK9Drx^5
zTD<mV{SU!`iq#X}Ig<BAj-SPNUr?|hiVGI&+vym?HgnfwcW>_G9*o_6$E=aK#?5pN
z)_{_eo2ySapl<&j4468zoz?1!|AB@cOh5yW0<A+yKW*CRHT~%@LHqHXL*lIO#ypI~
zg{-C(lOwR<88<3*gp<Vq*u(q+yWBFioVm6+8e8;A`N3AdN(@$j3-3H5SX7OeE9w-i
z?qF{L6ZKkhPddudQD+Rk{paK%*24#+$1?6VoxPYTEiv5N7)-rh<fODeCSvGpUdRQ7
z3rB1Nv38En_my3>-17_b)BpoYS0+&LRb5@(CvU0F%U~oyheDvMcp$_>D=6-JBk*T2
z_sjOn111ZyH=h(Jlo6in@znpoyDj&s!l%LeH$)aKxRW)7SQRvcP}?{Eze?(~Yzh#I
zp`X}5=50R*(MjcqRm$DqWy4_X8WU*6Fs7x@wS{%;>{KMq(Ji(>%m`zUf_3YA_4CZ9
zSik}XzYEM*yTVZqX@DY}90-OQcNsaQ`xp+T%`$HdJzCesG(IAecy@X+Hn8c7A)ow?
zDTPPlV^sYV;y0|Y8ijG|bVD~Mfa*OITbK0oyS4?!#gOB9wnFU#nb@%!sb@bx!H%Dr
z8Lv-xP;tC0m$f-cZ1&FOpF1#kuV!rp_Qx}A{ZNbg|3Y=?C@rqNyU5YWh7H3v>hC78
zzC3H<QlQwc;G3vzgmKAJWXBr{<K%#~Ibyl8a1p{dg`>>qePGmo$sXsKI@<pIg1kI)
zj>CA52sKuq?AJ5Je$)0MjiJ==Cr@s+OiWDd6+UJ0Jam>EFVU4N?f+~19H;YnM*WAD
zmm5Cp-3Oldj`APZiX_UOY+#R#hzoR#Q|#YY)QdIw<MsTwGP|Q}e8XI0Iny}kZ{6!!
z`L+JAahRv+xB=)VS2SxR=Y8GV3`EbR5x73*%(8H11>2*S(vUo3WE`rg1s1gATk15#
zP)8U%Dr}u52XieEKlP)oGrgpN$GM&|I}AP<qO4iJ`y6`ch_HGS$WQ2IAjZMGqSIH*
zwXsT|y&&n6VatUh-Kin7BWxJ_W!7WS*i??#-v?OpCAzj_u=zgO4=OMk1G-$`zki-{
zVMr&>1G%L^Z%*zQdWmZ*)VA-ontsDVsTKQK-nTdt=bBR+AeC6SAUhARlaew=rkIm9
z3lFB>-h67bWH#2wQ4E8h3xbwL#Y-}YvpEf|PH=sp?MRwTqNc`3M!q}Cje31IQ_P%<
z6gKPPw>>#nCjG45U+E>8*vThz4SBT`DH_=Kd^^3%<BevnV(>e{BgEq8BymmDjF!I@
zB82*?Ma!Oeg^Hc96c=uhKNE4r!S4yAnrKoG6}m3}+#zE9g&w@BG?sIDD--+6(hg6d
z9yg4Ha8Uw~ViD1jNd4F^Lg^lmT;I=M81l*x-=C}Q(jFt(70N<6%*o0XmHng>IvOI?
zlIzWN&L2j~SJR++BSA}4SmdU?#p5(ob2EqN?tP`k;v&ZfE#{rdyCKwh7GN!f!5Bb-
zsF@;0JE)N@jWz7PU-G>f<5;EAXPk1L+bAFMI`rICPO|+vJ=B~nc9PQlK9+qbJTXw{
zjlCf2*!u(0qQlLu^i7*)khmd@(QgX&F=&YG_vx%W`&A=-J+Pf9ItJM8!mS;|4CZfO
zyI&Z?|8mhNyTEo<9*kKF8oDOv)&<G<ckkXgnYI-)Dl>kopo1`J3UYx{vTJOk!F7E2
z6}W?W<7e!h6hbcmU-F~hG3s|GPDbvNc*CKf$`$NO?cOmGsOURBIF)ojI%2TaPN|tm
zKLST{(FM~!P^%$!$ft7z@6=++Y+-E7Zp^o{e)nx8IZ#g>&qJeXj1hYd@@6q#Wt?sB
z)^d=Y$LB3CkR1$hZHi*ciBm^d4M3~czsONL3gPQ|`6x!9eD=9RcL*-_|25@d=osv8
zSE-R&T+=hh<1?d^j$ba{-;-(D_;SBH%gOUcu!=`)a}@i}#nvV6QEF!2+A)M{%4TTX
z2lea{Z3^&Gx-=;+nasU9>7&~_@TDce<(@}QNn6rWYsc*}W8o%|O-A}3M6kHwIlKqX
zZFp{ylFp$>vA;&NN`IKgHv{KtKvlM<d0%Ds*tRg}SA4(rW)#*=pW82n&C%b(`Q`2R
z4}D+aey0ap_uIFZ&J5~56wIG4n(-Yfkm#B?Af4KmeYg{ed*!2os~=MmiT@1p*%o!F
zYy<xx0qBDeIA=fqMFVmF{-d<_zII}9?Z(*>?Vbwz&)bxL1_(Snay&&KRNlNJoqt>1
zu5{|xHd2>Lm;!#Tx0UF!1+OA~+XH?R-1#=UW;1SDzwtmtII<xbTXebc<u%`(-R$ug
zg{r&E=(qBQUdC{n9-DNv>ua7<7fj~R4|{}Duv{OGL}=T!rYI)19G9!T0NLm6Pw%8m
zFv&%YqlUuia#CMG$yz^F?62HWofm&l`t#Jr@O4g~LZEmsp&Zo!ev>xMZDQ0fg7{7P
zdU^j<`X*^=M9q_HFg)cl5s2|9I4<p<J!v6zTTn6C|D(VLYdd4r&^X`ZdSkI2HPgJD
z<Wf>4Ox9i&3XtKgJL!`)X!qkXQ(nL2StE?3iO9C{7t~a^{2>d<kF}7hm6T}X^{}?x
z_fEuV@|0b3mYMJdOV?*Yz}{jQtgCd{OpzhRMO>`G;$iN&BevpgPguvE9+3WhFs(JJ
zNR+)=6jYI<M3LXuUp6K~4IA)6JV^T14DBka>63LJnUM6bgnW#^X4MdWqT5J_V+fJ}
z9&9@#zwzFoXvi&m0GGhgC|O1;KPb`+Q+V}*+HhtIo99P*Kes)o+k0dZM?mL3*1$|g
zt~*(LI<AZT;ZPHH05Syz#+Z?`z?rn`u`jsQO|+HwZ(hjn{77vAm&nB!+f?Ne-a)%%
zyX2=08bRb6%ou7l1~_D5KTFou*1GM^eCIvm{E$~nSth*3JX~WSxXkibh^sJ@eo>cb
zuvM9@GM@R%NG=0!8=J_i#6_DV4`+eWZX;Y%3X3k?^9dxHIm%X&aZq4A!H4VscdEMG
z^YQA{lB_!w!Za<`orne2hBWQZ?HIgpbNJJ^@6w$*iJUa0c#Q$9_F6`3MY$tb@gv=@
zHx|p(W(x80UQAX=c4Loon%8;XJ?1A)d;UnQPmRXj$8knU*x%l`nKDQM473N>SNikz
zL~h@B=DE>Kx(24_yp^v^tRifSIt;#4Oqf$>+*FN5{`;%PWsV4pcmrxROy(ao@7T1R
z9yDrYkmA{3nb`e7iX4(%v~-PJtOCq8JJ4Y#EZPbT==+4eoOi)sajh1+nHL;^<~MKN
zm@&&a;oc~JxFj>DaKtuTvA@A-YFG}uy8gDkGum&{mAHacDa7?!W`h*_i!=tbhh0uk
zy6bwYtKE3blp)X?GarJTEQ@Gm8W-aFqtUUD6q<f$gwgJC*g|oci8<oHyR7TjqPqAI
zova<P29Z28dt<S`r|fjRFs@CN1E_Y&qf*dy2|{{lN~<TLc_cM(=C%jqEZcXN|CkiN
znWp|>Myk6-+$D>ztc63a_8AtJ`Z^>3{vpY$ULK|?^~RckiryD<*QXHN;Co1W^r#lc
z^<`b8D`<KK$qQ3*ZM`J11|3arXkId#uK3>Uq%M+en+gvJnVwVNO1&k`eeq_Oactd|
zKt(ZQbLM$<U$Pr*LR^7SSl4LW%&Ax<!Ol0@<{w$dat}xsX}#%b2d1a>t89Aca2Go{
z@o+llR=IVcBIA)*6$h@LPF16Al_}WTh@tW#b&fSZ`mcZap{~xvMShTV*G})U+3VM@
zYyGt8^I34QK86b$xEyM0p`2eZ(6x_G$SAW1VU0plN1okvDz<tRzBkTjrgPnvQ=`3M
z0v1KFzMbvY60C9u-BWu#Dj@HB|LTXgb92Ag<Z7p8?*#>0X?yL+2P})p)#_ez8CEkH
zQGe4bhdUJ?7pKKD_rfSNA`vV4>^48*n{jY_fM&!1iLSdciHEVNvs`r3kZ2qLja;;Y
zaw7Fx<Ew%|-X1ioRUowr)B_4X8UA_+)r+p9XNMZ&6%(I!{r=cI{m@at0^eW1VX!rS
zsts6bt`mpRv>53UjUF0>>O+^SLgx;cH5My+#DGGTJ(kP-cY29{)0}AkzC>@{si)!L
z;a{@M!e)nayU^Es1-bLyMTux;v4`YW=N3=oxV6~h(&`8BzCYSt_GzLCT@NLfpvmoT
zEm)+!*<OeNMP)J(<0S-@P-mlP<7=3Rmi5-EE|>c@tBT7>Fz;h6_pF%H!t-J943@if
zo8HAuEjSR`^;n%fy7tOCkdlzIe&%ZXh}}1rePflR?h}Tt52)M|a<u|=#@G?=^&go!
z$v7fS8_p&w2{~6j=1gU+2=`IR>Ew_}<PY~5VJBy{sSz(XZ^`+@@u=Qt{@LfYyRw0b
zBj#`d*?^54>zL2A%EyInU)XPB)Lwk+fb2~EE!E+RX$`8Tt$BF++)YO57a+^=p{G50
z_Ei7dukIJPPC19q&e|zUn_xq!J2y*EM&VuYZ9TstaNoaSaq}&Z0?|mWKVk1U%#&H2
z&n?k~cu-n(8d|8Yk2TGC#c7oS6}KqKh~%i7;V<sFQ#iuMFr^cG;2p(Kj{rZv?fGY)
ze%hA(;?Bh7r8``{&)Y*GlPG|lcK(I-C`g!|8NTQ^NmV);tubKIyw4aD38JffcOUoV
zvtV%4BqA_@8bfUwL5hChn+RUAsu$FjJgT~>l0DX#)k@C)0N0J<ru8pmF4FI5aCWnI
zOCDCg>g{1_qOyO^ba%O!#_P)T184R+boot%K#b+(k+T=qH*v#lFL%GH?Pq--!JbFW
z7R7T%W9#-~anXwJ@80X3Pq>#aAYg(C=$Tba2@K^5l2KJoR#vsOx$3jlG0R5mhLw2+
z*`RK<Q}yVJwtcJ?GX7m#t2e$Bb@@yzv#UYe%)De8{+4D{h$3d}XrJtdvP9~%?9VOg
zl(AULsjkapChNCo^9-y58OjWqVobquPi`{Ik(*s8Y%nuDEd-%R8*a%R$1zV7_HmW+
zoYFR#d%$(V_~YJLj(~uG>h=N$Yt?%T^;xZ3*Iu2GkU#I=B_B3|cyqT|ZlEJmJe3@z
zs$b&r^tP0rLI-iEw%K2~@_NoYyzty^#t0z}X@57(t70y~t!D1rK{B~th3RrnN&6d2
zm|dCMkx*AjdzR^5ydj2>VNm<lE3@5h*MSh}7e}@P%){kyLT->_3!Yau(=2M#n2jKt
z*^$8Yn#cL<al?_*>xMR~QoX03780~!x_8#YBU$fe=B#dP-8zj|r`r3IMLW%&K7D#h
zFj?^#Z<(^Kjd)2G?AWCAN;<t$@>8;8*E5Fzv7a5cw=gXjgA0cDez=Cd7>-vrDfa6<
zmXMYPMdo3^jZ0i|SJQAvoBr_d+?W?vT}x73V}Ohg^Z2OoR&Xb8v)t6p-%D+{UQTy=
z`H%QQp<QEAsaL*5h&buQV(r9IUpz6;J;=%}@84xz$jV!`I}Ph<IiNFS_ryZL?M}BM
z-LwWqB|o=Wetma=_|2@N!IkCf#YcC<)=_!cG^()&MHF1C+|HQNMJyXS!iJ8t!zniZ
zWmG!Z!Ki4Zof?pWTm|%Qn7{i{H|+o}XUgZaGiNW_y!1GwVNwYGd?ggocY%#{%&#T~
ztY#&1b$yx?!8HjuWi~fFSJy64HVxDCelt~PH+15}8NJk*@{&E>Rt^rd<f-nhy7c^l
zbkpbk4{#5~74y*O#$#RG@~yEr-zS><2R&nKS;q>^%R=ox=3}t)@wZ&1lDybF<r)0a
zxa{`-)S_3#ACvaiom|UKSz2R8>juFGWo(_F#sC$a<p{s<XTCydiOD$uBw2d5=Mscm
zH11K41ytz-Ii>y6y0N?7G-%eh$}7pp<!%(<dh;A>H=;T#Fh5eGVX@7xojrDPE*{&K
z6d3wRBVR_qt^c|<H~_I&t~~aZ`zhE~rS1LKsrH0t7}vAc8uAsj$w|I{h}O}=%w0LV
zZO3TYWhU24CPPW~*Y~8p@7{~W33yeEXfR;4-p01a8TGS&Yuiuhe!G^><rZ${3z<8m
zs4)cnCt+8J%JH4~{gO`OLfibdL9#?$C%>RUn<BY2-(J|ah2=tM+?I{*9nH_i1}`Nl
z$;;GEln*_W=z4fSy2YS5eNPk9_^%$@nGKNxO|QOb9n7eP^p5L9T$p*&r&kU-a+<G>
zP1z5*YU$_gNGN&%b+!qb+xhH{Tv5TZZK+sSW2hN8${C;AmHVh?aB#32>Z;UQuN`nr
zTNu7XS~0v2Qrugs5qa@yu}orG{`+QGrp!Gv29?lkQ>&4`w}Y(<yqp^y=nbM)wQ+Jf
z3<=ab0tt<9dMQYANqzVA7b`p$(*M?&fGsQB9_I#L{8=<qG+UqPq|K|AHC1QhwqN{S
zen0!_L(6*kVWn0Gi^3q!21TpKJ31B%RFP;6vj}sHZJ4mNFPkSb7prRTi3y7vNX_tf
z)okrcEv|N9O0w*WA{luX1TyISie>ztcgjVTJqga;lrvOD<F2QT)oAU|@r~(Q%Q_ao
zie!Uz4@-{;v_jIf7c%!)NOgLHNpNTRnQe^ici%0NymR}#ufVIplX@;#5Q6)r5W^U3
znXwaZc6#7J?OarFyXFD)5$98KwQf%}-@QM6KFb7(8<N?|2b?Z<)EeY!i3(A=XLG$&
zj7hGTC=NLAgX(<yH4W_o-(JokBzErR@1Gig%u){Xov2Apeb<}0?Is+K1yA>GV{^=T
zJu{xWnJedW%1pPrb;y0(%s$KA^QY3Z+Xl*>#Alu6bdYNu=rzR5>nlaLjJZ2Pd;-pO
z{f6rbO%fORx<%uUv7ck)Rc6d|K#E;W#`K?}lM~_sHHu3zni%r!M}B*y$kbxkp9dqc
zreLu4wr$(oPKbz@LFq<0<C%|8e+}q-9jphtr&&WX*41TqxtSZ=vvmn2Or5SNog<z%
zMJ)tUJ+uz8rd7=reR)-IJj|yeBX)Lps3@#3i!E4iPU|v8BqL$UoB4w;p*>m811WC)
zc&pz}$`>oF6%Sl*iC;ew%CmnVhX^%QQ(AI?t0`UTRqriyhs$N2u{nBrIO|8h&{E^}
z>C|tkDn%mWq*{l(s&Xh=Y!Mx(q#DRosP4_g%^wpK%yoba{#o^~>#f&6BYBwP=EY5!
zNy09g62lG2yxOsEAml6z_+855?1P($Yzlvdb+rOPk&~0#gEQpQQoyuG&kr?uR(zNC
z*SaX7J3f1IFuioM&}hjzZ^eEcZz=t=0%4xb<e_#CD%2r`s`+W$C#G)`>sVXNY<O+^
zR0`JjNRFGTZLb#Hbn{!&&5VAPl3m`zLOb-e@R7SRdUBs^^ruK`PZplaF18<Oqt3)}
z{}6ih%<savBb{@nIBZ-BJ)DiNT=@q3Y>EW9yPjiR&+IPe+RejrHkrNK-sxntHh_?%
ztz5zEqj|FdGNv?8%elv?XKfoA$DLGfJRh<_kt=xbcJf#Jf<vhmDxQuBiZWv}l}@cp
zoo%J<j~YG}a<Nv+`gcW2bvGIq8~2NY9L!&H?h(tbyk&WWrc{QOe2URe%({M;WY;~Y
zTSi(mzU&r|)XJvl8D!@z$`izGa7n!;ZkEd6r@H-OBX6=yoQLyK>8(>c(mNmncovH@
z9}3^y4!uxC0o(3Hn{gye?}5a+PL=)qEGm00n|&g0Z!_Dk4XCCnX%iQ5aNY7XFj@J{
z4!utWwpKsJZ0y&)v9Ft1Ft^$-`+)Sfh*yGy1oGe8_m`qy<cVh(xT$Pzv)&ve_Nbw8
z2MKwO&A29DC-Zb3iWFBvG6E=Wng0TM>BKd&%?n+A4`AL)6!%&AFlw4q<3$>n#!FP{
zi)J5GxEW&7td#wd>ig4rCoeyou6(+wPWL5FwAsEzCtG-|V*kh`_8=|(Qh9;d;l8NT
zA<gL_xohE)`Fodd=HX_>?ABL#iWxxvlKOEJJ~rk=Sk{SW`Rx3X%&9RIiK!Nl<sBFL
z4Sfj~Nf3LF?Wke01`pKT&P*Ghb3l4ybID$CNLsMtA<a)ua8Wv)I}F0F3Q2@!`Hw~o
zIAx)WGA<X7-~(A_F3AiTt8h8TC>m#bXhqicLRH9o5G@9#y8zdu?APC?;c#^Dv<W8v
zLcY*L-V2vx3iGGiAwZHFD4Z0AJsZshQ%z|{Vzrv>$^~&v7i{<+Fh7OB`T~u6iSpdw
zMBl`SQD4$+PI7;SjuHCwy+fL>YPL+PoT#FX$rtphcmQIe`Q}$xwR5qd==h%ATF4)r
z9ogdN`#R%7u9nUoa&eWLW2WF{)b+Sz{*GwfLUr`#VAtt32s@lUAr*;2;VPpqNxKJM
z^B<<KH#j_(V}M?_`D6#@Lf{xM-cx8{QGL_SRMS$3rj*eLBi3A%!}{5bHY4n-qw7Z~
zS@C*HmClX6jTT`}dA657<TxYaAs5ILh=@e!<`2gQDW(~!aNCQQz4oC4=Q?Pj#Yr`O
zU!hNg<d~~rd^+`+vx};U#|~kfVgC%h|0vP*#e!3<)BjljT0WCeO!q84X%rvVpPQP*
zoXl1`qyLclQbB>XSghjr>TJ_St9X=7tYWS~XdKtMFGeM%!#r-+F;yv#>c2O^ehq18
zjmc?sxUFQFteCj_NnB!c#SCS&Ee_(Wgoc6OoQ~(oebIq4)O3~%^2Y>d>@zMOrzwTB
zTSn4oI402QYq*8ehvGP_9lFQ@iDD1i=Qp?RMCv6MXvrC3>y$#{0szIc!xIxW*D2zm
z<tjlZE%Fpp-a9!pv)(v#=G{U;58bh34agPj{dUq-Sw|PbAzcZpQgU(H6i5f{`lB?E
zVs*oKQ<cM<Kc8GXG+&vk6*}VAB0g}6M*-{M^Vs=`)9g2a=L3P;7_>gjp3Xm5oXfys
zA{&EsJ@<bodk<)~`?r6*Lt8qHs$CjtRjJw(r9n}<_NqN%g`#$=RZ;7<H$`o+V~;9=
zTD5{2sfI+&YK#)&_jcdk`~E)9^M9W2IsbEVdPu88uFv&;t?T*#inQ@@m0wLCY2+EL
zm^&Zb-gHfJ={=K}%Zzwa^3e@RvCADEvjO_g+dTH0mH_ZRnl|eXIHpG@i{Z&mOJ2op
zea_WA><M_g_ZCu)F*;Ub?p|*}f<E)e*kZ@y7k*NWTe5s8?}hZ`4q`^TcEp-ge#svz
zy!#;eW9BoeCf4w?ad&?6GG;iM!urbUrlI8SR;@)*so?p&DXz;M35K-d3JW0f#J|)0
zg2sZrb(k>sd`NoR_fQa6p6gQ%f}xj4O?Gdh`SRl+gP!w0K>YZFBWmA#cFQh8xvH2|
z?9rU}lsA*Y@FS?Tpe%D$hmqe0S43@^=k`iE=34wg#p|KiE_GxsE318or7X66_g&Xu
z`PyDMl3ctFr}d&X9VAr($>-Hte;UE`X*aCayM;{*v|xvq^TuES!Po)b9wkoYs&Q|=
zum6a>GN4ZV<LzJn_rCYv57ieJK{mUf2;&RRjunvDIw6kq-mlVmy+BOVdm~Bfx`Hch
zOr2fCt^sV#o6uaArfb$(8ZYS~=y5%!O+EI*7q*8mAp$<aCt%Kql+)GU^xoq&-lT1E
z=JC;kp`*Es{2$XW)YtXBwP@L+-!@X->j$zbMg6=+87`Afx>p)>x;`~n!I0*;@xE$$
z<bx<zT}j`zJDLj(m~&|%7Qf|@+XQ^`Z)qwCDHu&L;9;e1r`!7hfd12gT4VexF(+Bh
zy4#3wdK_!5HT39TVJwEB5Go)?T>|QB@c1X7RaSzOrN^j+TGNq#$2&u!Lx|uE^Jf|2
zx<Xord{-;(&sb6Iv<oR1@xX3S0$V(>GGtJgQ|Q`8{F(rC;VeY@#Nu8?y4T{u!a@pY
zIdh}E%B{|kf&%t$olHH8&RwJO(E`--Ud0C`m=#IHXF~_6-1ttU+)+QFQup-ux#?;d
zQhpkN74;#U|JhH-oxB_OSSCW6MEOr03L&=v-O24bNPZ=VXHC%NcbZ#&k&4=3Qs@VY
zq;B4rgG@kJ4<H3_3qn*B2v@=93BE0vw0wTmDABro{ga#=!f$8m5PsI~#r8YeI~Y^%
zSx-NyGtMGjvRA(Vta7_Sr%57cPy)D`ibzxmrC+i}52w`w{l!g+*_mSt`ZCv}w1KZ)
z0R;}ebqzx)NM2hpz8w!!sJe3O!i8{QlaWz?ttt@yZQ*}plG@Mh@&%-6FT3T%Uo!I1
z#nHs<0=VoO9n!R6BcP*lxAC2`5V_Z*)O;XasIG?!-|$(Jy2!|1GiVW1o8H7}6+zm(
zjMe-#p|Ftdg@vc;8VylW?OsmYK1&vIKpxtCs6_Lx#!Al)c)|aV@hCJ9lW@EV(!~Rx
zgPtL!^UQgDEI9w-d}~=6^lu5^P|*B%P*#U&e5h^Fy*ysB0TO?t?ue_GSgX@|m9(}-
zY~9v%ZW0>RW(&ID%`)XkP;&Ne35;J^hH5ZJFf&<_xvTS+)~UL~Gix(58qv7le&I{F
zD4n$3qLS7Z1r;gNnx&|~=S|D^(aD#7oryr6{yImebkoPW;VG~4s<p?C<r%BL7;xL>
z40C{Ti&>C@*&n5zcTLZIIw!W}r2~hri`@NFf&S?H$(89m-}jq6D^`N*=;U)H>Gntg
z1nxJBj}@V1uK4|rPjdVZj5_Cwq0YU(nc>;Ps&O1XfmE*tHrPQTB&<hFXg@F-AbdHK
z5%$X+$15}luZ`Psln-VLv<n3B#~yMiUYACV%Ab$BCO)4JXP%+=(23@?vK{Pip$qHD
zw-w>sy}2a6I|@LbK0rU72_G*?{mZ2&>rl*vV^dO6umWBm-5U;0{O?8CEZjxd+7Fp4
zggOS0OT~aTs>>OWgxWbM^%Q%ZyI#F$oS7Bwy<GgdV)TG#iThpP8pAJB`+a_s{cB~^
zuCW~Jdp7IQwDI6leFRhyOX5xMY=U~p7c;EA@J;LwX`s4b!+Y*XZ}{me|Gba=!(lLh
zWClZe(^k41o%i(_Jjshm2@1{^Z|@&bzG~VD(l4Kot^4Sw|D|zfeQ%3Lf*4qb!W>d6
z;lY_u4a~+142_aA*Phg~I)PyP0R}&i=9)6ydlE$Z{_^bA7tJL8$pK<Gq8S8Y8f2^H
zrMp2Fo@txJUz@2{8ED%wIXL0Ri>Lt-Rg6~A&osivzo*W=n1ixC+_n||5BPu{4*m^$
z;44G0-GMLcr6Af_(zR}$2x3VMhp7g6+7di!qpWpI<qZx;AbznreaZ+xbrz8KaMbA$
z6cnUV)6kffmn3bdtDuv8^@1>8Z9rEQJvT_;H${nN>PZKiyX0>4VvOqArFQLg1L23Y
zQb$QB%(;-}YgoINA(GcmG=|OMiv+=2br#LT`LiIp+VqshNq62&_eoW;+UkaTJ8vS=
zE#cC5?Um;>h9oJ_ebOqVWB^zU$Lwd^<O-}Y)FK3t^mv8Q{@Of?yk&gPS1T|#@q&1H
zwHef&*J;|&M&_d+bz^JIa?_BzyWaGkIxY6c{eQ=A%a^|NM6L|(<zak1s;fTNf&;S&
zP-*Nye}Db*Q_|5*;DVaNW^3;k{Yh|}Udm+yy*5UrsPv*T7N~`kpJQ46G^V|1|M(SE
z0=P})eQD_o+Fz}?KnBR%_$0O_>D*c?sZ34xoau81`xYwMvw^h#)^_o;c@wG8aC|*q
zSPiFL6c1L6nDQj>=X(r!m~I!9!fy}*UvytLidrX5-<gT~Gg)`H1mcGIb$)+}YsC;E
z@I?F1QRC%6zE6Z|f4ST}S@C&~tbYah3pA$BeQoyMU7y{UZ15l;Z<@|nGyk0hAT+6z
z@RM)zMt{Sw%r>}f&b9clR(q~sus#u$<;viwrHt9Fg_)O63>-Z(yf6fj>A2>RH%tNG
zVxH`vUA9V5B=8y%ck5O&FRgpkH8m{1-|buSZYNCdtwjp*lkdt8?F~4KB2h9@Abhx&
zzLHr2uB0$T^Vd4wXC+vWAqWUew_X=k{66J=92_B7c0a(yMqPhg0!XZz4L8fw2HUtU
ztI+yj7r*?yistP@;6hQFaO5NY1B&PXQ)SBLu$Eg<IM0si`+quCd?_!dw;IH(o|l`C
z2srL&T}RTylos2O&Y1m+iO%E`%fwWNvZ@j3X&k*>3P=T>1iO+%0rS9JFhkT(@iCSe
zC}~e&YoZOdEG!{gCP0L5F@nl`Ez?16>)^y&tKt;Aje${c7z+l9rh5$}oU4O<&Fe5S
z0Y`gi`nut5!%Po@dxZoa<PFB0615)*DtWQ_MJ2I`J~hEt%D1olKoxH_P(EpDn=QMe
z#%N`h9lyNfY5+Uj6jrmV!PKG7*zHos(8^KH)cIV)7VPa?K-^!1GPUbQ2m2N?AYLdk
zcP9}{lS0hA$3J+jPQpVl`^uRF^YVD#QE&pc2F-d-*aWDrt_<c{o~VWdpQyFjes(f%
zBx(n`aW&j^N#;`Eyw>98SCA*^9xR8%i)7rpEC6J-m|^@}IEB{@jbX!RpZUW<Umb92
z_W|YYoZQb*xk6!sqNi6iMukhh&yQn_=%A)!!@Y#i5^dmBnBTSh_$qKm5M9q}x+W6@
zOmt7(#dUY!?wTOC^TOSxf{IB`Wb)%!9hQD=eAJ~5^kVLA)xEw^Q}frL#g*NTvt_<1
zL-0J*PLa$N)3j#Z4$>p^O<J9R(B|9>1alfkn5#l%6lr6e0>EzGygA%x8^YuyHNUg7
zHa%OdwhqYerMxjZA;wG)j?*_&^3gS;lf>$NJDJ40Be}Oajg9upcY0f5`fS5Bpp6EY
ze<$uRZ;CHV?k}gQ{U#|@o<*4pLeUMf5pffkmYWL0uN69_g=(eVHpb>fpLYHi!+(Da
zu<gF<AmTsCMW4XGQT+YA3iM$EYC!O#C}WUo=Rq{LVOp(qmXG^#$B9bDV<lq;W=veq
z%YEaeS=77X!KAC2H?jvVG;C?2hu2{GMPy@*@AmRpTRXesco8t*y$fh~8AKO*Em|#s
zgdcPGwp0jYvF96eEyppmPRsdKi2yak`8^HQG_SyT=}fT`t8AW*HK}g1M?0DmyBk$C
zhk)m0+e1)gDGP8R=BCcS{-HhyLU<+G%l+vdCLs|?&$WF%(-T3=ut{jY<&%-0FPDd>
zvvjfeT#+i(lqPS&AQJotBiR#`?u(zBKw+!82Wye`m!HQGXDH~czUq#7$h9PO!4DkD
zDTZLz-_C=A7TA%PIQIes;-Fj>vE1)O7oRRnp}@2c>Gzu>e|&vgq|(Qxm7jK8MEeQR
zcRBhE4`F)d;8hYcO{SF#`N800c+a@up%LsXmeQFDcDFGaX!ajaJ6Daudqh-t=)SjU
z!AxmlT7b!)rtN49CLTTAW$9WHL-vW}&rnyQ_6EU=Hpq)F5nlFK1(_zDlceabL$hX@
zT+mK$(f$@K40)+)<_Gt^4-?~rM-3){29#v$b32kVBxOo64Q!Q->Z+%_VlGhqEQRXV
z=i1z}8c>)M(IjOk8J!$0cFaHIFab<`b|BZZ4|vx9j2?=1Z=MPedpx?ue$yiW9Qk*C
z;;Rz_Yb5Z=R#&Hhh#67P1!k`v)EXge123dVZTuP2@m)cd?Jvvgig^QL_QHC(TQX|y
z-sQHb_UO~~Wk(}Mr1>t!n_iO^7g*dro?Z`Cs>&4Dy@ETE`7SOqM<70n&%A4_xZd<J
z+i(u@>pO?Ci47Fn5mMcjsuMy)VZPSNe)}#(x%Q7x)c|U=p7rlf|3~KW&rknL@bTr#
zZh>zwq<$AX-B`DT`-xRQh!MKPIXB_pknf-T`I+N=>i?L#70Uc8J(5Xmu(8>Y{;lko
zQC7U?2p}!ejYUfpnpyl~LV|)ZhP^o;`RFak^qqM5zS#sinN?V9Ll#mJt!F^k2<SF!
za9<1nN2&?JT@5Y4kMJtj+rE!2Gu=*5FCJ+K^ybc6$14Bq^0;qEyN$oFolewvAR0Nu
zvac@J;@;2<wC&@qbmJyl;OgP^zDx#H*;QC|ug^9`e?2)8TRnu(%_bmIV(tt`lT9p#
z%h@0$CB1rQGo`X(3@nagPrl$ZCl;8-o2|~s0Dq-$dyzrIaDCmhJ)Gzr?Z)?2mrmhX
zq@W8$Kv7DYa9dv~(L**WwV3Tvu*jAf-ddH)*vgm?-3WAc{v4!fCfo9&h1O&)ovv(2
zxS9FwJcZH927ZfV14Fvr9x~$#8V+EzGC5ZuMtHs@O^0s+V?O0e3y1c@ad+OozIRyz
zSR5}OHQ{nWa<Q_eq=OzO12d(U5K69W^Pt)*0kOj`<Ij+B=MSd3bpsulxNpRi#B%uj
zn%SnKjBmSC07h@&%(5TF7Ut&dqWv}k<zt3qO7zQHNvo@+e%QfFD{JJzY`-WFs9<U5
zl5KD$cQ8f1Evd-Ze$X(a2RqyskHT&o1<}8~0|0BLE7Q8it2qv3bs&7qZFm5$bx5~q
zi{87@tX$R!r0d&=_I?GN7W6VtVbaIdL5<sIPaBR8ZY%-)LWAV51j}%%MeW3@6H>-y
zHZGXsn<spvVNsLSc_n}N$VKf2TP#L+m^>Z|`5_c^nM-XH=ZH!rQPExf8^ylA2Ow*R
z*2sU~`1If91h)4xoB^9~;G-dMFEYTy&`G`s$M;tbBW>++De5*hvQt4^QEN*K^1T>~
zk}{d>(WUE?YIXnFenGaJc4T@2Lys%wI^bJ@Qb~vcj}Py;PHfb0>vD$9RoV1;WgWe9
z8om5OT9GG#DG``CAbgnyXKoru*e;c*`E<YROo#lqI6l+2616j<K4BHH_r9cg-p}k>
ziJa}ZZAZ*kw`1m}nQuW#n&l?^0<L-rd)X(+<rTBS49xy(ICI-zT{Ir`+{*T?c!GAB
z(Kw8gi!12)i9DofJ>7A57bI`J!yd1D0w+2_z2p=KvviCKA0oT=y@a!<CV=RAC%V?#
z!7>7V%aV?BM-mv38o;2PD%ng|l%U;~Y)mYh{%c}yeQx4cF|``F$F9;4K)j!=SXi{K
zXYfGEE?UhEk7I-M3VgDxkedC1J~S&`z}UHz_UcRB;`Unb6+p<aC+Uc78<sqvTKATG
zAxIiL(d};D+gaHL5TdW>nGXONoq+=dOYjJwPuw8$%%1n^27->^5Hv;SKH4A1!E~9#
zG7!b8(dgkVX*-b=h@$8NVRkn!Poc*LJ%=loI(|M!ZZBpAvWU>coQw&$wsQGA<?}Q(
z1te>;{>P6WvimzL(}RbU9pG&#8!Sp)p7YL(;|L>Jgf#mJtw`1f^6!?K`6am(38JO%
zG}@uPx=nfM2m{_>=H9aV-*{i|lerxC8?6+-^YSb`Nr#+hxj%u8un*!~11AE@cyBc0
zZ<$M5zlgwW0J&FUptW@)1GPy8NJjhCI)&7FWGT&JGZ6kSz|ku^-5TJBj~)Op97L<K
zo)3*nh#8l8>Wkgs`j;T~aq?sgkYxP7Y@YhZ9;f8i26F~6saq!-K2}Q}Y(n(I@*}%4
zMEQ0*`sj(uU>50y1SUzp)L`>qrL4(i;~+4#ou|$R<?MMCxo!1be5aT(My&hKGzTzl
zv9aUo(5NS|)-W@3g*KOnJw=a4yE>9f6X(2-<e+6P!i>hjZ1s1^&lF9p<Y^Zzh@#p)
z+C4%s34nZ<^e1lblvIEublt}j+@nK^zb|AiVA<{zdUxYDF!_^A4}!yqN}1E4C4Amd
z$745KcpyxG((Rvqt(V7N_o_7xr|RMXGV*Q7$;rPD0n^w$USqihv9m{-DiqFK!9#wf
za44I7?fqEh43rTV@|ctGK>r6}c-(PZ_v??}OV2uXW?H3+t?Q3F@hwD)ys=>2n#P7Y
z>Ek)m$zRs6wddX9$1H(+VCEpmCw*F1VW2eduSiFIv1wY~#bp$4c!QgQ*~4E^MNKLi
zMk_-O$RIGMB#PX2*|p3a7B76tD_yi5_>E>J!>rP>>yU1rd8=OW>?nso`k2G`#4AGs
z8024w-^GldFN;HY&Tl%nfN;)$koEMbo9!Q20W+zQf@Rc;Mb7;jFV?Sg{M<BJarJZ?
z8mqz3fuMb?hJqknty*lS)bBN-Oc_c>Bnr39(%+Dq4~*4GHu4z{HO!_o3`r71^mow5
zJ>T?yULd1z64{-({l`SHb?P2Fy&NT<xf8dDYotPp%wO@LpFUpT>Z)BMDQgjX_a4Xy
zlAqC6mR$M$3bhk&HV8WH@$|uJkQs&^;;s_QQP%i7xSLzr_^rinN#u5Gaa?OCDdrj&
z<poUx*l@?7e~e%n*8OTqFe}DusGpz1T4C@1-UJX)3p|A3*4EO$$=Lq}6ZZK)pkwIU
zWw2X_{bc!`vBJXIrMpj|b6Fg{E2OYz5qrhj%QXrSQ=%UT`@-mEW1j7Nufy;jB8V6i
zC`1Hx7j?4+9aLpVc@Nrd>jLHp@@=!Wi>)kRTfvb-s+X=-=}YCn9wR?9vBQNoDjQ=q
z)hY`oR?hT*_)P*>lsI-YfJG+@_~-?t%PzFOH@mJdL>Hs_0FO(pf5&BL@ty_qwZngZ
zx#CQmM$(*fV^RLZep72NZB!p4sDRlZU_GQYa|xAa3!+1yrUo@}xbT5*X(h?y!xj<n
z4XyPS$L!o(m4|;GY8j+DO0xFeamVe{D9*2rGrr}d7FRYk*dYRhPR9leE@gnI2T)1c
z-$opnh$Q8{V;ri>`fDP8$JyLk3b`%xU3^{nb_IlxGOFK8A7|K$>fx;qFqvR^WF!3=
z#dydHb8gxVuGMh?Kg;cz{@pGKmio_cxM;QHd^EEHcFIM#e(v0bt<iu=hx5OfYnR^4
z1ebZ6$I0XgZyv}IB`t)!iPv8D*mJ`=+JGTC*SEGe$6eHs4}Tp7kXD{Kdpj-@Uo+m(
zD^2ddL_s=gQb#`+0s;5JS<vEFs*0&Jo;-nqRB)|vXkaECn36DkDP`C1+T>S<Xq)dU
zy$f<N1E);z*D~#qL+^0E?}RgOZGeb!K9pl4^um3td88owrqh(KhL>!8Ea^3aPgzFm
zod1o6TTxc}!!pF`=FvchzgG&7i-Y5Q=E8p<P$%PktxS!7uEO0ZX4d((wREy95v<m<
z>gS5r7BIK}{&!lLpZ9VHD-1bnyBT#&-(mU7qifQ4VIFQ~A@N@05|WuuR^BuZA#EvP
zo&fBz+he1+-yNAj&8F&3AKSfO0a+Mpq~5gtt>V!Rdg5`lH~%yD>$)V_0y{6UTNF7J
z9P8@(L!7T3Ue`QBo)gT?b}^?A;eGwwb-B}8&F)&V3RD6w<jNps^N+|Bpk!JPA6~xr
z-O{bP$#m9G`I7=gvbA^qP|&+$K5W$!EWWl{$W{7U^w91=kxoIN?x<<nk2>t@uq8T*
zeUYye_q7YV`@<C$PC+CE<d6ySqSp6Ql5T`|wQ;Vt1yPdaCfXbd@wRGm_cJjukzAiX
zF*nc37_hZlZr?7RA>6oYG&Vgu>pb%<=c|L!%DdnNjtCfK9i!H)$B4{(=3UJ|&NxHi
z`}e7T+z^&1e-k|Xxcw0`dtQ7)XtJO3h*u&;61gNO>@Aap6(SFwW<RgN_f$>t6&XAc
zC%zlrvqNFI!uY8l>t(uI-^iKq%39f^vIdiqawC5%7+t$oowB!3!Q6k~FySei<ONJ2
zhTHsE%3n30JDU%N$lI4kj5b$=7E@nbp<>Z?$g#1T#6dgjKjW!C)`Jxfx&BzOoSp(8
zG5ST>7%Db1PX)70cI_UDhc<F~h(~!uBUYJXPG$xL;Z~Ua2AsHn==x_ZpJbngmBws=
zHwN`?^Uax=H`yIai^F=jq=!gZg@aKze>12I)eUw?lV9d8q4=V`JJ)orfF*txdIe^P
z`M9*SWFM#MhjvpQTpplS=FJOy1SKi0zGC&s+^to~svDiETHrj+k4<a#@p(LIOi4CP
zYGvKqRUQrh^pfq=nZF*+k50+A)EcThAtQTu>OVf-(~@^eZ;2VQzwt#@7!{r-d-zP0
zvcj39-|w%PXOyxxUbbR3(eJ~Z7tv%A8=6~?geueIe&Filw3=P$o5XM0xcK|`?;wcf
zGA~<LlWU&O<0kn5G^(rq`f6wYHN!;D4)A*NllOKbqNA12!uBl_45#x~8rjG38f<8v
z`yxVZ?Vht)x>D-FfpGa29iZn>{DOwi&l{!_yWX;qoBbfQQeFSUdAjXRM$pa`)znTc
zbsn2e9kfK*)LOXFO0>REAx`{40`v7C+F|cU-rn9W_V!Daj=yMP;M@DsRBNU-$4T}`
zc7-dWMDzd;$d~zJk3CJKqi_%Op=itRf!Qcg{(S1B@)5#f=v3sS_~&7`uf919S<K?$
zTg9=i4`5i<yiw!xq6U}Nng#m>%HhMZ6-KRR9OWFAiH<M0mONCwy?2iuJ$jTz(^>oc
zaj>V_BLCF8I54xI0A-CVT-zF%6Uk}ny!)zSVar4x7<x%BUXr~+dhMhbxsNqWkZ&hX
zrOSCor`amMA>}^d^msbiaHsuiEFyi?XHEicl`=$A?-%d{c9_S};28E}V`C@*&q5$;
zNyB^<|ElX0PA8xjBHxazih`jq<&iW?ym@AZjjmWq(epyAtBva@UCfs;7dSx*SL~FY
zoM!a2=UJ^YYkPmp;DXUgS&hKGLR%Ltfm|Mf5P`KiDRcPypv7A-gKCVXxKUz}_F>3r
zKk9bMHOqf$$SC^Z_;Jsuua2C;KehIMesGUS_WkC5S}CP`ON#lOr=T|PNs)_3RrB_5
zhNgIXn(hh=q$`iC9Bic~`?yYc9w&dmBwx4zl)g40<xCDi_@@qGPYo6g9#k?S;>8k=
z7P}Bu3Jf~TUbt|fyTj3g*-oE|BH)E&Z+bCe6Uey@8>C0NTLHzY{7KV;RizicyOYUP
zD4@y$;0=9QN*A}O@r5F*$`dUyF)@CeG*h(!w}e<}6T!TkbO5$Z$nja;*icvxqBeDE
z5GVtBYcExJ*xdy!lPY#7ke=h;A9cecnnuJ*kV9jM7->+1WcAtVvXEbfLqyBjs_$<u
zLWNDZubW-xa*LW0BG3$zioeAfeWVvv1h@CTAV<k_rVQgVP`mahWv|LXz7s^`xH{eT
zD5(OF7x_`o8$j!1zClPg>^n8E%0zgn3Kg1<H_#_jDZ+&<F<B@Npt^|<XyOX5T)xc5
zcOx8~eOG<y@Tp8+l@~J$%b`-vY3lQ+uONyvrBcKcX5aQkhvE_`6;#Nr^A@Km-sI=6
zEQ0M%Tty1t(!py5FIk~54(V!fmq`)H9P1`<ej|}|1)7%lj_am33_@Fz<)L#ppSjz*
za`x^8WG=YZMk@r7hSGN(5>X!U2oriO)lq1ORuC?6<<enVdE{Us-i9W`cH;_SD^-md
z_v#e+OYnGTNBt5kMVnx$K7X3>)PH=~Q@;9e<|_F%W6i0@m(H+#(K}g0GK>m5+}G4f
zLTveE(h<;4#W=yu^oRgWG-bbgQ)B{-*G9t^;#F{no`Fn4OY=KMUN1N^;U?9T1fRLK
zU8%!iq-KBpo2R~C4t|bwH0r;JB#00Csjm3Sbso$ykhPoN?c2Xj#bS2n0+nD?#0Iik
zW_e|0nlo*nEV{T?eC7pcwMHvZKg{DaHK23U1W{_Cqw~0QLgV_Z4$f)%h(jodKWF2}
zXa%dd7)5pF_X#P>ZfYX2^dI!{W(Z`7HY1v+iOn11Rc5u$S-t&PWkm1Q-sfk}o>i|j
zU`FP(jvQ&M-2}G=w6m)=WD#69HCmDWzFg%@g4$m7gz;X#*n&cAxi`#&^Yt~dmqq<C
z1oEV9ZJ%2;R=`BDD3vsI0t_U}jmwNsyyJ(rKi{p;<SwkNtkl%!nXV1$>35BsOq#a2
zpRxvmnGtX%<k?UG<HYS0as6@0h|!wspaU4X3r0|n^!sLK;>1J4dJ?EKUGJFIna5K4
z>hajjDC}y`yMEHX6c)GOf~p=GOI1CCH-@=A7AfKf6HPRPVu#Ct3EJ1t2+o1ScJ)uf
z?QO4xRSUhr5lXFFZDZ(>Tc$t(t+)80L(!bOb4w@)?KwNdX(2l*2*+|k8jZo-(~pGa
z;<e$B?R@g>?`AJF&wCbQ8o!6%{JQ9rGDPqu5B_#k5qS$j2l{!CKS=U%K=ut-M8CL!
zq*tV$;A0lF+{J$v6&v|>1;sxd>i_)UeH0n>gZDcN_LR%xRFc}DPHXO-o~}*0=TX=<
zb+AYD%bA&4zLLL6T_>q##!g0ju<?ysKkmmKsI<f0`iHtspa5rqMU(zmN6EnH3?cEc
zg=t>1Aiv>+Bl1f0J2jqd7Ko(iDzV7rIk*jvL|@fhS3g>Uw^B+&Vwa67Kn`};L3XX0
zOP+;`?=II~J!U^Vj9QQNX}IRbh<x_XS5BfND5~Rv+)oI0_gJ{4&3Rh!)bHGi=G)y}
z9VQbkysMy@Runl^N8wflAd01be0`+><>$Q-ZU?2vi2YUROfs=Ode)gaSH>vT4aP~l
zL8t_$Xm3<PjFm{rcjp>dboT%??|S`gUsa~Gq~!iaVKRMES1MH9w(Ys6=b>nSj7aZ3
z?ZpruPxm09GkHBQFOg3k04?SDsVWfUZI2KujpkwXm{h+MxD0a41=PX4y;lJ62nX9z
z?LDM+8MQ34!wbbpe%6Z#LN`v<9#YsI`9n{Otj#l|O~${9Lzv>A8gW0s#lt7W6cg}W
zuf#yi^0>PJBeK0Jgc>B{H_kqQE<~NXbYnGc%V|}7uug$P{f`4?<aVxLc;W)%?&%?7
zKSQ~N^Rs8q3@Z&HJj^3JDW0md<US8L2>dMnNI}=aCx>iwv5nO%qv|aKg+K%SQOiPW
z{R7LNSGp|WCp)LE4LyFPb6)M|2*Q?PJEDiw!+rC1%$9m4VOF0f-*hk$To+W^?tc^w
zt^@96Tmc5+#lm{v&>Gn17HhLNZeH1%;!Wy=#`q3)=D#7;dm4&|g%e>VQ4+8J>U;n9
zpTZHR-uPa56y6EWOP0qE!2x30MxNG)X)J2<k%}T(QZ?M|(k^9UWeqxdeNBe`N0q2_
z7Wn<r#iV^gTi|I&XFqF?ZPjSzjV(gA@)4DDD~;<44!@Mbl%8aAXJ>0W+DvvN-QF#@
zU@~E<mwki%v!o_;jvFF%k4Qw=w~y7wlCRFOksmo3V$75GhH`Ury)mlq#XQsNJzm1~
zJWRzt>+)Y&u9$QT@m&EJSCBkdZT#YPvXAV9XOMnT1-M6?9VJ<v(vy4IolD|_auCaz
z<WFDIKwMbr@DcmuXY2bL*7z2LefP!hA3PydBdb@a_F&nXzfJG*yNq^6d+?7Mc~fcK
zTKr@O3Lr3^{cF6xKFzQ?RSqAG?_13Nmas@?Jrm<5wjH2NrUaou2}dG+fH+ZRnr0+y
z)L6#Q{&FhR;as!Eor!u?-^|@OVUtgmxEG+su_!7k>g6%qd5Z;enfkh)>t1Yrw*zIY
zNtK82A0;;93U*B!pZ`b%S#99coX)$jk`z*3nd-NoLNuk0F%G~U+oehj<WGucze!Q8
z9^%=?+mUUiJmDNmseCX81aS|m)|H|Vjs64O3D3xI?OEfJjDY+M_M2&eZ<WbEBC-?9
z4Xe=_^cu=!4v?y7f=>|QvqRYx91eFJ#vtaE#er<nHsMcN5s)gx=lnwS03$R&#I}u*
zP}V7+TZqqNw)M}8r!rz{_w3SJ+<Si`LU&tN<uDduMl(7^&Vh}^ciq;dr5RP`***u6
z6yYr$o>(KZZ0G6=FU|!8tHMK+zU28C)C{(E1Urn5`6C6(32CAnCf-TkeLG<IVXNvY
z(Hm>t)Y$04$jJD01|c1i{%^QOsrbG~*610-|JgGCVGi$SDLSR!%Sr@^FQZ869}mD+
zEX%RsPh;}MZ$8Ng#fbxd-j4hg1i!D1+(tSW@|iUo&|W+zR+<H;{`p{I+JUI|PJ>NY
zTn!drn_3yK-F7SIuw&mISq*(!y5*DZ7RnHq-1c`C01+$RqzM?8I%galYE1;KIW)Gh
z??Z~+<3s8=+}6nJy6kH<6Ow@qLAvrF|29T!hVAd~*A{9Q(lzM+i)*&<Tbi3M`pj8a
zWZlU#lY_QJKHE}2Zg=D883`RASHM%9R5ew4lYzmIX`&!;NL4)@ozRVV%8PO=vjV+G
zEUqz)He5j}Z6K2a!ll^XAs)0{Ky?`{v4rT+e=tOPe7%WGB6B9r;dj!x@iuj~=YrSZ
zp%S0ft}^g?sp`aqMSApB@vSU^t4xe*Anjwj&{&J`EB5av$2|3z<y37;;linDJ3~xI
z*-9!&?~nK9#&5Xu0&fx~4I6`uA}K?}sGgPTFj6_wv$C2~?`Q|}4ytRYYe2mOB&d%Y
zs63p(p&T#4InVNT0cEuECCT{d@`Cta6vUGG?W456n1hyJKtRB>b@fzzlD0`UN~W1=
z#R0vsyxe&A)$3(9Kv^z@zI_`(f4_z&uT>5@M0qFldEs&%GpBy~C^%)Cezyj>vL}xV
z*=d{I)*9E9mX=mRZ<gCRk9q(QsNU~`xS!I030!*X4p5d_=eDdAtK+vq>PeY|3_86>
zP3d<NwDGs83PJ|rQq0#H%iz&bPK#}(zxro{>;}`bfzHoxL0b6VCwQ{f8Or4W?!e1e
z|3zQ_U88>!+J8Os#f6M*3|2?m<qt~GHuhn<{hd7AE0q7<uhnng-owj^RUSW^-U;hx
z-mCa^-3-Lqu;#bhBD8tOEW%TQzJs46udqQPipWE?=VSn(YM3N8=Z;+d5P$zu!8>(5
zh<hl(2R&7431+3^5%R1HWd$9&sS>v%Bc(1NdI;cse?H=`i`HcyBfSEd20_CF`4*-c
zOnDTIufp0D$bHMo5DSl5XVPB(K^-|~htbMf@nN8TuVgKV1WG}lI~hES@37930f^8c
zox$qD(Xo8e?u$%s<uhXPaqE;w6ue7$>^oTl^trdPa215t@C;q%G_a?YwKWavy{sKB
zWwFw^GK+#;iCf=KL<LOv^I0MJZsDuCRHR++JUY)+Caq~HFniNAF0<#upJeQ4s8?ug
zI)maIoG<?iAc}LLJ}lPSq-%e!k7yBgbeE%ankOBhr!If2UAvSbvCix!M%AJXJ6wuy
zVAR^Y5b?;0mMAts%!{O!s0I8YRwF3r7?bh@pUwLsAh+`f&@CatKxN^|d&CT&t9>~4
zZi@20!)UNcQjA|ebqL&6OD56AH2vWV=k+i@@KW_n0ztb&uu{|6$&@deI8N`Z-@Y}H
z1wl(F!dUz?RJHBct(>N1au%xgaU53k-m8|Czi(l82UMkt_x<>K#i@3`KT4TGl;#>i
z<m^%<!6{w)3`|xPFe!D{2P%G9@|k1GDGOS2udQ(=8Q6*v7Ao}mu%qz8O^H8A$)AU_
zaQw+R+!em?e-_!_HJ72twto6h`KH_0N=?SAz2H;iA8v!{qPD&H?uWOnXj)4l5WMJO
zOn%6j-iDxQi?IjTybH$>OI-T%rx!fYa-CJBZSgm&+J-%6S8!yC`B7w7`QjE88~Zpc
zvf~?L5IdK`BCiYI?v-WG@qI|Lhg=M>m;7<|Yuw)X^XC(3!@Gjsj6XBsFqx3%2jEYg
z4)Cyy4(7d){uZiAiow$w6yF&I110Q?vgaBtE9?DF08ZMGF8MZfQ!jTOKUa=e$N<<#
zlfFTM@dn%^^w{m+BQlvzv{b!v(kTE{>SQY;k%Q~5!I-~OEj?is0n(o`TzxO%3T@w<
zADNz(3IY@Zz78px!NqH1*>k)V##sC^v{eFP5HdGJyxD&N{jC1<+pwg8K(T8zX(sTi
zJ{niM`uo7QFz2zU&!X-)f+DvkzGJx7J-Xq6d=Mq536m_Y(WTJd*H?DBHrg7m3PDSo
zzycH|2^3Hff+JD`k*3cTl=<=&>h-r_czRNJj~X11g^DiAO?!ZfpHj~<71Rz15?65h
zA^w4H0l~d1^5KN|y2mz{xJdO?-Fz(Zqn(Z1t3;17^`|8LiY@5n7eG3_{%eX)$V#K)
zq@j*bj$&Wcpb8r#qj^23({r|*gF$%kw5LjxaSev_^bWV$Hy7U&kmFN!%<N4Dut%4Y
zOh=)ya*@8>jFQZpGA2u}$=m8e5oHDFIqSe7m_b<3b#Aqpz?{wAX;&wM9Sd$hhI+!k
zn_bo{voob6liF`Re*84xubTAi_tzaWnVy$18Zc8}{ks9an$iGj-BjD#<kSBy(-RW?
z|9w2XMD_UR%cHn)`N9eL@VDBa3mQc|1cJv@RjDh@@SGa2izaku7rZm?-iYRnm4_{T
ztWSk5oXsRCG@R|sW9x>?+=bY^J$IjIIpL{fNz+Bf@#5iqJtrX>^1j+Nrd*R;PL|eB
zpTWBvj~IN%+r;tBziFjWw$yVya*5UZ>OJ}M6Q14<os&8fay+_E1kryYK!z;6LU5Rw
zoq$RXhN0TExMYbfxjpKv4i*1THzy-$*-|gG0Sw}G6JU_^51F=|9Pp1cF=wTRgx2rV
z8uyxRB|xor{6CPU;|GT*HF)zPjHjn_@#($RUi9FEPIwcpy?hD!+_6(+`SgTmQkRL&
zK(h;iX{f%>AklLtUoCCju*qwU1}*Ih){aQo-cOkw*iJ-vMfM3U5`pDOmwY@8Pp5nL
zIQ`iIKXUsf&oBMUYHaeYPhrl?r>%?)YZ@h0?16o<)4CBVxqiDL3*3t=V}k?_-kbED
z7GXo*+Uj*+g6D&yfyYa<&R_!NtHDnmZ0e6~tDE!TBA(P=6@OkloU-NiVslnG>d5c2
zj7A8-y+InVccJ{C4?ici)FAl+(Uq@Ku-}0uH$Il$x9C;H*EiD_0rO0sbaW?Yhl-W8
zDMR&3<5j4YwkN^$snmenH-a@Wst>H^g7aw;x?ACZN=YI&QG?<yKQ{&^cxs_gI+a)|
zT@B!G&M5=71iJ99b9o+%!EEHUd<~hdJP`bGYuy5xG=1+pMOP6XyBc_3P!K*gIXM|L
z`pe`*o0jmvH-p!*i?076bJtV(`r&<`W|f}?#oA9G`kj|^^Sg`%<8}D1`}Y6tmiJ1?
z9~O$0Bwp$Q9r1su(f{&fjpVEP3tL<(p_=96`qbNia|_44gm?8=-xs(M6TsK^)f^gO
zWmIik>ZcmfHygK|7-DcIR(wlA?cQL!B6RM%&zy#1Nc=+f&kvShO>roKd&H;MfX_wf
z$BbT)S2x^@9nuz&G68gAFK30*%CQ~~U0q#wLEf3m(X<J1R2f{`+nT^kqjPav{fkkz
z<kFKUv3vI_R4WY*V9+%?2jrtJbCse5cur~6+3C$I3u(M4?PZqR1$7%-OV2%joF<k(
z_x3I~0<$_DK)F%96!Pg3IDB5~NFVV3rQH`<p-pu@SDKF+G3_=^-XH+5==~zIPX8%{
zz23Mj@X&2)PuQ#9{bS?x;I4v2;b7Sz;?=bM<0s$Wq5DY<Wu&GL=k(rHJCkVo5B3nu
z$_m4Zl!tAC>Negm<Ujy7fMyYFU&(WGzs|~0Hh)NQQ9|8w@xpRUkYWRWJV2tTt`96H
zE*xdr21DFggjk;zMN^q{hxw02=<?K#56Z-eNJbHnRlmi5T#p_$%E}TJ=NKd(ectaW
z9;2zqW3%pw&rJHjkUi%OURR`+JU5QBuU~*IhZeue<b6I_rlNrkq#)eJraX*T*`HkT
za0}qAF|&(|<^Cy@C(AlWepb|YrqgM9-3tFMsz+J0qlTUx&rC&CebobDuL|+iyJ(TG
zyn6z<??kc?j--*%5+3Psz1D#@iICT&IB{^pzn^Dp10|?h%HR34slqZAnjeF*<2z=0
z2^WumRa(y1ll}gOW}hNURRb;3mZtl~zw7YtWBPwQ<E+Lue>Tl+47OtWvKLfbCzU;w
z#oX5Yp>d-uN?kwsY_oyjt2(dYmP<c=_&mtT>_cH(qpVnV8?={wpwG>pG7GUTHm-MR
z2}a?X<)E4FHkZO$fc`#6j~?QLh_!drzXtEd*zTdycl>1kP=a=J*Z5$+UsD;W`#dJW
zpisZ}3TKGsh7of67Fw9M%jAjkan!TtuQB^CEBxWtYBYB4Nb?=ZOn3@B(bDgbPtaDp
zw=Fzu(s7=29A;jOYDImbf7`O>vY3MgcbYZ6a_!nNR~ecNyH^6_ebqk*;_NRF;Uyge
zF{hdO&5G@n3E8zqfTF@~(`9;C{=(|H69m=ub;a_=1Qly&6WCtmW-p2lPC=-7@*_O>
zMMOBRyQFI3w!E)X<-vcHrjGJ0C+n)*rZO2~R4V1(L3e$wDjjxIsM7HWN5J(&f=gx}
zp&zKhSeCg>qNPV1?Xr3$fNU-$w2|T#rEibCra{ldU^~6Tm9pnSB?s{CZMfCATU!&7
zL(`<YUn73#x-`^!@U?A!7aXVPA83=U`NB>`sfI(o<wTKT3;8Wj1MP<I(8MsY<K0Rz
z`%NPE?2YsRGI)k(6?6WW{3?gJK0$keGDe~!-vQrs23>?f>_)u&vO0lO_#oD`E9%sI
zm}_#Xz`J<9Dbco7$BN0T_NmHYd%7v^q7~lMbmCCegjwdoKjQN%y<4ZU%}PqimjAOL
z4gS{t5?~~#uKSkMpt>U2pWSq76GAd6;y(Kr;W?swWpC$v3C+3FATqAqr2QG;yc5;4
zd7@OIB4WOrwgP!zfQC(Vx<3<$_*>)Bz?;yg?k>MR=pVBVi4&op$N+-hJCLdTjk@>v
zOA?)cS@UALgoo(Wt5>6xy11y86ZW&pP**)Qlr=R>9z(Kk3I$Dj&f<7Egs?j7FHI-~
zzf{;5-ceLiid^E?)lp&C&AL|rwgAI$soGixcIlqf7bTRT>N^hv)L$kP@2Iq^Ch@QF
zT)*hiVZcE-i!pg5(SDw4%C#|WZfxOp%tEMYnXq{z;~y!}-lk%D;z>gwiFCY!FbPZV
zyxg|Ax!KOxMGwDRzO=p`*!|_p7yOS`d;Wo1lw`;6RM{W`KqgJBudf(V9Sm@V<n&qX
zV=WCa^8g84Q#TgFaocn<qfx@^7CZBEb8{@Q^<JC-;&WrphL|Ss<BRD^9lts3N7>2s
zjt`S7ynYj4%fXj5<Vs5>H$rk9*j;8wS*Vc%mxz6kLnO-o6{0MHS|Gd!2zCms58tC1
z3u%XBY}J*SWw;1td^gs7GlKqhnmd)UBU4Bw6Ok=?z&}-ohGH5OQ9E|{^2Rbh6<P?c
zp6+6D5`VxK&so=Tqud|Zf1zC_i2H0FYA*Mtl?Z#N^Yn%-z9-2~IQOcMV*E<B^K_FS
zarCInLC<s)Z3yn=d(^stO|w1lk~GE>_X1wCJzg+kCTnkP-8h!HQsQWN#eDPxk0?fj
zo&Pth)ujTi2(Qx;*}tCx1=dTR%=#j{>w8zC&0VUPZHRjY!6!=t!=&{;S{wMZItH+p
zYWTRO5b#*4$+`0qX@;?BB4(-Z2=Y&!vpu0?eUNPITZ$`j7l&gpV8@GJui8Qt;<vXw
zGUyLs^St`&>kj!O8=&8rF++UeXWDzlZqbB!JqZ=IEUhZ(x`4u<2LvLkjTM~m7TaZI
zW%~Ds{7~ze^8-N|4Go=PZL#FrIX9WhZ0;5SOfxopPe!J7q5T4@iMhj*ys_P|B^HbA
zvG0x+fY|jP6jaOrB~AAJeamz#ujrr9Q+YPAl?#%!^}x0Cs*_NI$>4)Y=gw-p(ar-~
zQ4&sB4p@A^13%5q&JHVx&Wpc~wIT+%JQcd;+cvU$*rsqr)2qvu4XryQ{_+Z?)-B#7
zocIuJ40l=BxgsFme(awRBOg3xv~mqP&|SM0p1db5h&z4=W7*F@l$i2tUxeJ>jpi6Y
zq#WIbOanvYK5&baxoFrSuA2`wxRk&DMOSCjIrO0GY&p2xH}92yJwY$HH{>PStm(I+
zWwH?cLDPw{VuMCXowISH5&9dG!jLfe?0W)vx%w@ANPQVS6^3jx6!x`-J+ci@#SnB+
zowk<Z%Z<kMDqmXW!~#&U`FbGPAof<`ih9PNn2Lboy-#2rd)}B>zxM+^iysHx?+HF_
zVN4bNt_HLOe&!^gPQhqLfMy|2C209xMS$-y-&=Lb(87&3y%v9_1q$Ic>9oLEWLFKk
zV{(7A3q5<yd0HkeMOIqJGnM4{=Wx8+w7Q)tu!g$(Nt`C((R3F>T*q*&j_c>}=?T5o
z{hr=Fd(Ae3zvGZDgLxE$xA*>GCZZ6~F`4di=AZd51^dO9EM6Pm%}adPp`!+sS@FsA
zVHajRfr2&6r-`{x&Nx9=w8%BOLq(YMjJrY(+u#2k=&SIPzWV!2_5;dOuNY3)v~lvD
zl8{knO?%{}OdCVGhB80r*v5Slscz7=T1=NxJ-r+y!>Jr4EC}q_hklRpWFWr+hlqo!
zapl^j&eO71_=$;BuHrVtRsoeN_$`T2Df7t-Uuyy^fJO;)$Yc(+zOS{1^0tAusaiPE
z?LT{)XJ=<UcqKeihNv6o{gDgwo|dVQQ8ap-ijbPvw`gW<sHc#4Tkfj`kfIZ6^NThG
zE#Ew<!L(iOGrwU#PZwWwZ3La*@f97(G-L=c9uEFWN7lT`y0;dHz5-l1hbdF_4vGzz
zQ-d_5x;$sA3!<sIgw&*OReA$P-w`Xbl(8TQMEEGsB99Rz1MqZR#V0r11NL7Mz>51m
z!94&&{J;e#u`)F{Vifd1z0>%{g|t2Oi(S0q&t`5$=2}&zp6P=qv)3RPb5qB3Nc4of
zmXR@foSR`$W?$9RYQfvkP|sA$zT@=+Y7e9h{!z!yfc4<7Z~^lS+X&lk$tVwvGgONe
z7+>XT7;NmtqGxkV%I*^(^ar=k>DfTldqfoglHFXimIXZX43)?iJIc%M3}kYHyN4no
zr;{;LwQINgA6!oqiu?_xG1tH=xpJj-PM3H443&iRfRB(<t4%j5Eg!q9uz<`Y9M)&9
zCY^{hICLc1P?XMH*15jVTAE<x&L9YGw(y6%zwQS0c`(TSJ15!09H7!cC$w{Y%D}<;
zz97)U=#cxj+1H-eGSkx==<(gK8p_bQ(M!Is!AsFJ;4PSN))f%nvV)wLuVd?j&dXI4
zY>>P~JzhWK`@7y=?XZFJ^($q!J^f$HH;0V5TVk-8r9s|jW5h^S?c+p_&O<}cTv?#c
zevIT9M@2_7>Ax<LJ)g>Yp$)_%+g@uvEB<#Y)}%R2zU}xabPzn{tPYg3lZX*RdcFW8
zh`Yh;5LhQtAn5~3hds?RihNO^$vtyl!M;KrYGx39;dMdu1QwgsXMNv{yt)I0(I5LP
zlr^*g9O*R_K5Qu{7_&c9vzMTaj>PrEA6||&NAw&TJ5_cS1I^Sq`D+_rqM(a~z-<W5
zZM#@}b9`&_#1UWIYK}WWtBO#01zS$=jR-z-<2-}}-sa~wC6HtDTPhh5R>PvxPQ(MS
zL`)re8|6$Q2O>yv<51XAAvcxj2!y5ErxG!9gF2`Nvmab&qQZl-_BJ2c{0v_~EHUH!
zyZd*eqoNqIhsQ$we1>xrroq;URZa-$KCy%v;g+BD^Bi)Ui+7xskR0dEA2&|`%{6;?
zcvy{RpQ&4c{rx=`TiU;ZO2<R>=~1|JKFAOyXKJ7v<?0Hl70#hD_8F^&dtjiIoX*{Q
ze9oD>;gBU`%y96z2Q%RwmEg63P|?1sonraR2Tx`4#+1UAO5;70X<E}TzBZ{>(F`tw
z=@Q=E6oh<&HQocU6pc5M3OYK-Zf|ejED{6>nLs@t!=wg0dsC)Gy8FS5U2=wfz$BH$
znEFLv20|UdI8iSG$XJ8z1g2;6*WN)o8+rfXq7?;NkdSKyMBj!m@cZJhJyJ1fd%{T!
zK8cS|FCZm8fL>0K9v{heAXK&zSZjNVmKwO89U~0$$OQMajpmicu%Vy$14l#5UcXc4
zy-1gdcRfX!ggj|FC`Q*J+645q`qVa-@29x)uJTBDnd|VBQGM#K_g&s<kZ?^tv^G<(
zgk`v+^92brRUPm`DdHGM6pr~e#9WQE`_Spu(B;XG{aH&u(Pvu?VRi7R1-UR4E=7+(
z!62c_Uf$lvR$x0LzEl=vbK(5?eZcX|`ke+_@V-yAD4Ec$TdwCX?6n@R)WBZ9`1@)(
zP5Dyun%_U3&z~PKKK=+#2dOMo+P4N^51GqXF@KSYryu^zbP&<rM6Vym6msy(oxYPC
z|Gq%}IB`0A#cy3Cd-z_60vH=QfzR6JPxU`ag}B=(U*DdHHJ*_Uk)O{5kp*tLq^BZ_
z#61N-s=`u^RiSO}rn}}v)E~&pINk~pPl6Tdqg_-yBQke|i^5gE9zfi+ls~?BE^>Us
z$ctXE%*7#nG(X9nneIppng+GjlAbZmdE;vfCh&cdi61HTm@bBV%BidrtafF@<fQXJ
z7ajr|*;`Oc$i-^P$?KJkA#7#)StY|{?V3;|!p4_MnT1sB|9;n@BcYt;93=G-G@Ryi
z*wu@)%T!tzczPdn0U;S-wX+Re*2wFLA(7Sg!#Q*7>+4OAZtAHwEA?TuaLZel3);by
z5&z?rXZG}f!x!>|o#J-#rMrGHMB}MPi#IdD81m~?Md$6hqB=kuI~*s<S7&w~qYiWT
z!+M&4C|35S$aw0oSPwpSPlyiM>7TVKNq*DzwdRIe9%*!u>SLtl%&?Jcs;`!<Txje!
zZ&%w^@lU$MG@#3mf>Q;5(<v2Re!nc%0zv_TY>x0`m2f1e)hOJ7_;(q23Q$d>_0~So
zc7c7e+s28WK|Q;>50KlWl4nkDoE5(~k~M8R)!id%fFY+c1}3IUe%T5GxD))rE_{Q^
zl>2&_^TM7QkB!;Zvzd%6$3>4VKUikZWv-q@cfWhY;eIEwu*m0$(m|l(a}`JjB_~xp
zdGdr_ZXWJgAEcJ&Y{u-yBXrzcSGRi&F3?+Oneuhn$#7BCC=;EQa;z&8?xYjlaz5LP
zwu##GA6qh$<5c!d(KG)|YZCtBrS`H=p2sdj!rLC(>cO8sD}1a3{sbZY;;&j;TknFi
z@U@ogLr-t-p&~+ufsl;D_5Y$6TsZ+YuRlWW0vK&fsi~5o0yQrrT9*kl?Z4<>W!P;1
zgBE^~{(saCZg=M!xBC0+(r7%`tmC=XVQ%dEm{e0UsH}Askl|B8Oo`i0V0IvK2~93m
z>}VqwL)9uW5XN5ig-l+Wo$tUe?*YU;7Kn$a+<9cI&^4i;anIQ}Mwu5?r~5m)y5Avu
z;A0=Y!0wR;&-!Dr$s7t(=N3P4RI0rTqh1~c^RkUMeV4MUp`8ydo9PO{CdbD;_W=vD
z(!I^?%}7r_nP<SqsP%CaD%9`)Zl#gd^cQcWT{>OLUiYXBDHF3lJB|Utr-zheS$w9W
z(JILucK8h-6S|UGUEjPOuT|b<W#<<yOVbdsJqlZC+$dc1wT%VuwGUO%Qlo**Fglws
zJN{0TtGmY8Pt3NpXz)x3Z^4#zT_W9bLP(x0JB0m#k0MP>94Oks*Ur-sO__-1>;;Y{
zW94VCvT0)So(7l_p7)tsE~1I{NLO2#)Vo#)AcbJtILHsf08-MD8COv+7u((10yi9J
z8wNKbQHCa|6O2oq^~0HoXQn}<z?DjHVq#XOESswJ=xs{SKz|#^yot2gp>+Ee-Bs`Q
z`F<Y2+}l`RKZ2f1s+fA<ItZTS>+^;C$U132_u>Uq@9iEafqPAU*cv&5`|4xI8MkL8
zns%tl(#mQ$U2ERcpWoUNW}n5|X@*xI7h?>CB2hl`kH>>9+%{Yil`>{L!rI_>Ia2(D
zEP?ZQ-X`L2D6yst6sa8({eIBq{%M-;6#yk`rQgI)9D=(eImc*god*RLN~5WQWXq9N
zvY?Df^2zj84bA<uCHK!4u_c0jS&5(o<@u$ZEaKbhW;d&t{3+r$tGfeqG11_xHPJ6I
z$V-HzPSve;ZmojmQl`2>9aEzYAbnT=h1-$W!tY4r%&I~+yiIqbQTY3ZeICaOEIVq_
zpC#mA0WKzxZg><;jH`M$6X;cDW}%@7+}ogK5B40k9EA}1?2?j_l}d1~p~9rZc--bc
zg%I(B88DEzf6!_>z{p;uvUvSE<T^}yTZ0KNN+r>}z$S4+tR5#W1y*V%T*r!e43N}M
z?66FjL0~<}jY?2msZTXHt<AZz>jtYC$}xco%HaYWnx-e5Rwf*gcjnpCM4k7SzoKAS
zkJH&6+!F$-Y>ffm`V%O6yaypWOj`Oo3vjWHEeCk757J%VFnX)ek&RoJ#C>|e*LoKC
zj^ddF<dY_`JL54!gu08tADcTj8Tj8gtbjp775NYvOsOHRV9l6E2EbHn!^FMW;uset
zWtCZ=^xrz+=*mBzpy~B{!ASiqFT^%lW;~*2%yTx}uBuj#NvfT}s?N9dCEJe!nz00K
zUFjhi{MTlXNVJN)lz}J}`(6Xy1G9-jBDRLY{l_lE1Miz!ownCv405{Ad(>nMa+dn~
z6PJS74AA|ikXRdh#Gbj8OnW~K^#3vT-hov2@Bes7MMI7)BOO_ZjF2r3vP(r~=0V4v
zA*8{v6XBSpB;^>{yOM((BrAkM#|lNr{#~bezsL9e`}}uqx1;kq&+B=O$Mv|b$2a@t
zTTZzbk6j*K4t?ror+Y?I>(;mB<?+g@DsiZKNpf9hf(^BF-m@zw9G??rijW7dgG#hD
z67!jT|1xAhTeyi+$puSSR}rpEEBV=ahw(JPYowpAhpc|)qI#BKD_^0fdgNSQBQ`#&
zXf+uM_#hfGhB(8rh;?~Tl;!Ji-8U!9Z6vjWKpaMs;-Ey5YPCPi@&{?PFaX}Bi-9m(
z_&<4@NkZh5neTIJIdT0|uUDOf3F*2F7;t}zQkGP%Vba%0p!EyipV8S}wBs+_<_W!e
zt;W3_kQt<@TYMd9Xa{B^P-8!KqFrttmP$-@Fc4b^faczI^bjDA0HM6I6zh4Gy9S}B
z&4I76+MsVIX?y#fH}(Y~evvnBx-p2;sE~sDD@*28rU|z`=T`FQBsNj0(<}!Jm#^_$
z25KC@GP`koRObv49fiiz#@~4lmeJpE90*hpXFIEH9GdR;aaG^NWiA^O#T;(#3`=nu
z2tXA-T)z}#^w5G6C1q77LHz9UzWl6|Mrv%iGi8q&q(93n*$)psGZjK+EX&R19dtkc
zxqo=L&I_vC=y3HOS)V&siR<8D^PO*h7t^55u?8~AXJGSjAp7s&Tm+aR62I3f#><>#
z^GTXzkPLW&9yIGUlcc9IUimCeFGo!J$<r_zwO@Qa@9S&^Br_(vIh-*faxX<Qb#Y7I
z+tQg{v4x%Di}TwMzE8(}J2@W29j8S8aUU}2;5|6%c5eIQPvFJ9SW4)l_fkPPupD|0
zc(z*<`=JW*d$<IGS$4l^K4BQo$(yoCYy0*v?e4$D5eIp0Alm4%HUs<(H44%T@0Mpw
zkj-DYf5x?uK;V2L%2U}J452q5%dR-;wwS{J7c!BqBMVABX@~5~qrW=W#}g^jLeHmg
z+bWKJn2!QUwS|DK*!Td%kST=`xFSSs`-#c>LtZ?IWlyzuMG`UisJ*NQsgCeiykBh-
z&%<cQ(ssBu56<~RO7r&PN6XQs-Zy|VZznpkJ0(bY-rLd&G-<|9OfQVL*ZH}+x>jpg
zTP5*AXLL=)Y_R{W?$hpe1&8sUnXUb93R~upPV7W|Yf_Ax8e{DAv6j^#7)nq~7Ud=U
z0fOx>k%5W1ek=5!Ja+N0MIiq`FwO@Sq291H^0|yZgx9-lGf7DSyep*$--0atDpgSJ
z)%|cy8aiRc3fm#+WLpmwy*q?2G4EI{I(6&M-{KM}g3FL%w8_HpBU;?^rl>HTRRwYc
zJ-2tIHY%`SrgUIa7$0$QhLH|(#w%kgJmZBArej?+X-2Os?;Q=dG3t5K)P!|Y6{E4U
zFS@Cukdblj7*kBavFTCdvjF)P@Onurm0VQLO95LMi^M3?G<7IVeG=2S>UmvW-Afc%
z_ymIsjc?IX2AVb3u?lxS3`ckNcxyFpUob}3K82Z%0`T>UP86@&X(+%ZO@KSJJIwyn
z#fG04-bmY6B)Y!Oj(5evkz6q(2~cPy9P29u;<^y>P*i2&`Ln|^2+^Qrv*GUU#kKCI
zhAAG*Kk~7kEyAu_V5VL#uz%}>nbTb0nyrm`J~48l%3nUqr3Xy7^}rr`E?NyGfkk<w
zpW*2@yYPz%N>pS6#Vf%NFJ3<+<vMuzR)g~XKi{noxqF^)3CZx|*SKz9Q(AekY8GZM
zh3wZt?IGskR(ik>nuLUe5bJn|S6apwB4Dp@2v%43iwj%(<KB~ev>j0>&%NTq(Lrk&
zUwQC$+YN*hWuHIGjl<e+9i?L{YZ?|?m{T1qbt)TNO2^S=8Jv}>BqAvyn1rTgCMWxJ
zP~Lo*<9G8*NR^&XdscOkM~h#20(-lSxPJd_zz6y6E^efhI0$GBHw;KRzrQlRx;T9(
zpZCBjQGo))6zDr_E$^n(K$_HIVu>#L^tp3S?wK9rZ1+UUwF)&KFRz+gSOw!xEhmXu
z+XDrWb<i0qi;0YsQ$oG{>}vr?+E14o41mOabGuw|H1)3wt;c;PaNxFEy{|9=591ax
z$UWC4x2T{>l|k1Y4QY1hb6lFPudm<QUe&|(Cfwgdz>6Be!K(JStgMVp@pBnr^Asc3
zn`TO4Jr`hRVMBD~po+DE>CFqsjPtde{c`|ZwCZpICzb#7>kcYk<7hNOp>~iI5k|@=
zt(N1X;>yem!^Y^P>s()2x9B<sji=Sw^!MK>WeA?XQA2|e;?2AYq^E~tR1Qn(Qiv$}
z4mG5hvo5E244`JaVG3(D-1VGe&RvwKkP_GQ$!Pgc#+X`lk{`xg+lV&S4IMat5R+Ge
z(?&l8xMg&$b|w107Z<R*UF=Bm^^b&+b4zmV)!!NR|4P;95N9;yQ*f=gq4gQNw#5k5
zSE-9(xYpr-7%W!0H$^<m#i%DheCB<!y%iP3?|^2{Ms^Xc`c}waPfn|6(GbpPHr<=A
zcmu1TAhdc6znJrtRfgLN6m0gwNtg3}O!1MT?dUTzGaJX7*T}>^zc0=&>63u3E^`{F
zE%;_VcGeP=RMGA=7$lL%qDGFL!P$^3*1W2gP@rc1P1Qb9h~57CQ)~;?<$+_)&XXhU
z|LeDZeIgMJxbx%U3&WY%4^q27GygV?zi;;DhBsH?`AIu|y%_;mu3F^st=qTBREcd|
zvA{WIiXC|Sj5c;UY2`qI?O;b|XOVmNlM8CELVGAF;G^HwALhw?P%i@GH4$@b%aYsa
zL&zo=e1_r&K*7v=>I7sfLT)iy%6mlrSV@{T7>TZV0P&{>{AaX!PV34(&W?~4y-pwx
zs@!1ums0Pi|0pJ-Zv74tqVP~ZkYn#lk|{GK$EGj2b|-gaJ^z59)I1QDX0ZDG+wI$y
z_(i&HM_`_U3ccyZaq+zl3iK#vi0gd#MOO}Yr0P!cV*pB?h*M-KbtSIv>Gg7X`I1a$
zc&<%%Mw9h$5YlH5aDDQaTb;Pkal+qZDA<4xzQL#Yv{QdGo!?)t{gFSFC#slh#OLh|
zAVZbLdT~kpmj^nL0uHg2H~K475B;&*1hW**CBHlWwbLxYwnjXzS<Z^L8m^l#%=jFu
za00?;X~`jkeBHj0!$`l2sBmI+G)~XU%;zD;VM|^7DK0f43i&K<+;_P|twh}Lmi`ni
zvhvY(#$4{q6x_-|#LC^C)s9_}9kBu?zAl1UZOF`+5TQscJ2)LZS@(}34)CQW9KIJX
zwk+?l72%X%+seVgG04D%-S*dnCM>}Iy}pcCPyIR5KQBn1%GJddco6)O#0<rt6oB5`
zKK!{je}e3`U~h8SEsN(RK|kS)I9$+6mrY}mRz8AA+WzYw$URXx_SlotC)*NHW3Q5T
zALqMPq+fFysmDX%X2n~y1ub2J>`w6DKe#Cmq~l*z^}>uHkN7278JWvNyjTOCV1uCn
zXcde}IwvriOs0DP4mX_ZJ~i>93Ae~~`+_EpS^H*fxAzp6@w71VXqUCc*groY`Epg|
z9V6}hkIQxYCgeSOsAB(%$9@*^cgB?W{g;C!^1}Dj`lwqO4`0uIo)0J6&LZ7Eemj-4
z!j3SyWckI8zQZJp-*Eh;BwV1UFtp~?7U9XI1P_Q@z0<R5ofiEhKi?(AQPO16fB5tX
zG8Q9_BOWK?`rNxOH@^RI1GDKmm|vDhG})0$@7y2k9N7NN-q?2mWx&f}ib*@1$>kt7
z@{h{&e@rT{Kp0{~eH-ULh<sKx?i(w5?{@_GbHMQ8Us{hf)GWJWkJERAPN$WJTrG3J
zU(#0z^4u95PC4>Q8|hv+?yJ825crY|F^VUwR&H$Kmyk$BP;iq`ksv;?{9%OAe3>pc
zw>JzR?Ya$A3c21SkW<zKTwR=}yD?_nL|OkL_RkfYxJb6W&u^_q{`rZ>uitznH>0FN
zt+@ja1w8i}wB<OkXWQS${_SS^?`-4D!?f5oJcR%E7W_@)Nm5$UBRz2|9C=uGS<3oY
zWQR2Q!=pSwg*qxt<;c}XztW(TF#BcP*kPn1GE+;wy($d5N8CtDFlO+zKS%=mh%YoC
z*4xPQS$9`5CE*?|@!npN+XsDzx!pvX-`K!C1<3!N9#FUW>&Sn5jU+R8f_C0a^~@(|
zQcPtY#{6;AKU&6r9@vQ-WlK%;wW%PB-ydu$TowZ}$=FVJ9aJ=s?6ZXoS@tA%Q_^8^
zdA-JUmksouMK|PMEX>3VR(dsxFBncGmG+;X2IV)Hu`xYy^5jW#r#s#=y{{qe+rG*-
zL>1JX#{}3^<?Sci{+m@ejZH@JL-7S~L10*jo`Br<`&qV8%8(HYj;e1-C;#34Jx^?h
zMlU~dT<sJ;!=B5Ij>KvEGf8s){%hBi2FUY)PKiwPaK@*Vl?1$wA_5L;I2Kta{Tw;|
z>?UgNPk)*VT?uOp?kqjJ9q0`ttd{VG5yiL5j;CWRkun!B0X&GE`&liSc%44gq(Se_
z*2ti1aeWVd@naFKtR^^ZdwMYKI{J?Jo1LsZ#>X{SBL4I?x6}R)qiUuExf#VUdpF9P
z!0_JfDR$()gspou<l&#8>4-iu&4?VsB#`qhOgl=Vv9*kpFQfv7gB~_qR*^1BRiY5>
zyw|hpTXZ$q>$qZKi+tm=+|YAJB&{k}P{mH+3JxlBAYs8Uyd0EKYAs7Bn;$aJoPnUF
z=2~Zf32>Fo+e-q*08G#FXK`|8?xcQNqlKwae|hmH{KG%P<29v5@)v5I$VvZ~@bc5b
zf}?rv#ryFf25EOHp@yb>hhk)z=y?B=YX4@5%&lMJAy7h{m`OFZu!0BJtavB?3hG-C
zXta~oHl0xC?SqrgD6`SQ!5C+kn7<1-m+znn2eTLG3pX_Gh%5}{O>cI*g1PWwI~d`S
z#cdrz%;=%=HSTcR5}9HD%<Q*mHNm+VuwIxHTt?+i>%iR3LsOrd<N+$B8Y-K+`t$;K
zHk2_N{S05JcK*I3+k&G31vXa~mgW7Aw1-0nvYrf}1wY^djNRAT+Inl|zZ;ziJ$y&_
zMCQ2PMhE;e0S(7~7DNROYd{z4*6Y{xEH{Y0Bj`F8i&O>5CKY#RUe4v1lk=+^N)qI&
zb(-v1?<KF#^z;3I&uuA7+-9_35dGFpA)r(a0gq>i5iy#+RLU@`8atYd(cz8rZGn+~
zq^RZlBk0lbAKp76%-)N|5K&RTP+f!O_UYOOhr337%((b*Syk1*Nf|yT^5{aw2^P0M
zx1Z}x4jj*P*WJeIKc)XKHwR6YlhNW_NK#+Y8-R7lixAq5+%)>ni8RkWkVDDXW{K+k
z$ku=tH0Tb~l}x~cIMRgmFh%;^#3hEoMg>=Ap!>A#-@jizfs5%d*P9a*8n3h)tUAn#
z&^mNyjXS5(Cxho#t}0Mnk1dzE&egP1TgNM$2YX*jow(a%;&H>;sEV5#FJDGBjezqo
z!4Sm-tH7wu$!&6&#KP1+7h9;wo1PUDeDS~@K8uka)OK>n%Hs_vU0C#pLvr5#$Aia`
zAH2!7@!@|wxHWW_YEPr?A1{%xfT^|4<qg^YhixOse*<evI0AoTgzDSkqEl?MO?o3y
zHoU0^#x-EhAd#GWSVhh~T8g1JjknJ4GOiPKHoZzi(aL)pMLdF>1N5i5x&Ie6p#`u#
zbS*ckc!#(wQf&B@h{q|^rV5G;J3)=;drl+;^#TI{1Mu3M{gT|Yy!nrDj2=bndGmWE
z3YD)5ePN7o+^X;j7osKsyObvu<2EJtW}$lc{6J9Y66RBl)2t{|XJI0x#Pw^_l`UvN
z|1Pk5kt%AqmH?D}Hw!mn$CuP{smdw;{Z{UAlO0lQOSHznpS_Yg-LGX7=|PANign)p
zSd(pWf3w=(*DOhSo3gy)ZgSFhTJPFj{Zr?q(3_88^iwzEpuEz(LdrW>+-t_~$Pw`(
zw?68P#&0*fYgI5CH*OXRv4T>{DO6DOQQe=!-6_#v&^Qx%SMD|%ff`dKR#&+VYQ-6%
zmLOK;od`htJ3gJ8N!on^C`Y8KkRJh0hwt_eC*I?wu^UYyC(n}qn!}u<KEgP~ssQPM
zL1zxz&V_$}fM7{3pp+C6*VB@6*dIXWe*dEgfnN^tkEbO2nIqfRzenu<L29``i}ioQ
z`)e}<V@}6T%7dD}n))F$`iDW2$>e)&U18s74@R!hZHN~cYo&rVpREK<)`K3G)4`Fc
z^Tt&up}cjIf>fGYU6%n!zd$`|7jW_EoKsepH6qiHY7xCb^NLc*Vu`N6X|05K)$1)Q
zG(eamBO~9(o#6b}OFpu>;d+!A?x|Oz#n9hd@UwUIhKW*4GiNu}-mS|K$Qqes(B4VZ
zTASi;Z2pt`Ac3YGI`gAlF-c8!t7ixI+`4{{jcws=w%P&O$9sdd6*A_8Y6_h{!eTKG
z+)4$itC1*;lqnVFeyFX_&p?WbirSE}ElZ#_)Z93L2$0LX2e;V{5qqM*-eX1@qFjRO
zKE=4^&pl{GZ(@H9K3+^w#)<8jH|-S9`z|W*v5ytEXLYfKO@u$hH8wVu1;8r1UcM-G
z_TNEk8<po#vP<&%B<k|F(D<2-PaK2L4YRJxNk%Edon@boi6ud^?w_YfKFDlx)H`R$
zNj8PY$-w`fTZc`G&RETRC8Y<Cz00W#^mH^KgO0UFkS-ClJ6hk34JiE_Ro`_66*%Ub
zZaMrV0cOr&vV+&H7n&q~={{HMBQAg5=2Tb)D8_|;z5zq!2I6vMUX_-$C{mXE8P4-U
zM3qIDCBz?StI*Im{5Q?(B;UYE*=-B|5rx~h;60p(L1yuRs^T7h&s_*2-G7|gy??&{
zw2npd9i9PQ=%3m>4A();evxy@?+WYKPWj|W0kmm+VJuJaNT$kD?T!g&#W%)v5!mZ$
z2f7}w1AXJR13R^#BwFD$v^3ZxHosN>%4IVU9l>ta#t9N4GBPp;nR~-wp$sG&_#>HS
zf^H#a@)>;&U8V?bJN&b1v3$uo3|MBY2$SVsOMKHo_N5b_8pr;6o}X#<J?kHyHLexi
ztFBXyJ6=1(@+FD?KM$>id}y|byKX`HQhap#C>pY*4a>!_CRNzwP<CNj-E2mcE2F)g
z`uc|hP;WhLC1PW+6IY?}AfBS?_k+;^v&B9P!`i$>^-WDpPtbX%jB=&u#_u%|F9Fc?
z5Zz-CQ`qkWm98^SW{qHB@R@v0*<I3bHJ*^`F*ic6gyq(83|=w$|N3MPex+WBI0Z9I
zfSUTL7dHR?^#7U`E0sV39Vj5qW_iAWK+G=B^Y<=U={Nwj2XxBUU~t7FSz$jGkiAWQ
zWRNB4eqNq1Sx=J`<@@$pVRE|IFIEs@hszsd#izIh7~mfeU6>@U_fRT1zcsEyo?>S`
zn;FK&$+@X_=eruAcmR%Z@HA@emdFj23*1ul?(O`+Fu!yfm9B*Xz;nWqv<Pteeb^(o
zSUaaE_MpM?3m@bS3kbxDJ01HeTK2KE&HT7mJ>0yI*s$4MMEVJfAK!PKi+^y9(}Gk7
zOoe>E$YAz=*a1J(WD?=W=KpgHr1;=3?2UWM4`jG@;|-*y{<HAe7OV!8lYGSaSbFF-
z;Nin~Bw<ZiC7bVDoIKOMkmrD*(n)>)HcPAQeYI~94Ah1Jj$3&D{{2=R_Szg3=)OO;
zHL?qD@<7?}r?duMv9TFTe~gUR&p4ckuJv_J!>%V_odF?g0~}>g3JRW~^JjE8iO|sr
zZ`z5dH_s^vw7$_Y5*!>n1Ffe^sAwt=^xmA0?)b%&`YA!#U4fBU8n_oarhWUr5`g+s
z0<DNaYz44Y@KA=d5mW6*!pKT+fa?MJN8UM;>!_PTA#PbcOUgAscV2<~H>GZ|$3(Fj
zmJx@keHIvV<N(;;`9RAvb3Ifjea8s)wIppv3~ogZw%@#^5N|qW^vq@&PKX+~<U(ga
zX9#MFPNX~f`U}Z#OH0m!6LYRG7)>L7sF}z|cUBm^mBhT$6sBYti~4G$w)@}w;~t61
zWDMe^s!H?U^ZFS<wozOF@RLIiHB2=PbY)>6F`Ha)_xx{QjgcA@oJOS<{LB!#NYsEE
z2ckE<FY@pphRI74q!#Wzdiq?BA+6bFi7s}q<v_QKDyj@-?>(SXw3#Ao;?zOzaS*Rk
zh}Y%5+|gf;biZqBr08L`9PRDP4<lJ-sy4Pe-D6!6faXP}#PnlBmpIX+jNPX!mKJCc
z_$6QopRz7-G#)5zw<R42VZ7+Vtix?TH95&@fga^5rbpl{NQQJB*DrJ3Wmj5;KkQ=}
z0Jthp+4rox2bA2u_gpSa{Cr@DXwakUSUVc!a1qRyqtUI+DVrRvC}vfZPnN}<ISzo+
zCd$)QLvEBdwy$xicmqROM_RQ)-OF3Kgb;0uxzYXkEADfr=5iZuOhmd=|FSe9APj4M
zPE|e;(Gi-=b9)*VGr=~3vM?-SUrNp8P5)$m;>ohGCMv|_Td%j=nQOejrg2%Sc)TYe
zrxGrmFJc9CUH8&kV>U}4m1O;?zN}Z-^c~a}_PH9;Qf7az-C8@knz{F{=R9!}3hw<c
z!XN(UaSENb07hO7AoSD9npH}e0MES%QFs1i0e@fII}CmsLuGITiYj|M2(HKJ&Q7Sj
z28vaBoj24o=)E^KH#hNsH94SchFq5-(Wxb1=Pl$iw!_1neY^n%%tv0DW<N?Sh{gq3
z81*c}N}S6%0lz@-pfs!v@y04(-CL$LeqQes#SA6lVRM%i>BdnnVDp;jR7UD-y?0C1
zyF0b&c;QBfPh~n*7X*oF+I;P7WqyJ6;momRuzn{lKCXr2UZGWGm$hvh*ES8*sq<HP
z<NBhp2>_Z!;-a2^B3I6@P49n1&NH?zD9|y1UvG(0I*P6?D(=bddGP2NReiO9=<CDH
zCs%RP9R@SSfLqt?qd$(D&YiBT>bmh&`_shqbP>!Y4)Gw-GfxKV)gJrgxZxM&To$!k
zd78`do^#szE)T2|;8r?8TNY0<((eU+a%62Y<KX!s#2rq4q#FNK`~B9Y1frrXiH+P$
z@4lVQQwf7tj0kCgyd*MTP%W(t0s)73P+7=yqs@vTaP#W$X&gzTJ=7V0()3~)NP~>+
zgw9a^J@a3eS!tB)7^AVNC%uj!Dk`$G$KQYpY&>{ryV_se@^}8eEx3o=1j6L*cmn@3
zNfnSKm{qA)(KII)>%>VinVq<9j85n8mbiF~X{!?5nj9L}1!*_TBTko5bB5PCZLY7Z
z?T1kmzUCB30@epLhC~Mj^w*vn$=`SVgZA)p6xCyAJRp=2(Bv-KGxEagma=6TQ0ztv
z&YgpsT!EyUI&b&B4VKoFzva@*@i}OzrL$9e+LUfThXLJwvkUF1MJEkm%}}k0TPu}R
z>vH}2oF8(P86n?9!~o!c#irL22VjTAYb!bK*BH;LEFg<9lC=@=mjo8wkgFMwl~H3f
z?F&m!)C)BTf-xRD5v-Ob1D|L3Me{1WVRUXN$qTs2t+pgRk%jCj({(c*^;#`IKR;pT
zn;#x=%%*vf%)BZgG^)^25*l)igF&7C$LQ5m=26cR&NT_PIDZHE{5q)R_)gJYEgBk+
zdFUaP7frl+&DKau^<f6Klqq&U(a{|JfjL5z+mFMz-1R=2w<7rJ0p$GVV^KC9e#u^1
zAlM%7{=W3kBRWFdPP2Vsh>bb4{cp~2kDNtePFRlpd!h)JQ-Y!;@%Y>jR8G+_C<Rc+
zHUy(FUajkQ&*q;_Ac_~x`vEn#A|%5D;MvKUoC>cudEgAK)Bls4JUJ@5@D!>k;U_d%
zU0T=)L+DOu)VQ^oYPg>dFLd&UVRq&6)1i2VmE-L;Vuv_bS>4Wkquw8dK%0XGst10Q
zpS(juO@y__xzjjh(N9gVYGZ>-S<YC;cP)_+@gzGrwdh$43_@kkrrc^cV3JU>3=Q@h
zXG;c31BAtcM9_hCke!xt?qhGPP=`Y4Hbw;!^&V)Pxuk#}UiC^c;TKo@07DX#rp8B)
zKv(!2lY)@*dCL$Tjc<N%UF)9qvdp;7<uXSYFMtWlzrMbOvI{#g8rGx$%!9)^8JdN^
zSN4cMs(Vq=Q{A~Hz90?x4pC(Hd%qY%1=-X)FK;EG#*Vf;6LvLQ4#`oB6wDGnoXKr{
zdA*=#b$T&16v1kou;_vGbC6$1HsvWCX@+eU=jN)L{y~p$fRZXPgvsjfG5*Gv|GL?K
z{^XuKyrGwK8fyfX`9v)A4+=%xQhqL`NjckghexY4Eh*_Dn`DC9g|Bk?nOZ4sbZ`%6
zA0lf6tgBI&0FxS$-M|vGaQGHqOYy|G-SLf>uI%l(Q&m@|nAmKC<XHHLzC22Bi`PHI
zo0+dR_U>NEkQgisu&S9>9jgY20Rsq>sQ4w03$XLpf3-HqPRQeOV(pJLfQsol3!=UE
zVan?q?AlpFnGT8yyXbk_k#1l%gmH2N={PKB*wdW#iu{t&0SA|X(v2dj-Z#))yK~bq
zovD<(YbPouX6`w$1?MY^4lKWTY_kH^pyfx=?YrAMAA$1pl#np&R7VA(P}w&Jd+(4O
zL;daER!GXUmCo!)T3J<#hLO-8L*5r=SfF`$1WCZ-&5!TE3fiV&Q$P3)eNq!-Vm56Z
zCquu$<Tb>t+`GJ*xW-aeBM!Zw(wt7W)I$+Xh_fq6zJ;(=QX*1%qdyCR*8Z%yrKQE!
z((-yWuhOHQ9Ps^fV*MEqNoVQ^Nr(T#)Bam+`TIY$q2b55L`nYjhg`JF*Ax+Hk4rX<
z!{aZ6g3!k))Nst{GaZah<jDaj)!twDbfe=2FjIm)e)#a=O82cqS()gfr&t!Bmmv~}
zmDF)xJ8iEh4e=!XsTq`~(fLE@AsFsceAFx}4+LY6k<`HA9wX#Ywa7T3u+w_T%u(D*
z9n!r9amKN~c#@QJ9slgh<VTiR5@;p_<=g7cyfqZ?VnPIXCJ-B`P(#qqbGg%FQW>2q
zY^I+1{>>EW5!latqGM{@onoz$p14_EC~K|RpQbbL@@^MrjcXqtr#xkm=VG~5Bz*nT
zp5CXu*ro=EtjT`Uj3#tQPMBOOd%}3Cb;;kZ=44MYj8*lDE?g)fX}sCug$a!`o$l9G
z`lFlZHcjykXTY#ofY&x9eWm&Cw?!U9{ZVDp{H2<eh4DiR-$!HEEI-~yX!JVZnR_sA
zex({2R<nwCLrfA${hdT+`>u%n^M7*K;sfOR1&lT<4Q}*zJZY=YhtJjheD~(__MBN<
z^UZ5)huz+K&1qvPA7s)pGn_&4XHpzsW?*FeNPYA%R`6N5V0a7t`>)6MICD8uy;h0k
z8wvSPzZ|I+s@;okJ59HIGZN?7yRzIm{laqMNoBX0e5d@v)Aq)gxxVFe|Jkz3m3;DE
zQ^btUm^7tr6tvg2QF3kDPWAtMx%c9>pmnq7e1}Xzw^2UZ3mbG;#JBh@Nuq?LIh&Z&
zfRMR0KCz-1k*;qRB=Lg{`#LY#WyQ}}_$V(i&f(lDMM>wJh}jU?YXC{lN9WYH!j|O3
z68*0G@z2bzY|1<2Bi*f7LY%yIp4aTV-tx-oI$Z6o>Pl-^;G>RG=P{M7@NRV&H^xm@
zXyzQE?T8zfiMhF0hUn-<&PN&Jmp0&%bL3Uz85KPWT3ectjU&Jm*hgUCKbJtiEsspl
z*&Ee4KS9C_EN?{2thp08>8j9esSo0x4K6tD&fW1=8#|#xNLrSS)MB@asOID2JFe-k
z=jg+cxDqvz(o1`GOe}CU$$axOv9eb;N>BXr^UAe@){>Qk3z{4ns7plsH=hp5j+Q%}
zW#9mjs7=1n>PPi=^~ZZtQu{~VSI1yfUujLgSr)P;)_p2;ebDrM4dscouHJ*FDKf^J
zTPltg_i&@WArXah*<m~DiR)`zR0gtnH!T003YXz_hAF7BEdKoDe;t{h+O}JM@*Yoa
z^HXxY*-t@}c7+oiX{5+jk)WBy2L#=7FO6NK2-Qxj)6ksgqX?B;z~of5M#?z`H!lnj
zxm&(@M2gO}ioJ-wpxKw_j=4h$Ynpy3vhLdRuCsBiHpuMt{`pr#70^{IO!Im~dgQs3
z3HsZ2e3<Xa&8OdBDJ@u>6ByiWCyS3l&Wj^gp*15ZBJyRSqod=MRaH49sO+e~+)2R8
zU6Kc_ms+i1@yia?G4I5S-a(cu{S?te)r*5yodmOn>>@}b`yxemCc?6IJw&XQ*|BVb
z$b$F{u5!tiRMxq(rKbm(LK2OmI+E-n@REoQVYG0`yFRa@xAZ{O>64YD{5Tw#W2Zvm
zWbIsCJWjD!xuBiC!)%e}c4^Z0kEn0&_lZVJ5KQAF1A)-uzi;2ZckQd<odlWcKtaTY
z4r<AU6k76Cx&RLxFiA;C@6h_>LX~*^rJDNMP-3%X#0wCJHlT^{ubsPOe*B()uyyV-
z$WlgYW{HV<Tz#PO#pG=`v#O0MOUP*Sv?a@*+ts84hf{GDN&Dv~|KsWfr_(9jH`hxO
zKC*W^6|qFgSSpS`oJpz*S5EJ{R;&1mSo-1q<O|{<a7~|cm_tKD4@759Wc!ZMfqoK5
z$X+rIyD-OD_H0Ogwk^^_kXWsP;(SkNo#$h*&U;<p3`<VDSb`pk61>eYi(lgPgSe2>
zbpBnJOKQkln8pp5aSD0@wtHEJQe9_?FLcGJM-S3kLphYCe4n=MlyIq5dJV7IGzS}S
zfYi{Oc;?b|0?R^g^?Zl%At7Wh&#S8Hp@yjj>0G;@giS$}<%jBQ*3KB)LFd}ilAiJI
zDuWts=Sw$k-0*jLSMVZzt-QY0u$nLD-Eqr2AtpbR;2;8ac&%F%+8awBijpopIrjkR
z-c7W^ffH!}xKA%Ua{}<9JGpan$oW1HY~jvK#st=^_DxpS=|)2@{uIi&xgaE9&TlSL
zHKJB#p<no}<_(n1p_5+<i`x~^fqt}<!#XC0m<>0CuWEn!&E9-$=EPe7^*6>q4|G5x
zAXUgwm|>5)1gF@Cero5VuK;h9qoT;R`h;6?!0X|eQvx1E1(e83<SW<g^Djg<!p?{K
z;xUeJmJCefim@?!HB>#n{OLRs9OF<XcK4s3{10b)Pm1!)<6gFLY9r-s6lTZ|Ynxfb
ztD4#}6<!(c@AGDs?smDo_Gar3s9*i^<;y&OxT92J0J}9G4Sg&#NPc}XumZVD`>Hq6
z#F{MgBVMoHy-N~0UpZcIiz9t}Kj0VHeR<MGGGd|&r{D&^wsewx_&C0@H)>UUyj%9=
zIyAkyqrWE80HrmDcs0N{67U-NGUvfcd8gEt>DsxBdg7ce%Np96cuTmDw2%byeJ9!%
z^dnS^PrOq`Eg2Vc%*-;xiXsGHu`S6cGBG}-XLTQ<;-vFF@eBlIuu!JM^l1}2zb|SZ
zyL0fMwAfoZx0Y{BIzX5a%9)N6x;!s8Cx(={e}N5I=><7QDDjw*W<;;rZV9$k9zxRb
zN`|+6&h|h}1gyD64MlUh1=ByBAw_tHA!HTw>rOYfx9#KPV0}^&Sb$yk2^r?jJIKy1
z$CLHF%R8@RaB5RZzyYm*q`J=5DfuYAKlN~(|I4r@A5f}wfcaM4TYW%MkIC)6oZTx+
z*}qlJIe;3&itKaBRv1^o)aau0muJ}v8iIWpra-Z0w_m!UH-A&f&XySX_wxujjX+?{
z>%I)kILqPC+S>=IcKkVHr8Wxwt`=5?kAHshzryV|?6y6Neq*S^I#f#cWTD92(m?5j
zHIoin6AO#(PQtY0E|90$!W9fb5RHS>cA(5_+F;=8OaZ$`)ms+py+_43pp8NstHq~a
zVsqt@%Ewzm<yQ_P%x06C5|r<|J_nPi!6|-qokTVe$H8p#iZ5#93;u|-Sq&lt2D5Op
zw<GU*;g*Is$<h;Qds#f9wQ^VF)G!;n5WYdW{>5eudCxdL0~dUj+yNW*2p@1~f!Rf;
zLX7D({&1T7^G8eR&0ExaXp%J=E(>dTE|lQopQQ~e;0j^_raAV_c)904_*(0$6B2z>
z<NR?+LV4|_*=P_0=0?~aeQ!&;)O)#P{QObI0ameT%i&(wF+Ark^o~BZs9t3Kc8hY&
z_rA$z`MNP~8@ZRqz9y&@mX(j)ou{Q_8#!aGZOo@Q;P$qnXr#D$$R@G^i@n2n^^GUn
z(qSs(yc$Bf#Wn*~40A=Z<Exu>o|A8RzysCsWZtp9Doga9DVk%JJ(z}N?v*uGfK^jy
zQUJ;(XTt?)JatCIr~Mh6{7P+5q0lzHKCMScc{V{jODe8|=VblPoA-Z*<}to&$~Ei9
z5Y+$wNfqrirQ_81&g3VyF}dH`MoBVN0x`w=HwJEj%3iy{9gvIQ@0M-j@d~PV$FeZx
z7uy)r6)|c)bGw(|*IP3x-(Hk9r5_=d$!~0$bDo!dWJnxps;f0$My3TkoB?ZXj_b|8
zq&Rud-^Rwq3*?BRu?8~4iXIT5D|XY&3F6Dt<BPH;Nt~j6moi}@qnY)V8{%?F{n$X1
zD29U|y+6mtO0I6`!qg1)4ZB5SZ%sd%UEL8x$KC01?1`lfJ4?^qk<X}m%FMkUhTRlE
z`h+zdsuyARi4@1wzun^k^~?hTf#As3W)O2-e=3wU{N^w^QlnD%+ydlbAV-q<Fw?4i
zK@D_YYburTrWNynAn>sZ>8CwK;@EYf@>)L)I&W~?5kZ7PZwJlC>=4tlcmpS&XB@1g
z6Kt3F=BMkR63aa`1R3_!11I8Ce==AFt~6V>PC-uYsw;!4qL=CT27Hmy(S!1iSFT*y
z;tx*_ObBmke<@OY;Na;hbf*kYo3=yBxNjp2#-}_R>vwc~P%!MB<F!)#-M$7hg)!|p
zuNjbpHJ=tM6(yArSO#`Vv|c`H)3Lm<1GmCVgL#+N@f6#X^W7Z{6NbS^s|YIknA{B6
zwb`mU1z6d&n(=TM8#x%?ssC#Fw{a*vH-hlrcAkam{}Kr-%)vTb%+quocTvzz6kpRn
zI=Smp+?15)rH)DVM<dyT-yCHh^=0mBboO9V9V^JOLS}XOPphgqsD|o^60Rz?z#i*T
zsz#=;22PM}f2f}mi4MawO3L{3DN_sx@P4I@ZG?!?^6(VRPQ~dc(b5lDpHu`Eb&xV`
zho46glDOHy2)+hvag?%IythDao}s0*liiupcq3lF^vYqp(ra(Nwd<k;CjW9Kb<bOX
z^GczLd4+KXK}^{->S;pA(sOrP3Y}nrp#esF9A8+ss7qZMfEH_5Q@2}+AX9`NOi`kz
z7J*DZCk^6KgQ+r;I+vDiuKM?0$ZK-g86rON&iQfv<t$kvQw2?dH3iJ@z_NDp%28_~
zwZqy}Lwj4Xyw|E%R##cr1WTw_p0npb>-)YuqecxxMa~)iRJo#S5+98V4;*)$R44-p
z6&1m|;ac~r>q~tOK#k9XoPS+R4PT3x(+pt;RbV31)32*#Pr9dY5D7F6xY!@m6j-E2
z$_%=3B({aLD)%8@r*sarIF{9LQ90hf-zj?HM6pld#bUEd){2c-`xMBcJld}6$nXT8
z`g7P^r{J*5<^E4j|ClzU_(H^mXfu^<6hc(F#wC~YK_aw{PdlJklvK*3Ty_GaIkLqT
z7OCU%-6{%)m=;ve7CZYj=93d57&FRy_>u}T8J3U?;+^WbH7H~|s6A5m2#t`k87;I^
zepDYkpJvaTzk(AbjU8LdPynGhN<>A2+vXzJ_M}K6oV@~4bLz_2`>%Y7H!Dw^*0juP
zt<07^a5PJjkFgXs6Mt~8a2(z(c8Il7RktCqLX;$qV@?*mpOQcGGY@e5Xx90_;OPns
zz&&}#Ox}&q`26`b4vcKrr-1jDBYkr*Z$5tf_~|UehNzRAc#s^`b#ddF^1BRmAPUv9
zP@R$e=?vcBf^nh#@j&lKGgWc<iJDm#$QYQ@{<W%Cd1h5Kj@fY|^RNLN)LGUdnr>p(
zc`(3bgQ0n1ys4AumlunfZHo^6mp^(9sD9Hz#a|S8P0RqPy~2BXn2p2W{SSx6uvnY-
zzHT?mId2Sj!t648<AbdBYQNa@fHz80H#ksZrqG<*d;Ph0lh(1ThUEG#!-Jn#+)$tI
z`u<SJso#ZHu<6b-j}ry?uK)ST|GLo=tZ=%E#2n>J+H22E_r_~xg<eUwlkJj__l7-`
zHqcA+X*WnWo`yYsCLHU@jFUQ_=3L{x4@8QVs-!R6k6%^S|9Xi*HPjGdB1hJz`sIzz
z8J#Z<+fg<R-$q{HMmhEb4p~N4a5<$V^`=#}Xs7t&MM*fJJg*D~&75W04wI3QuqGZ$
zmPd@TV|R5839PVr(BNq|T|}2@CDQ8?>7YgyBNFhP&lRvfct+^+3#O;<XzU#ZseD39
z=b+Skm*?Q*bL8`n0=gb{noukHR1L?$5MpOtz}nIrZO1v&8P&0(#jj-|MM5vVCXXh6
z-|QXR5XnjTJo_<yT>hS^@K&qDcrH}B**(K$^byid_d%I`PB(loA|uTRKhK_D5@<BK
zs=06aZH{Dnb#lX7<0{Xu)z4WLqIEw#`xU2sx@7Tr+Q!_I8zmh&otw4sS40*<*#=$C
z1u3NLoDbMSqTAle`#cau4;NP_R|maYF0!+M;&8X3^C9$^r`@2rR5h?5mt*ks7Gy#E
zh?*qW3KiDtY<DqHLIgbQuR`n<INbxiyZ%r0*)0JG_P2b83NetX?F@!naHM^~`Lg6)
zToC9{ahp&L%rj(lVXzb2_+14JqA1TRw3IrsJP!C6rr2q^k)VDSkL2l9&{qj8ym&$b
zLOR>A#Fj13C|2<{Ch9(`Iu-qqREx-pqt1~k_eUlPNqQxBKrL1p#ZcuPmz2Y?t!NCk
zAM2s+>|tcOx;q)}){O`J&l=Efjgr>HjWGV#R>MNpzY@fr=7(1!MQ<4_WtS+ZqJ+H$
z1;rIXaR_79$QLzD!pPzUO&{F&h#p=(Zc9p(_IsCSk<Dry=-7zCAE1lvYRf~qyMxYE
zMQ-8c?kB<EnTnS`MMR0_dx!NBSk{J|70d4Ix#uiGkdttTGGF`bSTC5EqbV>@@;OW6
zMbb(&7uC?+mg`QGXEoC<<!|U*y{2Rw8_3o&EZ&{P5|29oTnN6_9w<6|iR*XcO+U|B
zWS70wXeX2m?p=dzIW~hX4619On$|@Van23u+~t_ei87<!W{2t-$%(gjMY2E9{yo6=
zCS<pZ<k}W|V*8qcs02abx%1Rk^cvgEfErU_sf{&Mzws50Kj;0gnAhG@7OOta-gW1O
zVIfxBqow1><*Ps>p%$OpjMhcXcHg#|@|V9F+8ch9M1EsZ5M=f#LE{Xv*MT8%1d6pV
z;(Aun7ef|n-KjTkb?RY3sxDfgtU&^p?hi)7zc~_N*~yHAwDijI&F{S|*3n*OmoOXq
z5QSzJMxs-qMD;wQ5>M!YXC4USaoG36M;bOnoh7d<wW@k8y5IrIfijp{=onRBDlsl;
z;LS11f3z|Bg$<<A1gDMNMq%L}$cgj4sv@#@xSLNeTCw97zar-s%ipVxRlzUSFfXG$
zpcX#xO>VATV};Fn^_h1C#uHhr%I;tJuB+~QImj-030s>Heq`;&u(Tp_pb>;*6+rkQ
zn=W?d3eFF9?JmEwj@h~hD}rkF2t$@}rkUv&4~HSEweD%No7i~$16-MPN1_;il%%$a
zS7sv5YS!SeLn*ZNOmAlI7OUk4c$bdeZ8Xj5BE`204%j+*^(=?Wxu>wI2-qt24&A1$
zvWomFTe$QsDJm~X_FRu|h3F%P(9@y2cREI7+L!C+9N9GG#n0(q>vL2P&wvdxJb59&
zQw?hpG;16`IlBZ!2>$$0dT9cR|9272bz1}M0#`1=Lb0HE&uF`Ox)eo)h0frycu8+k
zD@tQDZVy#<dwtYthpGQ=9Uv%F#DyQkb{;y??VLL~e5-`5rOl<RKsbiIA6vK<+__XG
zan+?K1o;RGciM?lWS9RzL*+HN{P$E38LpYyM;7j+Eb`{ha+E4dEPH05&zCs)6(pMk
z#1F9-5oQQ1CRav0p&O-zI;6{zn)Btj?O6lOs7IdW#+T+EFBg=bgeQz>5*#siDG}kp
zZxcb+utkZgpFAbI4=6L5s5{QZb^R>XRhuQA?_P4EOB)QA^Sdug*-h4VzP-+oG-feo
zI?)yUt)FI6|LO(2!7}V$AC9#Y9r#N0CdeMGPX&BNU-o$g!Q-Thhqn`i-bN7RDk`rL
zSzr}VOgwiEI{xBNfSoP67-iL5^_FP5R}>U6&%)TVrM_JI0_P>+MC;{*gox>Bft?Fg
z^(@wOTLGPzjI=z-G?wwVzH!*R69l<@`f0nUocL8rM4zySTjL@jDcd$`;PsWP_zj!@
z4^jT<(A^RTLz}Fw<JbvNYL8)?NR6YVdfI(Ib07TYr$hLYC74g=O&gt4g&xJ8Y*mOH
z`M^a*kU_AE9{!DBUwaA_v}u+I4Mojy5wCNg>to^5IVwfdx=QUPbky<P(6qoOhvrHq
zX5EGttsso{U(WkYt<um=^f}f~M)qjy%y+pP9#K?<6#PT=4cj5mTWqI-%zS&QjU$4B
zwn_atz{*aCRke!^*{p@z?0CxfHFI3(I^3KeY)QQKwgh-?hwy@aC9{(#;h0C5b>qYD
zeTN6Ir`fDGrw)ocr1q@JAzm_uN&n^o_^Q85dex?-nWIPFQO8GA9AvSM;7N;t=yI`r
zA#hgi<E0%=jhO)^Nxhz$HQ2AJC{JHyxpB6yc7!vUea;pxxXrq<{s58ujZ!*w%!V^U
zn$L3f6CsJYjIPRr#GiFUfEi(g2>Do!fpva$cX7j?WQNRd__BGEt8|%otaICkPRGXq
z8FJ-A`&|O`*|WOt0$Lhh!U;;II>riskvs*&WNsdNx7472nr5&<Uop{drbxW?VO;)k
zP<)RebB+Ws#MK*nM&{3I_O(l_tuXlzHP2f9$hpjUK5Z=8TCxP@HM5v}G1W;PP)01u
ze>BK9uNi;!>Sn;!>7z((IY<)ao3c_edRDm*&3M{@ug3YPv&DJ(IqyOxKKzADJ&r;E
zmj2W@LObzTX>@$e)xU@-d|pO#kp4vx!@>Ujwv$#TIk+$#AG=H$4lTX1y?Nx1k&q#n
zU}1)d^P;%H%R@VDCDB8ww&$`bhXXu6HiX5K(yna;HV4|>%=2HE?XpsVTTH>MbbpAV
zZd*YBmC|!FAa!S;zQrf8@^w`enSJ8KZ^Y7Y7$yt3L+W*L%_T5#9}K=6$;6BhmX?=~
zAB2gsNfyCo7auCbiUmtZzE4=9B<uu>IPX}f`8l@h>`^?|5v>;wbS$7&J<FiF6tXDc
zP$A_nkJmb%kmN9a7warRkdxHXmMtYKsP^sO@uN?$RF-`bL##NMAOCD8!*QL9TMLAw
zhrQ!Ry~e#tde~4lbvLQi4_XI)pgThbu$(<vWhFvzp`e^(ognRCUA5A002qJLu-<^m
z%jtnkei<kd*Y%9|4KH7MO>7OnHjA6;XY49`N648tE;RSBmu;}3l&&M4dQ@Nb(PF=y
zoh*8$P3TuF5pWY0Ask_BC`$=hxEhKi8VPiHOmI1FopwJ>OR0#fsQ6W=%-+-~`?ey-
zm_MYmsED2DZhPFiI~|Z3q!4JObosqlwxc7|oD9$0T9TVHMamo&GVh;8&YOWkh*q$#
zvGzY;)qVuPs<M7bdxK}`3KZKX5_-+XDf>LlxiH&Dh>^xZr&BqzyGLG4ik^A=ZUwVb
z@E?(WOz=(fNWmn|JM)a=BUJXD#z#G;wj0&<a%Mt4Z^lzlHJUC;ax%NicqjOWooJH*
zyjQngfE6e+ozZmS%?wCpg>A^rrCT2o<MoKX{rY%=Ly`&#n|!?sk4pj{V_$hY14R8n
zGi#*lu&w~3jJK;><#suz1Hw|M?w=5QS25L#`w#lFqeQc_K8Y??>Q5;n{T@tz=N;ut
zGAw&h+-4s2va;LA12j{Turv;m$HOxlAqTo-6D4gp3Ncq{rkQ$rd}zOO>GIUtCTw5#
zd|#Q@@c3soS)A5)XV2r=l1OZ|;J*5W`5}*=#dkzgZ~wama@($)=rcJ1GJYb2hqZhF
zYmpmz4ucRlT?M#2o8_G&^<R4ztY6jN_p_?hK1lZEGf5?~KKAzZa{$aoaL3UM1TQ{z
z=6~gVWw`dqP}H-T>jg*lnb?E&*YeYNr2Rn0@^;ku6xNpIjl<}n_yPhMur!e_TWp`;
zp3dlyDi-r0q`I(LZwd)T>rc_6JpJC6LxaQc=w0fyRag(C35?mBI46W<=Wq#qPf=Yq
zK#>a#&gC?nre9=!C+K_CpMUbtnLOyLaL?f*zjZe4MDFwR!JN+P)4AfAD)*Qv`-MzX
zD;nkYu#Uc;=XceX;VarGdt+X%aGU$*pRf?7L_bXubZ{v;JHc2Mn7#LT^7^Ltchgk6
zT<V`i-D4>ts)JK_2RW6HCwm-DdAHLrFr8!;?6;fFUuE&qh=CBpo*1B?S6MR^9p%q@
z!rCiK1SmEq72Ng-kZy>+Nu7@$c66>PI>HG<<}(A8Ueh-H<vG2oV|StN#a3;iaCJBc
zmf`XUz_LW{PcUKc7}*0dZWSjSpg5@Bf|ZP0=?OMHDMeNz^10>~v?eET3H9H4p`S*^
z?Yv&Ho1h8|6Ov%=#Z2tv$#NmfHb^H+FBbAe7dv}En^a_9MmK#*n1&c0dmUT)`F(}Q
zC!lazSINR2=HcO?^IXW?Hia<0@B(y_^Nx9#5`M&kRUhPPaf?E7q;G~ONyv!wi1A^!
zi&zyoM|2P^=~0hp^3oVhTDb=!RCk%4b^~av#}wN&8;zV#64@7oE6~-0OmFJ$sGOdF
zSBBLy+}DNg#5QibQ0Fk+ooGDi8Bnw~X9miGYKHDEu?mRhIe1FdwiiGpv+;m`)#?3J
z^d=~_<hLY0m|KgnL}y$1R)t_wA%O&~jt$tTuoZg(oM;u*p)N=5mpitH8&5w#jY%V)
zDT*sPq7f&>w`9UIF9C8Ph~BiVFfBb0If09=F>2=^0Lt>C;CoX=VN|yl_GZteV{8=a
zD9@xy)wS_?@7PZD4Ke`LJ4UK9M(1%7%)MXJgnK+&AM`ff=P%h1f45&1H8Yo4#Z9!0
zJ)bUCWaiqfJ9`DIPWDE>fB+>L3jY3<?TEu<{D+tT83|hJ;56BSgzlA?^C6`OPF9kY
z)H;{^!Q=Zfs0I<Q`9+J8YFG)w;wQ4`)6e-F45blW)_^6Xvd}=SEM|K#y7aI_!4Nfs
zE4KR)!LJmF!iIz0>Sw=o#;@e;yj6Nz0Q{!LB=7g_DOwwZMdl;9<L`9@XXG&WL&uOh
z%kmOsb#*}<KwoMrzIZMlgU{eXmc5E%{t2vOsw+6N<29=$xBRS$og|rd@1Es=tz&3i
zSw3bKkf(P{Q*$FNk5uyWjxm`$eA^cI`91XVz!ANfl^6Q_wI@p}k<)#^sch!8Snu-2
z@kiC!Z-Cys);R*S9Cq;_aR(=~0yow57~AG{FUOqoe!1hm!f7V?kQ5#laE5}Q@}QTy
zsd2}AS3lC-nI&XtIr4Gb%8>?$P;KcLI>LDvPCC>xHODBhb`&_Iw~8~Q*|>8GyndqF
zYi2L)94_}Z;XLl@_l(&^o9QXO>FxFD0MN*hdGg&2oF3GeTUn_w?}V$Oe09^V!@b$D
z_ttf$*fiMm!D?0)ShH~B5NXQzRpoCWK<W7&G7_+z(ogH2$)0a$ria?cX>mqZ5Qa)0
z{LWQRM-Apl>5e}7G*n5775eRs{=6CN%ii$$;>V+T2TnE5e~5h;iqYN*%<c2B=&YNY
z>X<7&Q_wrx5V1mj1DXsJ7F~<vfsAB``7!g1=^5CjG2(rvB<1#uh!1?HeaiQ8&AxhP
zLxK4Kw_?EDr`Ah15c@Ez1k_73CTOf>^9PUP<1}*4%RFDGHjtq)K#kq9WZ7)8&F_Tf
zlt!tCcBiiJxzSy_cQ5xBSgLdiyV7usd~RK;J8<1%t9&E>7yce0Fz8jfkA@~MIVq_?
zRZfGWNWnVN*H<m{Q9Xw&`z$81Z=y1NZVe!U_69JU2S5wZ>%HP3?F<_fcErWSZD}`u
zWU?mS_rTdzVK8`g)R9H@h^A&s7IVn}kS0Z#;SK6u5_j9Rs-VniLp~-JwRx}(LldxP
z^#<}z1w9kOcGW1=-F<S;QKX)xe~_1ubIYpqgsXx)ez67hiYaauCswPB+Rb9UnS3Iz
zS3#B|Qbys`{RPrD_Pnsn3H)8n2Bz%x)%zuzXN>ty4me0rU8i5rZmUgHJNr!(t<ZmX
z_Gy$Tb7PcAp_M8?#5bukvhw%Vf2pUL{>&*RCnv|Q8Y|>E*l5{CxLmS|C38wN2EQD1
zbCx3H>V!klauktvyC9LiUPSxz)z7!ZL;fTb!Fi03iu9%*JhCg;pOW<Cq|4MUUi46a
zN~`kpyEvc2_a#uGhXV`#CHMU4CX$X*<TbdQTWhVNa=ss6D6vuLa5X>Zd)F4a&SQ$C
ztl*D-x8M>PR+H~ryKZW6h-@u>XTe%(uJqg%r{ofON)h$)z`&r5?aP<L$V?B%vI7VK
z-}G$9-4X~^D|7}n1UvCUr}9Z3vT!)8X{#%!aLW>0XPKEyO;;5`u53txq^XYOjuHie
zr+<d!&w7ASN-;ch0|*3%?u=x6nRqnKi~!){Gvw84X2L<}S0)&WEHri!4qETg7z~1D
zKtz|Dh8uu9u$VBWP0l51<dw@qQ>i81>vqsw+@BRM^m6U|OXGnAVYAhH#j}PE{^@?O
zusY`92=m$roMzViCDxHX<-EjKQF>G66O{zVC^nE%sk(UVkr@UG+_g3X-85r$NR=%P
zcwmE0l^EB+UP{$~65|;ha-KSLB<GPOfJAfS^9tj)&vhOi^aJ)ttrCjQwl5J|Y}zz(
zh2xTaq_2muZtYx98ZJhU)h<HeP@2K9#L|QSGx)PDGix6Y3j_G(e-%OmM?GB(09e`s
z?T*#6Yn$hAE31(hGnc)`>P4y#<8Lckzkl`d2W4fcI0-rbL}cH&AobG`)<KHvof(_o
zE+Z=Vog7EK26VbMbrA!F-QF><M82K+YbkBVrm&)&W{zD?#>^L^Mx=c5rpLp0nYzG?
z(OKBNA6=Xw8~U;t>fs&O$E1nSzvQ2L*<?Pzx$M2o!b-4&sm8ZKAvOO&V^^!#IlUd1
znxtgB7%F$X7=L!}jdOO23&r#{DP=PL@V7=Hzm!3W^s2g?OL^&|%pqM<bg+1L`1hgc
z>orC-M+(T~l;HD#R%FpxLb6+Md*{{RvRmHch>ppkv4YlxYU|ETH;`uKMwm_9?z})q
zdgshbvycO2h(H$py9Oai@Q#&0R&*}2={vTi0U6X79)_4|Pv-3f!CD$^yNISW4kk^Z
zVtvh;B&@<5Ee6&lXrjt!JLXs8G<y&oMV%zkg;1DL8MK7CZzxi<lk17<l&5Z=<aKJz
z>7CvwSeL`>pBKh9oEu^M{$l9+G8XHE3X){+=+w>&rToin#d4YaM<FrRaTu*!SyXEj
zGieevY3wE>jeEKmI@gdRl*MW@>rx=O@v(1o4x%gf6ON2k1^e9+djj*^ni`liYTmI}
z4_|u5SxWyTV6J$s%4pBQc2PoguWcdNf@j~k?8e+*+_E|yL7AB5dM20tylN?t1t$Q<
zDJIHiYpVgImS+OB13SEd3CtG)f2bytzjC}zzTE@_+yxR_mOfJJ90=8Qtr`~v{A|d7
zE~Z$2-8o1qSJCF<3ea(3<*;=64Iyx&2lV_uJcmV?Bfx_5(Ih^VYxno9Fx@mMa(ilk
z@5jJe<U>}ZbL<{P^+S%4730gVVN$ZY#8{}O0<k>rH*U7JaL`9sKgiIQ1TrTUvIj>0
z8pDDvIFE(dcq^3!6xVb+I>AZr_-n1TKt(On$XiOuN_^pavU@yd4D1^pl#9%0fAsnT
zpZx?^YIts!G~vIxsEmic%zhv9j$!X`#kU@b;xo^j<QRy~Wupgnd5EaKNZ&#C!#gJJ
z2fJzw5$hrQEw$%ET(D!6bH;aw{sF&C=H`Jme`BNe5ng9e0(){X1eS4;g{n?>K$L#=
zbju(1ayTE{dMHX_6j<SGA(l>Cm^U&a-GvZQ{^@pABSO#8Qc<tGvjnAu=Pz3m8wC7)
zCUZ&U?`uN4U1qoRSwc!#c%J2-^scKUH_#K9Qbn8Z+c)K$e%!G!9?jsBooD9zSulp>
z7wcmyjp=Ecvz<es-Si^xfM!HRL@Xj4mYN+}O`OWIZ#w(`;7vK;A6Zs@?Kl4wLRgn-
z++1D1oTXN6*1ZpUb_}tvlrJ~7hh^$tCN6GVKncV4{T5)_$mN*yym@o`(lMq(s;C$~
zq{;dCNf{fG9_1<5M;P@yvX#j+7m4tz+<&AcS;}qNKx0GVLAr0P-q^jX$aBJm1WZ;}
zucC%$42NLZ)&qXOOW?*!e1?3pTc1vzJ$iQ+Zsp_Rur<vQ<UrOeOJbBRThT3C1{39r
zd{dIZ|3}z&$79`p|CcDELS{B4duL^fltNUt3tb_5Wv__HOt!3KWM^+Gdn<dFy`?LA
z{m#|6@B4H2`F?)?^e7K;y<hM1I^%hs=Xp{EME2Ruh(ZNPaeZE*ZPdCcm^vV<Z}CEm
z69-@7%x<9xDrRS|`ED>LwOUxkj7d%+JwwTDzJ)<j*^g2Y@&>&PCy{@(F@d!3fa4@@
z<7Z3$>PKFord>FLiO_SUTa#vRsiWYlr51d)G0N!aT`Bpl_?;DNAG1I2Q8nkySNc;f
zFWOBX9cNVe+Z@rpsX{U-BDtio_G%p7NLtA=j8aaaFV~M_;9%VGaoKcvR#hAF$g{~I
zl2e`>vU15J6vp<H<y=rc2Cy3&FmzO+9k}RSrl7Hs6*fo}a0Wn#stfps&T>_>10s`D
z=ZM<uZ)$~hGd9^H3#g0k2iVF?Q?|Rh(v)9mEUVVcp@Evc=7M>Z%3_vGud3RIOQxgU
z6B8co=p1<=Pk1!WGRK@LQB!l=I8S7`Ew^}dypEQv&3+{&F0M%M3vFZ(;??az4>L2f
ztD1^q-zrpENn6L4pLZES=T4|79e-j=sT4~|*z5!8ezjNz-VDqlCujkmDxpp@pB~Hz
z^z49VRlQUbvqLwKoRAV7ln9Vt5Mk(?ul~b{xpVl?J$g1@XFO$CRTJcAVKB>~OninL
zmAEMD>cWQ_mwABqeY2h1a`&r?-7(u|=n9?FB{qi#4HRF$uY71WbAKSC&*Tu+h}o0v
zD@ah_la*!pn6}wJ8NJA$wKQh2CMQG0@QtnurRvvwJZH7zP_e09E~0Iw!1OdU*0q>V
zJ~|^|I~oBgvas>N?nW4&F6ejp`Ee@YQ0-NwsVqVVl9n`d)3!$3Ke_G)<!}sh_OF(x
zrv)*jMS=+=(?z$jiuxtmm!3vvyj;B<S9&||(lRAR2UY<o^)Pb|G0m`(MgV@gQQUwF
zeeRDk`*-{*!!FV+6Clx&;U#-JRGveeuP}j6(>P^g&jSsgW(ICwAD8*us2Lf3c61br
zLpNplm_NWLsJ;NXa>GWlCtn3OGAh{F*oR*W-E6f%H(TfOB_J)S5tiVL^rIqq`o*KM
z8e%=Vb>pqA=$HHuSRoS~7be2VN1}H^!NafK&X9W@jT)}1VCz33aPE2c?wvJy$3b*?
z+1j3N_C)WIS4v7Uv$J-p%8snbB+H4Sep;eXwY1ZFZ|0mVS)r#FCvGv*8esy>xGud}
z+fNmne$7XCb=~%@i!pWic+bbWRvM#=#R~JIWd?qARVU7UY!%qNkC%TIT*y@%tV?PX
z$b^AL3dK8>KkP=x@G!0Cq@T@FobkXN`RXXY@wl&tZa2W`vFS{HWZvC-ay31#=(c(7
zxQXCR-{jt05bmU8oj8`DvREkd{gZ253Drt@V{{(>%balMGlXs_GtD~+MOYI}fXBo#
zrl>4>?QWME#PqZdHqN1pt?@Oj*qw`ND_8hXTizM2z=U;>+Y@;(sgFkij)btopQ~rq
z&^2BySG#-w7?+n~Ntg@+eZKsDkCJrgthG#%j(ZCuh*EF4Hh=g^T7snclh20LCCohM
zp<$x$sp?OdUNT~$Jb5eE-k&)A{wwu-UoPpCUqs4z6;JzskA@ameUn8C*25fkU;6ua
zoN!o>^(}%|pFs9@LaXzBsTlFn#y-iDGmx8CqoEpvm~AsaH4n95DK%c|rwZ4iYQ_Fz
z?XK<wm}|FU=0GRj*f5yw@s$>GTiO2NsYZN!{54vr1E!6g{hE#HxJCI_fnyR7R$<12
zDg@y4TD9#&=tifT;B<t;t+;e@WKE($&^1s0cvLhyDmKt9fM+(j?B!%}f7iLttg!yK
z4XH3@mE9p~wjRuZVO6f2`T<zG?}<Wv2|s?+z_p8ZN;Oq}Q0dZY>i%?zyZDVjUfdPx
z^t-IoFdstAkz1G#+Lglo3@$-~xz5&dX8H;NO9JuzLFkg{Az^u^L83Bo1l+qzQTd!m
zpm5=Kp;mjG%uI6q_SsL@HC5KPel7fRE;oWlOJ=$p3c@!LEt_uHQQ{(6qe!N764(%J
zB`7+bq?wZm)eg5|Ay#THeosZU{khgd#9e0MPCquio0`7}WP3VHsMkkq*d;o=uT$(G
z?>AjeSFv$=#xsl8(fD9K(<t4NKADdKD>{G7RYxb1OSh&dFK-Qd&+4hOn$g}~z(JQ7
z)&hf#Jy>WN)v3P<=6N*T@yZAPBmyfyaXlDpq(mCXnLBNh8|l!w2cBk9{A7cnl@rNj
zfq;0q(ARA(7ITFIjN*+ysxQe3KxvRS*k+Y)M-ofq-7kzWP1<HZxbgw}N8%DK=6QOn
zy!a(C?&dH_y({`_=g~fb@N^yhG0z=(JrS>aS$#=u_O^U{9{ku|N-1MPW6dMueioG@
zRS$N>&D37dVJ~+y_la)7hJXk^?Q=wY`FA4wcMUevHoRSffo9cI1Fl2A-PQb*Kl1>L
zxi&^m6it3A8yY2DqVD~$4=ng9G!n*Jnp9k)1C&*A2Opl<6jGbA0b(?5%*LhCT4fe0
zlxHi`eP4>T&g!y8&;VeVNR8o&?&cA81b_yK8KLp;D^~p{OJZFj$=aL~u=@y~u$s{V
z76X4qPzsn0_8VtgRF&|JChZU++clgh0njy;*53ji03z2En$Q5G6)bI2HC1Q8$N?|J
zyw@GkVyK~^S#c2o9Tq!GkbCG?E=J-I=okhH3$wUS@$G39>SP;L&hnyw8&+Ce=rOUC
zhcwh8Pu$b5TzIPg&|~0{FyL(ye$DRIPnPoB01_l?tcH=gSAIegq8~`{VFaTP=zU$<
z=i&q!Nng-3w{O0}?|LJ#{pESl4m6Vk0Kaoc3#$8@9hXGndHU$=DD^n`liI!_j38lx
zX!`EeAdWj&&Slm_QKUa@1_;vMIHTK<1M%z??H6i?{CKn)UKQp?Ss-5b$f}H0IlR<L
z_b>Z4EO@988g+5<hnXD1Bz-D5Wr^!6B7k^Eg4N)h)2dH{ju@nvUz;XKyScHz0l9&m
z2hWCAmSxu$MYH_Y5SeKvARn1Qu2ee+aP21Pw|qfHj~YbWmRCl3yQ>TVreFyHo4LF)
z%LdBO$E1CKoemp8LI#bYHJR@VWdo4oR;XmSS}txlH@&83*u8(yh`kpQA{rb3`P-dT
zm6(T2gTUbiq_dy_r>VR#zaa>cf6H-@MjYd`4Oj6!_p!qoxioglLuXC=VP##B8*^&*
zj<WeF6B@lXc=4G-%<YRZhnh=gZusEXD-b%VP{xA{r~L>IOf>^{z`%LnRasJr&{dhv
z?7OTU=P4*Owb@D6{Mp`;gjxdgFS~^3(dlmU0sFc+RR+zlw>}*0`)}y559iAi=Je%R
zw5kqjt%*#EhsS}>Wy7L4Mh(N7-n$-doGV84!tbX&X(0)6?b2}=E9Osk6u712YCl^x
zF?uxzQSICXym3CiW_yDXrY)?=6Yg2HwMR~z4fg=91ic(Fh8HgDreWXv`B0Qk=-%Q}
zJluQuNch0I6QASl8c)+Hsjp$|IFN+0Ge}&Iwk-#4ono*UNFQ8Cjxy9J9Y4Ot3i^+S
zNlzv?5mk<L?GsGk`Czlug`oSlv_yA{vJr*%BonMB$bU2N--$w}Xk1h{bnu!IPSKqd
zUySO6D}C2V|6L{jN^w!DN$X<iV-r}^!xwQkuV4neMMtqTJTw}30t#e0EO35p11?6!
zD6PeY+Y+#Fi)AGkI7M=RAB^o^Oy4jsC2~Qbl6Q%AEpY3&OUPf3ZNNvIGSmOkE~HGn
zq6R}k1Z29VZkc|rmsU5TC#AKnTLX__z+}ASdhTsz*KFWR<@S*)%;u{s`UvECGem4K
zeB&Y9@#nbTT4CR+z$i$8Tt2g4POD)_(f0Op3H)wYd^!l+*hhW0fx=^GsO5dm0Xhgy
z_thBV@9@I>=m=6z@3Sj|GxFB;5&CZ=5DVsdmw=%&vyO+uQZ-0!>!iq%_H?1qv{KoT
zXpirG`zZi#vU)p;g}|Q08JzDFE);8aOcWX4VZ1pH9F{Z7!^LeirM0}N$9QfIX5^~@
ztrO-lCPvVNa4nqA-04daBGjARXRa<AZSYZ0Q?6VkxMDYWimIxQ94WGAxO<SXUbs~e
zL+uAxi9I)IC>InbmUfdU&Q*t8QdCV9#EuPgeRB|h83imSEg4S#m~C{GcSDNOCw_CQ
zDdql5QI*-NFB5+Ai3|mzAsEU4VT4ZoRk02eHt(*fJnA)-qqHTfczobQ6pFW~)5$0C
z{S1~f-ofJg!~cTEp-YqQYK#@pICcXy=KiJep5S(H6HW|}N;ugt<e$Noq<aaYt^^DP
z@$%n9EtrxiWm1-%5b9Zd(t8t)(Y3SX&Du>sM*GxwW(V?C*X={@)3veTQ9pY*7J+`E
zJg4i@bfzf;2Wu1j%;=xDS!emV#>8+fy{%4%X*k1#=A5n7s^46Ic7MN3(oi0)Y~rV{
zR^s|P^()B|uF6;@o|lp7JX7~|Yy~q>9MJYeUetT1h6~9X%Ea;N8F$^|Aqe0Xuhs0W
z7DBga#P3b7nYE$#MN_q8jR~2Tc*BX+U^3x;qHhr2D@r-!^=Gx9rQc;}pPuc&QmlHa
zfn#W3pftM=BUH8YBP1JZdriEkQp43}F^M#o?;W(pvf4yP`tslvsdeB6#yA-eU~V2{
zr6SQhYt0<P3AwWdvv!=+=Z)C!5{524n->AUCRk=qXU;Mt9NXJ!G)I*>+*z<$xD?vH
z??~uGML1+(gx_-ewn4zU*_@14;D!-ley!XXsA}>x4tnjqiJzuuxG%spLiou(&rKe0
z-K=Sna=FWz0JGLy-#v{5-2@3}p3_=E7*7Bj_&YSCqK&`ooh`v<*$&xt61Hu;IRY9L
zPqizgAuDk;_VB1(nWo&+_-Gg$lm7ZP7vK3yzoG=)b2Jox{w+5#hB@|w1C5uOxLD5S
zNl4be(v|<LUC{t6HS6Hj{PFArS@x61vDq&J;K5PE7kmI9+!pplJF?af_3f5#9K!bi
zkC@V!-B=vFMwUw7g|ZI}3Np7J^8F2`Z-}m~L+edEem5<ImV)9NY&DVs>SW4zyd@SP
zAPIqUPB4$c2?-u>1O*ar#oKFtOU&YdDFMgZ`RcAPQfz`p9lgT_)blF_uCA_f2;m_H
zn8q+`Y7z~`gB)GtTRT@sie}KOaA$JDj^*1L`m=K#TMX6ghG%sWa{+>R$!JJ}tlSv3
zAf_F$2)eo<{YndH{C4s4jeYIhjSogT<3(uWjI86_)<8BeXU?g{k2t3M;gOb%=*y@%
zr?3r0*OJ14G7c*Qc4A@<1cP<kZbqIe=;gsqJM_p3Uc-H!3ZTbl!N7BI_2zI^2`rtU
zARIbs-&AG?76F_<sLbVudCFZV+&cp%)wUKvF{L9*MXFimnne#kK7|p&_`>*U*sQU4
z_&r^sT3g19X<m$ap!O*!h_CIN;Ngw1TE*2Mpts~V=fo7y{5pDw)JpexYI7N4dFvQM
zLHW=r!}g8PVH~SmqWaz6rOEda$^J?DN4-lcsG#M!Ye@|+?^vN)M&mfMK~eCxUHqmN
zq1gb(v5Q!`CmoV+WKHgU66nH7LpKxsh$LUp<br$9qsJRurJHMb#8KH|A6y`Ng`r>^
zCF*iwhseyAoRH;m_aThu`JgG%><?t3s@YNz+H2XUowEF&Ez^Z_rk=rHfGcqdhR=^O
zi1yUv-RQ42eK2Hkg@bU@6F+2UP7NjwEj{<qd(KG$R6h35V1>jLYZ_<%G{7Ie)Lf=N
znxjD02$1d76}#!$O>8A<uDLpSaS#I}={N5Nz5n)xo%gW>j~`b`X56KCFzfd0p)NoG
z{Qi1dCdAbi-$e6Nvifd83+@$`t@8GK(RnS$G9U&QE;fXcT~TQE=q_RQS9}9QK9#g*
z|2m;=9<Y47KvGZow_0>dXISQweDO~G9}$)S+8!FUv<gFl{u{GxKAvuH%sPzVVso!h
zUx&gTny4dVPhx1HURKRA>`7B7;X_!A$<5wZ8Y=<RL3V?(jVq)Lt?O@6az)<mrGnrz
zpQvb+sDH90(MKfjv@IT=Za=jD7;y)9-*gnd0>uek*lS@a^16Yh&%`dIe6hgK)Fna3
zYVa)dP%JU8UAv|_!r;Mh#V}H!_@n*#z~uRF(3ptQ&?IX6Z2@l|L*Y>RSV_EO+}e}W
zLi&BsC29#a^*^BY-#A&cxhH82n|W-qTek)Q=8~a^M>cx?Pfr~32utxqo#iudIFeD=
zFiXwA=fOfB?@Nhv?_pfvY`(0Tqsh-LENpBUEx~tOuiDSGM4cTWCLln{ex2ED7lEO<
zR46;0j2YPlTz2s&v$B#o6<SlY5ncywN)CN8Oe~Kll7Qc(FsvCuRegj(=#Lhwm;IJ7
zf4vXS9rTpaj)P+U>s5kgn<<=%dAlaztE9iL#P4VRZS|6Kn9CCvWTs&cVpaUT+t~xb
zG_&X?Ls=fwZdUhxe7j^5^bI`3`8<RzUP-@tdj$E&(_khqEEV|jL{tp0h?@vMKA|MC
zxRgxmtG`{FahzL^9)~zw#ejje!5sSZb!-u>9(E^ib5|^c@S2aKPsi5t+NoYxZ(+tT
z%$G&PcNFXAE-3@s29(-nzUgQn^+f-2%guMdZ#$VVkYXMx(cKKCA$%1;!%6j*asGag
zpVwDX?>+W|{llURJpT>S_8G4rc$PvsI1vrW#&hU1AgzE9UoP{=Anq!$EIzb4N_ud>
z(v#$18c*)r$1^w!09l`Y^+Y)&?sf{ABm#4K>Za9ke%F>gWFF`;Xv)g$5hxe0Cw8<`
zT~ofBe*|*3!1f!aUbukw*Jt{Q<__0I9P@O=(zfK4BCB>CBShQ8uh00OhBZ&^8CS9N
zh5li+ZEGmY59pJ*cWE;0NiW*o+R2XRY=BLeTJI5t-qR-^7uMYja06F_Uf3ir!*p?9
zyOs}VKxAH6n}3cTMRPuu7SqNjXjXc+G1r2svV$;`wy%2a;$I*2U&jZ~3e)G^&xPLc
zuhjFl6h(NS0*ZSJUVx6h^mDjTP+2pB5-38ZhNe`|5CQbuLDx}{O-%SQ_s?r8k+!_@
z1uEtCrURtXACK&L@%X@p<P7pg^8K8XQN)<B*ky<8udd@q;s5o%|9wiH9ymo>*}iMq
z*8FJr@u3pv6_|J77yd}`-bqQKIo3Ohqqm6{h3W2NV^&;;to9m11vh=_2#mW`G8?2|
z{Ne1Q`Hv4wCTDoxej{j_eiCov`Qx@OLcR%N%=Z;E!=x%OviJM0|LfCmvtewCs1ky1
z=2=SvDdMH#<fp>uhdfk5V_>j#8*MRD7^x}?`A}er>c4Ue;-M5tz$S(7`9{FZoZ2dL
z*B`I3-RPv0)eV{e`ag#ek_IrfaRbxVaG(=x|1{Jx9ZTE&>Ftz(g~G?%e;ntp2YcsA
z(5-p&HsZdR>nRc1PVr5&;Y7d%o|UIrLf>X}NesxO>XeCt8q76hRDJXv=P(%FtI~HD
zg2#C+J6(}-2E?mi2*_2f%i`R#-xQj^jboU=iiJ0phQ`O)To1Weh2HfL<i~k^Xb{fW
zp6c&V@%R7ueSp^q9~P_UjT7TeG)yi6m7cEK3qF{pWF7Gj8*IMB4f9r@z%ouP$o8u@
zT_D@=5ZoyzT*-+`$;fm4HI3x9%i7aV`TtDo$IzD>?NwfpVNrS5TOpg>lq@{`OsX9_
zL-{ZbuL6M=GP0+#)0V%;AO{ba20+|_fdSgLmMSvtYB{PNKTwk}QwD+m_S&x#S-D=<
zEacu+6gWechd!EUdi1UO;ZEqlAmmwYsLx+Vw&NQz60(D>6|r;k@+O-WYYH>Ivkl_W
zHFYAq<^6|6eJ^{A%>O7>DI-@+Bk|?Bgy3=C^ea<O36hihcY!LG(Al2#Z2lO-aF|~S
z;YU?Mp7#CM%>EYaqyZ9K06Mp?VA6M69N*KECf3<#Y)C402lA8v>b-6I{!=<^acP-8
zQHM{=-7aK&wBll%d`C5^Sz@KY0KL++k#_$}?6NoOeY9Qtr*+S!OJ+4V!+J8ds5Ze4
zC6Z-aw5L8q6C@{y34fe@(zD0?&QJW72TuFObYXjaMm~N54gwI^4SR+w67+565QBv+
z?k=pN{FZ-P!0%s@R0uz5DVr;~OX;{Tr3|TaC_>8g;2fb78)43)D%=vk<}B#Mzkspc
zlsR$>eSEEQo0$Wc+baf54UVdFp<KKuBY<hAK142bj)7do6+RCJ`by{-k4LF(YA)F^
z*dV2=f?(E^(8=8)*pxS~&{ybRPYC|Yhw<}Qon<%4Mf^(@EZAOlcsFq?(f7T(0&SM1
zb+>++C8(YjP@@t?&2WECY8YVqz5<FUJ05y|PtV52#&im2%sR{C6FJIXL-9=8U!9S^
zd`*Qs+HKlhd43BYTWFNLASvKG@FXf8Ku-~u;Yw~pkJA*S-63OWOo*ei=`w?geEeOm
z7yBPh2g*YSunx_EJD!qzj+r&pKGk5_Y5W90ZWEsx%d2vWYp%+e`L+#Sl~Fuh_mz!v
zUAysf>?7q5<p0I3$KnXNGPxug&u99Yq9>OhSHg3S=5U|;@%D#dhvX1bR00jrBuxGj
zjbMjlz$^Ft9q$cdt^z{%NQlH6#9{vRJ|7js531PxPFK!JWK~XIPhO~Zlu&-Z!CwVP
zjRHBxn!gkwDxq~gWm&mW(mToMtBz-LeRM)dW_p_YG?&(MW}S+KpWW>hhqDVa-`qCQ
z<A1=Yw!EPg5;H-B>AB(`)8=W)OLh(O2UYj4lSO|Jc6Er0__EVhYR4Y+*+zU2LO;3;
z-X^}5&=2AjXikG`<CB@5ECU#Lkp^ZzMGOVmvZ0VG!-d_hw`6=T22MN-)R2{#q#&C5
z*R#RTLZQ1FsP*joO7Y<PKnoFTM8OiRCjxTk^Y(;8T=*V@<rh!a@uB>R36RXGFnYz6
zSnlwX2*)84W-*3pjbOW1ta;|sH{>kZ-<uDY6%9oE7@$cBA_{c?FN_!#8W^xvsF2sE
z`uFj|N3h1w2-qO~wpCo_kN$iQ(4=Tmj=&$$$ukmbQC3{8Kfu+(508$a3G%LvNV0b_
z5Pp#G;2^xR8IFbK5WGd+pyU4IheN&t)RPNLxW=6skEgm?yf{7;GBs@-bn&9vR1|x!
zF=$H5)G{Rd5`1h;S%+G+NSm|r5yi`%rPq{XKpkW-6tBEtv5hF&Q;bdz&rD4vk^Sf%
zAwPp}Jf)3L`$3|9(dgg&3}yiu7!%sawZne(c>m>dBwsw9+A7+@)en?9z~N*=ze5%b
z$Jh0YKSOpQ0|bX@P^$}jJ4gZqV?)OCXb1b?+*h|F6+>}GW1wcEK+@!H3o-eVKVt*x
zHr*0!$-3N%lOv(t%?u1<B@@HhmtJ)0Q4|jo&i-n5?uu96X09@YK5yN3=&t%+Q+~wB
zS4(HO8UZwujyom67B^Y!YTWK`x61<mMW4JT{dK?vN5kV2TNCfwZ;UMgg-kTkPX%vR
zn^a96baC(JJ=yJ=UUOI(w)?!jy?r*ykrQd9P{FK!`j3}z48s`YEK3PLVi`}RRo5iZ
zzKwj3a}~N5SnLqguw1|8|3JJ7+23tQP+YrVdQG_q=A&-4+aT1>LPbD8{*z2)p&)%O
zbnU7B_9MwZeL9__H^eF|s2{A#*Fm@cM+WHgu-tiLQ4IDKg1Sxu&z?j-xvhd%&;jQ-
zDf3wRoAypAwd(#{)`_jI`?L5RaUDO?6;oxNtUK_5PYrtM5(+1^P(Tay2DDHW!^^3F
zy+RQk7gy!!ch>aWSJEz&Vn^jm#@rTdMQSg2ssDDvk}{kVLyEqd%@Y%oUg*pH8BjPe
zGRDB4oP2Gx{-giSg_+gucJyBB+)e`?v=6WWp}^`;Kh9gYPZP(q-w^&Tdj9eZv(LaM
z&9*$N7r4Uk*|Dz4%blU{mkdeVW3($Pi-VyTwP*9*G02#mGg1WF)8|eg4;dk)g~`cy
zV+MviaHvFY7sb&c@IQDWi6=cU3CC(fzi`0n=PIbT=+**%4_@}jKA`4X1NnTY$B^v<
zi%O|cRReYWF{R9tf5rpSlE?kKz6>^H0bXvBDo{6SH{M6vgr^TPylo%H{(K{}{o7uI
z4+Bi#^QEG|amMyK@r*eiHBOZpVnF})Gv@EX+~<DF3Ru=nk&HJZw?U6EQQJy#@W0%0
z^l?~eNYWuBSt3-k(IMz3o<0-3Hl--o=-RrmOYw7H_^>;LZ=$0ynyI4En#!nhdklzz
z)v$v5(0wl6!NNLjfUnMVLLxM#w)f(fo<N?&)tDf<k4_gJo7#-W$(#N0BIvsLF*N))
z0(stCU5MrLm4U~!AbLmnpfsLv8qM|4qrrQK<|sc?+9L^_0<!6*sNp-<=EeuUi_E{7
zgayZ=mmC$FFhqRr&t&|OI=E))lh?KrNURppO1mvrVG;5*nM5<27kT`RRAI$zQ9lVs
zFY8_O4n(SJORRZFB2+>yBqO7D90`XkuVr%t)nd7V_ogk*R~)z%w3izmqXe6r3T-3a
zaI8L<t@E4bHspk&_Sw8AFfqhETG~td6D<yP9VI8V<6m{=jN}YFn6nRT{<mR+n|`E8
z*ES(2+NGt&8D+WnNYM`;o;QgMAclbeqcg(j1ae;4v)FWC(|oLgKYvN}b=LUnf^n<v
z*B4>(KnEz~hXsD35-c}<OlNdh&@3)i(+mF**(K943-;+pX5V(>g<Y|uAt>F$>{j49
zhyw!k9Yb-)+3SV~BFy)eJr_n@>VE|y!%m`54`9V=X-pA@uEcGbF;qMU<&e&!W=KT3
zek3BnQ!CtB*xMKk&YpgwAu+W&?4-2-`(F#AZ(Mx^^Ctm&yStqhu3b({ggOgO;{%YA
zNZh%<4!tg3LmEB9A$Tm?=$$7D>`G_rz7}LV6^pq4WO0i``60qd+YZNk{5pZJU4iwz
zl+1-zMjy1ppTKI3hWGE^n*iUY9B{+>TIEy?0jDvoibomMjieh}TVz>%Ca(aK7UH*n
z>?uSzcc^dR0iJ#IpM~9HNiy6r`#$9C%SC)Y%>f@RNqO`un$-r2v319Nj?QAxDd67W
zzzUIlm_4=Z$Bz)VT<~t5tn`uGLht237u5!h?MnDwpDV|r#CXv)%YT+N?LlZS(@33V
zUwW)!AvKXUXXA<NgQ}M19|JF9(kMcZ4g<DLm*YkUjDDKUxq>VJs2zazGR>6eg6;{+
zpyc@*K)(85kh`{L1RP1jz#R10#!(=aM@qOe5n;8ZGdtmq8Tr%n#+hXF%+X-F=i1Z@
zE<*DQ43(uf3_-r}C@}e<+|*IY(cW3Yp@#M&x@6t+NV8KlvB+hYGcc<+L<3kX88AIL
z+G|q*PIG|>XNzY$=Pdtz>)=B4jtql}Oc4{|in2s$fZxL#)sBzeuGgl{IEnBj(qg_Z
zHa_vPIyW3<g!34aVbP>Y6#q&w=x#5Ku)F}VfC3V^_CIqn$s`x7g8iTqIvW*Rq6~a+
z%Pqj_1xpXr(KFfayokCrX}K75m4=1*YU!~t_IwSg;LB9F4|60gG8i$Y0x{w$z&~rV
z+zRYx{_^R+4}YH3lij1f<k{=j2(@_ei#^AWM=?}8r<VZyGT!ew<8mnllnVJ!N2V{n
z>NiHycI{7~2LV8p&|R6B1R!_XN&h^c@fG?Q_spe}WOiuZM5|=v;|~-zDCf|C)Kpck
zfz2&zdL9%#u=?s58+*xtNhDvX^gbTilwziu>~njEA~8euSxe3D+tVST%d?#<cn*uu
z+Fj2OQcv@W8@68~bTR=FM5SIAmFAMI>~y>nBh|0WNMkbRw}xS1dkh5|gSj1IxgSmv
z@4uep9#;3gyuB_$UZ@-^&|~joZtl;i(bl`eeEY6^5TP=QzOA_!y`y-tz7iT$&DqG3
znSPN{{F+&FdH3RS`QVBEKa0-x60lrDEvhBE%Ev15pKN5_vkrAyqKj-pb@T{pV*j%?
znALBPc|^fSD{9M5ctFOd3!A4c(-EeRj_u6^<_l6H6Qb=A%b*x05fPK;aQ>yhQ}hHz
zk_OLYbsEqSdM@o1`Z)i%J;qD7Jb;E>IXIU7?zW7oxBg)ElUHJ7CCuw@*q0tqxuo*U
zI%&*azQTFv@RZZz#us1?hXwJ}w!Em*4h|9tF`s^fo|@5^1>z1Z9HSDPo4}?fvYk&W
zZX{yhL6*yV(LdZGoR%*5U*WG~7F&r*4*(KtuR%Za1Qpv4Aml3qm_pf$8OZSs7*OVL
zWIiUT7YtqBAK0Jq-9Vm0s`O_wf98@pjL&EA2L-J62`jS{X<xKx92@&SDmit<Zpil`
zB=TuN1-~+?8e_cdPszL}N?7c3K)wHq>$?>w43Zx%sSVJo{}#}Y%rPs!l;Tk7w=c+-
zcb&y*1ZeEogt9kG*;3u5WI~L2XVG}Vr*8>|&f0QfgibQ#Pme~}BiHuG_)zCH`dfqq
zmg;a<P7s*qKloH$8yDk2Ec-Fh@1Zg<*}i&L!93~U{jPf3P-|UkO2kp_uziH+#XPLE
zBeU|xyJ!?yHGmRyBD3vuOlQkDt1fAc;r`9+`}^K?U%&{SSa~t$v;<yWL9Bq|2Og@Y
z3YCK(Tf6$tn4g9RyS*f3YHvDzTLLV|@(BgN?xh&N^1;zqs|%fIzIMBL<PRtCpT4wP
z2aO2iqCCxJ72mBJ9iPDJS+#Ihz<Zo_r5S|Sqzr3<W5RIrR_zMl{(c=CRM$I_gFDGt
zU^;4h>$j!}j+!6G*+Rj=izIUKd_qEwEucwMqR?<-=?YL>!MQisX9tKC^*aC!lJ69t
zT-!F~`-SXkc7@d=I}G0jxldb~lV$<s9pF(@DqP~i6q_z}*y(w?F0V`xb`JaajT7<J
zs2sVpp&);3cOBb0h~h=fP4?@fDcE0S2SjIWv=2nZOWn`t#;BcbcSQjGtd%0~u+YTz
zZTSPS9}H-_Prcg~m2l9f{_8<&fl<=>U!Qb%v9ZQ(iR-1OW4UsA2yL(P*<id^mICH~
zBpL)4ZnH+5g-O2P7_J{&VNJ59UxdDbQz$2c%sA0+yqi7jH$q?Db?*G1sdt6kyL_RA
z1n-Z^c`lvQye(?))}OU8!=~S#C^KJ|<nQ^rFMEetF~WD0Lw^n}nju+#%w8yja51pG
zsPTneeXaJ?0p+YE+^COI1f%228!2nVF*`(31Xa8XCIM`@7m}gx!E1N<j{Mj)nN$24
zYuw1dl~4WO)oWaLKEXuE39&A=!Q%|~57vm9NL`=#Wh5ph`w=gDEXVCS?j5;o(lIE>
zEKkfZJ)8Gf!qS>>I50L9-Tune(HPL&1>MV!fMr`+uY>N4UvsMUTz+k>ZcuRW>Z1-L
z&L)0$#b8rW%ulhKJcVKsn@8hr32C$&ws!*9E_Z0M3yWF6y!t(6Vl$)b{>g7<oyIvC
zhBdPKGJT}qTW%JzGfY7LePLzWu8n`FlZKn@kMad(xz1S~xSXn3+~{|bEdj)24xQdP
zA+vub3Os-b?tSlGN)wCNY-}XbKEl5IqmfY5k53Z11~s+u`Fk=4Uo2dHa{;D-NXE4k
z%pQvM?_2|7DyH_4l~@E1fGu;Q^kMs$w^O1zR7}kSs>gYdstx2*pyxm`eq^+U_<S*A
zvGOSKj#y=x|0N2^f?M&iK0y~v>*pKm>*)$x&3zo39uha)yj6A4DS6#1%E+OClJkx7
zYw;r)qSrU7biHu>t>rLfk0;8#&LSU-xzZ<sVVe9jpg`o#oJ8L}SpUAdn6MaA<GQ<{
zGPrsNhI0Dq%+x%Vj5MVBimIB-rTy}-=~pj!J0=m_zhY8!({yaH%Q~)<0(r>V_x=Vz
z5>D|J6E$viude!f4a6QW#=@}lL~LSW*Qj6KP_uQMFFDfYs;jgg?6;vEyK$Yx&UO*i
zd=2Fo@vh*5d<I=$R0iNSu0^oa3t=wtq$VcDd?fFMN)5ebLSytP(6bn`Fug7Bri^WU
z4N*BWGlM)!Bf70SwoBeOfRfJqsCnfi|DH~s`N+%E@Kd(IN74PeEC-t}9kf4|i_O~>
zwDNb8VPMleKK9Q)PS-wGX?^sPQL5TPx+?e>M%X3E+m@f~<2D%J`>tNHntaA|`03lX
zZ}#-{A=(26htCy47U*C}7p!#1<K*HRsac~{*Vf+W;r{mzrhCqHn>VTVN^}y9(I+9?
za97&f3S7gkAa~_MFqhit(^k|UVn;QX+8BM)rZX%cDl!sLT2|($1~Lv>r8LEmj~99z
z9VXdmExigKE>ziH@<`uh&Q_cg>g);Pp=}irObzFPb$XVS@uk$6ll=XC3y!J1Im&%S
zGKCiRj-hso%v}=0Z3GTt*Y-q~Hg-wI`aimkS;tWZ@!(%C)z#Eg2?z*447od-sojx3
zxM*KkxBbNugh(=FJnn$JAgT29(|Y!{;i9*1zXqH;2z)>^QX;{U>MGrNn5dFBaP)dn
z7IxfxYCXeZ*}AZGLURc(lgcu_vxPNEQumLO;KqRM%gcO?55#o?l-y=erNRSt4uS7%
zq8Z>#yuE~hxzX<(BSj#2>^SCs|3mK_ru@lpO=8TAOJB01tN0_!+IZ=!2aQ9?%3#cA
z991<ydbeZOyMMQ3fp2L#OsQrIXl)zJEHl+Wk+tXZgM7Q3U~<yY1=(H)GE(Ae%Xj6+
z=)4^tcAB0l>Sw~EkF1FAW4pKBshi=yr6jzPkWqZ7mkv|<M`2`&P?t1yb#<MhulW>m
zwr`pytGFiHOitF8;(_dy)Vo$;byZ>&ZpOw7+m6@yjHo|Z#}z(7z0u=(ay7SCGf|^o
zk9dS7)r)&pL)expo4GvJF4x<lG{@VbHfr^%{Eo$doo-QVO;1y9ld#6j{sP_~M{w<`
zo}r;(|JH%qm|<IS@GmR92v+*S{$(-RO8+?TpI>_?iG{s*COoQBPBLlQ<#7_ij0=S!
z2tp!YwjO#WCX1clOSm{Vs4L6Lwx_mHCFZr7^5?w^VngWK?e2Gj3O<=UOV7QpihY{8
z@vAPtLEF+rAZk)S-(8|82%r`;7}-3P67DhT+DVRTf<BmOX-^sR{E!ke{LikNEDp1Q
zP6_?HQ)9E|s%h?~io6)GhciOxVAwMHHVeQm4iFP4P0<AgZY!#aY1U3uOw3J!W#&$5
zxD+!xb`XhpN_Yq?Lojtyvcf!jS6+wFTZmum9&91aLFCQdtI_Jqq-t_{!*dC{B8}Ga
zqh{^>d+|*8h$!2F4?8_1e;i5RbIF)Y=SYJUb#?V<%2x(F_rTKh^uY2?Uc1@!e;gqC
zWVwwo7_Z86kgoEI7#lTj=atIvi@kSB6vfAF?<{3sGMU(#yJMuFV`|=%D6)_e7A#^j
zeLd|{9M7x+*cqGzUP|~8fdsU+BIlfRZzYBXPl>jWhf(T{6dCupSyZhzj<^U1n2(HC
zofD{>J6QAZ*m_E7(R`*db(ifl9S;3!dhy{6t|?f+&E_q{G#3rZ`n0S)12{}^U)f+{
zj4;rQCcU`Cn;`9$M0Lr+?K7(pnoFua^=J=bxBc}CUEt;|dYRiTZ<p%GQ53M}qr3Qv
zpCEY9JJ{=Qzw@%U?FRYy_|V?1hV6mw6MzpY)>f*yudZa6RBc>g%8HdZ_?Ne&d;IPp
zPIyM9+-=LE&2S(nz0*S*S%1_x;s3b3UYOX63f+c<y@8S~s6-QIp7Mo-1qE2&9?=Nn
z<5KbqIP|$8wT6M^qo}H;`oxzB3GUAgG$vYv8EM}XkFWjkc+YOfhJWM9T|y*?H^OFP
zc^2I`zQp#jCysc;PMFAzJIYEd8^-kYQ=e>g^4)TmD%Q9PPc~VXkR#7d7gfVrv_xLF
z?>*O8><S4c>*J70@a{nSJCW0S`zn3?9p~jEJI$rBC%ELBLQ$RqYVufpD>Sh*^;-w0
z29`Few%Rjir-V9R2l5;x*w$2!+Kp9bEsENIb&e0a*OBP{htJacj#)95q#!RmSFsxW
zQR_S(Y{S9KB%^zyR?Q;zzizBIZun`|;zw`T1zs_=Msrq<MDcqW*mg1rdcfFA(=7!B
z>Q-UKPQQdo-osd)SzQ<`blxvd7SU8vP!K+KmZ}@93nVtS){(z1&w@2|(+<d=hR5yf
z!<O}xC*pRnTE1jGSY)+K#4=7Z$9{i*-!;ufFI{-3<lqL&4ACd^Kprm8785h*F9dWn
zEGjArm6<n|R$Su|Bfe|=I((OwzP{Qv_)I3ntk{Hkv?-8AZd4=iGusvFU(dTns#rMa
zY-!!Qa7#%k(Og6I66}uOw;iyXxYy^nWT&Xg@3AZOZ!hopk;sb5D3Qmo<l!vl#x+Z&
z7#<Al(~6Ii#y^`aod3fg{(ah)Zo&6>n+vs><AjTDWu!%2F#=C%=sJ<8lqlaAT^PtC
z0UOpRGX`o>i^pj^6j?T+r2TIBS%4oJY-nhp9Vt0rWtpLzY`V@;TAsUi@V!j-3pcWT
z3iLQi`GscEp<w*h$kT}4^>Lre7H&;S(xmpn=P%#Ce`kp_a1#;B4gcnwyOc26=Gs~U
zi>#NFeer1zWrsF(hWmU7xISunxSq?LPaUV6Ad)t_348T6T=4Ml22`g<78_xm1^}!b
z3tL=8{cG89-Gtz}iTt<ghD$ug&R#>&R$$N0&fXq0iyOlae(84nF{$cB_63~(bw1ij
z57aYKW$f~CB=j2}?l5+_c{yi@smY7o#$F%$o<W!D`Xc<BY0cUZ^T99_RI47S4#A0j
zR|Jb)1i@r)oVWmNB)c$DG68YldVj;KTTB1C#ch&in{f;kmYKO~{>aG4-pR@2fwmF2
zk{$@0-UI944&BOJ7vq?ka1|-;Z@Y0NG2IVh4qJ<a4`Ip1SeZ4_G^C9x%%af77xnq3
zvd5GYf`CeEF+D_C0*-`IE%UZGUDIqRz-Yyb8(@--EHmjgyKb=cd@{39E5x=Sv%ClY
zPw$>9S(@n#w&P$KBaDH4mre3^o~DxX)qh*s-)H5CL&|(!)dYvI_vPm>gZ9YWaNe=V
z`ke$4SS2PH$g^j=j7|Y=OFdjU(M032oH%M#cSa7Dj<d1MXrY#0-m42&5yYmap|G_5
z{!xy$7!tL*^*O~W3n^2<RKhl<Qo6MjZ{J$oEEUV0+(@=<**eh88dwtC;C-sHnCML~
zH%EnnMc`Vk-)Dq6*C>&>EygZE{k!<B-#?nZ?7Q0=rcz_%<g_EU;d~(BOzki_%$91y
zwunQoIG}mt(zgUzBaCe)+34@)yUff@B>aC)CMhA@rqgbW`gNtWAtboicQ3&4P{=v|
zaS{JMp5vI9Qeui;H;9>?#_uR_%}u{GcPR}2rnk$8Y{!F1`;aN234zAA7q;^&(te3}
z^ubG3oo?Oag>v#SXG`6rOx^5v%g(-%vW5^eWl=s8dDYa~-2r=1tO}9m@j6PF_+BrO
zv^MB0*~Zjd@lLj(Zu_p07No+8tzbVq;V78M*XU{m&tCwSw6gWP1dcgyM2MwD*{fA2
zwjBDGHrO%+V4>AB5Y(Lh?vmfX8*wkF6fyVU9x9#oj|7rj!80Z%Mq^@Yi=-nVY5d_T
zTj8~1o{JE*?pex*{IAIW3WM>)O%7_-K!y2dyoPOQv4|r|=jHq+nrh>oG`dGo?p2DW
z)onjReD@Ld5XdlaZHRK+@qvoSmjdav=n~WP?OTD{*U#^rBT&qXS<Fe7Qe6X=Dx$V{
zPeB>rO`vpx7W6rSV8)QSX8^p)nr5V(x#9?0alcXI!s|qV67w~!9h)9AUu0gG%1M$F
z5??A-Lw3(2M?;i<CPYT@A&{M^ZtR+mxMzod;}brpxnp`j_TP`AnM`PJBg!bV<Yla@
z_t(13#_g5?g>Bcb1%2D;gev}WexG+fk9_&Kl(x<_OA+Q=(?=H@%rm+8y`5cGyYOOU
zeK^77O_zjdr}%1_W1cPN=1F_;E|<;nXv|?|$pQ=RxR-O>`TudAk0oUx-D34hc^OK`
zjG76I$n#ZJ$a9oSRbkz2)k*$n1oaS^^ey3F+pALdsvqUicHpI^QvRwRkL-7Mk{8{T
zEBT80bu|NsV7C&wkZC@@6r;I>zsY;*S9P*W&N5@FGvMY1hJ{I9TYPSF7?Bp@J`DRI
z#aU)-ZGsQpK&udB366%8YQr7{s<6I^bx`d*lxVRVQ&KhmBGjpfY@h2npg@k#IgO(-
zDJ-iyTQX;rB>wSWw_@K7Syu2?;Iyd_WSH((5`HU6d%|;Q#;!)z*5z}fQne;G6f~Ct
zdDgWNl~TPmZg<(rqn1nK=Wjlu!Q7`nnb<ui_u$+Hx1**>YodOib6(G4i-_WA+VdF3
z6bsaq;>cTHI#Qwd%76RzU6<$jsJp7qF$_j9*$D6C^M821f23&L3XtXt(j#aEJ$dIu
zbvx&)mas^s7d1^qgm`ghFz;2QiF|)F(%aj60$lf@BDOWc*PgalQ=)Jlh9r-mvX4jY
zzAWm`Tin>cCod+Ay)M8q!*w26Q(aW_{Zm7OY~CFSkVeF%r-`(<km?!(rW_t$2ZsBE
zy$x6T&iPV=mos!Fgq`5>o^qPfj4U?R*Z)FhFxPuiq{$j#8tA06OO9-ZO}Ynbh+Y5T
zeyXM&K4g2LH$iE(H~moqstSMWqtZA%)KJtAl`6t{-w>38zG{yB!BhWh3;C3(y|p;>
zhulqHGZN;o`bgQ_220%1!oT4OPhs^d<(a(wGil|{#aNL{h>qa5j>RD|E6o37AH<}2
zyzG(;EB3%4G7S^q=(_L@@yM0^E!jzoFwg4E%b&WKnz*o+Q(3nf><7Hxi_stCT|QjU
zvbD`)su#?ilURzS0#gqw9LOd<j=2j--9Rn1-2X`Gj$z<<^1e#lKCWq++h=t)s7SJu
z%MdeQ3O2SkfJP)9Jx!DaKR^FQ8zgYs8ejLHF(qu@lnbURzbn5p`bdOj20t~tOH*CF
z484L2at03ad<%{KC#0mL?wXe`z=^<5h~R5<0B|A);icBo)03_d8Byc1X$}%A4zqJ}
zV`pufbG$BjZ=>Omv2ku#s#!A!;T~}gbJ%})^IY#pjoaSStG;KcJC|<C?_k3!<9>6+
zgR0zc=cfb=cci!nEeGsqnZ2>)dBWYULpneY>A>P+6bwmojod&~&d1?l_}~ZCS_GEw
zH-}#6s+ft?I-8Sr)JK&Sj+=RF7E7&bK-1$PzuO$O)?Iz*__B1N64d`&KB(7=73(Rg
zK~(8gOn%{mzvDuyZGN0K1Kj}T^;7f1si<=@EuRs4sn;0A&eN9mSx@PDt%gPHzEHZz
zIy)2oy>0e5QmUr0ECbiW$Fq(uJL(v2LNrk$*|pd_b?Nqw>O;{?%b;D#Cp$`kk{{=2
zc^->F%1?CSgEW*bo>a#<4{bF6ah3k8t4VMOdb6-!SX{dXjrLttRl`MBp4V*_tf`#D
z(p5ZT?c^xOE|izNbB-KA_s)8VN7X&-bxJm=BB)ztUB;FW6Jxa=9&FbQM>g?N!#!Ru
zTGobtJD<CBIHA~MO*N;2Y`^(!_`6CA<mcv;Yh_cxSt!=2oV&Jrd^sCOEU~^Si}&g4
z+iVfFZYCzJ7Ej4~@=hVgo#x`_D3KU#Q&(~dqmmHaA?72xQ7Yo1>0PGaPERjl+&xG`
z_k4No+__UuFUT^3(jIWrZQ<c)vgYJz+`pHQ(Z4IsxYu&I(0mVrWk!OSTGvi8XTXUy
z)vH|RIcnUwWX|bm(9~mWSR@#=*BEVTGyEgGmrl7jxrueQAZF#X{t<E;Wj~Krt970<
z<w+*>`POd6@VQ_8VPeGGx%Z+mcZ|7qLT_Ghj3Rp;(Aafy?$i6K+1zQhPE)ExqCrzm
zE<wM+Oak&wi59{`;g=G(^4PSaFB=Ab9xto1Rc{i)Bj}}^8#6G6%r^ou-^_~(r~a9)
z{9P3M_}e7}R0j2l7(`gNzqvBK8JNk2-S)RwX2@(}-L9Luxl1th_2+1XTyfq03Ms|f
zIB05_ySpFG@{$aOtBm*%43Gt#zxy4Q_8!E~ri54Ig?}4^M$&9?n2LC#-B??|F8sK4
zl~yH^szWT6nFfb3y8%16KNS}!*swVcLkU0-$<HNfeUbkSz;#-9UKa1ID#5L;k@k!1
zOoy$xq3IUTf}2HGAyF~ukoi!bl<NFGWzRCRRsofW;(<_bAuJ5oy2(PR%iiJgU2C29
zjkXX!zn#%DJ5j6DbCZYXR#+`R=}#o&6(3e+o{CFIIMit?$c2I;OF5~qfPP>pF5B$*
zBk#Y_p?A-5>z_E)@aU&jIUdX5S>twAB{vJ`SG}vx#a^vOWoG*6L2yVd-Rt$K;00W_
zXPY03rruZFYN=vYk7}oG#1B+WH0f@giy4j6ohQLxHaLl`tJEduFKk?|mg_v_ba$5G
z(-vM&thT43przIhl?giI49lu(MPIJ3<EKRb^@jfSPgVlv^V%jsy_S;6UKyXTFtSLQ
zZEI|U9gNO?FqHN?>NG7cDteTM&Ic>DUQB!_Spl5rJ=rj+TH`ie&W2LOMr!yj6TOVS
zWH&jAE7IU#MQqhN<fv@Dk<t=>7f)IXW~{gE=<9_6{c^~BIAE^OMC-AiPbp|ol`|r$
zhH}o65*sN=WKG>4NahLRu?5fhL1e88RVnqD+eEM!T?hj)1~3$L`ug)7Q-Ia}sC&%O
zJ^1n?K!-V~hvPk<6!5r=Q093k!pDcQn%FwJoENKI#l<q?3mJ)Ksnc{dO}v&`gwQ4n
z2z*35RC5Z8;0nSb;o{YuzuZ{WjjGxTr~c|qq!^tL8+)aP-1gSgND&r3ODgVer<1Ty
zBG-!5;b=)!)&57s_!1P;k^HZz|B(#5zG$^N?pAos@9q5r?!pc|4H!t(zUi))SN&;H
z4>6c#=bBX)=WiSIK(9sKIbRR)LFC@~ygsc8T|%rSTrZ~>43`Xlry*Q-^9-x%j~Dj1
zUddfz9yFV<H@2tu86+YutX&N9ka=hsKCazvXNE=9G*tS{Nqsfw74~A#N=YF%CP-#O
z@`0~@S@!=fyN+XC!3oFVYzSehuiv@ciV?pWN0k#`;>;+h&a$CQU+=UqtI{d~2!ak{
z<&DcC>-9u^pFVwZczJI&kY@mNAYI0GhYJh>3ahHBx^r~CI}`M>-D}`{s4GAR_e_@(
zxfvOEwX37G^Fsf6xXNOiwBMbZr&Y<M_f?q4knMX=xL7=^@m5*nkoJqRGof703I8VH
zL%@U!3a{l%Quk_3RpyH58qzEX7(8)?`wsnvFYb-%D?9*=%E-syG~Zy+U9MQ2h+LK!
z+@rg$wJXX6iQo`GZhS66_G6tbG_%+5{KFeQRZTh%05j@=09{l@%7C_F_BZYOJ~^Lk
zf{Y5^SM{#9NR%X-=ZtDgJ?FqK^NBcA3Ats(C7ifv)zHOc>m}^@{_GMfE<;J9IT!m@
zRrATCTaH=b^^a!ao*$~^UV1B7w8n8s;5<fr?h}IEmw~pA?=d`<G(u<L^#4d!|A=UR
zXW;~ZM~w-vq;`@>Ye)7>tw4y_^gMCOy}#|Dluu<wMusqbecfbJ#F<mz`%fa<vz*Tt
z%8YQyNu|tKn3q~k)O_iFb@fOIJ3mKj<ZNE88ut3TDk~jYDKQ>^a=ie*%VT&;a`!VZ
z@a1BPg@aL9jnVHW%Xn8@9@=L1?=I2|GsRkrSM5Z@Z@M}b420A7Q16P24WF5WWHd3G
z*meKNX7_C7&6>jYXl1((c~Qod!vZjPSD{oWZxB<{zZA#LCpi3%2jmXKkP_mGjtUK0
zbSjK#waf6gUU*?OG^#D~mTa~~<zu<(J9d)noA;vGG93plvuBD%omozO;?)tSHS`W4
z5KY$k%2DT;Tj$;70K^J`(5(a}<E}`_`xN<fQwdoUw|uJgUGsR-*U9GXsZ*AXo=<T#
z;1WvesX$t;u-I8>@sAh(FXu+^MmuQ)Yw`PI?YFtEc=J|4n5}V3aT7Cw$lsx!cFyM=
zY;6RsIj0#HBL1Jnp**!M8WfP<B)Yrgydgg_#yH|&x)}l|up5AozOHOg+qFf}AEx3C
zl-?+?RoT?uCR-}6@)~JV>bsds3qOjPTioe=;y;2!K|Jhsr)-S2yi0hFSbEeV(8=cW
z9sB)c`x0{x75N?Svj?tHrqz6T-E}5SoVHa{LQ23LWKLhvHO`qpq~XJ!`*0QaSGPv`
zN*4IvkOe})J#8}3y8aJa%TASi?TjGk<hyiwvH0Yg*kFIor1qd);`<VDwoWYJvIjoZ
zUUmi)jb7Hzyu5~{h?e|kV{mY1F$uH~H_C2j+%b4(=2`c_-&ZACOs#gu_@0}4eq{P3
zBt3Tl1|J_^Va?hao93}ZypZL7UTaK(H`l@4$UcAc<nIvo=ikC{xEdnL^g4w3GBL(F
z?O0~wmMT(Np9W=0^+Cg9uPY^djb*;{(c4=rcKHO8Gr|YUh@F_i^e88`R7EmrR~wS!
z`E<mauCT;?w5_b81hk*3yS~)<9?4=`D?Xdr0=;-xd54ttyVh58mb4ZiBg8XtK83XO
zbfr+0!;V(r@og`oJbYw(W$=~7H?=hjX)jk-X9i)jP8_(feR2Bw3(mrv<FM3=rL6!l
zjjrxjF1U|^u<r`o3h1dFxN#2eHLx}=rsFqRC+G#@bWYSm*JuE6aLu%?FhHEjF31Is
zYNoBQj<~{tw{7=b^*q~x#B!U{{|F7HEZuLCm>M{V?sl1qYT^|3Ck}Vb%`AP2nAf~f
z@pbzycTRDMI%9RcGv%qG3wE**-|v%5sc3&;GhkgPz7&1^t87yf+o{Hj6v8#h?+{Zv
zNxVsBYG>KAPY-GvR!HmG9GK^E8%MKLMP+o8jeypMm;6&Hb<0%w$<aPb_u)~^1Z1Gk
zyA<3<c#9g5#Yla9&#kGH1iEa><Cqk+>+An_-vAv)-bf~CIkJdhN>$@_FWlDw&Tz&^
zol#o8oN32cyUR#COOv&QIu2>Piq+tRIcy<LmA$tRuCf(`;hfhl0s}D}?Ok1!)e1NK
zpyNQq-sb$0V6v*9i2}(PrQ$wVknN$RuD)$NG=)2S13DmW<V4=l3GCi)pV+IkIw4^T
zcNHf#at3QHSmJOSDQGtM)l#L^S8#9jwY2_tQ3`+u>g2A3R$9+C#%NsiySEM;pj*Hu
zU&Lhsii!#J-j440p4Alu4A>%vut;%Wrd#|s7a;atV7$vbyyuXZ*w|fD`ua6I`c;F#
z9Zv%dS{t3MZ<$St#S&056JuVm4Q8WuJri8~BSDvy_FH(9rzi2Gsq3`|4XHF$qzdPG
zWMu5o<wndd4oYP3b(WBNUA#jostexg=cL=YxVVm1-ak2Ylr_NLIGi^iT<*_e%lr@1
zctRxPuvRk`#OnP1LgvLN4(2G_7Y!%OVjgQLK4<R}$1Y%`iT5)5Qm=VLOB&a9^d=QK
zQ*OV{%_K?Ic?r9q;i;(t&*?IfQ)fpI4I~(0w~V_UzB#+piq&?j)bX&zE{5*mb-U{K
z)j9L$J-sE<yL@pLtqY)slmhAe%kO-TH!l2HX8m?3k_KqF=yl3#deV9OrfI!J*YmGy
z&lH*O-DR1%M;&|?Lwb$B(bYJWcbFcw<#-dk^dYF#xGkGE)F%PjBV!ZUA6|RWCGpIf
zvCN$CCg79yL;4WZ5KJb8fBP)&$o?j7^EF&CN{gDE*%4Jf)xM^a`gPC`@n73Z0QIn(
zf}=1c@ttvp3Ae2iAjk4N+@^8MO;`T>a!;2x!O_g#VJY;~J3{XnTs_sd3?eI6y$+&O
zXt+{G^#XTjSqum^cb^eV)^fCI#w8}!<`+oGWkabBNa=WJf;|_>#x=8+poQE4VPSvO
z_P<*kdi$6Z_r9ClkK(>BP%5pJc}^=UTx;Tem5lG1Zq;MCEhC(vWYsi%eK+q7n(}iw
z)@az|Qv0^4VxKk&bFC2zs?tMbf6!%6kFL$FKi`utt%q@0+rj?omAlwxQG=!fU)u7l
zqRE6Vz7=#|GgrOr3dj4@QYo4cnk^wvQhgc1GxYk?gZ>p?FK|N3xnA)3Lex2C+k`tD
z1;hpx&>YZ(E`r06*n{9Is7n_G@u`H8*p|a`3CNv~Lyf6}!&P(9D0$8_N8<8#JUy@9
zFBkoLy2B)s{Sj6Vh)>@<S)F`L{nFb3Rx2_}zD!3o0==cYNo{uW0ji92f7_O0T7$AX
z{<g|8+@h@a*15#v?LTwXg%ZhRr@cT}G}PTiaP#CnYWT2uO)hM6%XNiAX=1GBh71yu
zfSKb5oYrpk{<up~f7Ow^=_i8Yn2msvjhC%3FZ~Y$+|!PTnMluHDVV#b!kpwpo^(>Z
zQzsdZ>K96?%y8$|1d|``+4l&I7GzI=2PrG@ubDsLNqFuJODd^1LC=ElO%~dfiq$B&
zha9&^{#mD1V8`wCm%DwyKXIXDP(A#5+bnAGM)9?^XRT&fB%3enCSM#cfBQmbIrQuc
zeqGkn%?5@vG=z9mm-a%uyn`wpb;vtwJT_}6|9^~q1z40@)GpwFLxXfFsHk*_(gF%f
z3(}pUG!ES@Afdp}B}g|2NH+)!AQD5P(k(SK+&#w=p7Y=PKbHr628Mz8zP<KZ?^^3!
z@1B1Yisim>YH(g#4!>04+t69>hIf5&fe7C(c-++B>Og8q!@|pez(ZWWW(?4KCB%Sp
z0;pmI9)|a+my`T2P({<$_X=iqJ2~!Z(Ln@{*8?oQ#sE?WnWrV!#^!lIU5<*zWTqyu
zIE+UL$UWRBSn)R@rIzvq&b5N0w^qb<Q>Dkcc+Jb39NCfldq<d&<yUeN>e+3c`PBlr
z7a>gzay$G;Vyx%odd-~CKE04BZHnoPIlh(#)wdjjVvT;N!w|<Tzn2^JX(3cSZ?Il}
zKds<2+T?Q9<49F~SvxFJA=svB$So{xLQ6Q1!D#m5X#SiZ`2|n%d33z;#YU_>$;<gE
ze)qI0{{td)Pw2~nHZ+Y`qgTNeboGsa@k?8^#dH4V3V*jvv|A!5LGOxby65UCJ}UX&
zD7U^-|4ASyuxan!=>CJvBz7}kY&5gg9rQ_O?a-C}$gV+p-*;4FztWUb1*Ud;*Ecml
zHY2iee^h6S{wS~<<G1^NOP3M4f-6@g={C-&Ic}&AO(67JfUIT2;vPS0=0M4^-a#Ik
zfEqE(#b%-APJA2-J(ANgE?Ge$um<(1&nVMSk9RhX-3$r6lkMl2!U|a|DoBsgk0(cP
z&#Ma{=4?jg5l(6P8ehkE79t9`TxruZ6b=UnV-5akA9gB%-Eg?U0;sdUTk5|stgqu0
z#@uSfp9FQ;Skr!Ps-cA*!t)3O;w4a=7Kd)fWXxp6Jh<Dbt6VTLBl_Thx5+|9`a_7c
zIAE?Ux&YxZQpsdZ?rGu|MO@UrS=>^;T|db(<69k>k~?Fu&q6rwQ&VAotJO%hRAi@w
zu+vysE{x;(Sue2mOU;>Md3PHcCzUW5%0)R5RaNf#OkZ+8O8x#~yvJ)}DSB-~R5QVJ
z{c7Re&RP2n<|VSxxg2~f3aCRZWo!5pD@HaxgoOGfNRDnac*<Y>e_kghBuEC_4ekvb
z7)?MkxWK}#lcKtjoUXSj{WZmPCL|_9=6CE`ac`gY3=hZfc1}H$J%RyF3OJ1K+!B<!
zbEuiPyRkqC_Mn1&tssT<DXobj0QLDYPR1L*nOB&9bB%4v<ZTbWB%OpJ(~g>SKYd;S
zWDLzlC@jR0`Cgp|H`h|BdPjqkQ9~3QGqb-(RfHgE@iI|CzRMF$TY4|7(jX&}<vl?j
zi3X!FrbS?_Yb?M!i|*3^Q1!2s`~AyKS#WKd*0<kulisn9*c#N{!XXCFYqI(FCEdSP
zVbm6cAI9U&5)i-1z3El7Z(G&0U!%@vO8@(B%vAvuz7r>W7AIyC<M6N6;YNNJ#B7yt
zLXW_TUYx0Niis;{_ozl_Z|S*m(wX94?m>vS3GI6*GB+>E>)?l%+#hMAaj|s$NanO*
zLkq`vreEY(DH>fW2w!t6B8jKIPh0vdu=&wSLq7%992|peNzeTZX3#WGGi93JbD!B$
z;RD!{AyZD2;s3u4k@QEwEQ5igiI)7+PhCB8A+pJHAnsXo#S?&L3%96fgA8<=5!6LN
zk)5ML76`tgn?uUk9M-?zMneJY{sxCx8>abUL#HE(i42Hlce6gP!Dz%`;Msk`ub8zz
zkJL#@N7x-bejF8=b6?Eg5NVWOsIi+V3eX!?aPYX5PL0oA<;r`&joSt9_Aw#2Q&Vg8
zjXPP%*aym*Z0=OE!f0|=Qu!Nq5`y61J-?MjP#|WHk5<2@=hii{QM&$W!3tVZwE31R
z%adV3Mx}h-G_AGvr~hVDe+TM6zsDVc#VjcwVn#Mh*HvLyj~w2hUrN5Zb!P4eEqv`W
zCm<ZgV~)#I_2}XR6vi6d6M*vNIB3k6aCLp5{M~ATl)fd)p*ntOml<kYcT%s4(ia|!
zC0wsW&eVVqJvJ(G^BE1N8`GHwNm#TIH`mb%jW9XNC^?+nRy?1<?FRK^OUTlWzxB4P
zN4Tq*+)kiUPNXddpQeUAcZsOQs|#%pY&eO|t}NuvBtF+nPU(_*M>}QJI1Fs!t*ea{
z$uvydv}k8-k`~X51uaA)m3@12)!%LMKN$q|Lth)F?7b*l{_yWI+>3Ir{v?+qBoCQ@
zog)Gn)oiZX=#pW1(2!^ZNR*(=8#S`mQuh13EIr36vT#!_En=|k#X+?ms@ogdlEC+P
zB}$W<^$V`(`&H=4a2+`a#UnlV8;Zxa1$`Tyf8l(_RNA3l9;Gi4*b)JL*zqKh`Dcc#
zOOa>KtrP832G*6p>mA_xh$2+c*fpbueQ!%Xe^Ls@-1`}HT^~V!RsaFI2$#?Qzf1WW
zNN&Cf1<*M_FLj-tHH+6E(jxQ;%aSf~nrB?H5csux@~5(TOaajRmHpjM)6~1B1G|l5
z11<FbN|M`{-xg+5VK<iZ6uDp(miatdsA-RyfUiALZsxdpQx5y0yrus6l5&$|z_e4o
zhz7Z7uPPKlTrPz#Z>?&jSA<iBUG>T<FVe+$z?}7{!)F2;we#S5PYKCbsyDCUxc-as
zuln@q43Fg1SLGrM$RClv^Je=rK9Rlb!s4Xs-hBjXbFAW{g9I4>Z}fAEln->1_sKwb
z@j``#{V({_1Wn9E+>2Q-g$QBqt2cUAJ+^G1y9W#*9nTltG^w8hw5H7*E|6_|%r+f<
zYw_Rriddrl-BPGIeWm$DZ#ZGQmN7cRaih#&FO1sSGhy6ZL$+-3x0hP&{d-`~@9Ah3
zGaxJpkc#EZj|UL9-2|U8aUXe9MDI{wO+LPxJ+#A0w79@s8TD#oKN}P?7>!XLxDAQ$
zwM6c2Lwy%j`-Qf{-tE-HHCg&JU8lMX<F%t?LxcDa$#tl&yEWUxb`1*&eNeEDbiNA4
zp@0Ev0}OmLJk|Gi#PpwFY&eAmZ<7(_sh`AixC|g0f3}AiJ_`a*mClO^8sH%N(8_)Y
z8iwIKX`0&FZUcauqp?W+dx;Ft_n<fPSIqpZm3cWS0@(1X_%9iB<5_JyN#J><h*!20
zia&7YG;D7DzHZ|r@$!eMk;1Xj)#^t+9C=p@sFBQLl>NL6R&;h9H809HT76cU4c<ax
zloy)EdW=to1vP0P$FCwHM-w=n#Or!5zmv77@`IPT7_b`Aln~tEkWp(GJXCIGe+?=(
zGbGkH&m=H>--DIE@1WBf@VmSH@3pIf8e8~kKhG8~8hwC1s%Wh52SO}p3?-m}Fo9FT
zahdu<iE8zr-)T&AS<Ietl-Ps~0;;-OsxxdQ;$`sfR(0dXA4pU()N|$7r7zTVBVOI!
ztg6&b+pT*8&;9$_c`6&Hh^2E0w|V3Do#~Bc{gY)E684e-qauO0JJ9$`8lQ@D9SoW%
zi2a-OEI*l+m$Yc;ZgV46%tT;*u11fGZkl=M<mMySz|d<5Xb#`s&cVO{xsHU-8?)rA
zzq^><HuWFx@9)Cwd-7!`FW|L%tW8v)pmG9Wh^W{3ZW_|2VSiM(%UA-8$Ny&3viQk$
z&a*Z*q%><d)dV=!Z|WIwSi~xXRchz#n8Vm;$|5NCdgM`I_w^f<XnAy0L0m?>aoZrb
zdqb$(C~3HxUf%-?Axa_(W_vFCW&f%`SW&^lz-APCFAjUOOfL$R&34K^5SrZMUDHcE
zDPx4vJsCCMrypjA*?htAwI>Ic7*W@g2yBoQ#3wbGum6*t^^ZG~uNSXod`25=IMklT
zdjtE+%4zH`VD*8r9Vj3Ccioq<`)f>R#aP;{gq=p*xl77EI|TCI2w?6=9ccOy*3+sL
zzUn5qZb1;o@tsP$%g+_}RQ2Q$cS(Ay?yFrDuvhUTI<3!d<m)0u1afwCsZ9|vjA7XZ
zryj<MQkXr%yBjatSy!3w7k0#IKCw9GC>h<|(0<dsp@e;|jg1O>8#{$Yk^_WiNq)H3
zxgl67er~!4)P97<cwoyGqqi(Pt>u5b>3^=lFd4mNhiuAYKZ?igx<`)ImUC6TkNn*j
zf7t^BYia%L)r}T7$J;m}U8$DB3|Y4okA&?5UHk7^aPL_AMX9t&>kn1|`+kqRWZjg#
zxCuUWEMA*D20z!b+uh@nXQ*9(VPxM?V|reqTguT`JWrFD?5$i{?&xu=l<I0)Q!(lZ
z2t2PMl;Db1>yO<rLsssiy2}?E?N^Sbf<v!~LSq1$)B<SoiqBoJDa$Cqs*X<f{`lWE
zB?s7)MOpVuiYh@!uB0><`OPN(c-<|XKcFHVje4%!?fomh1@aw&3C~D5)k6lZz~Y)t
z!+f#J`(5T1P6BM7lhskF<@OF|z7d_}D2<C2wv-x}!2TKKTjz9@^Fb*xteE+dRriDo
zlQ$Q9FT>RARxq_vPoU5^qwzhvF)Nw_c7`#_t1-ozW3;zwif!xn6FjQD%DvZ1kE5`p
z(kSH65t3nS02wuuiKfqh#XIFKD?TMav*G?R&VN6eLo@7?ouVs0fa|nZ{P&aKRn6LX
zcm3R2!M`*07nazA6Eb)0JJ<&34-aw%J}twfKmTYpv<Yn16yq#pr&5o%6fxelRZYcb
zZLZw4OMvK77s@$MpOxvk7LVBKFVyU9_l4Wai<s-BV7&&Tc#w-$Z0YI8EzQHHbMVX0
zIE!RGwp}|%XZshjbTd$WJ+HqW3Kx5xR4k3|h?uiOq?Lg~y|YOR1I)I{E7XTviI$Zl
zo&QU910YBp@DjHzj5%VJ^r0hbt<i|UK<vi$yPpZuZU4tY%A*nB2AO04eB~>}ATMSW
zNe;zWPi!&1!9fSwU78%EvkcTP!|K9gjQE>#uj`d7>hby@`GeyXP|UG!3v=V{m?2+v
z3o|Nfa^5dReqJzRTXoDQiSI)$Y#La8l9y_i$Lw$-;Yqa12KB4Y@jd*~SljzlYEW`%
zgy)>~zz5o~^jXHe2Rm2{Vr(;+l~)f3#@<~v#RQ4`DA<RcZthz2|L>KA7G|L>bohB;
zCFQb%mt|4S=BH7*R+s-SeOOJipnDU2ZyGjyir2L$f@0~%M$%wWWo|ZdDt5>}->vU)
z_7B}XNmd_Z{s-jyAG$NO6VvmmU*BA&*4dRgmrDqR)#T`aDwgb5=0tP0OWB=DY6M{#
zk_6SF@OrBSdjB5(ZA1N6XB#$r-F=;qa$nIKjYn1Zhsl-*@2b-J5q!||AD||~$%`iT
zb1PWNx(rfQdrYr))Z2un$A7)q=3FR<2lHng9;(^1Ln5BDY4MdP$iul<A2Tg74I`g?
zyn4)kcI_=Gu*x6S_e_f`+TyrC5xraFwkDcB{IB5SPcE<&1xTKCQSjx&x%*feg7EQ)
zlKWqEEmags1fM@u8=%TiQ!Cl_xI3`!5G_&u$gKH&z5rD}pH6A%na6d_xNf9WOKnEz
z>mqA*qDAKZnh4(pr(RPM9`{k4C8g+b-t!2{+4J>~6a6m6+1@=-FZ7z(Drterqv~zI
zMHP~B`?L1oi-(Ovv8o-30gu`Fi1*TMGyhvZzvw|5SZSZ;T)ZbN$c0M|CJ2quiNYR|
zJKuXMXGA;yWcm8qlLDv3Dcmiz0|{;PAU$p91q_5-rhi@O9~Z8Miw>BDT!=>QE4SKw
zbk+xz0;?(kB9`CZd&8MTA-VZx;DNd^m#zZ4^O#OE$HF_JPiA;)S^l=;b8R2AyL(^>
zvOax%GB_+`m^B&AM#k<HT}uzMu>3<pJ&G!B=_>Zu{%EP64>1dfG2W!Jt@9bE&3EG)
z8qmZ8h_Bb$Oc`)k-T{jYWn97eBar)k6sM4=V0~!jPw<8mGI#p;R}Asx#Zn8pb(u6k
zSK-S$Lz4tPpL@R+PCf{{x_fL%MvvK`F@8A3eEL5N3N369CmH4oPuD-ZgSW`cI>4Mw
zgI!~cSyf52iajKFmN9VIsBQewb*`QzNBl)8d|TZ1IAs42yhtRDNa$W&pV+m<iC2D}
z%ITc%s^8ouoLghfwTtf=s*DN^Uz7sX#@&rjXIIV;5q$}6t^@W6HG#0E^S)})H(O&R
zXOpR<kAXTLJZZI(-SYBm8uj~d(|oW|Nie-asG)2rXr^2=Tj_+|goXc0wZK7Ws=}rq
z0!<V7buTI@H5Gj9`F%UQ{6jo)8%NEdbAPO1SR%M+<w>5!$v5DE{>K+@{)h7}Aay~^
zc+WOO{g@a*`0}5yUK{I$^r6sER3gptSO8r2CYdw>r7u&W$HSQ*EEoIDa_RdH+aj}{
zeOW{gWHQRPKA3~jN{;gL+SBb)P)B=p<Lg=vG2^{Uk%7KV>X;2bdVB)cLa_-wjhu8Z
zQ04<gE8VSzP%vDi{8Y(OLCh=w^)3%R69YMv5TMp@Lo*@pu#XAL_vsAR?thYg8Xisk
zJ9NvZT)rJMW=UNZhjIOVTfTGr`5#*s3*E%OJjad>K+bWM|21K5T`Y;GN_aH|&$vr?
z=B%!8cnGD;^F9agByNLJvI^^Uk&kZv6}@K<>Sk0iIFp`Q7TtgLo{K9UNqZ+}Hd)qz
z@bjarx?F$Nj(XHbZDxWb!e=Lg`Y&N=cx)?DaB@DuCScP$0l}RcT%};l=abh}?xgQ%
zl<wi)8U}`@TmY&e9MI<`1Lbv+<=bPUa=GD*bU9c5?N!8SeD(c;AOyX^U(rTRtpu)V
zuBN1rmHf`}PN12V@F!d>b6oHm?x{cOWs{{w{(xor@q2D&#BpSjP3_92TEAvL0+Cz~
zm_K0Vq{Xel>kWyqE8O#yz;Azw!?#QKbgM3Kq&SxfGWOwZ#+}cjpTn84%)0TcjT`*j
z+&L#XO!@5M73#slUB*v>C`&aENwh!W=5p0+W6{|zK9jG@(6@if;VLBk_z~{s3_M=|
z3s32ks&0gaKY$a`R|yeB4j5>-XM@)(k_=rH5X#-4KZt5w2YsWohD8eMBU0d+(8(M;
z#W)p!i>qj4_%MZsRP&*)#Oo=)tUX$XOqEI6R15;Z;{AQ6a&KdqT?%Rl!0%P%obNpz
zNpq7*+nt{-y>nQ9t08CG)5JS}_L|$HbWqxhgc&7cAMSJrY;gjmSOB}@Q&VSD3r14G
zC~hN|?>VH+idh28u^O*Y>q*pemIV#KYw3(=(t~cNY8Rn4kB(fc)4T+K`I}!G?UVRZ
z<r3u(&S$Qicve}gwH>l+{IxWRp>7gS2SPQ5<#1*aEfpasUgr7M{g3{xWjIA5x~z6_
zc&*H=Gt4(U-e~}85pIGjKjm1j>dqT$(*#E2>Sm)KDWp2K_kzc4v~C5*3y3+U(l9L+
zyPseSyJ!G<iaO(`=PM`YjeBe@jxUjpuCsPufn#q+cLNWfVfg4{yAd|Ri`GZw#=_He
zO6sKD0TeNGfb@8UxN)ct_fl0Njm6Bm?(Al5mwOJ{Y(VmlGh9lV3W}!j9>YGAH}w*=
zt-2j~y1m_9aNd_#{dyk@8kQ-ZFzfAp)bJ6p<)?Nw!xMZKe8EX}>ip?wwp!xiAghL0
z!Lo?oIhdgxXK1M%U0jJTO8p#oWjN`xwfP{h&kVGE=78ay63Jj|S2^xq1B<8%k;y8q
zw<_lkUz?<%r$rxl!hUbLEBH!FiLyAEu`ToK?5W%TnObY_4m2sv!p1@4430TjVbBEl
zSc3EFvC$czsiXOdd%gG(XlI2qy}cf-gMfu}a8Kgaw<l$Nl1!6P(+(r*g%Fy69%^IF
z+WF!*B;xR0fOrIOJzOvHD{rN>YJnmT8CNrLYsdrapiAj-T&i(r<#r_o!wh>=sX-4D
zwa`)jy?6pw8_IH|H}1s=O(ufM_GIP0CxYACwK{Zv@G{ywo0swTb`EmAG;~l~24|?*
zwb{wg&hJ~WvrUXHCLi&4;t5TB`#a8Q^-W#v{w^&dL3`%osfIQ!r$@q9wx}RIU@_Tj
zrFsXieCwsktTgBwXU*DV%kuC8YE5`djkg019}n!>T<sf)N!zcop1g-XmNE&zn0uXL
zv1ZoyI+xpC6Aw(k>g9OdbMQzGBCV1R2(t#VEL<DBCxFxH{bc};q02RoXEBPi%WL@H
zjB0po|6<deAmTT;n@%%qDH~HFfE3Hvw%wRQPEm1+!20|(fZKecMJ>So%$`$Mxjb91
zyj6eQBd_#_?hd;=&hkWs%;mL=aTG}B4^?z(Ul|I2rYc!Sejap$u6Ij6Si$pGF6BEL
z1*DafVy4`#NG1p#<ghUQ*(I=_mVvX5<TD+?S`qfV%+T3nCHTRBS9`y91w?=#xpYL_
zow~kHaa>wXYnCKNyp7~UIh4hJaRJ5^QjmX#KMP1co`gD$!}|T<&$HbHg148kF(Hz^
z-?nB{r{}E1)9XO++m!wYrdkx2?m5d`?H5JUdqvUQ1aB}95#0CFi76nn^1R!wzqU!s
zdc1t$ytlU(3|5FpK(|RiGfGNSD|Vj_%%MG4w!Il4X}F9H&WgGu8N4fp*lPf~Trn@=
zUS`DLwKl~8qGB`Px?EEfuL>Yn_y<(-VhJC3xu)F(buwix`4`v{h1_pPMDBuWzrjl}
zE+ljA!7@^>N(ss{Nkl-)lzDWUYPy&tzOGmkqvYL-H_Rk$53>DUu*5kPgBC-`ce>OB
z0Yk;6@hdLxUIC9HI501kwRlZ!UQRK$#y!uqlFY4WnlLU6Z&4~`K?7=)CLrb8_U&lx
zVsYGKAT><^IuEw&+($He5W@e74>5nD;k!sh{&=2qRuB2`66p0BR$^9;1P$dwKah&k
zRaZpq{{w!*{KcDmJ4SHGahW69#@lFbJxN!#SF&ty_Zvm1kS?<VY|2JTD6I-W^s=?C
z^4rexLWa4Wnp1$l5rFhb2gaXuon8YD5tzf5GMgMC@>RY9n870|JxJ%o-JX+$*PZbL
z^kN<^TdY_PznZ71*8>D9-wbNpx0!P3RJcFwaBxS9SL5%D%Xgnjz<ew2pu<)+;o6wx
z03X?y+`?dMkK=g+8|Rc@b!>tA-kKPl+_#QsyN#S^+#0{dJVX6knsH<Z%fgD%9bK=|
zBGN-S2;+?+RT9p+Yesdv6j*KX`n5mwRu30%Q53ULAPDex90}H${e#Exd_!Dmn2K-Y
zR>#^>bkZ`R8w?+{&>d^gU`U^-EhXMc@8beRH79;AqkGStT+DkBz|>0<F?5=~SB?aX
z0^urwB~X_C=aw>Zvtl&_k-8Hyp=gEXVe!g>uy}Ld=Brrl$#l>F5E=!@I@QqkbE6PQ
zJAgsAd!jIwg=jtGS^IL)9UhaHlc<7P>?pj^Ly4k&zQ-PC@(&&Z;aR8&0zDiB&VlsL
z%E~I?;N+a)XVLs)n0j~HZ~y9Bl}0Ntbh6{taK0O=U;&<RS&(aOArv|1F0uWS`q?q}
z^&h@$b3FdKM{nE}K!JjfYx@@75@)xgz(#!aEqLA`$kLaHv#<DvpMG@cIVF!>KYv<*
z{r#&%`MJM!1NK;kYZlvH$I`Wdi>y@J64v|xkPRADH|zf(AAn#-@5(nZ?69pc<C6w;
zkwZ{nq{yZ@L5~41gm-^w{W(BS`%9f52jCXOI>q7<bWhP@d0&q^bDErWKxeVW|7Fzn
zZ8!<t65GG-D)8Jgz7m%S{Rex32ytnOXUAzgvZD`<<bTr03+d9%4q$H5_%`edGf=xe
zLF2P-Y3}eIK+=_&>JSt20o|_%_?wc(hh6qF)?uy^wYR~fFSzBytB*C)u2!L`oK=%H
z?ttbl=-K}b%}(BTWq(chUYz`k`&rLcXGqc)9Qa&pT-*SdVw|OEeW@Xo1%~Pj0B7^W
zH3f$#)=cyjpwExWUhZ>J2Wk=xBtQJ>?f)DkzLa`t`X8E4*gJHR{c(U=P)Aj!MEQ_I
z`G`Z!s1G8+lYsZRyM{~GWig`C*g|#a4eM|v{^t(X73N!43O*n;HL`CSW;zxa>+Oz#
z%sh&Jp0>-J%-5)iuteReh@s;RHi{SX^VsR`LEf}U_75YA^Km@)?)~aGEzq!X@i?!I
zI;uZ)q_Ll#B^lN!IKZ0=iv^x^aDwYO$RjMdnx!f#?;9XV{V%vxI7%ismT!L>mHSkb
zW=QwG%Hkk?<{$6(=c9C5-}3LR<s<mGSAtHi5U=e&8{w>tA3aoo8EPvBM>gy8+tRtw
zNT7Grr<d|IG2oA>$Uhu;!m=;H)Xw)cddJh-yg|RoIQQ0Mo2k{Lmq8j+0~kDrJ8W!n
z#-QkM)5KILAO~a7uXz@G8IDCt!2njeeMw<X3?A!>C`;FSaA;W@U?8=yP7Ckx1M<5$
zm|}l5F(t;&Z+1<<?wh&gU8<meA|kat8jjpBzCZuu@)Q`nV2q_G&_(Kc7Z_?&Khi14
zciXmbYf;6!^jeAnEAyxt=Cu;)_U`fNVX1?@@zNVCn?4L9W#%TUM(!ulotJFNLR~`w
z43-#;Z<LPVUGL1+rWQb8@Dr_43&=wROx}X*cs!_x@dK&qFx_?<lf1Vb*!DQ)as$BT
z(crk#<eG4C22Fp#GX6cw>!Tc}_dci7F;(-A8{_=*j{mL&2dSjX$Cp~Y<<>ORkGK1h
zHpK7m-a?pud{)SIbw;PgrooaTNy^Zc!i8Qa5&Uv#*lfI`GiH|R>ZIX@Np3E3+E!E>
zO#l5c+ip@4ka!g+8fqX-6bl%Jel-F0GSbc@gNbQP7Q=b)E43Bsis@9zcID{hqDilS
z_QGUm{4*hiaq8W>fCMTc*}M8OzsmZ7yzEh5J(zksZUFiTmhP)|@bH|Rocn;^$QU5N
znNy&ry@o7$1tt13l?MnRp2Rl=7(OV$Pl`TK*LCHT7&$e4n25G`RCB?q=&@s5Wwb+C
zNV~t`_o2KV`@ClDwJ)0AxNC}YUnM_LYnQu>(x!k8UEBW1HehzF0NG6}sk(JVlvG&k
zb}16Hqpn?Z`@nXs>e6eD6s$Pu4iOsl{@&VlHpsmkIvk>C%+IIIGyAH;NpI8cu~%pS
zvup?*1pUHWXew0v=3(jU-5(P5j1I^`TI3eh>rMwcEz18oAT&0l#(!j#%33f+@p1!K
ze>rDDzrJnJ!<}#wI}KZw<>2S|EOi=!or4L0Au?lKbk}C4f1>Klm1UtnIpG2vdKjdv
zGYD9ZTY_Me<qgS?87Wwt$~l<Xpe@?J9Z1rfStY$Vnbf=urZd(%!|80-oMK{j!i(1$
zYjx_B!D-+7Qegf<aQ7l?^mu!%j1y2|!ThYj%ov!v7u$^0u@U@ub=O$_pJ+OGW$Y-H
z?Ev?N`L4h9Mx{fsfCe?j<Zg$?)|BX-cj<|#T!>=T>+H0}aiK6*nHtq|t|U{-FfttE
zmF}9+-B)mmrPheAZmtx>lXFyC&&HH2(~_pM7Y#INO7jwB9I9jwva*wiXsf!E?%n-y
z>?{(uHBP*$AlY<-1ElUBsw2B^_%zc(f8w{(GM0>xv{Ck325m!s6phd{3;Z53|0+>s
za&j;K)TbChVa<s>bXP^v?UThE#R2AxMrerijay$<-cYru<mTQ_rn8ttHNr1Y1KH-c
zv!)hnHxFLJMms80WJR^0+hs(%jZZB-Lr9EY4v%(voB$dBX~4~&osPKoODf07@_uLu
zp!5SSc)FkIB9H*{2cE=v)I@I*u=?$mzr4Q&Wm1%nv<-931Ea}$k*++q{<Z}m|ACI1
zDBK&*d)1TK06d}e;JbvohRq~;qodgAt@RmOe@MNlKDkxz;zkdk^0=U=$1H0hE%og;
z)OfgP5WU3oi>ZSNj%JtKjchx78y0Jey}MwCsb??F#tf@0Z+=<XH3nJ-cDw!j6Hws7
zeBV=iqY8dMow!3r7v!VQ7eh5gBOEVy;mgtF{qfPSqR}4-?ce{UKVhu>GIHv7R*d<i
zNJs=}dFSN<Wn;lr7zU9;2w;^qG~R{^s7Wh5zJzXGiBDlVfs`4U6+^5<4ERgW(G!!H
zM51qt;VNmAk|M?Fy9X9U2HbCbR~&MIhF~=wn0CJ_V}ck0G7&$Ya(x)+2D1&<LDTsa
z(M9s;@swCy(L8t7@NOW<#qFF5^eneJFEr&JgcMdOuBohqpu}#8@-4AisIdyHx}{Nf
zJ%v2uYpMQ?54Ru9Rh(upYF3czx$EWiI2WC2dc7?IToUdVu`*xyG0F$1R8uW)Uu&Dw
z-aq<^Sq;kO376#~(sXU>+m;W(0Lju}y{Vx|DlY)shFlb1<jH9XL}3wBz!|Sy<7T%}
z15Q%CqpJltV@$Nabcsf?r9Zx78=yp+{eSC+aPMGws#mOi+I*fRTC~6E<?BXpn{;%I
zDse7IzZo1R)fq33U62t1`jy8VW_}y8`GLOn{d3<%6n-frB&MdAY5gIF&<ipWJM~x<
zi<)mEC09N726MJ%n=XC^7lo{-10nueph?l0QT!q1fyk@*&krf7sJ8MAKfPa*WZAcs
z41OT51YE8z5E5xhB}+|B6>BLkbeo|EQ_P$E#qQ;)i(!Be^lMLrC5e}3=xf$+XlZaz
zQb!wMaL|0V2n7|4aB~&-q+e36=k|#3$+=a%U{5AjEqftGJSbo9u0D0;RQ}E(xB;l8
z`yOmRk#1rXR+czxLJ$<5=$t$Q>i~sIkWx+xpKJSF)fz+nf~39L46>WB)OiTeGqNy9
z=I+gVO01wmuS7BW?4Jq6Kd>%z(<;)yMQb0=pKIkQi|t65!(Ti8UtXYD9D+kKHAyD_
zarTS;_yCvx0D~SNc94(@Nf|Prn{7A59umu^PYyWwxn0UMQ0tP-+X4ihEa5WtwMKM8
zr>C}L>GAVVQn^sfeV_<H3doq&;W_Pg53iSkBcL*gu8DbMp|w3&^eV29yU(<?wh{-K
z*1{D{08{TOn7I}yd1ivZO<m$p7c#t-!jjd<65yhF{1lu=2Q)d#A~^*ifA2FxKS;mV
zF8m~bq>Rp(nXRUhb(CMh7`|K!_#mwPGA1aG?aHM+`Y}uOLAGK6D#cPYTrN6t0t0X!
z+8Y3ms~%9OS~}|hx{R_92L(b7GWW_b)6XferM@4&xAQ7`!_ch2M_-Y-L7uz9@oK=^
z1P|j*Ha!|{*X!7Qwk}Jsqtr9V8Nw)U0o)pRbq2^n<g+oF#KS%}mYD6=yL8lp$Xr9p
zWSsb~oBWrRFp{W=MyaHZ%1M<F*?9({YO#e9=SWBAVi+ES;5hfGpfGQ8z|qgG%{4+G
z0+z|Mh`BNj4rLv1Ye=Qrr0Iu4jV4jHvMd)jY`wtf<EWlaovZbq!guEg>WYW5uE``n
zd`N!q7VUSDNimMv$XfD#1(WhGW6c)y2AUKt&x9>`Lpc5KWa)!D{7kW;`bCie4*|@S
z=%2US##Xw^p>dCO7&Jf2K4=oR)OWJdNfe>BrQqX!&8b|4HCzo&=HCy@m712zwQkH8
z-}y-ugbM?ZIZY|tfMm>aX&x*D^U|!ohWj;qwtjx0*zq2JQS4pQx>f^=b3mY!Fl!m!
zF2m4wpD9xD35n)|E@oM>R*f16#Z&RiCrfuw5SMsg1|--s+!n;>GXNFbR{8FMqM9Fo
zv!r8LHdGG^G$nrq#>4Tc3~<Fh-%i#DyyN+gtNZn+)CtSHX=MjB{m7GN?eRl7xm3y~
zadUYW<RHqiFd5&2fz#BC)7xzGBaQy&E7G?(b!oqQGZ%fwZvux!ACP^L`7)8h8g|zg
zi$Y^^a>x?iP@ZNDlm+|;m~a1tFbpvR&odOB^3%*wxCD2Va0xdfW(Q5d9)DS6DH*(I
zKo<nt$<qp#WPC~T_K(T;A3uI@Nn1DJ7)hjQ0gAf5Nm$LDu0`V_N0PTX2d}LsBV)j+
z17swU9&(lqp3}EK|BWU<B}Y45Pnk>oi2Yqf4}N)|YfS`vEU`dQLz7SXGs3a_TcXCH
z#<FIUGB=l?SB9~^#6rzY7dO2l_+BDz^CQk;z)P43aw-&|GSFZxMH;iZH1e0IsY2y>
zvqUp#Ol9W+2VS={9Ny4kDmk<8TBui->Ll9D$(VV15p4Ks?GAtx4BIrcm=I}6un;7+
zL+J&}*xyB1%;qA-JaC@}D?ze@{<iKv91Qm$`o{M-L6r_GB(-xoiXlNQbqEfjHa1x^
z&PUSD$MSkzyy0x|dGz{=At=X!8LmgSIO_v<ZLv|b_sJ~0$VRTf0*-DSb2R^8BM;_G
z>{EYgU6RwRPcq6T_+05s{mDZOs4D9hjxRA+!+f4)ojTlO1i<?^Nh#!=K;&ge+H4V!
zx9JAur!Ju4aRHQvUle&qAyru%3rMc#a;mR*DJUt~f=$R8AfP5o1HtrGgW5NJivr|&
zH<ks*;x(L><epX^hQ`xf{i1#RQHRT;h9I!s2q57ilN3>uJ$eY#V9+y~dtbm@mZ-(c
zmoIxky=qHWSC_yqf#I6OPg@HMi#IVlO(4}(u11*|q@|`3p)Q<1#7O*Hb?GgS;OU$K
z$OxZhL1OlRDv(6OHPu-Gz9cn%q&Bh%=5P<;+C^1-%eLskg8yaWbB&zFOH?ci;0%>%
zFsZnhk1R$H9k(9|!UTXw#ryoxhD5KwG=Q$0viFF59BnZ;6FxsM<2Qk^{{x|x@!Q!}
z0}EvIBJunj11Tn2d#oA!4Z{ufHH|m{eKJ)}Bn@_{*AL&KsK;^KYm(|GFU1dS*Ya}s
z(heJf-_O-`Ep#-2ptLeNWMd}7TkbMVQtEiS;V0qm3mwa~W?l<zVdnX*Mg^Jk{ZdK*
zwxViH25W!hD1RJa7?qN07`zZ{#lry_K(^e)d!ayb1pV;ILHT-0wtI>fU-Dz_5)HwL
ze=_!8o0wbESCi3J{RydhD_OQ0X31n=Q8FE@s`zf@CHD@h^;a6m1wXyr_;rTclO=c6
zl~?`rtZ+i`-<9<iOCw~AjGccREG_DLU!^$0akuz-LXEHblg3o3r8a7Ae|U-7mROF?
z2%Rd|yUI5_1=f9>N)JjnKz{Ap7LEk>rUM|TKH~%Yu5AXUc5hk}u{qZnGl_^?#$4w*
zpkHuS_yVXCN{IjDmkc&{5Z0HaK`g0(z~@CFk({WK&!b_2cnOB}7s5`IXAcZSf!M~v
zi5JJAL6N%UsrygvPKZQ$=R?_=Po<?K0l2S^!Q3RJ3Bq^pCK&VHtVBz4mXc=;4+WA?
z;W1Tpj2@^1$;L!6&Q-2Yu4@r4fl@3Bbzr_*xrtih$<LqP+muKpPq~=dAIhPg#i{6n
zd0}@y*vEMuslGy!NKsTOm_+~71#+7lfL$!W7x5PNW`Gu8Bx-Bc+XZg&kw`=KY(*Bj
z-H>;F{?zhpd=Vz`7TEDB7APBp^W@CeE1`w*RzaJrb}t+B$=oYpK)YcL%%v^u?M`a1
z(Wy^*t97wuo!?$sip~8cUAYu=m`|JZn@*jy4r5`i`ST>_Yy@Og1^y)zgo%=9Gld@2
zWc)~0qS8wJ2*<SCQmO}Hc=WgYtwSzljslL+oS)7~sNo<Xf08H#Hc1lI=5c`l6~4Gr
z?jvMlF^z$SFm>GglUb6xLbs}a7%7(W`XL%HUs^l}^@ne-C2&aCw+W|p9jre->4H{V
zdkZ26X3JL&5KO{B)Fv2LC&~}KirK(4`6ByYWL1Du%dnO<8(uldK#h)!N1wQA&$%UX
zW#qel56ndbepqCX)InZ+rCOYz3Zw11eG%{edT@&HGh{bIy=;k$YLs^<cvC*H=9-tg
zo>An{?tnUuRn_LJuA$~<!J>^nJJd=Z*dDZ}>ax@OieV?hQR*|vb;s-14}VmD(RTK{
z07&;)e48|yIRC_&Dw+ed;!19+cJ%r6IL4YafNYCO)XB4%dKWmX6Ieu@f~Dt${&C#}
z5OOTZ1S6kUnA+3!U+E5`5<wI0gMOQUY~CB747X)cZ}Yky9LF7N6F#~NrkxG&m}XK_
z{bP2N4?cB?7GE3<c%QGpR~;gIk~064ZymIZ1u_xp_q;TE-KkCfsA9P5m4I(q>L+TH
zm6;kCz*WQm<>4r5BM=NGj#33+c^g3Ot@MAgb+JRa&i7ecGr~DD^X=|0gdO1)?MMG5
z1n81N@`HjIxw`<y;cj0=*N&Jcm+rCQHKWHV#qQ6zn2>y907JQ}ZzO|3;cDCS;ydtu
zbPsuCZUMofKhX+z43^PTu1`UuwQVLQsYV!_1YuH2kB%yr-BkJd)v0gd%QjWA74uUW
z+$xRxRQ4Ph3M)!&dr3JpR^AX+j=XTI<1u&A>uxrxtNs9UN3AS9^KDUspoBC;5LTvl
zv+n$iD-MXya~ZRmVcm}t728tG<w>QRInr%3e-QJ4A%55mG@W#|7z{l?r`ZY9%a=NX
zf%S+Pp88x~yyP|~?SGxw(;9KUcnz{jy~dZSEuXvdvM$-=2|unXz6K}!WEM$mTbdyA
zC2)wvorw*`^wD*iZwpI<TtHrTNnW43Z;Iukqh5(86aNYv=F4xcH;CQU8K>+ihtN|P
zY3LcS6rwlqUFPdC^gRne$*Q%n9SOe3YwUEstvP-Nyh*x+X_HM?tdvHFL93F=#YB^i
zoB_~GD4{bEi+TpPth_N&IqEd7Xf-**o<+dA2QP|`MLLquiwy17C-sKa0SJGB?kA{W
zW)sapD6qF~2S7K|cw>mORG(UNl7+^oqymwpau8m})c1hik5KNb+dvxZb<9qO9_6@|
zLC<yQhiR{qjkwt5YI*N!8k4IVt~1Vc{1JY3o@(iQog$OF_1^3Eq;tR#Rs-ac9*jAC
z)^c)kO?m|*V%kr~i%TcdSLcBuL*^g4%V!8{kTt{siCvkQvNjZ-7Rx@d>rxTVB2Z`Y
z{E)WqEGVt0q;wHsrbW!%ImWiM7OCNd<Jq@ce-^I8y#05DkA#}W_qi0{R=|`E3rMHl
z%zox|Oew7SF|}qSNHM`nv6Vse2k+i6^W{2DtgA$+x^rH$Ue1~+HnQyp1xlNau7Epk
zY<-FjCQY?tylaKc#D}EVwLv5ZaBF+^*J@+-?ujw8L4tbO=^i0)h}!mVrP*40uL_wL
z=y^2#=)!*})$s1<Wy>R^HrVCbZ}<R{0gz3o`hpC*@{2Pb3)n*M!CC$TYUK-?Yupd`
zTV14s^`yx!?`k7mqb{zT=Y8vs?r<JvcWx>3^yFeV#P@e%U^~j<vRl;EL@3sIws5+d
ze06xs?TN|n`Wn&h*xic%PB2GzUGf9D@qn7`hl!c?MjkB}1E4(T=<4ZAI1XQ#1U)B^
z+`&8*QvFFZ(aFhL@W7W#KthereN2++lCNf)B3wm+dd<Y28jSHjL^>*Q4pq+yG+Bfl
zc^Bj+QeIoEO2-3*3HRyF)Pw}rPg^GjTcgFZ$iJgoQy1~EK-Tc(KoW_L+IApvQp}54
z1Dt_@KH!jd%QCKb$HiN6QBVRs%rpwgWVMXhas?Yu`X^omRU;vW(jM3k7&r){SMg3s
zipuIhAj#HeSB_k5=O<8m<FvD_uiBU4hi+b0pcUByuwcsx-}dr?=*(`q=hbvj&GuhZ
zqw@gTqzax+Ixg`ltJcc<9S%t)+Ohj%Ac0F$7rk}%4*`Zi8r|C6pQ*iOx*g6@hmD$$
zC|ytHgD-`$el)`}Ffs`D(dshPhau*{vw9b6UMYoIjND7d2Kph}itr_4b|XWsTAKuC
zJ?@4n)xfs(aLMR3uG-=;3pKk-Ul#+15gRfvR{9{i=&7~#+ACZg#g@`7t-#TU07nyF
zeZL{%lXzTGonwQdz>i*YG$&7719+Qd*}G#nW}A}QPB@Ug9@#;nzf<I2<tbkaY(KkH
zu2U_R`Ipl1;|T0DKYUmook>**d56s&wf@2CD<h6c<(=AR$E`6u*J+a9h(xtip>k6=
zttfQk<qUQ#6mm(Sh$f$j+O`+F-;#x2qZ~j_Q^~=I<FMjxxU9QI({`kjhCAgKI;=Ui
z(oj}+2CjDakyXsMuESJ~xq4oZ(AL*Q*WB}Sv#i^?odQ~sXnipmsQs2i7`^H?7&vcX
zJ|>dXGJv)^GE8_qm34)0ojjnFoO%e|&S68cwXyy<id+N2_FL+Mtoc8jOWWyBUcN$$
zQydH06-6k^VpDYb<-44T`K{Wg<Zm(`7yZH=vlG_bfF%J0+_QD+^bOFbuk6-WI|(VW
zR4PCR`RBF1bX9*GopevpsBchNg-ZtaTr!gkp5*8ivO(*{HS{a7Py`VBTg0t&X;h9z
zu2(imR8?D_lzfPBbw5St1o`wjERF^M6H@I1jnY)0`djcy{`j9kM(X5dE!)~xY0%=j
zEG)xmKkGi4O8g45>w4k0#@E!a?^hvks}>AOR$pD})fD4OHsLLeBgCsx|7wv!(C=8o
z^Bn7Xw8ooR!Mm#u_*!ybqiUn-QmoP%Qpa`8`U!<i*0|~iU%>Xh`Znx&B-V7FqHCHj
zm6#g$IsM%H-Cqyw{_XJrEr=2n+eRg848wS-?)550*2%@eGk&!r#UJCO{lx|NE1rV<
zZ3G$s-#yj3Q8W~Q%XuY;>eH2$+V&zwGhw?pw?>Mmy)#0*_3{*uwMVOmWAIO-01174
z<>!jsUEdk@hV#*L8)zb(076g<W>m&s_S3-FS%gDh15x2Gw^^I9Pc<!5QK|HGovBo#
z@5ERC)<eeHWy;g~$&tBG{7dfBt!^!S-C(F_aL(oKV?#Z3?TPJyKe2c^VxQe_WTXWY
zk8MoktH1R)N(OrakPFVt@dVlmZ+RA#WP?upN~g9QlFC$8C7Igk@X=DZN<gGC+H3lk
zROZv+KXuPwA=@RnrLT0J?ys9m244fm%H)IgZMe-`$<mr^UL-zKs;V(jOKlI1kbh`U
z7sVcs6m+MtUHc``SchN{;F>kl8of^igz%^#yw;OqGN3}p^vF((Li9Xt^w_qx$eFqD
zU=Ut9<2pUDP~W&}m#xD3>APG4Uf|02OVNx0&K9p-(|~@~(~`Av>ytc{9`8QE%}njJ
z2dij910v8A*ZpPRg@6{^A4<z`lQ^j3=7@GveGkuXI_-vbSVf_3W#BCvu+n*38Kk|@
z@0;@Bt@Vnld1nd;l8(NGo(Zg4lEtB+up{h4<NN9I?A&yS-jT@Zc)o#xBV9JlxSFQt
zg;DzD{$J`k-g=w#uRHdlg8LRe3We^Y0n^W(@to7OkKY-`_n-T__Qm$SR93dN+p=}L
zY7N3cW`j2ZbaHUGKpOY6v*t!jO|I4}i<s9OxKez-)h_<%TjrBMvD~srRd`w+n6%VO
z=Y47?85iccXP`Li%Ik<*_S4T~cjxJGe{}a;ukL4A?MIJIv1~e=<E`jtVc)nZOs(t#
z=9~%XZ%uTmfBAG>lb#x}h#yU*68Q5#_iQmo*{CG+KGah_O#E7!Es9}O=LoaKJe>^o
zJiJR1cQw3AtUR<zJOP%NuFr3d_)|l<?xC%t2|^J}sU_KvslMawtpD(+qoVkImLf$S
zkbD}os7yguHaQY=<^%@#J5N1Z)~c<iJ}FN-3@JD}D_6v*MDljhlT}XM%<}N==aIm$
z?vlmWD&<+E0-?tF8mKZm+fIGjraC#;dZ63aUuezsOL7tU5~xT_MhGs(YfL#!S_@~z
zq^%4C8J6onBi&>YCHx_VOR-unul%qb&iW>8^~+-g77ECgBD)1yz_u=u5Gyg^Bn#37
z1a(ME<VhM4`1q+gl|-!}de;C~Ovkcvq6Bb6XAP>@*FMWae{O=4DzdbTo`Zu@&Ztc|
z4@9<j^n|y*fa<xN|A9ugm>7_S-35Y-jYyFJ&_z@#WnVkNZs|cE5Wz9-T#x`Rov-s$
zuiqZ7k+<e(BlsXKH2-415~a8s(qC!o_ySf2fZp5BXdcopP-O^#^t%mFWrn&9nm)Vl
zgqT?>W3cNjaiKy*=jkFp2J78k<M>Ij&j#8DRwV3$ww6=F+j&PH63vPqS0&|K$i~+9
zBfToW9UE*C3JXtZ@;SL~gQG|KF0t3Gy3JNP@f*4XE19h)S137Nq1eaT@(T`@q%!rp
z>n|XJ3B$gINnR5vBSXGygT;H(SC`Nqij|S;Lds)t<+`dp1;Oq&=9w2ykoq}a<%u+4
zCE{Nzkwv1#E7d6d6U5SR^|{X`Z1m@e=jg=kk^Q2rt;b6g76OG34P@{O+TDEjFJF$H
z)**v<BJCd4h9BSMtXXFp&nc;eHkmg;(kP(zD2U;3k|=B)ip5x}3;9XdPTy}CU|?*Y
zfnb$^^+(!*k3J@BD_kr_jQ@U~M2OpbnRBJet9REwH^WsA{Kl7Pq=c)Ps4{FkRv^5+
zvK=!japsd8;o9kPcAw>Jo9ZaBYm=6gqs>rUO2MQ<m-N{YoA1ayaW~eq*i#1I5y2c<
zao33Pi*1h1`PUc6BNto*?kAfo(yenO)ZfihKCBcMRa`~5%_U!1>`xRA&XbA6#mUi1
zBSEdnGR`Z`U3Jcqjf|W-ycAXj6Djaagn8YRiQ?klpzPY7qc-uL;9h(v8T|Nu(cspB
zcOdLs)*>us=R8LF%hk!4-p$QTQS`K!_o}3|r{|f@*SlvzJWB;Yl+anra<Hq@{?t6I
z44?J~B22UC?HNA$XT)A6x@`r->}b4hGn%2{iI_9z?evP)JkXL1u0BN35#T6I2p=3A
z&nzWB5R!=^uCx!IJM<4LdptfTutcmBHJ@p{J2|s;aB^92srt&?`4i~&GwWTQ)B;4n
z;m271M%FBkMLU6;;n9u7uabA0zzf`KJE^+1cd1QEZZDqIlKi;Xs-3ypb#QV96I?=$
zy8})3ptkRi>ddQYQo)ce4bkiQrOCTAhSnq)n16lx4nmVJ^4DOQ`cGcpGef|N^u$sS
z{OdJ=9Fqj@9gS*U#fs|nb5N^=0@Rw3kXA#|++8+&6WM}q=5|Yh1*O?*0qj3ejj0<t
z<cozvqZkR}H2Cl}+0MMy<P;%Dtk6bqr-)t-;_KnC+CcNxcqHo3cr=vEN7=T6n0mz*
z0~)4)F)JPxUOI`zWc++6lzZ86E`0ER8WtlrCuUpvW2fZIaMwzVEEeTVx=D7ULRr?~
zu~^(!GZ8SU+d|Juc!m2Z>~&fbQ{!@4v+;(aW>%Z>>pw0`^#`b@9@px(6vv1XUb<;i
zh}iuaAm7hiyB2=(#qhlG=9j_J$BFwBqrn+3E(P|Y7WTDkKh&*aa0_rulWx_4Iq$08
zGGPL{$gU<z1-<q15!<FSl)L9qufe(YXT=8nt#``g?>fXT)xI&C9y;$BI$Bec3^u?G
z{w_}6)zn3Oqfc3xJf*6k;Vj(8F^nm-&IJf4ZR#Pl=eNNLqA5%LyX}V~LyE#1g=orl
z)%qEzlg{in`(C=3ypI~rd-hr}Rpe-E{KTF<TCZ;+OwN6AKxrf4!L>&l?;MxqqUH&i
zMCJs}u}s!a*2|R1875|CW{9*hU&=SlK8S{S@$oFCNCw}F)|*3NE_SEfRq%dcZvL7V
zZ_)Caki(18x;nW<mH^G(M2M3SH#LDiurWcNrAo<QDq!UkCUj!aVjKZ3KW`owfDfgi
zy#QN<{v)NEGC%lJ6?rKOv48Ub++x19nW@#KSvt!+vm@4U^+v0U;D>xMG)_VX`xDS7
zG|kCDc24G1q1a}KAuEp{ujh{Cd<?#hV!N9H=E>z|-(Y7R9c-)$$Bz}67>M6^_Q6zQ
z&PwV!W#PV2q-wg$x%bauVHE1o7-4P&uEn46AmWCv(5{NUcNAV5y4&tI&sF--CJgic
zVm%mGDY&LM4Wh&vGZF*uLhiQdSIZ{KUhi4Bq0}Lh#dgGICDbXz(MICii*aSxUa{|U
zxOD)Dmbv+o8qOnJ-6Q&>9Og%>Ydcd^<JV1T$&x!?7n5?|zaGmK=h{+=JbKpxAu-Il
z6X(d#Cno%`F*(ioUc!F)_2)8C@*n4<pS-2Mgr$e;FoI=hw<=3LXRd%D-7rqFweLm@
zPPfDvuu0)>^=i8C=s#Ku-Y*8-@|id_`mThRi_z^T)hR-bt77n#UcNM99Dks%Q1X+e
zj@kr?&(zNlXhGj~Ax#6-*nkZqU5gKMqCJl71Uz1Ma)r*h#Qk`gv`kx@*kCL`-x#Bv
znI$|9w9+4}Rt2RiA)URP8+TKh78a9i&lm4FOQw-^*+0iZAbdKhmQKTEqD0J-_kl$T
zn5;9U^4sX>=(v9lA-ip3F?l0mu7taD(=0MtB~^@`NU5r?ddHLB1i>>e!@|Wt^qOm}
zeDq^iaY@O=)5L7=hwCsvPeI6R&VKRao4utlv^6n4{+=Kun&6Tp*!ezxeaZ)mTMrkE
z;<Fp(1wHE&$?#c#GA!+giU`f_3H1v`G+!V276D_XiR@oGrjs+#yM<5dC}({mQ;%dg
zY{-0UH|-fbcADntn^~4I8Gg<Wgzh6Pfy*DcR~43xI(gY1gPRQ(R=EkmV&pc$0^g<1
zwXLbL_+o98uI)g_R8Rby0sj@wqHoyEm-m^JbFZk$KDCjDIkG7&)!_IRD17Fnx2tRk
zC;7DVIkzX;W&>dZzx_}MGHMulhaga4vT3xbmYQhfl%(x_LeV0WAkSk}b%K3SBZc@t
zK2hhIYhSbxUgA_{ARLA38}R_&G3+wOt(<=8Ku<r0mz*S+)ccgn+$24Zpv}4WNyCcX
zwA*_ynXEep`)VzH^d-5z_KNCs<jec525?%yPma6P1YQ8EYoB<6swr^t(Yda`5eW+{
zJ?Il0Ks;6Ky6Z`_boyN;%Hys8sr9$3nOvJZLT#x@*(-X^PEL(tJ&A7wa%Q(|H!1FX
z4!t&Jpmk7o%`}XrQ)GbeSgj#Rzv^il*V6`Vs6B2+#_ZMw*3uM`XKP(FbLR3Tn&&CB
zO3DZBDh#BT{agG$#AqEnII&zOg4xp()0l{|YV;pvM&CSo3VkgQaY1JS1SV+>l*vV{
z+}$V5hneFd<_`D6%9coNIj0VK38foK@R&9kT8n$*1tJVOsO!d3+XTHmqi%97ms587
z{beJ-SP*gIHp9_7rNyroNL}&8Ms2BmUoV^Hb1%(1@OoqRUayypRN+mc@eWpep87Oi
zQN6DlRpOdx_@;@UnlxN8d*3g+|M_Cp_5B8YXj$O9uYCpnigJr)l<Rk|AonzNa?bYX
zI9s>I2M$ay>bUNp7h}(mgc9o0+)C1XpM*~!G@>cq_)&~n$YS!fS>R$5C345GAmSMR
z{X~iS-YA)zFwH~MuG9c)&Uj8#n74Ct(6x9tLD#pypQmIjt~hxGPjjKnB4sVizY4;o
z5zi)#^m|_vsTR~SQB7@uD9Csk)Rve-X&6khLW_!drqrG2upqA;tcq#ZfA9QwM5%M&
zIqEFu5{3ga$uUeh|CxR?3q-1~HWku2kO<sk#jS|5Y?<^0D$dm+jiSu)y9vjm;I<>F
zO(L{pWsf$$+Z_%U^Dv0G{Je@5OivVzgq>do`@V)L!ji^hiFC_=J-|g_|5(dB#W8Nu
zwn>za{_J+A=5t=FGf;;x1umuBG2Ps!q(Cb%d$y*cVs{|3xbDTS$8%1Wg&P3-Jm2GO
z6q&tGvjkV5Cct^Xo^=X+SAzTuP}em8h(bR-_D;LmeGpLIi%3=`KO^Hgmu6}|cwIbn
zx0tEP#63;9t82UsN%)or#mV#tMF8>S|LLI;b#n_}`vX0&`{K7MD%llCC=-3Oc3kCc
z(W=?42Y#?~BjafXrbpgiPs|>$&=BAd%MI;uFP)OA^xO^2IJhOVgDT9MAxE`E{T?2k
zU4QC4T(QTkah1<8Zz}#VO=F@1ZzfUJOP7MLYuV9(R8<b)!Cq}lArsxb{OdHGLXSJy
z@PlYOMT$zZyGnCRpSic#wdps$ne5?G8giZfYBExM^=u$>c690Gl9|y!KOMt<_MV-|
zc-C!<%$cv(n63ca70(NdE1Tf|(Dl`EQEuJ$il`thf;7_74Ba3S(h|}=bO=ZZ5+W%Q
zDh&fjH$w^14N^n5v@kT%4Zp*^?|WbG_ud~L|9Hf~d6?&%z1LcM?S1(7V^xHWS@gcv
zFfKOl{(e*cx#r+Qkztz>)^i<DOTv)5*}?k565YGnw+|faiqCaBee!WuRQ;}Ug>Nre
z|8%1Ut4-m$dJIc@0O_Ia&GJ0HRlW^kVY2iu#Mxeqp3&UX+_nfaJyB@p;+x@RC-?Jo
zxYt@qmHu;EOcZ&AFD-q~)>DFb(LJL|?BDM^+9#j|w|kFbO;iaTq$+Z6vJRhSqBkfd
zi(tHPca3yueZ6-Iw_<~2j}15BN%bhC9TA?kf?<`A+)MO^%xuQ`{mzm^@Wz={_J)k*
zOGh%Fw-DEGkxKw&b~+;&%EB#}cb2%oQ7l`nL{8O%D$*i|6Ryv+?upnLAky4Gp5yao
z1hPSe!xw$bLcN%8R;a~RHyJrRlXJ(|v9BQo!LEhg(ZR22PDNENyM+>Y9vewpWeXGW
zl%nNF-i@S9P3C%4!yoRG1Ub}u*Wv_TPg>JT)UlIK3}vTZ)&o`%@7S+d*UKiQTc9C)
z97#yk8b35cx$gH{v|(ZQ-7|36q>tPvUdLAxjOFx*rz0;bXu;4u4RaR|K^lUl5DIAG
zY@-IOtyXZAkut%~wk*_z0-8Jo2v2lX^YU6!k#`({+z3;AjJR;pK6lb+l;L_;DoNk6
zl4o1}NSXL*c}xi>`wG*9#mp~-^W=+NZ~sTTjQ8J$^GSWvA_3@z4sDJ!n2`b+WGJ4R
z_Ey$D#KJH0$v*;)xH!Yd_<gNbzI?=M<96Y^rDWEg_;(R&8OSKpeKe;mCxum+(eqb{
zSs{@xJE}On>mY{3#(h6#@s`Pwx_LLW_ttmzw>JjcYb6{=3nP>7lga-gc(T~!u=h?I
zCxs03BKTs9F&Fx<O`5t&Xx6DHqwJYK0ZCQ+c{@cJHCQJF&$yPkQe_!aILWYm8f<9q
z;!tb=`?NUj-VB)DS#4hzh%l~n$JDh;^`ccAdbqK%;aO{EGzH*nLlERJ^6=Z&uV2rS
z_#GFrDo(n0z5P@oh7U!g>kJhDvh_eXl8MDZ4rSlnvf2<_4CPu-%T}U$0A}#Jx6E^J
zDc$iR7j#~E<Lc&ibr*8%LDgFI<U={;+JOgBM#VWyB-H8h2d!)oiA#h5+)d|!)Rj{2
z_M~DtSY=Geb5QkA?UD~_nFkhMq7Fyi<h2LKx}f;&dX0O}E-vx;k@3V6#|tnYZ+PU~
zt?W9n(Yt@Krkl801}N?_B8)gS<VaZ82lvnO?X{oZkG?O`?WhU?)7Qg5Yoot!A(1%g
z;jp_4`}ithae*rnA2*wyZgC{X=m7ruaqn%rTbW4MS}gj-$0$QPXtJxf21D!Q-4(`y
zwGje&XtK!Wko~u7T3$;@73>*ujdiK@Sg-&szkf~3qrxHT-!dvX*y9yepBJ<I4^aJu
ztZ-4glspobb<LL)=<p{9=s#$ef6UAvOZlY8<C-33`&HaRJwyFdLc0j7-yq(~x-gPA
zYEM1cIrL2e-hifi>4Va|WDmM8ocwhCBXIm^t`FZ<yRx_P^LDee(bb9Zm^tq#FE-lw
zW!gv<plTq91=r^zQBkF7`{ymPs<d&aXM?{$?6$7&`V8;UTlD*ACmYh)L!CDOAMYcA
zKE%xzQ9}DvH?|7;A(9BB2E@iAGotKCv4i@)5DKjqlx@8_+pP3SACkUo5(+JOh(>j}
z&ac0}zqH8jjRD=Dej|@K@%v1B(&N97e8vfgl>R)0+1&+5Kw+EXhV#TCtl_cjXXbq#
zkGh$s>5r>@=Z;QkWetoPX4VfVaHR&2j~}Uam4Vps@Tu{EN8Oo>%D%hF`5DkzG4->p
zei(6l6PJ?}VZH+8Y0~s9$J5npALQiZq}csEdwhu>b=}qA9gp~GI@&ddoK|(k!WRp;
z3`bM@E>DAT7i6kic{WQxfRa*`%gdh+^2d!xKD^YT{0%A?xuctsA7q#Ytz#C*CCVw?
zV?nh-H!VC(Va{1f{Z^WkMM)yfwReyg2^&1h>p}B_$%yT(nTq-)x}FCnuR=<-uXGqM
zb+V1u<IT_OYpY4IyIpu_G6{lD?DGO8GU%maof4bwlOMz!?cMsM9D9Sic6GOH4J4Ls
zWw3P_VaDJfd%HyRc-Y?J)YO!&8XAWD?@;Tv1r)AVlDX;-^QqwTq;yfvDoH&iRp&4~
zI3%I%d^98>zORV)`>Jfr-X@`MlSBq@3g>dPu7I@U(^Qs)z6O@1L1xoYAO^S(2=pDl
z@)uym&3|S7^0+KOSHpF|$ywKIT&)>aViXOga_AjKVF^D6cbJZcwow$Dc#zPZHaB5;
z4zW4CUO}SJ4Gtj`s+0+U;FpCL_`QXo8&-$bu)Pl;aoCLMkB*IP$7Ze0;_c1Q`5!9}
zdmL5ud2P8Pg=SCk^^Z6&Z&RCPIbFuiJH!!Zk=>f$w*eYSw*ZozmPkZc_H#sd*&%f=
z5My-O$K72OB!Yz|6t=y#-&`5GRdwrMK5Y64QVb+i1q34)<Zw)~2u>hTJMLZgAK-zL
zxQ#1uF8{c)$><jtgO?r5fAvWAGs7Q%2E62x)1|0MPn$&WSZKA(Q;Vi+z&mX=4PFcd
zgZe9w&lurAnTgme)o7T#vO)+<*r}nFv=hnK6!7(yfGGI^a>yk)vrT~=q@1)oqec4r
zGO9v1l@`e&KJdKFav{y3dzAayvO1c|Fg}2_Q_1_}eh~z&HNpEX+9RsIaG9*^C%0x_
z=|PD{615W2P68*tWzDFEdEGbWlq@q6&)w=I5&7ida>h&Zth?3Z+v;g@E<An8^Is1d
zc3dWv8|LGWMb4HkiJt7MU5EXsMiXt^8@|z0Z|xstIYjA89kZYY$Eoq7vHY7b$coDL
z&68_ABb&)F24&KFi$8nSl0<~gl#so;Swpu?R_&^6dF)=oB8=J9DY`vvaprq8H|wno
z-G#Crm22xrlG_4{9bz6QY7z5Pw{GGRg^366K)0Xt^?E3Z*&jH^5VJn+LGeY$Tgk(l
zqfkZ@j)}kL`?1f)QZMp}*dKsVP`{T5%KlJ-Y_qx>oU44Nrlw}9wN~uD#QB4-GyLQS
z{7C!B>1nSa1OhSZ8ph-7N!G#MT`%^iJ*8~y$p-t=G3|ZF`mK=J^JTv6!)UnYlF$0i
zSx+=<4@ockDq6H+Y+ZyiggVb?Bi|pl!Nh6x=qv&WYe-oDC$2UUO<|c;)z@D>wMw|e
zf?QgI12&(IS&{Go2B#+%@8VG25Waa)8B@*oWD5I^-*&X>%rnd_+u`-2GqEm#EBjU*
zKG5?k_V~-I=kT$!rpF<o(!H`n`!%kvt}nMrf!qgvC@~{yR%Ep--n{4U%L9T2JF<Bp
zBl5zs?64|q+!68JR-(P4BHZca69m&eA@&BP<;@I>@mnHyS{YqoHp|-e>kzWR)09;|
z7LApc1++q|(YqOgz83+~`F3J^(ts2#4&6{;A-SeUo07OVfF7mc0Wq(c;Xbw2q^?9G
z>=%`hz<;I1-?AmzBZr?My*;Kb+}m1wuR5w!DPARcNu$<>J5fpCGSWupBiHXsWY`!D
z(MMhKj`3$p6y0tK40ob8(S1}FW;bF<$G;4JjrA%W<5Qq<KYDsyq=9=H;zUJSl8v~>
z<7OX8;M=zm;*r7Gy#mP>67C6E@g)hTUA>-PFi}28f`}(2h}Fu;=8HRxAvpFe>ca^L
z-sHUoiUo&CRQ^L&aJw<@1|J7F>1<)^H!-kCc6vdMVh$~N^R>{h{tBtu<+i!CW#F~@
zv(pC$#<g1v<Qf#j2|+6g=n2*n*=@$)1zN9|ai~ZWwvC(H5k=*U$GX@m{H(gp5s~tV
zwsUG9?BkSPqMGDqvD|*(kVY2+Plf;_0?06x)1G_1SA-!iIf!bX$yqrcGb6phnPGmZ
z>)c`2f-&G*v7ohws<kbB3qpi>nI5$})OwDPuHz4Hh=3g4HYuF)$93DjzZh?k><$M<
zlG3~}NCL;D;0Lu(S(R>-cF?v{0G?L_cpe94)?_vC-+8;;KO48%_o%yDP*^w(=5fn2
zUP;NF3?wBbUDZEJx|98hZxr>fXXpEZ=A+ASq_&Jm1ck=R8~=d92XoDWh)>Ph;e!XU
zxy>=cDdwAo-DaEQ$l*I>oBWy^y3<iRLWtG#k<u*j?C`$PtFGn*Q-(U%7K0aB+(SIm
zwIO@V2-!YO5jv^6Vu;WajgFSSFX=ORUImX*(63jM#G?{Su(G=DgX?!Pq{koV5aBs7
zLQPK_RsV%2;fl|FH%sg|bdF6g!s+Gw{G!WH4b&3E==vF@p5c#k`-aEGU_7x)Lf-TG
zZg0vpG$!{nTwO&C|2s(li~JlXs+!_lUXV6QQlEp??h!GgpAh9|@)HtN!A01Qz5E9~
zyXAZ@znx3uF-tWnw&H?_7b^ZXca<M`{z+A?@K&D5L#cuG!$aQWr+V1|RJab=VOptp
zEIk8MxbLoscnu-Q83_<+mw+ie2{Mz=q_z8s@G&8gkTh}=gv?=e-$k%}p?5<=l|?=6
zRiZB91QmJiMg-kf{QOg1--N`?maL=CH1r~?E<9>C8(>w+@S4lCo>?xZI*Y!ZgGkBb
zY@DE$$kTQ3bp#0Sc5sO@$R!Q%YnX$GNLwMH?n?&;wM@Z*d<YUvSdr&Q++TPb*Bt!V
z?B^`21;|@h-Wr}C`i6HKXF)1Me$>i^fU}r9=0Fj|;{B0e;yN|}+S}99)9h+z8CD)+
zV6DL2-gAXRcY|)h_G0N|Us(Z8#RE9i9|SJ>z(*2o=acOAG$^e2I$ho|nQjntm}~sw
zAWNU5{bVhi-?Nw?D#B}tr-R=jqCi7ANi6QH`czwlhBBw#w<1hm<NTo9MDK9y@!6Hf
zO*j3#IjCsKkIGzpqq-XIF~xY6!-KirK%we1Dx1uSvmkx3bUobOoB#C!6k2GITIyNT
zq*YXzy>`0ruDC>waZQ2uE9J<Z!5dgUiRTl=Xs{4V>LAT$698c?=KKbpC|G>M=zx2n
zQ*SZVTj(yvKBIqqLkkGb{~%u|=w$BGqx9X(4wLP*uYAR!2G5WJs&O`ujd*E}O4ICy
zT~j<Cm79a?{G)qG_ac`(m6tm2!QoH8u$dCeq&mL5PhID36|FqA%XE{dE5lJKt63V0
zPu@93f)-Lp!4U7S&Hihmg#-5&p5@tu^=M(I#qd)Gh9CJ-H2tnAo3D+Bb<6Q3g#~>-
zo`v?!cm9ZfGqacMCbU^T^m4D@FcKX4G-;O2AD+@pbmn4$04MfvdD$GCT~s<Ee8OK}
ziC87nn*xoSvbq)0S#sfhWd+--oU)N)Oaa>NETpq;x#}8`0CujnK{N`!lwtgzh-cQ=
z4J!cwi@*wMDd0*GJp@OdrQn@=yYQ1;CS05vaU4S!mnhX(_>mEcpZz&61X_Sd*eA;v
zw&i{LMfSqYfrCg`TJGebr1u|KpXfk=w0aIg)vD8hZFhHpzuqr|?oXw-s!ARPv;zRR
zaH)|;;N2Kpv~6^tUVU~Ifb-9BV>W<){RD9hL~dQ0e5Q?hZ?A(^%=R}H)DzYip?7{3
zMErqw9pUuyb`Ldg*;e*YS5f~&f>57T*a~I<r{Rp9o#ximmm!ic?4%pr#quJ<g*`g8
z+k)sX>fNJ8C7XkT!#$j8>c?L<p*QAK&fWF@1(w0G638m(<WHVSshH$1f^!k@@7?VF
z3pW3Lfr|URdQ}z?PV$tzC0SdRt95*oMf^oQMCX@dz?c4Tr_5wix6m?wUc-RrEkrWJ
z<E4f$C;tM%CZ$B9RqcclMcuI<UmhTA(~aGA5{x4R8`i!iWD9<?_I>4bmNt&u6k~`m
zwqFkfgALHWc@v?aJn9#J)QY)~D4J{dtNo@g7<KDh*Zr*_@=G{Rmeix4qAfXm0;i5-
zS3J;-`PPsWu-T1Pl(44vcj*#D-R}4`$!A6>x$!VAWwy-!GE0S6E{=k4mVwFSU=5Xm
zBiz8Zaiq0^z$#*JLd<S@^06=dp|HqkJK-T)9bxg<Tn6nA&3=X~^4G9W)IsVqqLAR(
z;jFM9o_QH|R6WF?GO&RUfanyG_T8e#6I2Ma?asSu;#|(pG-k(d-Ninija+dU-9GAD
zf*Hybf8Y9{Hf3^9H`_`Tl4cKt*UJ?BYb|s3F~xBF*TSjoBX$6^J`%;cFxA@2;2Tf5
z($rCQ>?9s0Us&Krmc2~9_(W(I=RB%yF}-v+Wq%cDkA^(Q0i<C!<EYK!Z|S5s7?m?U
zIm%P5Ez4yv9hE)3KW89;-7t+URGak~(bT<(KD?EXlZs?*IF$78yuFqa7o##pQ$)NC
zZaVb*Lb)v>+~dRRGQ1&bT#&c$lx{-6GGMtD`{<no9m$475-!$TGWe(tweI{&AgRXX
ze<d{<^?r?CGsaw9l>3G3z<`|sR8!x&w4ipy8I~WAwSZF&QN|Bl+leuT^CBk10w1HM
ze^dekvIbZ9H8ltKod=yT^e3g859Jcf7Ap4Om_dXadHm$l;Tx|lqefZKC!1mJxJ>Le
zK8nG%TN#>mk$tn(^C2m|+VJj{;F3q;xM&dn=W}U~C&z)jjiQdLg^HrT4GH!5;9wox
zB=hImsXANaN{bdcFTh=Jn{-`2@W|f0aNB87k1e#`%2mtWa=*q%8KWVK#}*)qmCjDB
z|2jMZMF=tIhPp21<XJ695Q50_hlK)3La63@UgR_U$Oaf~ry2Qz&Ppnv%weisDGcw<
zy{THmPS@+%UWJxW5=ao;sRxh?6;U-{2G{$*y#{F?UEn`VI<j-CnVQ~mTnyuznYqE0
zm=b@Ras(CTbm=Ag+0?wgq~5dP5q&0Ee$9{nf}=%dbqDp>mI~U#tsUf{Ac1Hj1w-a}
z0HB5OlkXY;tt5kPDCeTX_JSB>cL(O$vbZ?bA078mq?>=7D6`Q5(pGH;INqy%lw<o^
zY|<ZVd;K@Ct0j#hB_mS5db%9tZN6eutrXY&vb?0QpCJE=?&^6hrEL%M>gUsxp)bnB
zcaml&EB1Ct_d@q*F@wxdFs19vYEOdQ39sqyZL^J=K<4SWzHVF%>sQ2~uS0j}xc|5H
z`e(VSKl<&EqVjsbIZ3Gkjjle`t0-(cm<6J-M;c5nvfYX1q^G%jMnBXtVaR&Xp#t-i
zh_UHm9vc%xC3K%?Z#IE#ALtl4-C|xId0nlZte*V>x{v1kvfljMdaS=T%4mF)s(<c9
zfbE_-dYtCRvM^G%23+aXA{_d5zqeA4S|6A^^^-*%v-||2Obf>v>)<LCVWRkVuXc-H
zKeu}QD9`7Fyf$qNr;J0VKj=g|U8Vsw?$;*JVk^1Yy#cRXR8+~dI_(E&B0>$Cpuh&(
zv!IpTt!NBQ2ZC=^)b2^^@in}~(aWX)5rMKvK=Z0d6@!dGf?xkeO;N++#YKKZ_0+{@
z@;;!=TsbYYzVBOw18SeL47(D=JHtVv5;Y5>3-eIzip30S(NsO*b9Oh4zsV*0neX+t
z9iS11L0lbgjHLh`sM7eYoeC_#pv)*rsyq4&{`@|}2i?fe+_YI%r;Xqa``omJ!WVm3
zPEZ%lCdTMwC0si8*nIvY43St>FTol_Wj{Z3es4DjJ=*G<c%v@?&mIL01!nwYs%~xt
z90{S(6U~bxd>!|&BmXAj8nvTtgnI<e#CMoHTrODi3?%vS<D}=oLTIS|VoG6$MvK0F
z@z$`6`uIK}nTx69=51D`6s5$v%ZWd+%i^04n=yU>ZFV~<1>Xm{vG=t)7`IWAu>VLS
z9Gz_FZ*TS|gfYTIktrG%uE-W7rxPl-Y7U=8%W|<y*C~v?!^Sv47`gG9aP4a>Y{>G5
z;MI#Ua;Q}oEmB7RDwfqjK&k0npgC>X1Ng#b9=_oLLmD+*VWQQE4)cf)oe-*q$~Xbf
z>TwIRHB;kD11bWZ-V1=JN$$41$V3SCg0t*jep`@m9wA*MZd1q2L36&Svf;(iKXC9N
z3=j`4k&Pe#WKBSz^$A|^;`nJVi`#>8c;^f_b&cI5w63i(FKQPU?tS92R<r-G=5$pj
zW8LxNR~%}A?OiRb>X}_|JrVT+^{JLh8LQ%I|JPT%h?A#N)hXar*Yx{vX8Jou<a5Jx
z&qI*GxCD@V`0I7jXm}=Qi5td>R$oC0vf@bj8%RN?nh&IKre<d9pZKu;$H#-#wpu^d
z@71eoTG50SXAjJ>-yQhRGwQFv*-+cn6%~iNZ}f>$?!N<tid6^xo>vs>K-m=G4x#o_
zu9`DMr=ox$7C`1DJ+PH!>;pQd_E_T_9K2&6(HjcL{Ix9L_mJuCso*}O{pE+W6$7Bm
z&8}20XS8=Xp1Zb>59J&vB?%Iw2)n=H&#lXILGUkz+6%d?4PV)6xn+v7)A!=6|6W!v
zAoTr_({W6%0^wHX)?&yn=07z#k?v@Q<vp^jbJ)4tA!9*uq9b$?&zV;CMeWAOs}=26
zM%ZZN2O8_~;=GYGK;!wTWkLrS&&YK-C=N+N{d`S#E>@}?HGK770ujd@UktWcZkR=u
zZy2w~YV*g>A4;*^qu#bT`57b%^u;z;!E3WP?vIiVG&~eZ3U^Al*~><ri>`Ya?$mak
z@NoWoZz&oB3bSbVX_czdF*3Tt0pD4D{H9{ydD0gOixT#w;qtJ+5sn7m&oX5C2koz=
zAq1$-DEhrh93gX;Ecbki+}s|Nb*23b;nzqJ55^Acn|(j)SAah;0$f@s8kc$SGlL;L
z#7QF1jBn4zfuowI-vF-Ojb)?vy0T_^Qv`JBqz{<c0vlJ*Ah%acUz}QDhrpE|t!(m-
zV(-zgZOuXMULsGLkhJZOs@`Q}SGXoKVX(b*Q2rL4{eCGHl4g6(H^@Cby?%Ze0o#iJ
zaV+yjmJ=l>Fx5QHto?!Q?{t|%E6ZEB9cw_FS=&PaO7U)+8yiGf?LY|xN3I*<qSx!7
zJt5{QOH~2>kCU*4<hqpUrtEuEm!&`<ukSb^L!3AfLvhw_?1BaX8P7#!v5Wl~kd6>Q
zGD<&xNGuTLL?O?Z`zEPvWmB?>519{Cu5AI3k{ixUKpmD(ma!p`Kk|xQ_H&Og>G_h7
zA>$Gjz;nh;7}2-9)}!sOuCsb|k-+<%39sO^$LqLqUSW30wx{R8i~L+`UilV10TRKp
zNs{4OxBiA;2$RGIR3hxcCmmltfF>eZ;|=Sy*W68wU)bGz>c(w!cf|)QKzbeV&P8~c
zGRj!Fw6A`30%(a+4J*O+Y}Y9c!`1d4NOzLgaf>L3N8*KrJx?>Fxl#RZ$@M>;HPA41
zT^FSn5VBt5?+VVuw9K6EzW)wI5MihPW=8<(rk3C1;Q<5EQc)vMVz^Tmjtuc0i4bDR
zit))u&Sktnk?yzI=2mA9nUM{Z@q5&Q6cres;%^Sw^K&y<jh^L3G951IQa;tJd&_YD
zsraK%am^O|dzk@_Z8@v=S3K;EhZDEljShCMT^bY(Di*QQRecdA%@{xS@PPr_mV~rp
z2h`9ThPmmv2pthuz7W?t(2bi^)90mZhOoW6KrU>qbbV@3-nhSF3`7~CA7D(MNr99(
z2goU)3_lJs<iJz#v@%re?d&$a8I)k(WGTUaIuTm3z6pj5!-3-~pne#javXU^G0wK(
z1}6V+l@=GbKxlEAn<imsT@S!To)QNCsb;sHT_f%xvqb=)Oc5tF?*%H+s1O^a8oYq3
zQ~rd_uEo27X5q`wlJfQH#c=zTo_JP~7q^VT3N$|MZPmXzCwKd?Q%tY6{LLK-i1F_w
z()+&6IURiKO3qcGQV*yLm!u>Zy7AWx#s}y6JL8aq%LQRBN?vL;a}4@%T&;1M8CHnC
z%fmmDR>QMck`floYaMa<F)kV)-0$uKn+23iA(%gx6(@3rr~qv5fAZ~iy5~d(rZ0z`
zy!5h4x@WDF@jacX<g+^9bw6thNM|JL5%v2CEy5;=0gp1gzM^Tzod2XHp*GbfCG;u0
zaBvkKnKBD%vM#kw(o|*-4G({(QM>J4*EMV3qjt9Lm`An|l5lyfh^VjC^9}F%z~z1(
zICinoy451dMnHJemyWKVc3*`tHGk+c#ePTtR{Bf``HHAAgCs_a#kMnBrx6D~ZRUCR
zASz!M{Wef-aN5W7gF6eQ>j0Pmd(9%-i0~X#&<*+MxFt%4pNq<XHbxwBVYnlfbdggj
z!o@=m;14j&Y`LfusyMWNQ%Fe2C;yLv6|vL+;ioas+<~lwbxutQ(((4js@AguV}l%H
zr;@o_^-$8*MH{;}WZl!kGg_bI%AC-`andS#`r^91HEfbOd~wz^RAk6o`H$S>sU3Sv
zQr}Rgh~{|2X7d6H%wB5k<BK^}s$JK5l3Qr$R*n-;e7MJf;&a$*Zz8gmd{_%=s8>Kc
zVtB?__D+t0E7JOb)b<1*WbGM;je>tm2*oR?`{5o^1zAeVz!iFb(D_LjTNY0P1!dk<
z{B8c4b)PN{OF}Ikj1Q-K)XVtMI?+Av0eAB5z?(h4-d19V<HP{Q2wYz;2)GagKgV;E
z?2o(iFIFvlYKNb@ioMvQDrmdTmfaGIq2yN|9;R?Ps*Rq+8aTa<E$dAy&73VpqM1%&
zRDaBXa$~-!M>qxQS?pQ!EysJLu$~XRO9F^Eoo(-k^zjErDrw0P0Tr4-6G><0fNF^h
zyYWT5*>Cf^7`cY;lKR8iIUg1Kpv_s7oFNM|>C}i0G?r*T$>eC0NY|}SM;DaM01`BV
z5b#mx(4PTD18K63*<PYE6v)wqL@+}|p#TwPy{6nr3AzhcjiTHMK8U`z?9N;-?0u#@
zM$S~bGyzJD4Gd2i`^F#6)$+o;mJ{&j2qNqtpUeh;nw2@)FN2BRTmk>i?7>(c0076p
zRDu3!%m%Eamz{j;wr<4G4R`%2VEo<k7^@MzB%M8OcLS&NEX6C6@Pndu706iCTLN9j
z^n2q*9ItbuK6ARe^7_QkkN5DK`)s07ou;?`DII=asmLjG(oM)(2g^{~qB-8P&GDQe
zw1nY#qFMVOxPu?@sP7d2wnL3l@V^MI5j-mxOfg96VKvfLaFYv)VV^oRgs=?>U<s5?
zHoUlEMwY#*S9z>nm}9E08S!)CGLPDdZ9`+HCh73UNs8A{+0L88vvUE7Q*IC(9JsFE
z;7k0CdS9ZQK!0`Ikr-*dIy?a=$=@Q;pHWF6r15sF&*P(-Dv6IXkfK404c|O2U1L~&
z`egOtf>Y_*hT2J&(7I#QwfV|5;jI~}5%C%3%`URIt6n=xB*|W|RTq(FofVpCLeE&g
zY@aZWp+Ql}AjgjhQEAm;HDZ5tG`pYL>k;Mi976DCS)_b=C(gjhvukQJaPjO*H~WT+
zN_k0LT^Q0{%pmG^C$V(hZ2>fY2(Rz+^qCoZy(f1`6J7UEz!pboABW2_QuddQ*unO?
zKht(ruOxM~14FrE`}osx_ldY5U{`XZG6n|SKfDdI6=Pv*^Hhtu5Hr57$ZKHn@w%*i
zEA8>A-uH1fD1a+-pm2q(>51<c2f#+e!Oe|t-Q3bsN@c$eph)f5*3-X=>`-OkDea47
zP`*q*5Cmn8Zr4~z#0fii$H0$zptep3-RPx)c0<Olt*5dK%E{+PfBBIU6S@Vic4%cU
z$Gx|ExcX|xu5&{)_F0%<`9SR?AZUF8J`Ht18)pCli~X1)Q2vk`NNqXyUSLeM9s8OG
z49`Bm*7dmWJRelr^F$v?Pt%wh0FluhaYE_^1$FxE!$VhBoSol&#enB>n4!e0uq#e1
zrvZ^;JuShbpus46&`L3NY%@di5s#tUy#Nnq(XL;2L0yuaF~(2_I6YZ62dEqXdL_Ot
zDWLZ2do!Xg{6u>Tx&1F`g<j#08Un3_43@F<HW6|)&f)~IMttIc80TRZwOA9=6+jY<
znW9Y2OK38oGItz}n)YxBZcGp`F!v|mm24<jkTpFJM69ky7|51swee8v5Qt9u70@T_
z*^ns`NWAzd*FnSNziYV;)zstX!AJU2;<4OCk$>`N_}m|%B<~1gCO2KtFMNbXm34X1
zDtAlfmD)A_9_|mXbudo=!hUenE(_z*TPN>_?c>ko1VG+)UK`F8Ltv_$-T|Mf5R_%T
z+SRaq{BW${@<8lza;*NaLGTV9*s@%qB^It68=D;MGf6Ii_!6EOi{32j%b-~Pfe1R=
zA<zIS>AFNvH)v>*dEr6f<g9(!%$Xo`!?j(DI@5CsJh(Sw{L9>;T*Gyd5Le@`R}_AL
z{O(FHB?IcxOO!9P;D|7cq3z5}29u5YmJF88_~aP-KoLSch>!E4$*7nrv(|IlmU6fe
znGb9n=v1Xb#kygjxM{x#yMl@Q1#;f&z(K7%%qe+kh<O%{5D=Q(U=~`O5<q5WL}lk)
z#j2XaH{}?&=U}s`4Y3z>uvlUp<&CYa`uR~^`-|*APoM~S;{#uaU5^m3ZnHVxjty)p
zwKxIs(SGKv{_An2C={pyVl=X!`$9dC06zJV2ojo;vy>{iYRj~q3QJFS^Yx8km69~L
z-MQ|RfW!9d|6ZjK{sVWyMm%J-o#Qe?cY?)Abs*7JhaD|zH3l_{S{Ks>J3*@aj`!$w
z+8>WGEEVZKK$J7{#CXIcm1d<7-=;>tP_W6#^5U~!H<dKNI@SLAQCV}io*@e-Ka(RH
zILVCd$F7ENT2Ym+_!I&Mw6}Y!K(sDDGMjWI)JGxywh)%VvM<Rw88nG;oAKf~YmT8>
zA>_-|WdO;+&sf86@nXz-XR2)&bXN|<(w8I++e?pa`5T<HCz3#!U?N@01f*zu>Ki+5
zs2}x1J#fcmZiF)aEo=X&CjpnV4BOcpr1N+HmnvVwvYg18N=|b;L)aS}<U^k5kn{7E
zh2bf(;=XpMlXLz{i5N|^1N7V%&Iv}A_Yz`0nuTflL{|-b;wFoMM=)n3JOY;NVqF<?
ziiXS_;GA1>{H}_|8hduU?n5Z{Cb8|-JCQ>yU+h4<z#@;cWgQmNG_Y}hmOKKzy3YMA
z{b1wPcMC3oW5_N5^0@j=B=lXN0S9@m1o{jYVHWh+KE|OJP7kCQyt4Amdf!kurg)#O
z*7Y>D6P-ojQbX4-x-fyqWm{iY_pF(<^2aWmbw?W5q{Spv%9;I-n{%pw3J@fzvj2>6
zDL%BMK5DmH<-v3m@R0y$U0hbCM!dSl3flBvM}T)1W5TkIpJ(IJsh37`+{tn5c{Ou-
zdg?_4T{pZs9lJVYHaxjEL0&8`3`)IC0#_A|m>#Fzneo3SupoLME;@du)wbw-PkeaI
z+mFF1Yu_yUXiq)|*4dvd^EI5jm33HeCW8Jh)hM6@FtW41ea&5_B^yuDPF=}i!edB0
z5u!;ZqkfX7jNm-L@1-&;T1JyudRg>EU8|MMFEKar#HDE=srF+=s%8K&UjDI-0S~>2
zH*3bKMd~4+(sw47JX5BcJ-zm_;Mwy*%pi4xKuBNCzP7;D+TNE-qZL#CegNEA(rNlS
z3GTIQr?|668<2Qh6}*`?;@6{2H`wb)`q@!-^SnN`Pjaf=VtWnWa%f{NMe9Q7Ehf14
z1oXMs0Ku7$*lDH}oc^8Ro)EHsiMCl!=H7MleP9Z#XzFpPj;@Z>cOQH}R%TXI_xf~t
zpNSA@aEj~zdebVxDa#oCam2R8fbsYfJdr@@gNPkf$%FI#lZ0Z+&XVw3a9qK}+fNsN
zkW&phV#l=D8Gq9W;9S;z2kq=+Y?WP#o+X8vKIx<((mV?7c3uo*P6=m<9(tWqvRn_0
z(|g)P2Zx&k+QF@0g9Jmxaq%DFp2lN{*rF7dprUPXp=6f)(#0m@FFyMNS=R@_(Q#L<
z6>l#}n2XZuXWg&jWp|IpMn?@?f$#_qtv&$7FS_<|E?~wWFK+iC>L4O177hHJGQCDK
zT^8<~7&y0M^b2)=d9&Nvu@`N$-x#skYTgUKS|faG3zXbEp49eQlczdjYI87bJM-}w
zY&&T|e@LG$+Ta*^|Cl>suy0MtyXd(lMP=d3s#D_W;v!)S5g#TveNvBRv06=bI>{a^
zJgik9V9AV>y!%bs(pOGtk&C^XSt930;M88gxNk_ZTno&D;ClUaExDxZ9lqBx--xjP
z1!AjFJ2aUkCVn<sNVhW@LVYMvqS72+_^YP~GzsiY#2R?Tl~{7|y}j)U>!x{$yR0xA
zRia3mD(5=dt?KjZUf7cAG?SH}_Y(<m*1iapb~yr{U?YoP#r{(1CX9|UUIg-0rn;0h
zyH=!X)uUR3sE4e+xcI{cu`z|TvcJwZr*t`ITc2*_P*9copCmtYJ;Go4@;=ppj_v+6
z0AxY)1*F8|fmDil=rr@Aj88ShhrreVHgwfnky2_tF<vt4D_T>Q_TQg@LZG-AB2txt
zqI;b8Rfj<4EcY6F5v~PRrBDXXim9pJHx(ho3u9-#iV<(#h>~7bjCj(=dsyUCap{@^
zg4xF7T<C=BqgW^OvyMetDZI5|Gds_xDe=?wEr$gekmJE;={qVef+9Oy?HXM<G9)4=
zj^49lsogWCq8>*hOdM;Q`HRw}AnwwDDiNHfoIm_x_EWsglc<gFge|}C{{J9rBOTzJ
z+IV`>fQ+{9B`ghbt4&(5rBeNq@y3S`f>``8k%f<xBt&OtjEPIfhzganaKY;3c^R1(
zPjaJ@M;JMBMakJtB4Z76CFTca?oXW`bUp^{`g<gJ<k`^ovxde)gtcOqr`hvJ`yr3w
zlC65We)`K7Mn>L~m71no??J)vh(`Pd$Q<c2{^BOY*OE*e<oa{h0Q}D))XT7YbQ_Gu
zcZ~W4s<*+A!<dRHj@7St&1e5hE*#n@@lWiPw*;ApnT3$?9&xICmMvrDf(T;oT09{{
ze1@>QwqSfJgUW3on_IeHJ=~xMRAft>P`hvWnu)3%Rk%qCtF|NpidS5(dDl#>Gvv7R
z81heuq+121#auX)PV1GznazgF3cs}Pp|s4HiiI6@SSu#qvvfXVX)OElUoXHS_9J<4
z_hL8C)1C)S4LtcM$ybN12YK*s5lV5cB-ZPB^f>qa;~IS3!u|)<pQscVZ*Nb2=u0~3
zQ-777tl(tq!cFy}&3-xa?J-chY0`6{I?>D`D4HsHk$j|g4r-rKjaX!`^u5Q`dcp5+
ztc<TP4XxPSCBi$v4r(VKWlAZ6R5`XpE6*kzl6RawDxO92CrbUZ`qsWzIm_08@{{zH
z*Z~KX7JFkV73yd4yZV@I%#K0V3&W@h4dybc?FRY~r+gAY9R(}(q6nO)t|CE+c_-qt
zs&}#R7OPc)s3R!v+skNEI{nVms_}9JS4z6RXs)rm^pepXOT|*NBcEhS&Eq{Bc^7(e
zxBKb^^OI>T!=>tkOB132mf1KPewO8$JAAr0Oc$=<shtUi*~B@KBlA<f2}(~=E-ikg
zVFraO_w$mS?lQH1T{rcnNvq_r&<-lo*5qE~zUAl{!8CY3c&M|_hTnNRvoFp<%st<D
zxGZJwUX%-^nky{Ddr1Lf#PuwrWku>5_NFDFE!+eSYg!bptISd<xq2HG@>UnL&-(Kw
z!vDtm-iy;KWhuN2pGs9s@!4!c_MBTVSr~d87`Gprvt_#AQum@HXBqd}_b_9(5NNev
zy(w>ef#?%_z)P~H(A1-W<}_b85GT_Ee_|Qf;1PG9`m3?&lA01R_SZd*BFBOEj_Xjk
z&t!X45ns%0UoZJb8Tk}QPiEQs!aiq*F+J=%*g6svw4ok*Ld+XKKre-UAX{KJXg9lo
zf+6<>Kw6;tQn!yQYg5g{WUcyv=BrEN;r>Pb`WT37vS^CaB^d0??2o$*@HdHzT@P?K
z{?(g{M)T3F|HHAlu5ptNcCJo?yE9?sO}*Td;ZjT$A{D;I+^{aO326=+Z=S9pceWuv
z1sbi*{w|BW5z7`TyB-^M+f_bl9YNhBxY_I7<)wkn;Xqb*#-t*OoU5i*Xw+qo`;u^B
zbDV~8w=`5=J5DX%S;sVw_-R<8O_EKj&}Zy=58}OLn<Ms=(fLaiclUcvf`MYJ!O@&+
zt4Gnp+WBu_E@OVi>>FyYeInke*!!f2l>&dfB|XQUA(QU%Evr+UkmQGw2WFyXt^Z6b
z#n!0Tdx-C#_PROcUTRcHn7#g+P5~}HZ(wL;<VrU<G`kmGC}C<$R3?bZM6DDgB7-bN
zmDbhD@3?#KlVY?UkMT2B8@LUI)7MhTc`9bxLD2^6KoOZ4bylK6%Sus4I?yG0SoO}6
z&`<nXHp6*=D+FQPAT`kRBPzW~WZIt3)O~l;Ar@r*WT;dIcvxpqjn{P?kGV!)7sN>_
zW>eCZb9O`R$#P<Y09&{L{HPybK!b#=_ffkspc?2y;ilO0dq03B!$;En(Q%dd(w271
z(*ATR+@+R|q0StMh-l%b53Gj^T)%n`8uf6MxrMopD49{4pT>A=XrD-LXIKy~=1r5`
z(m4rCPcb_sW5+W0$f>xgnrOdbnpskmUZQ$3Zwb{c3>rN-7FyPh=EB$r-?Ch`UyJE>
zy2@d&7YOsHaEu<+jvo5zVMxkQw&l?fzU8q=*>UE98tQR&di)7&8gUGurl7!|eP{QI
zl7AA4X&PeuzwrFuzQ;!#7@&MzH<)8_)~m%xBzVcG`)KIyuR<I4<Or3BXlW6GFxK)1
zlKQ?Q3jL7f7k5?IWk~lh#gZrT;6Ey()@^-9EP7}#gW4p_JA|K#(~-G`i{`?0HO6hZ
zmbKA|G>3F$xxPp|wy<4?a;)_(sB}=AMTzS=etjImTTxkAdB{kSA;N${-lOR55B`1i
zOR*iZFSZ-;q!zOx=3jf_e;2-;z%tRc3~4_e&{5tYAmk>AjSsA3x<A&KHPd-%x+bfU
zDWxC5nLUCR-!5FnXHX)lf1`AAe7GfQzMK19)s3tXS-3$ZVr+2Mx`EUzlp6C*IEA^d
z`i;G4CxfWDB{Fyx<*1;xID-gO%*F77mwrpq8>J;OnTL9%XW1fo-B!b8+n<m#!|rQG
zQlkeZN2bnk2lRU-b*1^hB~&W&p%sK&dU4<q5)V!_p+!9q|3dU{0wxy_FiY2QvN^ki
z1{l+cfzd>|Tv$~1d)Gev_DT*?eGN3Uz6=uAjbpqtAU!Ln><mdDdFaRBnCvdq8-s9a
zJ4+3GN&sKQ2YqVo)k>4<Qkv0k8Oeam-|YC|<LP^VcvG5sSKK>z&mtNDU76?RsmWRc
zWdvPN1ggbYyA(n<Ex<<`!Os)uCjdp*Xz`#LOb~%7s25Ovi$s5Z>RQAoQV&JEqhwvj
zdA`L<KYofVK6fc+Q|oaP%KZA^T{)9*wFf>V<uR)E1BZR031=_(=$o8(5v)n;m?`nO
z@&t}U(ybK|AEg9G&p8peNd>C)yB9K}SLZvVIO0jJZ;(%;HQy>_=^QWlDf(s7RC?SC
zN9<hkc*?QFs`L$i-8{=kQMj~-fD93JOb|9~+lxcB<F7s|@Ztf7O*ei%sC^a?-aP;O
zywSblqBN)i)&KjA<ZDONU(5Ty7iF{`-G(DqvviFI^T=}LW$L>kPvWFja<Xr)ta4az
zvPBUtF(f9>JxeNNmaKl~m}uO`vz*&1vyPk73x9X0{_+5Q#@V1~?s$heB@leNyQv)p
zVDLrcCsgS=FE?^R>Ur$tEMWss+E^IJ1#>Y$Mi31o{Uam%pL<cGHF6#pIz%5}44F@U
z_r4z6ZiyoNa^bst7G57Dp^AWW-SQbt+|YYPjMjqzfp*^Xx0y0xTo?+Iw}tF(>E5mx
zp4e}=AK3B>+ZTK8^M{6Fx+VMJJv(M2D?R1zFH6iu6Lm(}c5k^)tguCu>si~MTU4RX
ziY*Wu$fX<fG8RWB@0}PqF1`9Nd&CrXkabMlP$1uuQ(3?e^bG(}(B#!X`#@YEd=dKQ
zN|jM3fY0Yiuq}=M(Z7Mzc2tV91RRP;J;JM^O6LH8p!MB4-MOyfyCyo|q7|A`{l_Pg
zY7EtmLI~oHpX$F@X0doGzB<FoNFEYK%P`*jnrC6H`P!CSBL-XQ;+WNV{5W4m>%jGn
zGX|`hKv`hw_~$iF+8OC^eN=3ZR(^5L8<G~bxoIctceX6uBQ|jTfol;REc=)TzoqY=
zXM0N7Lif-0d&8Fz@4JGBYH7M>7di5oepZZ)x$ccp&hz-B3HmWd1dDAh`Hv_eL;w-<
zkxj?bwM)i}kKcSwTrBxCcwK}s-Ax}(=47wuW<-`ve9!UUd#mKBZBgfE|5ls*=io*<
z1YrC07U?|jjFZrN&SSd-Wxn0OPq=)V7IYWlbrTx{j9~i*dU-2#4K#ldMMgI%y2Nk=
zS&<{<j5{1cw42GGtL<N7h0U|FB0w3xTi8}Vpum~`A{<O9NoW&-VuUB(3$NNTT`6%5
z=v9a-6Qs9BiKatVwF|oWEmT`Z>xG>y5)tztjR+b1RB$L29&sq50Jm?@5cl-{=FRjO
z?p3H=i1=M6HBs_Ol&yiZ`zwDW!{MirdjDsUg1^cuHm+ujWvHP#!@BQ@kG|>*CPylG
zWt>hq3n3WwEh@;7)&?#KF%&Nz#yK+u#FTzEqb+l09|~RpeiaN?rL09go15~622fXF
zSnCB%ksTHyXKx~N(YPhMRf6<j?OQ&Qy4=q|V4$Dxpk@iO)l^V9d?gYu{R=ev_%U63
z(X|MGpnDr;`b7nY+VqAzCP0V(O9X#j`)cEV5n7AaV9k8`4JB5ksuVN1uL&HZ6>C_y
z>X?bw3uv=T8o{TrOGx#u)wFT3Anns!HHKKXS}u8A!tSYP=5?1S@%-0p8cRc*H+MY<
zPu2_*@cQ}{MNlAuYrUr>9o}aj#UtbWo$8}$(V6PDTHZk5Q9^NvySShtJSJ;A%Zoz3
z#mQaD8~6J6Mz8-mh6V5UD|%3!%9qsUsyXw03s^NZ&*ox<)D)-6Smc@WpMJ=Ea}sK6
ztvwm{^Q8?*1T5TR6t(47rlTFtH=b3~OcryJUfV^=Lave1q{X}CeN!Yxbc4yUWluhN
zy}gu~Za9S{HN!?Md-?NbXJ45-Lq1(Bd3Ddm8pFl&GH?FFb*@6sgfg8*i|V=LsXyL0
zThQ%zeZenw5!NBy`re9J8ssSgq30R5fYh!6KI*S?p^N-yfeV(TF6w44|38~v6xbza
z44l56&mKx=whNODIV^K+RK25_t={_VWMO{#xO#Oby`oX4Ed?Xq8NHKQUe`U}%iK2!
zskc<5r^$)<fN*cuJaN-x&p?%#ES7mttxf=0ckbQW!)STo+;+)CZsRS`!?QeMAGa61
zH&Cq@!*MTt#*@N(qZ*FJ?K0x0VsEG)5b&(RuZl6^p>$mUKQHPNT(r9G{_hT6zw@2{
zTpN|5hS$Edi3HipV1!3uLb_0nEe*?XXsutP&1#LE$-~TnW}mjJahZ$k^qyAr%?H(r
zSq#LSQJHEJceq>LXXgfwt{=E6rFQHlWg&tlQ%R8Xs}7bkzN$!0XHIAn^>&_9T7<kf
zgZP@X^1@~OX^IzxQezv&flJo@v)e^`7Z|T?9}eYMINjskS~}8NlgO2N)c-9ui?sG?
zx$uT3QRxEEVsv+INne{%p`c~~zipVKf+KONb%gtOyV!pmT-MhycRfNjCjJK#T{cGy
z4~A1rqcNI;@WXG^Y<xv^KOtOrRIk!0pfVW7VI1wz_oAm(5QLT&?Yxu7>`=P^2VQMx
zVwwU*`^$jmPFEl05?-3AsuiobS84W-$->KUJ5Smf;GU-nTV2Bxv@D=cT>MMta}@l}
z`($(X0*I_%=BWnO7fS>+nB@A;n0dArjW^Md|8rMXU9bCcrFm{dy20ZYkjciyQ$)FA
zpkp}i*5jw@o5!4S?pSJ7qDaOY#6KLv$+Ujj1lT-i?8zPw)Gm}+-Tkusm@6zWa94dx
z*EO)rK3rqGH10*wPiOBvQf7T<VwRH&=wQC4IJb%r-3pzdhP}HTh6b&#;!Q;LwO(xg
z9J>lWkz7)qy&zp`GI$K8(>O5iz&tQ&t)WSG+w1QxBj0&6(k|BVwfJy!1?ZFraI6)M
z`%;77!}tCDMxV#J4!jpKl8GLAXucZt_;m?dnExUcfXRRBISlc_f^r>I?f1f9b|Cki
zsBPzI#_v0W#-yd(VyiCyibiYnz`!(X0@V!EqO5zV9v#(8ma4(dcdMKA>cmqQZ=;2`
z7v>IC?C}K9s~Nd$>jy*NaZ7mA11k2MI9wVxeF;$%^$(1wn-1dXe4jh15#WRF5@GU<
zkpS@({dNo6hj~O8{{nTqQKkh~b2S{hyhnODq#i1$@7yNM%44j!+3SKDLnGf3UC5^`
zP21wIv|@bO98*Mz_^JA^eb|M3Fiw-foJ{BCF&FbY!fhLmIG)tkHM=~#M2BeFMWZPX
z*d`sc+*X{L*>?G`Y|H0ARwtD!RrDOOpQ~A>hI*`_hMj3#O35<~f0GI2VlP7P@)WIn
zy{)J=*4d(bj|O?&Kkoj#BN?ppZV7N56L}A^_C(S!W<8w$pYY<4=4NHOfjpRRkw%wk
z!7nkogLA$H&C;nw4s^grZJoV+M+$Rt7q(wIl!e+FP<<WrO}I^YP{itQ?!wi5FDq)k
zpRv^%bjY4yNmx^)nn@OX&uPwet;dR1_G7NLK9gR_siYKp^Ks7;?btiHTVPi1Giut7
zXSmtyCJ9*OjpER8H%(Kk``C5epf=I_iTe3JSNU!F>-IKi=uUb-k%~H-GpPY%QrBvK
z)+!5C0G@n6c18HAM@ZkV7x49atOxr#M^suv<S2?Qu^}JWj&nM4b!VAu+s`=zOR$2z
z8c4a2?GfMkFs68tU-V2n>r<Vfb6omv7n1a-aiokXvq$i4wNuhrwIES;%NOsWdPI<D
zORCe=xwPDplPxD135Ip;dEnDBj%*uAvZbAr1{G!NF<7;ajt2hfw^%XRB`@yneChgX
z_#bgc<(mn@`r59}gTT`U@I)a9*hsNJ85rR`P;?v=fy8K-gCu=;|KgudMchagRUzHr
zt2LwP9cN3Lb7<ZxIw;G+tHnuq3N8XX>g}wK36*SdYUY4C`dd25d3IlzcinDl$dBhk
zs^F0*`Se6@gicnQlVl#h8X7?4N(Iy2%O#1Hx1oWKVf$i9d;`(fx1aZp9CrFlMzTf0
zEfhOc&#e3BWR+oLGbyy!Ti`2$f>LBg#8-ORNBeAx*vg*;NB@cW#v@QUBPoyrDjnZP
zhf{kwrBx83EyXef3guq>F~%&lPZL!Y_V=7XWvYfX3Hj_oN|(PiI43l;z4e@Z-u?EP
zsf&&))w9aCW{_dExFMTzCW{G)gheNsdp0DeFg2>O;H;OpD=GHEv5HAJUdZ8t%cWDY
zLZ{KBM@cp^#KawElJVdP)UYQvXEHrKJ^yi(h*6wyTFN`>f^EMuW1l9z@LJP0*8;%q
z;f|m=@F{00fc<c%kxII*&R!SH@66H>@ab~9(f#{uivQrqMmhZNp&pD-nJGa80j@Mx
zh1Z-WCD%?$i7CAZ0iS-!^A~~TQ8~L<nX%*2oG%<l6m;um%x!7bpI4~AexWjmV#-=D
zahvK^fN|yL{LZ$5j8%g(ho*4?4XmcIaIwj0fVm$kw)IpE;;?_d<%@?_0Ebg54LL_9
zgcO<4+-QsXgyx@+nAc|fiUggDzW1&85WCLG)FO7@97pl(f8bnjuYjy;6tOt3^CLA#
z_Fc@K4jvB8^XLEp{sH4v9L=wv$mQ<*x<$PacEo6O$nl=bYDnTz{m9o;XU_aca_yCY
z1jEa(E6y^`p+lI#Pm=ca79&Gy2F929_e0!Yq$)Gm>Vg(x8~0B*3D5#(|L-1GyEvnq
zOP7HI&uhezF!FVdDiN{uK8v2Od)jjNb`Y;87FAuBX>fu)>O)A)ygzyDInMZCcn5jI
z<SSa8fGo7o@^l5nCp89+9Yg9wk5;Z9k0cd3*xh=5i5C<6<M#uD?8LvU<aC6usgd8(
zWu1HA^F;{3gAuOM#araHYAU#1m9#I{G>}`I>(4qf>4;$Guda?`tbKsOWZb)M+QWQn
zAMHa_KUCuvw_M98a(Hy7m}p$l!P$QvR)bQYk&YD<jte#dnlf7!I9nLiv-wddZ8}$5
z&r4K^^4nfgKYCw!LOfir3p?aC8~<dQ#Xf(wsexf8PrhU>>IEx3IYwA7D!L^V1S5Ff
zX?v(n7x&tabB)l08sk^Bs?-28Zm2;SUP&<x&%<KVg8un}S{*@(YH#L5%?aPWPbJ(J
zeZ}(e(z&@Vu*9Yd(#*yAZ%6h~%JEf12e9VS`Zvn6GSS(6mSg8+RpgZUF2NVOexbat
zuX-PA33lDq-34iu_2eeCCrGoopve<{(?0kHdisp|HUr`BlG=YDzHbToPcSRNYM2Z4
zj;dCE1#1Au9$e{mlN{QjD#cZ!JUda9_1r{9kthp|Yx#ZC>8@-Iy}%ZwNPX!%VcGAz
zN_2fDnEEH`#wwhOUp^H+fMHUVkLUh)e)7|DUMWc}{iDvJQ&@1;z`)WEgOjM;TQO%D
zs{r{($kdfknzP(M@kJ$3Q0)l7rSXxz`!d4#QKQjt3mqUNRD#+u%spB8uNJ=l-41(=
zH;qP7!Q=#YTlmPxKo^rVNz)I(!0<<EL~-Zj%j(9GYFE!WLzf<up^|Ebm%0%9G)bmm
znlk3BTvYeGd(R}NV`06HYq^}6WL>9c^^wb&rp(A9l`nt<yF1`J=wpL&rd(>04oDn`
zA;%NKbn}Wb#n=~#i(!GH7s~n%CvS{;Uj@I(6;PboPT4AlZKw&WrO%8f?R!cKey|Vw
zK5YD<IeQ|uZ%)P$(eZNU(%w!XMp#4l4uJivEsRvs;Gwt!_}TZ=8(5(o60#Cm@A?^T
z#Etc~)r0-;fB)b<YXmaYif2|MQp1G%_FXub6u~NYGd<sUdqW;B3PUeZ?MgGrD@^~1
zVpFJV9Um)^BU5aHPi;e0rUMxVD8n|u21Voa5o^ZayL5X0z-2Qsg#m$n$mr@YXVOPb
z%NQOt8G>dkDt(+A<Oe7z8ko0$?sx4+UL;OZFN+qq7P7=yk}=H-27%Znq8IUk3RDpC
zV6?mcmmmTbDTxx%u+Y+>O5d9TjgcnrfTus?9X6wlDCw}V%{%y{L&S9Xyj!TE-Xy&2
zCy~szUE--%x1JYuYcUwt;l4B~TUgU;1argYjP6(x^#{vGElq*$n8)I>D?U6vdpH{W
zh}KumA0TigE)F>M5uBGZ;T?>DJkuf}fJ-qwu4SnDJI!nZMD&Ka_3)nS;2se;YlIHb
zL;w?V9^~?0y|ZZGweZUCXf~wC1N!5u84Qpe&SVmkJD}V~0K{A246lYBW2_(*dksE<
zvwx8i8iP?0J|hPm|BtV?49n_m)`y9k1|_6H1f;vWySux)LqI}W;RcbL?(XhR>FyFl
zI+X5w7tgczyZ3(l9e=-Y$QRaH*IYB_%$#!$-b9Gv3{*h~OXC|Fhl?|fX?MFoThXmk
zvfu89Oh$LyVrkT)-pS%UUu){v_u_c~j$=*Rwi~L+aAG2S8T~i>SS`U34OzDVvQ?Q%
zuXeyAn`F_=SDMnD4KGl@<+(PMQ)BhF`55#}S)bK3Ig>E~-9}}@_e^spUvqQUJ}WU5
z1a?`2QX;C5NKe>Kr>aaf5dh2b_!OFr@*;c~D)<(J@Rt=;ZM|F<3WHp&<hTjuIdl75
zRJDdq$pxFDps)~@;W-amqn{fe(q^`c^fPQ}F0{XX=)%V1_lY47!04A<uF@h{Op~-|
z2rQ4CvNp@0Bg8z6y-Rjr2WlW(Ltn#+2`+UnrgW9j-dY*u!3PuTkR~)4{n~8sdM>Aq
z;~pAm1le^LHG%+{79jraNJ4eSt&5PD&h|2SjQ&|OHTw(gL8Bv50weKq6nxA2Q?V7|
z`<`jFZ`yS$bKqKsPj(Kn5`=eXy+YS<0SY1dRoRKH8PTba*+8@Iz$wZ%2W-(u1_v|4
z+XFxsm1B%aeBS5VK$Vo|^bFX$zprLPU9`}^Tm8^W8vm8aEQ-G&3p$8WqKoR4<iDwf
zB^3~uC=Y4igZ?rtLk6tkg?@_gS2O}d8U9roPr!C9yG&&y-W{HICkA^r|4<=37e&@i
z2_sP>v7tA%8KPGo;F;4^iZ8y{lgWUIsrR?S^(pgC@QItr9wl0z@tds%S3suv`RN#y
z9y(y+q7^OQG|ZyL-L+3E=9L}p<NUTi@Y*v>G<}l3p5V!PfLjpWUlUThs&(fb5awy7
z@-LATa6%A>gxqCOGflqX-K-}#%R8E^hr-?MzIO_xGI7;%qR|AJM>Nq$B7gwR=ohG`
z+&yR#vOJ*cT&Nf6@DH#v3l@4#MM=RHr>1V)ss5EIilCy+h^u+_XC7=0zc8l497{^=
zJHAmG()w3rZ_G=c)nYB5kVydnBf9|Q-m+T_P0s=T$jBVEai@|!+cX0kZFf08Om7=3
zhF{ZE=&9b#iT&m3cKtwC_XEtVHx@ArLoh|+OFU>n?9X~N!4MYI)dcbt`njFvL?IJY
zQn3ls{~!rNdKc6Ig__r-94QOfxP$=y_x~U;0*<rlH>k$ZN9UD$IOoiuB@$)AuqojW
z`E;tq6OD3R-e$09ED#vX+Hh29%x`%onx7bFoYjAq?z=a<RVRFP2=yido}5f4X=cV`
z+%;?aaMvf67bm_-6Hyia{6`>ZC{L8Q?Bk?SR}F!J2cr6`7WDK{-vr8BNIgDHDvBG<
zu8?B)l#IWi{6LYEK;x}KZ8Z60%r5`ehHmwhiRSA&ZPS25tbqqRY@>QXp>YH!0Zy4G
zZM$kDb{o<YZThPvQeGe|>;t;J6TRHRzWj_KU0iQwI&&AH{~;glCsX#ArNgO)3p<#z
z{MD^xGEL`f6Hm3mJAv)<x$q2tok;=K?7!nh-_I>7;F6{CO)SAEHU|Kr4S(VQUdt=T
z^7Ay8lHsa_dD1F|%8jJ`8v~Oj{(ync*2w0v^9`YFhfmr;F6m^v2T_-(W~_+dmlRT^
z$YeYZ@#Z3Ql$ti5-~JdDNE-&t(q*s^Cx_1%PUUbRlwvbhq5sVV_<H$wg-A2`Ws-7+
zV-zN66*vIm0ruVmrOqOY*RoV7%<}l@cNc1Oaq%iBdYbXnegJz9VkWNMV^G~`$gBoy
zjyi(NcFqL$&bTqCpW0{E`n_Vhc<!FZh8iH=t+csR6$R$N+kGvO0Fcj9eog9k&K{TC
z<tv8wn!10+s8)E+J<|pCuBk`)+(}TvO=3iSyuGC3TgSL=?KmC}p6pxS732aBF{}j;
zB2!eH1$34DFrHnJ3Xf6;E(q_-sRYvt`-Z4go_q9y{=r`*2?U-Sd8}3oaBLLO;3mbx
zrjOf(RBa=t(I%>dmg@$bBI-Hz$#94*XxL^?d9PG(+q^m<KpQdn<?X@Mz;XWmY!*bz
zWuzy3>A_Dwe~Hu^Z8VJ`Ji=Q7=m{3a(j@{yQ<l$rW&?hT?*JXFLEWb%r-1G^^%G13
zsMA2af{&^0lSc$uTX+)zeZ4L2Dx|+1R>aGcyh80}+FV3gmgarGx=qiW$Dh|>{G5NH
z-QtcmXHC>Y-$sd3F1l1m3)Mm8*>tf*#jCd~w?gF&1^kW<pJOxWs~rEvYKQ?Z9&XiK
zl^<TbSKKBGj{7xw%HXrJa68GH`E`L^h=9^;Huqe^<*>!$Q}?*r(Xg}a!TQnT6+V=J
zq=27LOHN741OM{+W22+OxqQ{t#!GJOG7j?%h`01_0e}0MO~=ScD6L@Ma>hl(UB7Jv
z{P}M~68=Qy{8U;Y)ZauuN|1t%Kc75C1)2&nV03SSV<irBN+I4wg%9~vwQh{Ra=`v5
zYOZJ1Woo2nU1I2flhxLvD-eYat`qkQGGoA7hOrBe4(DLn?a_|cu7CIw9j;#hIR8x~
zkfnUqzEvCWL2sC^et81Dhh8zGsW>2UnF^NB{ZEtz1>;6;{9%R=h~~weUdSk#NO!#B
z#Wr=>Qu}n5vhZyA!IO7UhmC4?$uy0NJ{mforY~ye?f`X53BI>%{&Jk;eETYiy)-vA
z&{u?u!)~U!hLl612ClHBlsLzF#Lsa=R)DTw#dGp)icf~_Snl)K#?!Yjm42^^0iTOy
zB*9-0mILfU?SOh`H6OdvA4SD)$<yz<67)1S_jmPC23z3gI-vUn?gibVD}1op0wO!!
z08{7wz(R2VNM`_94f!6IhTeFX+jwK_dy$W8-fQ~50TsNzz-S<-CfX+!sKG>wptzS4
zFPF6QYRu;O+O3ArSEB27{6Z|&Obe^n#i&6J9KbxvqlaTeshL>c!F#svE__A*92_}?
z^CXF-+72jr+{+*qOTbVj{(fTjKopm1?+VB66$X~Irht8JZjV4BW;C%trKAb3^u_mR
z@W^ZdY`Xt=o+|1W%`3H(Pz*ch-dGZLQfZVmEyNm3M)}^vshSG?pOuhic-~9d&rhaw
zR6Z7RnZHJyoaxqvJcx6J3(BoDT2&?Am#>=&fRmK=%r&+rjLGv~SRPTjybCJcb2ikh
zmwl5P>=#0Rc9M3t*6M(Ms&KvjwNuK(wKhQZGxR#v>HGLbYz<M8mspe7sEar1)GpI(
zsmY+Z`*l04!DWFr3$D2X(4^dYhYk<SUx@;V*jMfZPGDA{6Dk^oG@4=S3mNr4I1?!7
zU}4Kyv$WD^pFER^H*IlU7ARQx{M+1P?A?<mE=%xrB`4Vqz^W=f;(d9ODJ_*+^!c(^
zP6@^f+S#*?&{|mZS083L=(uruoQtV3<B18QygmBlbeSCweYNv!kP_<Oif0<d$IzhC
zNz5{#ChjT`oU?Z?t4f>9_J(63j)n_EbdW<AcibrNEwlgv=ZH+De+E#teZnUf`|D`~
zmoS)H_TziL<eIW$OU{aVpde~G(Haz%f3iq@5=9ax4HKAoi&FxjdCXs27%hBCeQepZ
zs{L%6X14^UHP$K8cMRt=f|iq;U~`wqRNV1i9Vwli=ov3o>}r|>MCWx4=RxO{(T>IA
zUoITtHvyTf^4oEi#OnotlRMp*UG}~KFZ$dJwOh6-u?!`qiu2h^vt}>i>!u&;e!$>}
z7Xz-P&HoI*!Tjq0;tHOf<9dN>+%*D@`A65H>^^H2s#1AJ8)Xm{vyZWJM$<4;#@^5j
zlgRKXf$)yj#wP@4;e#dN714I}auH(Vc&$v*x%xCat(Wa->0PpgmFPkemo3G@QP8S6
zIKIy0>rv7!w4K3gHK>ZS4S7f3vyw<<9!wIIcg$um9hv7jxohCprX-HQXoVpE1-1e(
z)`Bgdo%c7sVa5fB<3HaY5a{;5?}4(!JF+Js(9u@T$Ia=)V+NFHi?Np0gZRd8&uT7e
zTiCM5oq!bs#Ej{@<F}V=S^{()qBVy3`6&C|o(%RfI0Ouz7kn%|pb|`~TLY*0wQ8#@
z0A1(4IpZ>WHgL4tv*qc#4_pWF{mqlF+$ldD7}Ive2iKK=>^h7l$}?WC+{xi-$?O(d
zW$fzPQ7oSWbb|)3hv+DswAWWjOn@8T86jx}Dr)qcFE0JuhV4M&-}u_7LicxT)ZRaQ
z_jgpl;Ni^_CuvdjW?E&eB}I6Qq#%WPJ+i(aX|7=>@92?eUNJ0&WKBzMx~EABZ8JRb
zH%kjuHh(GusVtAUoe1%#=L$yiT?-UhO6gsL)|>-8AQea-C7uMjvy$(aNnp?scYBFj
zS>bcvI(C=5#2pPqOz*$*ywOzQIRz%RhGybfbvOx)0_B-t_uDXfsA5fZSM2iISvXv~
zCCW19oS%rAZDPCFIgt2sLS#MH+^)P*gj4%~fp>+dF;VFkKZcX@N7dW~-O}RC2gFpl
z>1N5aVoHpd&RB0t*~>nP;+@aJyPGH;j90EbranPdSBnbwLm2KV%c(GNE956<!yh;6
z+J{iMb6l9%U;F6~bKa1;&!s-0r60eV{PiE77wc^(`g+6SHNn*Xo=^X;R})rvDIj3i
zl>BONwn8$GC)z+Pz9hUQkvGdI-HKZFR>a){lUH2g>ovK-&5Oq5McKQD3#+|jyyAc}
z?Q||fs*4gW84@xwR8zG1rH!G?Tacc({BuhKv8>|At~2A59#9!6ZKGOpre*rWUlT><
zKHEzLc#%+GHN(JFhI?(UEk062_1L!b)ff<1|D@&`-iohW)F4B-*T#1uOQZjVZ;j<)
zaP>Qyhw%8yXb3}>O@}eZQ9P(Pq$*T-;qr~tc_UX&D+tOjs>sj-!g**t7EUyxQ(R5H
z$Gi%}Vi=X#Gm$7LP%J1~uEYux@#_eSV&o>U<}QQHiqeh>6Uh~(%!kB>^;QtW42fn#
zngh%PqtgpMupY>GYwxGtaS8?#l?%<Y3EbtmAnd=!q#HOeZ1}y)(zcwb7t(cbl<10g
z$-7#&O$0D_Tr&uBUP=dO(0JpB;|e|G|Dr6`ZKaPrerNp8Pz6LTX3ju>uDmBze_?)m
zofd5^Xr`x8Jz3rU4Vxp~;v6~Ca-fygU<@%qsTM9xHfUc}WhPE#8TF&1gqGR|c90kq
zzwYO<+S))pnDe53{(SQL(7}ebGdzKK($$`lE1u+CEu<A-fy0#<7*5#u{eLkL#{8lC
zZ-MOZeO}TVX&{aYE8{hsymUct$VET>{fu&=>x0;-&9K89<KD<9KD@iz=QQ1=Z=``<
zxvoj~`nX%DvT4j%l=ly;PP%}(@~yasO_27{v;>tK9rN8qEv-L=6p!M7X@HW6oYphj
zCf7Og)br@0)E{5%!-%>4MIUnm<5*Vn^7Y5m?(?5RMFZwG{SNaoW4m%bQ8hFBO@#tK
zqa=kp4Rwtdj_@v<zc1l@1r!GAGrNh?cX0q51`)ghKDf3!keusK&0dTUZchTjP-U2G
z9sl4?pg;+qUgYd0q$qT|RC_WO_B3|(UrOVzIN<O)jkVg;5Yy@%We~HltsM-+zP<<B
zEHB*v+f9Ofdz?Q~ER>uQ8w5^S*}I`crFir3Gni%aQbk4rObgOV|J=caj>f?LVckU!
zTxuT8yC$A@ruLf<ErYsx&{q^uP+&aS<XNrZL)wRY#gEmofhtbb<FHfU7eTA3B@@cO
zusIZ>I#8Tx@RmOlEu2O(E_s(jiXP7!=5SH0sRjp%)Y6B=TTLz}aKQakx3jRqEh>#K
z=m3=)<xm$`{z@<MCxg8WA)E~mUlU7%5-UELJ6Kg7vb;->Rc%!)z`v0^;@%}dH^?<n
z8xYgNaPt^AGaiZ#2Z)V3r^T4|v-i{I^%cn>j&hoD4)Y%jBxdHnl-|}UmV#6sA8~@y
zp_g!tdi`}nbvvH;PN+vWr^lU@@GdP83H1O+Lr)C-AT`O$3TV0=y&^aM3mX7j)%QT6
z>J}=bu;vWb6@=BT)&QEWL^cmb*wzNWW>~Z4j#nyEd1<ChD&JUoq(@uN6<)~bzH>zh
zPZ6y`9;rAf(cr&C{I;g(+5BWRkSo32cG;=w&7r~XaVfEip<{@mdy*kO&}$EicX>hz
z2Xjt7X`Tf17U%SLKd{ravw?l+{Mznmc>oME=Ryu>ss0D21t2RigMAXv^R`IFPV)OO
za@43vZ9R#1t<&X^SS<}9huno=x46I}=HJPz@fM?{9_-7`O{2uG%OkS3>y_`aUE1Yh
zD3t(W#h~kDU`jf))raSTi=x;Oc*4t$2N)fE&0aLm7O0fR=3JKv=UAUnuiKgo#k5)f
zyzI${8J6!p;Ey(SFHkX5-pbp=stp(H!v-Jy2e9RY2mt!dyVT=f{uf)F>y@0x7{=cM
z)Ls6R09b)oNVUg?ScI@$fWBgku4I?bmZyXYEhEm)XDfJQ92%RCsMSS%Y!Fs;b`G75
z_{#9(Hp+h4^OA$G{mcem>r;#_rLf^2l~DEGL7*s>9+XdjyZJV$&180EA`?&=T6<}F
zTqFIScm#fvx!Vz;6OIR{x#}?sR&MN*Q~mu_WiXo7&lGeBo$Z_ZSSN4L9P=F^w+^U6
z@fPB>7ff@`mD<gW?{z4waBJLAxl<g!^Qd@DP3K;Cp#9Xe%p%{hOFxufqbwqGTt<|<
zkDyKmAMx*Eq|o1vIOWyBYj)A&kC1d$2fVNudqu}D4g(mKW~6lxtW7w0SqZL-xjUPG
zbjk>T$;+_*W!!UaN8YdEFupQxd84^5YwxxRz&9(Cr$2hDoEJ0ZSe}p*r;5Mb30>Pb
zJQnT#j&ZbFBM;nH9Yti9mulII<`2##cl6gke$Tpf_*5Ms=*FC1QzHcap=@R>4il>K
zm6+jzPcO+WP3xAixypi8q3=jHREC_WkUW!e(VKTPdEXkw&asxzz`1Ob-7c)84KW&0
z<g071%;4DXq&YhaGYOJPEQBdc_(RDZ<}O$kD!-csU~_W&`!m`rjdPP(!qZ=U?C42Q
z(4RxFK7FQmDNR(evC3;Jzcd&2y5G_NeIEIwF&;B35R_<Ik6zBK{TAdo<&g!X(xlzJ
zH_>h4gf^%ouQj@+0ruAZcg=n!yjSH=k)4{M1m)b9JqZQriktEGqxP_=Llc+8$+!1l
zhZcWpeMZ%1lznvTJ`Ivwzqi{wyy;`U-a5BlfH={0??ygtNW$)v>vP$u=Rl*%u~B=e
z8Fi^wT!Z{q43{txD7}{v^kA^%*~r8K?}3hfRLJFC>3LeL3sW7$aD<00*X4W{V-6#x
zvJ<7#O{f=z@31_pdn|?f3Lx%lQ^MT5&hpR&-gV|uqU`So_HI&f{Xxsnjefh0dUzAd
zmw;B$hi`v>L<0o@+0&6qU}4cxy{zscO?QwzB>gnXp#t3oxt|vG{*vrzif_O8HfnbZ
zqb345V=P`fJvv;IVnP4x98vS=(mB+mB-YMjoMx9Wv;82a{Awiv1NiF@N~mwn)8f^J
zhyctLr8XI5eSwwuibyPfi+$6x29g6r#H6i<B61=0Fq#09Zq<$~-hCcc#SdnBe`az~
zKQ^!^F4%y*hPfo?hvHx~4#ho8<o<-z!^Hdo;3VM&jt~2$6RYMl@ovOz?xDGD2S#}>
zWTz}Tf9Yf{MJrfQ(sP<_TcOjWa&gKk3qbLVUA4rHFCHElpyjY-NcK#Xmb=Xek)-Xa
zlGq6zr?@q?P*e8YW%1wZOf-IU{{G0yL|^|CUg)BAQW|%iRf!yA?L3ne!2uY%X6TDY
zlHtUEx%o1&APp()c$6Uie~7!rOGs~s|1BWmBTk`UL?zHDT(FW|*<B^ImwFjd@X3OE
z(DgwgT;R`$YTc}nLaz}=uNJ90X8Ofwn|`keM(|kkbTPGv6x%~FDDA?oJH=~YXhCZz
zz6@Q5;E9^FT*kQ!PP-I~#0nJ<Uw>2soiecn7Gz%kyL^up=#~81OT-MQiHW?1H}Rl^
z$~CkyPx_C#!N30dZ!M*AcSU0H;YG83)Rghq5n{n!!&R3G6aVto+44pCcnaLI&zK3i
zR04KE`P0^-5gN?Z3RG(F+`3eDY&+OV^y#*F(Qe39x??Wl356R(McorDRr?Pzj}uWw
z+xg0L?GRRdY~I91w(RW0@%1MggrAn9bQc0OAqT#KI!*k5y4cf0b<CKB<*eJq4;(T5
zay3up8=FjoasMdz&If1~H5<00=AM>%$xkjyK#&z>oQ#dOnO+Y;KPjT$Zzi(`3dTd|
zUEx`M)%m$=E~Mg<Tm}%Hw`KoC$bQ-GR-2n)fro;=N8guoV;327fO{121t9bMpl>=e
z$4)a)fxJ;HiLwjGmMu>^9yE&XE`M`nZ%_=HUi7ZOA(Fz<j%S&`-t(0dZxPGV+o<Oh
zZP>BEC^-&f5Z749TMv!)*%a(cdJU6{rTy>v9c4WX*eBPmZ`AUonG*K@YNqfb5;fbO
zSDCXuFPVSnO)O@+K{b-?x0$Co_%QHiQqmo!jBM=r=joqG`1e}<(;<zj{Vco-8VJ^h
zI&TF=Q=Kf0&plw+uNb)A^MX={5f8f}E(CZXCFe4kYH*5utLAy`PHxiST@8w#W^d<Q
z@B@n<3;h82^Ml|c{L5f>#)l+}3Em+YzVl@FA2TH4=gr6mO<E&WcLo%w-g#~b>_-l#
zVx8geJZ_z%Ge`i4+js7J;UJd+BrAbKqSWQ8BfmqG+%?-DAI<-vNc9!i{x^IWEmOgJ
zpdy<*eM$sZPPAs|T_bGEcQdS|7lPZ#QmGsNhI8~)pg4`HlcDamnwk^~oLXj#4We5J
z`evS;hO;|JOs>h*gP!f#+04*;3W-!hsN%q75cT6PY=ipnqP{9zmxj}!Qby5*3criy
zV0B!ehNJRY=-435O}Abe!29)BtDU#N*hIj@+vLBAE?}h>afW=sxpJ}8VzneVP~lv5
z`MeBKp-y#xb2w+?v<1u0DWe`B1l99`Y!#b*4B|hzIN$mjes}UOZYx8Q6l=_xB;Q{z
zZEiZGwJMk)Rh)r5c-!<VDuwen8K!)z{wx>ylLb5tY&<+N(%TgaFPtZxm?h?WvqSXk
zLIe;cbJwZ`Qg^0&uqqjeY`sdE>RiCX)>5lhxf(eMOeRfgFs!HgdupZ4(Dxj8*Ee^^
z-KEUSX!<ugKpc3$AQc8K0uMkYe4xKkkQ7kY&e&AwoBxN{z)TkCX5>9qEnp-1g<L~I
zZg_DA>o>1Q+BuRxK>4=p8=Bka=F+85h!6N2c=lX%&FCC!V9|W7+scPUiOwxdnV3;w
zkS^0Bd=e?zmW8pO0Q=s&YXl?uv$Hw~(-A)EQ{tw3F3iyTuBjxDqVk&?VPzF)*pSIF
zh7AgCafu+Y0$Z)I5OeWI`N0^OYeS=BU=MWrRfR$0QK7%kp!;0Q(?V;B)j{#D-*#0M
zV!y6Mm%NomRJ>?_Os0BNkOWNfJG{a%yj5*puhnhlgSoceJ$RVlmvDpbv!|B`xMz={
z?4NlrQXJEYpu@lU+ewr#_pg4m5MibqsLf#t90(F*cbfGzC&l;0n90p1;dP<<XsW<S
z|5C;@OWi^oiHqH`nL9R<zMxKbQOS`K*@axbuZ5NmNMQQRB*)ocVfXDS|M$Y`BBh5s
z3qDv+(=7GvBPXmWyzW|J_J#?C8AayLisyXZR2+ne@r2+x7}J`Hrc#yTLAwO5$Gjk%
z^0i>(Fs^er=~tBz-Q!8gvu~U>LO8`Tnpq3bF&@_BEI4XU+bDdlK4wD{HQl*t2=Fcm
zx<88m0jJ?*by!?51153sJE%$qAB-x7zcEj_-1-LB&o;a;^J?dC6Kh|ckDRu<7^0(N
z=PaU_lw+>ld0;1A;|r3n=p(KdUl<NQ?|fKMo~O0RRGj#DZer>(l)s_RnnQbGKCzs+
zKzolw=*;qa2&R<YYXOVw76!iD3s1Gh9Uc2}m=W3#Aut<*`eyV41!B<PXcPh>`%8yX
z(#5oj<waf{e8R%<SH{;hi`8_IqCrVe;6=gup;x`2aglc4CX8likX*fH#F|!-$FOS5
ze3IpT0sDJNa5$$XI-R0D4Izw%L19sB#!zJ*r9K!e5f<U%5-O<BH%uI&KVM6D1%4`2
z$AwG`r6s8$+m~WY(K8?Px0$oM_4V4u6F~ycgk@-v2(J4+teY;<%|4g)*A6v3AviUS
z?(=Q_J~Gm#2w@n0_%84n*aTZLhL`xOvCWZPTX$y!#?yGy5H8oaLl77PB99GkM8>-e
z8^9q4{D`~|bYZ8s{4?k32dLZ(KDxU9QMLbHxz^%a5Tnl{!kcYY3{Ce(`Ehg|dUQG;
zBf%Wjit@8;NK8t`S6Y66qv9F8-%$t3Xc4AQZ_|)g&}Dgzec3Re(9K8{512Q^hgI)`
z0rP>wYVE2cAHYp-P3OM)n~^*DK2YwpZwf{HB5D{*>@XvaKu*OkI;Ezhk+(I!MkrWv
zMuiOn+P{A$XZDr3$M!@=(Vz$u8Oo&8dL`#Y88#q(GDIcoKl|Vg-<xHPrXIqvimNqk
z^rH}XQ1wY=ZlNKrq$4>xm!bpnVR$#xt6dMX4CkljXL;C#tvNf}vie{y{e~@8Jia>u
z4q-1iejexPU*-bwi791vKSsjFHWn;T)K+9f9GuZxHK*wu8YL(EJF(<&*R0uiUec_z
z+AdkZ#3{c5?>n3fh;w<1L}33zNe>{yC3d~;_yt>?le_m-l~TcMXGZ^+<>mndn~^7{
z3-*?TDaKM>z`!O|d!uX-%KTRIkjWe|p7n=iBzlN+uQNcw3jo+JUgUVSP9I?Ycz%p$
zDbUc&kE`%#HTc}SA|M=A6#yJE6t-CO3-B}Rj+Xg5<7t2j^h0nou#7v}JrnoqeY-#-
zsg<oz21+6dcU+_^Mc3B!u8<G-*3*^3tTGoo&FkE90yz*VG0X?r4W!bBVb!khoN_b-
zXCLe%0%`v!S3oS_VE#PDhep7ig8Y;gCeGYy_N-jtM3;;UG;V4$AUzz9Q<|jCWNR?g
zdOTi_+arv2EQFUeAI(a+X`=bXHx2y*(9X99$kt$4Bu7sv#oL0YR>CcR1glI?#1vGa
zfqm4xthHHzM$8}}dsdjmh&g$hAN8$|g7R-pD^Mv&dI#eoAs{kXjla^QSc=TmJWUSH
zN!;2L%}d#5Nl&MNu@$tR&bVfMUja616k<Z@k5EoNl-co%w=BBs$q!yI=-?EQipc-e
z)$lUc-@tqHWsx52gXMy%Ee;E^Kh7MUq!U5s_OmGtq%cAJ%YgxZ9~ovs(D{&+^qWfM
zyXveui2Cm27KYBkphvVmiOeOWOf>V%w;-))k4dsB=o@`1%~Rz89f5Gbw|<V_a{YOS
zTr%CftrdBfN_0~;@ju^?n6sAgG73?d0i#A?(@_o<Nbf8*(^`zQ(&<+(<G%kCH?-|f
zc(tZo@u!-+Qin3`kJ3>nkKqN1nO&jbr&v#leicC{n?62&G#WMw$1xaLYb}RP_X&h`
zt)xL=5_=Wq1GN6v?SPFma6&(9sEs$ivuJV^_-`%%A9YB7xgTy7-#@%36f_F7<E$DS
z8K%`1yCtGRluMlQ(mSityhLj4X=GDqbxmG1Xlafei#zlH!(nDSbg~iFW?EUSXBGAb
z8gB7!z9mI6KQ_QqI5!I$rvaCUkm~Ud4;R>H<@`tEMg>r_h$xH@`1=llgSzq&G_{aS
zB9D+j7TtmOPQQVlA`<(V&qV}qbed^us9w##D!Y;r_sCN|^;lpvi%c_L*B0Mvcx1Zk
zD_mFqrh_ZKO-8*R#-+x;QZ*y`>D2(W_M-sOH&KfHV80u;=a5o8V=4?@kvU&(YND(v
zdmCU8Cbr#mZpKOYPRfO@bj;->Y$UD2hzb9a9l=hlzn%ZnA;C53^|0?JRo|7U@ZrZD
zMPoQuPN#>TKp;ejnXS{sQ}ne1obc1<su#?|g6Vu0`5kY8>#P_k4Dh(h1N+4PPIZ`n
zy?EdIeD%l_7AA;8S^TbQZ%V?{oXR^zw2z_5aulmt<EY8%ZEC2&rzvu*nLGEYZ?q+x
zV=pWrF$7_XI$Tuzs8#&vNxCqc*)ggu^M5=O3p~xVZ@#}oH`A~1D5Woz<%q7ye-ffm
zeKF!vmQCZ>EM=~meL>uMOuhLrNAnSj*SWt&96^u-NP5=Tx~6*eSRK=xRv!M=sHC_6
zi)jb5Md~tW=&(dY@rPDKCYA5(ne;Fcb$-!Ki|Scjj+>50O<BF|j~R~|ci8?`^O+&{
zpLu;*TqdVn)lPVE<wR{yXBLT<)wz|H$u2dsi<ae0iQ`+ggOFGH;B07AZ^eFrOTX`?
zMC|z2cskj;zFl96Q9jfqqXL<VLO+i&_DgCn1EBkDeZVP?3ip-#L!t`>>X-VYH48zr
zn`3LCeXM0@wiF!mpbd_pvEUv<P<+bTSG6S7H;SeY;j?nri1IaTazAMq`9|UDOA1bI
zeA$H2;0*;fxo19zg61tCdf}Rd1z`vOT{y6J0f1W>V^~P}PJAyrELfo0xJ)dGwoc~m
z^RyB34vh^$6!tQ8e)Ts6AN7*S1jec$7N)U*r14S|UYQ<=eDq8kqtg}-yKF~exoy(1
zDk|8WTph5(Qp*K6>dV7>0VedZa5j9p+dJ<Mt-q9cioBj%FjNdX{fvg_c$`M={vh$8
zePK!c+$r->smOM=R&%-*Z>i=Hhu~V<l@zsKFZe{K+Moj})E7spkGDh_27;ba!PbnC
zaJu1FHdQP1>H8N5&LM0Cf@C{1bPgZjeG`DBJj1n(6MFT$Enl%Pm`UAX|M1h;-!~L!
zUnL<vbRDiaDNB2gh(l?F%;|I2p!Dm9*PxVYc59D#m7w%`QwT&(<_j=rE=CS;{igF<
zOVeh>$ZB#-K<X$e3QvWD*?eCG{KE_TRbqoCg1*jn=JIwNC!J8Tox!JS#aR!)6TM;s
z34cd7c*>LCrH%$`7ovZZE@0e5yV^|Cb|zHOv`VJvv@~4>dM*%j3MU)4R9zVNEx4+N
z&MnK;c#6BAC^c#zjlJI66C?MPR`zTTy{7hrSsQ8J<#ala&ebmSkjlL9MjU&;bOtgl
z#UOBd^;#&lK=YlI6p8-uU&^9ncB{xL)CUQ`7FT0Z4JZ#7Q}HNZL(=a$EZB%z8ss!(
zuOF(h7D!2$!kl7&$c)zW{bi+_UNn^dK9ZE~MJO6zR{Foq#Qwn%K*2=+%M(z*Nsi<t
z-%g!@B1@ciX}K{)qk2e=nHY2iPiP5u)qiQChcVzAZa<XPRH<uy9<``PmlPOK7Gp!l
z`)?b?0n(1wzp6EWINAvGQ1Ab<unz1GqX@JjY4Y%%acMAO!YIUU8X03zP*01z<_?Q+
z8)|DNBct$4+lg?y>+pKBL^p2XPg@%(Bd1r4*L-t|@6K5~ZQ6oUAK;I=6JZ^SM6VD}
zbnXzR#ZSMJ&!Z~kO;I|gk)N{mO{;i(_P6oYv+HhI4bC~w$hAz1u}nZ@HoKq7D|NS>
zsn|;$us04YV*o5Q%aKZp_{0+w@{W4s;1@?gSXP|mH4Z?NWqzqnAR&BWvIp}Jun7Ex
z)PaHGE^*AybO#i}Uk!PbMiEoX*JytXJvK91;Ot$1D}u<ChyPI1E0xw2qI{Rh7f?ah
z0{fK!Nz`Nra^S81<<m3>%s||B>WZj@g}<sj>q%<NrkI_x=K1B*)DafY5IXC(?|qSE
zIaxF{|0_HO0(s0dY^I?Fn1Qk+qRgm>H?sDZ+%!36I@tn`c{jP1TdYtLSXjL>E}=s0
z`gt&{(%evlN(Ic<qBI$Yr{eso_)#K#sIC%*+%y(Z^-JjmQib9WFIMLGMk8j^dk5_F
zk(4f0yK;gbC5Pr`(LdZ-U5sRfx$$YdiSRb41fDlQDf`IoR_ou5e0Obq+O-%kzm}gh
zp*Lt+=Z$O(It$ThOk+*-UkH-tj2m(ha(j<p<{ESeE_m)6)&gQXiF@7wFzxOePJ#C@
zCbMo<p$ULduDn3oo@8Jg=aCW06ZNlv#q9f%rPefMC_tXG585b*a}%wQNM+eZW_)^U
zw(ZeHEfGnl0#HbXV1A$L@+P{{IJ`qEt7i<zan=HV)3&Zbag({$#~YWdMJ|h|W`Mvk
z1f?%k4WN0|bkXwMoO$F2NNZU#&kv#1pbb5q`^Dp(hv=AQPgm!>ZM`4XZX($Y&hS73
z)wsNc+Ijo3gDor!2bK^p^YE%gS^!(D&{BQz`<pQUj_s;_{Uhc6+*PMLNjOF?{XjEg
z#sgo>v}duafM?OvLh_>_3*M=rWv|JP&Mec!1|~gI@qO>^SA~1>`UNH-sv~zE$<n35
z2}ChRzh|%n&al8w?+?veeoZ=`>_ZJn-guh#W;8LGl_7`qZh5R$C6-AF6qeP~1a7}O
zEd-Bny<^BEDJ2NVZTp73Q=+Nh<jqEQHIi#}UuI*@wx(tRf9S`CN1IN&AGO#tFo-O$
z8$5!o2vVR*gYw12@)y>A3)r|euzH7Tz45Up>m><T&9BaQL-|jWB1jbI_@_OSdU<dC
zbfpnb5lKz(7*j%cB&=rAG(AaGv8!CIwB#kGF6-iwTc+qjlaUD;P*d?;WfoZx30pel
zae97)@(AX(G1Y$3XsQf~aO?L&I@M_2NNcla1no>5_P1@V|GtmyHP|jzDEc0#qwV@d
zNM=xdo@u{ZlyafwhCCYka&c9Q5vwG^$E^LvE@W!LoG#1C0QS_rY#?w5;pKLDXEe_4
zyBjMLgBb5@fkP8hwtpFJcZi347yLVpOG>P7z&6tRjOx{EOh<YU50PlbYEN3)*X}d5
zpvTsMFRM(=QZ-2x&a(64FX$z6dSrc?1Lav7^5I2mGd~{&eIc4DtBxv_w=~nUEQWZ<
zeK5;d5~ji1B*N4lW+^2+pAMdTGUNPB@n~=%bpENWvyPGC?G1~<7ZxRNGw1RGT%%(0
zO7rGR)z%LOPjlS0ZHxpAXxJ2RrBs~@LvUK28@6xQ@gBBdHG3v4f4i|GtJX+f`y&4A
zBbj5$!x#lPxQOq}kO3@MOaVkyXLkU<I1rir`rG@HlT{se+3KI#sd;~91F`ntxCP3q
zO=EUT%_EqbbXktdp<2rZKv-y38Yw{(LAjQoQu9%{doAXjeBMGvyqy(AHq9MClr0>o
z(l#i!d4uSu5Xl%l_cF+=7M%u^TE#>IiobLV*26Y4B*T-tiTA2Gmcu}$2q!k$AKY;A
ziSjA<kg(W>Fthe~&7DQT3}kPK1r&6|z=0s(WJrk`ULs4wu#73m-%p-P_U}L(*LX&&
zr<qP~5YxV)KP{{TDkfMoPp4U-x<*SzWA`y(T>4}-<MBQ>Q9`KpfH%6o&3)4*U(ox}
z$3hrCuRs(jey>C$4NImZ{vyP>=D=L``9aD;=v7!dYdU>oTv}>fn`y9iyKM9;Bz3tf
zj*@$An+)B#3nDaI2}#~R>me+1w3_p*$T$R025A00?IW=2<|e;Hjb}^WK-4DjfQ_%*
z@98lBGNnCWa9ER@fQak{I@aHnR(a;Y!L&mB^BMwA0NS$c(1^yFCt<Qm?c8PFb^B2=
z6~l8a4QehRCg9)Rl#LNAHTj#Ozn>kAhd24&!lbt|6Bp3jWy>szEUIj>Govo8fIa>D
z6RU(|0;CC(UcyUdM-9@c%1_0ZcOWS*fVH=`@zu(mLw~2-Vit?eq(Zzk->Mxu{)+NO
zfJjd6#j8o<Q2-xaQijkC4FdS(-n_`_zmZS_S)i4X#|Ue#d*{vw#s#7G&l<*j!t*!j
zVoxm3ZELopaw~^lFwSu|gwz*?VxJhXm){4c)%d4kusB&xCEpzONfuB0>425*>S_XI
zW}f3-#S)dEM>k{dS=F#8aZ{Q3tIXXL5|qc?Or?XDM0R#*`U-0>Ryu|^oCuD5>TP{0
zrIlwnr-vF9S%2Vfg#Mf)kmTJAB<N>JY=|qp)S<QAQPcK5>B`-GK=MVIaV8~NK>Qg>
zPR$Oq3l|eWCn!qXo)0~X2_K+R0rJRyAeAcg&uBg}^Ak-GhB+unpiCd21m~QuA<IU1
zb%yLT?c+W)ERbtSOFTDTRE!7j5lXwA@L2f0%4#zW7f~I%&fMRjGXmWdmn`&26Iznr
z69ToRd^_1gffH72@mO=wyQ!pYMYgUe^5DReYxuTki4c|)T+vk(ZbSun8A6p2OfPTb
z1Pp&I*>Qmwn2<40dVBx(Ll*u27+}XIp%2eO>0%Hl#o9yef{AFWldrM}uRyC*a~35U
z8EWpmy1e*CG{+SO-*f^6Ac~eJayBiwlRwpwl;Z`yKgFWvA89GD(`s>V!E?%Oi;dmH
z4Kd1usjcE#OxxrKZ?;4+9=e(p=JCn)==~7lGS+Px>~BNnby$t$`0XEGx*zI2CqK1g
z%gt=D&6%6dQ_&nef7POoq^2w96FUu)_X%^rodYVx-O&ZU_wc|EWe8K?SZO}YFApz(
z9kkwxhh{Yj^ro28fh(dmK1P{^AOfX1gTJZ$cYRbC4(1yV6YCDk@Kq#o%<5h|YKB!?
z9H{Ag7X@y_3>=>u<=Ct+<yED!%eXC+SYPw+E^rU}f|ctR2A-cZ<z9>04;FRoxwdyf
z3eww;Kv~1fyLj*WQHN{t_YY2@B(c!5@75_s4707;iLtLJFAy8gYmi4NfqKk4%SJb6
zRFh+%oB>T6MuY;5o13xgEtjrXcyY{Qwgg29_m>Gl1!7<|ZIvj=h$clUexq06XfmI`
zAvCj(U#f9fP`IGRucilNg`Ntxwb%(ixeI053fB>?4AYLwNIA+2>#i>L`+dA}Bbe2;
zS=AEh6RX%kN`r%9JmpgE6MJh--eH5LsMVc^`*YS`U4bbw7$!JfWMTf;JKd%1E^TOu
zCV1FGaHQpY-j-iyUZ*(6I3h{~aI2_4U0%cGt~Dt^oxjv*S)r3A;%2Im_RtBac)-=1
z|9J_Z0!5I%!?)wNWQN+S-~^dVlxM8>O2tJP`_6fiCz&qRA_}@ZbJyn4(!VD@K>p_2
zvH=cR?f}{A?x=_){Svhjgt*6P25J;R6?sUr*$)S0nZ^JUOlJ=%6{@Wp*$zfmZnRir
zr53Jp*YT1eNQ7Kzv?KFu;U)YI$@(*a4e}BKy|^%7QSwor;0~(XfOEh)>@sL1>i}lM
z>nR||xalClPey?t3Ja~!7DoL8`eGMZk1bWp&DavBTNwH+$xKmgT4B_5@TPdYKW^9w
z$Tv<WB{vmfli0isaFm+_3lZs>`7y^zFSOiieD&3O6dB`&1|StP<r`@#+#`*psG}@f
zEoSWzZoV+r5pFvX;caypk&306Gu)e%U-US>vy`M<M7NPWmX(>1>v>wY1aq{?JUq!<
z=RxOYm1a-wiP!Ik?OX@npoPS8wCpM+hIy6X`Gf#(MM!Ed50HA9JznfT8rVkmCGr=w
zWbSh)RwFAtB8W0XxOLSZhIDHxZaOzzkqYn3FWI4SWr89kO$_n&wd}r-2XA`k)C{iz
zA`$)no`>!F($!(7m58s_@re$vo4q$NPbwcKwIYNfR@5;|O)zHS5=F=OO{nzsD8w?b
zbo7LRhCl+JgCb(6$02^V#A<dHZeIMg3p-Ii)-Vk5Pu%C!7K0`%kgg#3O3N2;;?Sgs
zWw$JJFx5xW@_J{t7NcKSR2PCRdd*F<a^V4z8z!t({mn?6rNL{tfibLR+Ecq$ml<Gx
z$XHeJTfBU^i?YVtp6D4q76P=?mk>h<K9zWh`EUQ9G5?+OX`mjCjkK0Tx@6)<Q1Dk8
z=zQA{wJo%IGOceJ>IIN^3Q2I`{1z=lv44LV%gs{#O>b7gM828E^pLrL{8l9{Z5`gL
zdKmu$o_r+2dT5Q+K~*GXAt?<ZGaE8h4pF#txihI0_ZoEsHpJ4X{>(XPo`@BWIG*`e
zYtA|py?BR474XV9Rs6r%6*rPcTqqEe&bW2kv{AiH`LxK>Luv;1sNTGqT|*qb9@J)0
z*26W(-R&vjiu?SwH|yW^9H0@BL;$=ejTCSbz05uwueBHY->>}iXCd7}KqB2`>t~eK
zawj3plS1Sf#cTEO^t_r0e@)T>yOY&F5jj3K_J`l+Vh=XzD;S!P0w`ddR57FGww}3)
z4njh^kMrZ6^cz<mwR5pBMg4vTsH6yZe`UQI$f>`$e++Q)8n@>DzJ;D;D*ZoKeGt)H
z;{uU9GMT*O2VVS@CczH&0Q=8a3296=rvg#)WEQ3joezo>N_wDH`SnQ;aS6m3)<DYp
zxGRd#=oTi6F#Swsp-RH;*~>Rt)@)X@?kqc}e%R-uoTbQ;vd?YFt>`xFWXQ=%7QeJ{
zd~Qy)aLLLyHKRItMYrk*FXek`KZcEA`^+DU<4c}G1y*BL;a^hc+pRh_#^MRhM;V_L
zzXLL0bs{XB{g)AA1ZnGk;xN#>z>q^$a{B=N1mU?_)yFdGYGa*ko?A2T)suLgp*&a&
zOt<BX-7o=~bCVxGB&g*2H9$SOyvZN8c?_N*(C2eWej0ZjTbFjbaouXy@gL-#V!oy}
zYE68}>*t4v496s?&Y0DQ<|*X?(l=yhoA|a5L&<DZ&wkx1ts`Fs<}Mcf-yWe6F6nzT
z`a+YLYD{2nj$B#Mu$7}r^)z{4dG9!VDpAbXYrRMl3jSbn=Ja_68AyA3LL^_(-Z&`#
zB^m@LtVynCoOVnQbcZ^7UgXC8Q!Wx|C27b5Ch3-Xq79yz5%>;R8{O$<^Ods?w6nX*
zbi#j(BRaSr5;MWlzb3@Sc8-v)xVvJ^<pOH+Y<(02WKVI*ej^0GyAf5RbE?<vl%#nq
z4?11L0*ROt+5ZkqvPY$W|6lzLa-e0SnaFjS1N3V349xT#g=Y|?=T*o?_hB>}eCH4g
z6k<>(;^fo3Kdc$)NOzRgjAr-26fr<7Ia5VGqbpJ}$WtZwL&g=@$as<6^hx5P9CB19
zn<~k=wfC|PM*CeD`Z?_=we%<iz9pJosKXuipk^0W(ZlZSs$1eQlz3VDwYt2y|5pN+
z65+e)dt8Ime4A1eX&+73aZyHWklkuBPq_Qd)KT<b2?9EC?~7qD7nzQ_ymY#WW?88I
zs?c=RVR~I@G}1fielR8fT6T{(9GnM9&it@Ilfu?uF-o4!=&$WAxU|=|Td5_Zw3o@$
zRq;IJ(j8?3v_Qu0jJ{Meq>ozLk9bsmPf9#zJupctt9!yr>PsqIc?<az+e0xehW0pf
zd2*w+L*n|NFt(a+zrB8$$QMA8c4_$S;DhC6vYJKP!-Fi?ZHMa<J=u(x+uZK#4I93A
z->2HPigu>OWhC6fE-&ecN9HyZH8-6IT%hxY!y|1E_;6!t(8-8mH}DJPjWXxS+uK_}
z*2_|`AMd07*wRl-P<Q3*??^|QK+V$2iXzNZm@OsXq>+SE=wUjJjieE8?{hRHB<BI?
z)JKJ~2Vh^wk9xC#=#L})%ZrQF#5}HfM)TvB9^domXzBno{9o8N&=!<O>!YHT9h=M^
zyWu_^T2SU;`-u+ycb<IOFv2Zkf14Mjfr@OVC&%xh@!B6<n#wXv$-+Mh>A5K@i-P2@
z%7&7(7`bKpd2fYd<<n9yRB+^PT@_%W$ncO4l`e%vq+%B`N9}NlJmh@>09z0cDYev=
z-<a52-~5$*D%;Cfo>WMkw10$cR=B|*=QT{@jIycJ8b4zRl$IwU#{xeKgI&IMmudZ3
zB`8mQ;cv}|vlwCkSqeId-b*i1Z~J}q>;Hb2|9L4DbZM{;dvOy5UO|~;jeaM(rqvwH
zXy%07m$&DZ#r}nk`bAY*41xLlosld`DZ0#`hXmhCD9O%dX}@L@Nb{(Xy(7sSavU8|
zoXdMgx&$>(QaK5oZtvgIPTP2p6sibe`3CD!NHH+wn$pi=h0^Ypg{I3q6ZZYFb&igp
zY`@{WW>SLdo5EW<TeZE;>Z0B=KiNE1*tm~l&m7^d=ok1SxaG3}z~PAn0)7ot4oV<0
zL?Q7P0B{oM^8ry}0F=9Cz1c8l`-q-@kz1j0VP9$ne$=g8_&8&&)mabhaeY1vi4i!c
z=wapuws?>%>$avJt=6sl$utAEm)hfg_4-b(_%y85;hIxsuyk0ic_>AbPr_(7fcH_e
za^c=whi_p415%MU`RaehEq);VZuRs?&V$Kjbk;#N!VaZ32*B)*_toSGEwgVe`Nqr|
zyM$`gfb5{VBLg8}#%ARZP+m=3s-n_NicG9PEc4~72r^sF4T_<iNS#RHcQ8=$4($>N
zRp)Un<a?5J`ZbGpx#hMFZHM)=<+VZd{VMe~`1UKE`?be7zN}nx4m6@VaJdcPw5@rW
zf#Zdj{0vPQ0vs@wE4nr?F83cwj=-k6rmX{|a@KyFZBf;dYMCjc_a+L`Nl5_@71v7X
z_Q(+PcdK6cnx<-k3yG-iHUYsZ11TpedzBROZ-n<;)k`|J`}t2_2Q6n&RLL7*xw9gP
ztB|QSn>;{{5Y#VAeC|ku=I`!|YV<W4hcfDgjA)+foS6?BRk;c1@)i29SL23Cf%1@2
z)`z9Dow-=lrd!8SynWPDswgw|!MWMp4X8JlJRGkq(o>B<3!bv;*2h{2E?#`V4BFOk
z90O42Wfk=z2JSl_fNGU^9?78FSN<?M>>Atn90O;?N1gw?xZX7JI%r)CExNHh?98uI
z3NcxWO8{{A1^_I0I%^1G`IYKWugjDb_ta@Ia>0K*1q1@2k>&XIqx&a8(Lt%<Wwh?1
z@Fe=+=!O5u;j>_%r}C&gW<uy!nKc-43q+W~RO~1^KKX(YNh3I=_~-P=RH3N(%Yq8Y
zR1Emf7cl-c#wMdh_{TMEXZqE`334CK_7f1J+qCYeg3)c}$5hv1MN5K*CuoFT-@a~Z
z^?p3UH`u-$xn~XimeQq~@zYX#YrqjR2a~jI^0W1buWc&?xY$_72CqchdObqBZ+wmC
zOHbCK3dS&~SyMqGpC*P<#iG6;k^+DbwE;OWhe_A`;|0G{@yz}^*egvQ_usj%pqCvR
z6ML9_`SUM;*VGDncN6<mnaC@~cil~prjkwie@q4StFZYjIhY}Wjr;J;#_m}Ulzye^
z1()77Pqt`keXP;l>x4Gf^r0lHnP68xahsSCzLE~I#!Yz6R+b4bQHl6naYNZYZjQi$
zv7}v9(n)=5@L|d+D>5x~%GI_iX_b7T!p__)c^$<siguAgBj*n`b1@uk_Z%mK(YopG
z0Us;1Dc`RAEc!M^y@<2#NO6}2>bI<eptYtrI+e$8eRiqrq8?mC2He26MYr{=x6e?3
z8+b;of_Y(bQhZ{1M{PCZ*tl6ndGJj|b^*wP7-M#Kb`*K`^B_-wELTlADm3{YnET&~
zA@ay0F3BPCv}gGR3`7&}z6JJJb?<1l_(|F}t{7Q@4U9iOMz*XTIjn?#-#V^abrkx-
zg7c+K;_JuYcig#OR_k=j-Kg{PHL$rjId!F?iT1fztk~gAYiMHe5dOzOAbyee0e^e!
zk8lgBR$!x~!QM4ZNO>*C;bZY#!z@(?inf${0FJMAG8aXA{{dDrZ6rz;t|tMQq*-BP
z@!9_K=RVd<Yn{e#Qw^g}7y7@s0R9<FVt#_8Hmiy2U)hc$uM;<gal;@B@w=N5;bHO>
z@f#D`1XJwkX(sGP1)vJois`TLf%1n&mr;Ex4`v^Xn&qkGYF#zc^Ata-rG4Hsuhc?6
zL5AyhZ=2N1DL*_3ed3i06RZb{*;<6eyZ{L>*AUHmkM$dna-|NMb>~oi29m6fIF>6(
z07&O|O0apJi@g@O=<GO+av~zUa;*+C!Ro*HKSjuOMIgO&#5lv@-Qz9xSFwZ|5%O%@
z!>u7@GB{468{bF5V2l2-m_P`J_PLbe)2w;6_UlNJ6n-07s3BZ{X$48JlV-QA@JeEo
z;yB~5D5GapUUu(;-<Wvx1lMm9m;h)ZDp6}XKTp=j+gKZH=k2v}QAb9eL8^Ry!;q}R
zR<OLxcM!Og{|lzw)j7;>5SQ#D?8fy(nhcI)d$r-09m;T(N}dEfB*3{G0^~m>9$;mj
zB-U2PaVG@*Om)Q`Tn+3cY;Om)g11gm4#PJO;<ZX*w}o&f!ik(jPyIHv8v{@%&|*>I
z_<c|;g6`{Y_s8?#gP7mXXCl(AoFG1ThsuT4n(TDRbH5<;GjP!96Ek8^rj2!n7v6Vs
z2s;bTf8W0@Ps*y}a=X$H3{p}B+ol>skrqb_7GIZq+P+j|yL{0}XWDTMw%_?-Or*^2
zuzFWyKef)lOD7y<K$0z=;``{!p#hR&Tv}#rUJjM+J4rNCjq;YjevK9;K#_4{jfEFr
zIbe6XGMYzt`Q42@fLyp`w;E%nIGu*v(Y3(+`tnBKmG9Y=pmz<RLcm@)@B`}fD>o;N
zi}L^j#1tXo54&s@5M5F8FJ9XRv-S)AIK1?4o05s|t_m}m?pypnC$5|hj9)`6Kq9-&
z?2<#SaT$11O*OLEyBFU-GxFjqT~J%x$tRZbY`ixJq>_*G;5<T5MU#~IfSXS5K=Vjt
zmPdFMcYE7oUUplZ43-HRPf*HBgOqQWl(4m^Ps>5$T_n9{v3g6@L$bjfW3H_BfPJyM
zKJBe=YE>7e*~y%{U+oF-LJ~?8`+O1^Fl9Bpg}#aD(u~)$X0Us6Mw3>gxpQ4+X#4Nr
z$|V^riTQSahsDKB>5IyJYu$|C3&Zn@i|%X1)cI4+S+lx;R2_*R5L?40@YsNWD$fM%
zODf63m#V^Kivxo8;y^$A<9Uw&WF1wgbD7D>x5qCT=nUr^zE#@k2}DWo_A3%;BEhn_
z#&=IF(Ro7cg9d`0>&MRbA^cLS?D-c%tB%(s4cK6sMSES1W44eOH8iWMdWa(L{zqW7
z<%N-g14VfWu7>7pi7soue}JXsP9ksmcmOAd5&J9C8krP4l>hcM#;us>3O+tNLl;;+
zK$KcN!FG!itRne-5}y{4p)#W}D0^QTCbo<0kX)gmGAJ7?!`4I-6do;uT_X=)P=}d*
zL_SJ4J9I|sG;s+mf3wDU9C{#Nlo+hVe>D()8&|nTl;|xXDW@NypLY!{VmwDHgZzr!
zzpP|Z<J7=n8tjuZHU9sx_0>^TpxfJufTT1Wx}=eA2?Yh|Jfv`Fq&p=f1Svtf5$P@k
z={__E=@3wm29cKj-ZOD$?stEG%$lWZ#wDE1yPy5U-uWm-#cE_duIx$!j>NOk{anqg
zI~8BfDCs((I7|YSs_&n1{@Itb1_}%Um^K=yk>WSCiucQ%FFi$fckNzwqj<0prKG1n
zv;ecb!M*2M5Y!GGsyV1bkf4@q=)M1{Fdi+G<mR&^7G8zcyd)>r;?aU_V&CG+74KRV
zlc}2@Sgx0Md}LmW8knjm!!>YWz(M;!Z<0DXlwRFoBH9@48(ufyYV-6V!-hHfie`>V
z5(QiSBlRN!LoWGa;Zk1ie8iZ0u{!IoW!Y!0OzodbxAq86ZC~*0{HEN>r*Lx`(7tCB
zaORR}qI*;G<fx{*(jPxHXV^4@XfLdk9#L%gz)$5-T)7Va7~9N5dubKy@O6-SrHZkN
zF)i>O{ZgyC5872Y{jHT$7h*t^N(q6tqNv|oj_pbx5~>J}f4!Jx*Qj08!K6R5ndiaD
z9@{&;DX_z7<|6v#cOwxJn=SOK8Jz97Vf0d_37inGhZU8Pk*_*{wI2a1UvlMWU-<M`
zWqGevAPY$_VQkMNDs#tsvCYoTuDJ>PZ}#J^Pgm%f!yc2E{l_a~$AaR+)Z%uU#;Y38
zs)WIJ?jsPKQ9#^7)Iix$LV5DzGP<TZGR9KK7hFyV{AMD&#&mh)SKOH~n_?vxM1uL&
z=Yz&@Ad7|MCyaMJ&BPs}>TiaArf}gj^}rTk+`B2RG_segp;E24H+M~qyB70+VT);+
zm(em`!RH>lCAMO8>iE->vE3v8w=F_4fY`SbF(noTu)e`ry0o7KOt3$)3A9CMPyo~`
zP8dUo4P9T-VMzWMt#BbK%i-g5`!7`@Fo*E-k2!;^CsX~&0utfav<)Nl*!V6Ha38q1
zO|du&<;l=FjIXU++Q3q)W|yNFr|^g@gv#5~N4Eb-Hos++l9zQWiyq?HRJjhu5S8|8
zS@_o!s|x&$kCk45KCc_hjbv6u%}}7N0gH+=$@n}zt1?Xf_azYd2PW#5vnPTRr4!xX
zY(uIOgN2QZI7Df;VWiM810_;E6vp&;`pTfUENvaE;)b~1C0P!<bnx#h@9(bd1)quV
zYzY!<67iq|0PlpMivlENq(Mwo6d1QdDNC(Ms<=}vF}r>}1Se9dD>y(Uw8<6Rm$|rl
zjNkmlovPQP78@J8P+fu6{pRYFR@7r>=IZTLK|$)?#U5Qq^H9~d9E*z#3K886T?{(r
z-XQAue?EHxl%B++G36w!vG~{TeaI=|Qz!5)YprloPf@R*w`oZIR=p@4$QYg}fJO~s
z`J|%Z8sfR}T;@w>>B%_@?1-9=C3KF*(ra}O&leopLMHu+*E<mXNZOjOFIMCAqX#Nu
zMa7no#C6YpeEq)ohlgq2I*=xH1k@{jiFQI{Y|r%_4{Jtv{(%KWt>Q;_vj^25Q@c7?
z=aIwe{AwPHS@vJ1vS5Jy9i2iUCboo5g8jU3qtWeSNgf<5-7Y|IDE-$+TcrAJxm0qW
zuyaFs7Q-Vmh;qdSI;QxVGyS$QqP~!FUf_=YxryHq?E;K>8$0c`GGAzT;sGZnCw7$m
zl!8y1LmuU!Tds*eVgFA~bRjYcc`24_9NH;$1o53GaWJTv)>zeCtJme!eDWwv&+00j
zcP#4;`H|^jOL8Tl0;zYaVoy=BWA9|&noSk|QnHV1hgjmR)eF-~NKu`pe?{!Cc*&pK
z-(ir=W5|d(cmcvVWWgXiLxK7$Qbn|v;V;SLE{uwT?gYFIX3~9!lk}|lZC`giDvQNF
z?p@RqO8e_>p61hGaFvNA&G=7O7{++bbkUGTUp!aXbj6@AjI1o2ypa?J<BtIlU-<i;
z$B#{)UU?Eq8<kW5hGcCY*{NKgZ)bq_)*k$NGi%+?5fW-eD80s-k7pJ26ncG~#e4Fa
z9UbAkd*@J_g{PQT&yQu}U-_I|rmDTU$<^TrclA|^&3EP>iKVnXdER3(>hfC`*l$^2
ztP;&cd4J`$;5kVpG^dF2-sdDWdGrO(Rr)o!f@5aH=lBla`VDS}QxVnsOS8b+E~62y
z7VzebLVWcp*MXHT;($7w980qoBbw-g$&A9!EJwP&n&Uhm(-=Hi^=;~Pm>ASfyX{Pj
ze(1m;)s<ZimwnnUP;ZcXD9j~V`$Pu62>1_M(!PzBlY$0rV<J<Cc^^C^s5Dqf0tBcG
zyEdT7=+2s!Fta_NRQ08{Et1Cm?_F-jzflQuvpnM+oaEO=-0Ln~J+FOJO}M}KJ&!gf
zIc_5~i!Qjy&px!3<yk!pFuWH#jc^x<_s>-ya+k~{P!};OUe0*@YwX;%@k1lcS-V}M
z>EeKkN~)&X!>9DsHlI+F1{#lETih0!_)U2|?2=|;9Bkw(x1!*~S)P|XV%tJ%xu7rn
zyO?ZIJ$sxhmO`{sLKv${<vinY?5CkF;UxIvw{Y=IZ=NmDr>7ypU_n7A9>yS`s308R
zizU7UHN{PHx-)xNbbNe#BQWH&S7F$+<_K<ag_Vpf+I;el(>VA4v;?lLROHIVTpLX8
zuYoZsH8azXg#JefmLH51^>G4j?`5S)*Q3M3!<f|6RBudhwmGUgAUDDP_wW6cy-Lku
z$J3t+hgs_nUS^$6@Zd;+{`CkSHQPn^X6NTZ6XI*NS?^FVo+Jd^{WVg=@Xng!#_2jd
zzq+Om+o!Mxn0XxBpjTl_yxBKPdChxb0-w3|Z=+2;tsCrmcyoDZMeFq~P_vUK4^Q&<
zxA2!g8`d97_xW_)vP6DazwXj_8Ta<HWQ#Yown`H9C<*k26R`-rr2>?l!cKCT{Rgue
zlM2YcxSCMuM*^EYaPgnP%ZIb<2omUvS`57dX07j)S+P*ds4?8IO?Gw+J!d?7@8Wku
z4Cc5G5Az-z7JtASAi;Wl=gmd>i2NbuK&P^^tWJ;B3w$3DE~qiJ`1QeN2^?mVm$0lU
zIz1<CvylE!C^yTO`#6TH+qE>sWE|sj1&_Qsw9>&YCz~I9G+^E~xq}qwZZU+WKv3$u
zMo1y3w?D$eMIUgQC!p$=Pi3ns1e6(%sG>f0k`Jbc5F(l486fI?{$J=vO^L3uC^>*L
zo5M(6PA*e=s{aRZ#y*d}>4=+QkP4h-;&rDD$y~@;z8@xM$oe2V0xC>3fJw^~3shF3
z3BtHIXI05JKUknVaX1?C0h>8n#c4V0wwWW96sN=zs$;YFzrJ&j$3-8`lQZf3@)c5&
zk^TN~){T$tI!gNRNehvH>1d?)39MD&nU*}lQi}*NnN>8YdmDl8DL!75>&%_KSSNWb
zov_0gBQxzWa6Ks@sxwA(qBslayIj@`es!Iab5mga7{QT1rjhV&zp)z71KSjvzn%Ks
zuwc;B@(uk;qr>p89~mmH{z(Z?anE(a*dNn46mC~PpA7B<mun+jsaAGD@0+TAu<v)n
zjY+SbFS);;Lx=IvCfLI4y!FOey013DgBvU8TT5MQ>z-U_GhR~kgJyqP`X63DhCd`o
zwnLhvuFh^t_tlR}$uU<uYA8s}?s_;@@YGGX`TJi#M;tDi6eZ}2-OoQdA6I#}8zEMJ
z80!f8N>{S{*>!z{Hl{>`4^E@V<_G=*f~aau4El#c!?_P9Md~>>iA*4^B?S#*wx9IX
zN&0n&q-g*h<vDZGk}w9_@o3s}ePL_Q*Uu~3*E&k}H-^83DywX~WFXZga66|+I-TKn
zw+eUqcg3p=qvDMu$na2*q26mDuAx*kImTYXj(3%^g6vGWsjo$II@2oDET|8!#~Kr^
z6g$&Q?pOj7(dR^{q=3}~CjpfvUD~&liU+AW_*5h&>Y|3~4N6|!@KMtawH3jN-<lzP
z`ujS^E-}MJ$xj{z6rJWgeyl3_jkD%=3)|3!{M5$fLp5ln&GQ^TCIHBwov-0{kiPXt
zuQnvrk)%M;O1mlZoA$3?pWA{Lb?-sraxcV;d;8c&F08@4%p}E)SLp8n_YX!=i}5{l
zV$hxZ>8bh=%ch)WXLlOOT+QTsly{>F{N<B?lB{+OWP|`SYM<jN3Ci}Sd4I^c|LU2&
z7<sb9Hu>TG;B<&s9cqa_Q`<+)cPa*-q>a;NY5=d)k<V8-FzD+)=^&R|PRYOCvp?2k
z_fCxvI;KgQx@m1DP_ufM5lxG91AfRyffjk(@I+3_%|tTyyCme!z^BaLlDBh>a)r&5
zf;)<YnSZ5_0zQBpj3|Xv@p~78=>dl`j$#ss!z8iHdsLT#GV`ZbQVb!@&o=#+Mlv6I
zE_ttKz9Ufb$?!Xw-Hncp)`o<#&|LotyNN7Tp6K9n$Wd3$ES<D5lx1tlS(}P$Bji7+
zZ875JJ5Q9b^ji|v#{2rocKR(lbM0`;-mLk#dshf8X3)2Wy4QecYAZ;P%(u&Nhn<S=
zVFJ#w6nb&d8bCmQpYKc_J{Ym$YYw&`v&7IyP^4lMu6@6_E_WXaPr_X!LG4&9nLGv}
zSljerb6fG92RL25;F@qi`MB^FKWOKkQXBu;bGt(KC;RZO$0k)^IB_*D;74DRQ@;os
zd0)-X2JWIKGXwKYSROUs-<;M!4nuXFxz+`jBU}%f`J4>vNj~EMD+#+m!MNk(Z_3{)
zF>^)9cOMzvuKwM+<%n+zR}JRnx%+}23;yntw837KWOCOX)KKJNx*X%4_oSe(=mqKs
z!TVS@W^Kx?{Kph%56VeFx(^Runq~xYQL^s6M}0rXVp7l)7aE@P-#%lUut#<i*qh2e
zXm#eT&#eXvgdr!utESjno_R|{FS#bmH+Ac<L9P2j@ku7f$o{eI6<|hI2h2QL*!7&e
zdqh|1#%>E{4*IC*U281N6cjS8=3fYqXszZ$(G#Yv$8XYjv`KrGoE)OZi~3JLk%REm
z%4)Q~=R=`Q0)XxLSz3nf$XCAU*Krshw}VT$ex2Z-34c3>e*CDmmVcq9O!yGB6Yb*D
zem*M6aYjYEIkI2ZAnukT`!#aa7xDE|*_PophH}8?a^p<CnZ@RA{*?!WD8F&zb{|!I
zRPK>brP7EiLi+nYTzomJz{)TUF;hSOyh?|0&G@q8^oj$6-cd8@lN3gfa3kmB>DAJk
zk*R!;Mc%^*`j#Czmk+(wW?@qc{zN%yKy<cIt%au#ArP={-iyhTCj_Mq^_1kFl<*oG
z)<@mnitR$ol23hFC`*rkdqOd)T2*ILbMyrJq36%$Zp)u`FW$mK!HXLn5*uJ+*KYp4
z5dHD5Gj`;Q+c-Snrvu%L|CFC#1YXigP-o}s>-XX2!!7SA|9J$WUIV*@RBdyy+B12p
zp~P7Y3yw*m&(&|GvhCmN_lUYONnpyF^oZuf8CIyKNRHM+GirAgNmLRdocB?PRv#2&
zu#iUZW7^gMAN*mBL(ZuV^RxY5XM#o}B<m3zvrHFb3FB_3ZxfHoP|Eon>Nu1kBQX3T
zt5jvf<rq0jUz;(X=gX?#g?RxB$W8X1kd+v=PQOj521)oF9)crnZXK$^#^fjtv2P*D
zh6+)5L6q*R-6vgkg>@^8BO7-bE1Z%Ad}wfaTG#)>o`@Zo_kpIR-zc^GS>d;)kCdZv
zLY7xC+a{OeK7!6hb1?p(VxOeu5nqclCrdLzSba~s<@gPHnFz~fYHqOzOKOk6iXl3u
zd5BjC{;@%tl0U;o$l=fyl<KxTRfzRtL3fnMtil0VUy^|{t99ag$J_Y_p9*&P_lfBH
zXNBwsG?(Ob_i8Mp(d-xEeti;gW&$b$dZe^3wV#SOQ9AF4hbq5}Z7^H{)9BquiuRf!
zgVy7UR@$EJ5Oqg={|=HeptC%1dNYF@K1VUW4RT}YgW;115So9GTs%>KjNQZ%ZNN_P
z8`vq@+_H^oB3@$BbK?Yk(^P*L*|FX77{6%p;-ks@?n!=Bb&S}~IVoH>MuQ@C=|hQQ
zUGKwSIxp`pUN297N*n_eDa&sJ2`vPki<NuZ;pxO$6HVM0L2`3&>-u?5&|BBVRHHpA
z!9(b2c=2g|w-#b{7klJdQejis1aeIc9zf9%O$A1oLi07PwU>(csBO!Qq0V(c>iYtU
z)6om}Y{7*ZOg+g2(u?2cjI2GO_^6IWv%-zNL?rR<r-A<efGVDQC>IrmhRh*95gIbg
zp`82gu!LMb^a~{Bv3|`DeE{SSWrDcZx6=tY$)akQ<Q1r4g;P?F*__Xpv>Pjxcoc9w
zZDbalSh@0AzYp3Cx$;smAEqPZg{TT1<lp?7q(sH-U}vSAeKL~jWJRb!jT5%q@XUlX
zQ#0H|?LfICRTVe1BSdR}Lu$iK8!DoWq7}|BpDX~C)QXN$hOOm*`(s${8P%erABFER
z26Ukbw&Gb#f28YL!?Irpxv!_(OA8;wBu-I|eUIxrh75O8^n`c7kp@X#+FJ$xYeEN#
z=}CmobHsoJ-s3`6!OtX2vLB&YL~m9}t_9>$-?u#rvAF8jrXMZ3S*08rGfz9J(uyuh
z_@#b&HFIL2@?=0*H|O9h1$}iOr%;UFej1se{;)lJkPkK&U&RM9@*{|zk~#D)V<!Hz
z^nHuK49v<+#e3)tzOiDV;_<B<Gzs6`&0dR7dwiLZHYm;EH3(L&6mT+ed;Rl@RiJ~d
z%?b?rXSYqyz-J?av}6a>_Z7*p8E$(tDauH^dm!c&czc6a-Mv}6)cs!mk<i_EBiSNa
z)W;m>-~X2H`<huKuemUS_8xRLfI>S1!%HQk<Z{)O2VSvr>=U4Xp@+x<II_|_%Fv4P
zrJ32vM@G4~PbPs^h(UA8lBAtACgKEPI<LE);X1Mj#Ensjg6Uwos>MBE7-=InQz5y#
z9g?^1ZRIJJ`PLS;bSfo};T>8mVP<C!{;VFk7@A$SO2m09ZR&0A+F_#5@7`DbH?iAd
z@Y#w5hZ`ER9TMeoAVz%zu7g?jmEFabi|X3i+WJWWXW2w9Tb+W8k11uHxc?ePjlZIv
z7&aC7$Bl4wlThT=B&OrSs^1O$?(iE^<qyhNFpcXU*Y;K+UgIPeTp92{@6r*=sbF+r
zgnhyaJh-+naR?!q`{=hyM>G-#GVM<WTKQ+I))+(-crz%b6gW_|%YBQwiKKchSGX~i
z=&ujV)}*~g22E%9;Y;ty{5E7;IQ4N(=RVaK(l{?$8sfeh#ti=GP9#}LC9pWU1fu#9
zsJ%r1C<(KVrC1SzCz2A)V(4Eq!CyaQJiDk|{(~&JdqlEJ9B$+N#8*K#DfeY32!CKE
zuDcnANyS5EqVBamL1X3FVHzJ0i`S(e<)kb*j~tUSOv`HL^X_;~ucl(xf86FHrY9Cp
zW!iB0?skrCLJcCs<TJYF;>RV~!}Cw<KC`c#cxy<sKTyTJdtj8Ur@9W~^_bj0h<x$o
z#xVM2#qNQpfd7@28MuaQu>1j7hMC?Hqyzd8LU`ME|LJY8g~W+r7&K7&kY;^y3E}ax
z)&LIrA7LCrS(L{gB*7SA`Kg(ZHvb5CuY^ndt^9ts-mrwk-k8|5LoSx#aJcJR8i{kq
zf@$#nC6k+x>7HMk1N(s4d^g%!V<)>;!T!mArt%h2)DBL1^RBvkOQ#6Du2CQ$gfHH0
z4jKsJHjNtg$s+rdY7sT;l_gAcC4A*-&XpHcy%BIqAT2q)eSF=2%=v(XVCCmK`kLu&
zL-i^=1o%s{gj{kD`qF~OIH0k@B4c+%>gJOAbis3pgDqdZ2QG&%tuiN|>60)PuR+Sc
zb<Udq<&1(uyrQ!50^K!tkkq<#^4tM&sG_*;$BNSxW@%+b8|8nm>(}yd?yRHk;5N`h
zZzp3iOEBM^y>tE_`tVhI5$65LqYrJ_AQV}rlop4lT$IORPe(E(AfYCty&9Cws+K3c
z_&z9eLdw7ZxP=@GFqtnCf9y4GSz}&AyW<7l@cJ$K!F--*5i#^cgefqI+k1ZzrNBwp
zB(#Gos2eB7z_MO#*HC=kS#mtEMC;I9%hw)LJcVjuYK-Mh{*Ff^ZX?+6RxNnDY~Ptw
z?l*~pg|?Gw20E-9LK|zH*~~a4&gwD{B?h*0luknMVh(+bzDtiDg|zRBK#+GUWS>$T
zMg;-3G539x=>G;>3NqVk7mxP0G4d;a91sduS<nV>NWXnwxe=jjXUzZmm@hDL{yVB!
z)~zosaand{uJX5vHzlRl5YHcsz7{cQTecqDUbhJ@yX)K_=O?)dtBj*DG_x~9gEw2Z
z`Sonalx`5zWvkT`VRxfTJZP)3$%r6Rni%iqwOiT8NYYM+iA0Vh2@Cc39hoXi@W522
z+9F1*W-1FP#dbZxo5`Uyk$$iZHs2=D(k0;mbF3E94dVqgJeB58`sI_u5h0OdFhlZ}
zI-m^CfDMW<VN9xi=#oTXlRBz;AtZKBp>*k~WmPtqlXLZGmrid|@m*SZ;GmCZn9L7%
zc6P3G@m!J(ZhP*|RIkW~-J?F|HV9ZcWgIs#mz<!9s)p?y`?T3_r)wW*mUcZIW;)Cn
zl;`B+6z9b0S}CZ@THoDq6I=2ixAOP*FI@M=gUo3Q;WZAi)UokcpG;3rpVCP9Onj{H
zeFLmpy<S;l(7lJy%!d(~4)qtA&}jkXU1+>b-3c8ALomz;v!+<We(}>f_y@0K1@OPK
z+s74-Hd)SabIq6SBLC{<SST8T5(do5Jypz@=4mF}f<ELKFVblHjECY+hFBu9ll(tw
z@$vI<l;p9vRbjqu<Vki{KpT55_a(z|vvC*^8Csc_^Y!gEK}prSA;%GXc923!8VmTn
zWgny@)Y^=of-Vko63dH4Y-ZRc9@)X03E;5QKt8-~_WkLJr!-0Gm1?gjA3=r?<w4G>
zjhn$3!FJJgi=QTleWLE5Fo24~$IC@7H7I&!mZgsWqJF+KidTNAe{EsHZ>n*D%il3(
znU3fGxS8>U$ZN{r#<lU$-(CPoe`<?G4yG1`QARh~$PXMnVv+fASq{DOZftBOg?3&<
z$M=Id<6>A@d=f&sjx=xB#ZqUTUWk8=>&_76Ry0+q<&8m9GFAO}^dTb5meP(Iy(PR=
zeV9|F@&n68DLhjB%V#TrBHZ_g28z=<X$DoT30vmAUU4k2&_eQ-60+L9fcIYWEyyN+
z?=IcDc(<u6sL>*X5mbKxJm){TpjMVqEp+8d$v4C94h}yP0M{s+*pjE<+`1QFRLrUU
zf(Kx=%#hgRG!H<dxBx?L7y}`YF|0yOyz29no4E~ckd8hH#Dz}RwkO(SCzT$elw-de
zdd+Mwb4wnKJlu;?iq4PGE)|qxlNcLKQ&=CQ02A{b@c85w%RoYt0kn)s#!aF)zJ7~e
z(+j4U$M~1Vvyk!2##Kcr8Gc<bwU|ox(Ng@}Mx6MxcnbHpJm;OAIOo0huW=>lk%n^2
z&uLuP=1x$1Vju8MaagZAIBBBE<aZj>S$AL*SWWyezu)!#b_(di1XRCme)!3wi^hNF
z5l*wl7Q#sD?9}Q|BQR3HbpZJy7J8_pRI9?_*=p$HVZ0(2XA%tyu1VZ51IBBLwk~DL
z<Gc$shTr7WQG(HKY2S*_jo!6e>lUiFM{%*0Yzfi?LwJCvj|Rm=3)yK&tcUoK_P6LD
zzP10-!2a;{rFfv&JZ<^<ENPJIs}HHf>W8*IC3D8Cyjr$vv0AQ_qiK%gO`wETZXRCe
zxnv%x5$VlIo~lN_k-bv=L4Pnt_?CHlx*@OiY5#iBws}Ha;(UcaJ-e6kg@vnbfWv5Q
zj~Pwx)5atxyeQ|{SV<NbtC=shBV=<M?L5OPVDiGId3rga?!xdX9&rOnEA0J;EE!H!
zACOvOz6z<51PZ7#Qr^nJ%QZUuKI2f2FpZ*y%2`H9Y$s@{oNcf4#@Hrqv*uTBc{H^Z
zo`jct@y<#nuaunMK3;bATZ+LkL=ev3eQX@H*U-=~4q8=As>i?>djJ<P;82@b^Q8*w
z@)NR_c0KGT&-*SfFK<@77pX)2VE{+IG+ZN{^J?8wEa=(!#DhaS59HMa3-P7XY+wjw
z=}HULe>d{yNX=iiZ}u2}heQ9Rma-GTc2;cpU!<~B>pxy4(qSwvcN*)@>$V<g>Z-fM
zwt!0FlkR(nV<rx!a6DYTx*5;^qm*80DI@fAqB%64U-MJ8x^-nSKwNrM7mXw{HiJFr
zT`i^*iALkdhC21xAqwciP+djbVE9mAka1B3Y>HnH+tBot)_FQ#w*CppUT@ObL@YkE
zvX6yxRO9=o8@{jp+2!Y-A0X%?Ts1s9S!WZ^>8tcF`@=bUsRMAAmJ6bNfV4%LqZBm`
zkAA7dmdaM6x#U=LoQUBQ^1j5Vi~h}bLX@X#Y+51G2?9v%gxT7(%o!=UePwxd_Kzd(
z&)crx3}7JG4m>ColYG;@3h99)v!rNA39Fu`ACt|>fn?crJi?*8IWfADQ#U>tD_SVL
z*`i2^d0V*?(<brO__Q77L+mJTm(U^e%sCG;iPs(Pg>jQ#v<ucOT11A#_9%OMw$VMe
zxLxoSFTG!7^ix{J=k*eVJQdC<H*IfS*l%Xt69a*)QEpgpv|RWI%Ys3dSZ<+sqb%ZL
zX6#k(+n;94YUUG%fB4|aqP@dYXpkf;9NZ9=K=b>;9bHsBd?{E*lo1qeADYgCgEWap
z4J&xnznOoZAiGwzN7%+h4joI+%)Gh-w71uF+1Nqf!aigJEsl=<hxf9`WBj3##M2K*
zlo!Jfzn%!;`b*j%D|=_{1mrYs40=_S`BS!h#Dh}!w9Tj1Fpcy2Evn7dws8xGYUPe(
z$+gCXit?>&foSk-p6$DaHiYTFs{jTYlKdmWJCQn`O$*yfoF5`W62{0h6<|ijqmhlW
zlc@9H{gQ(gjE`M^bwx;=b%zrz36*)K$hN@fLSB$CvgKa@kk*@KDqN9bXLI#$FIuQF
zT}eT&?tMOt7y|7KMN4ptH*Rmx+^!bW;~vTS$E%3>dM%s`5cVNdC9!?X^aKQ}0)IJ;
zqARIZKG7WxbrIB7@xH{p7>Rov?ZYH3M)s8<xX4Q)GK5bq*f0m*n&|xCR@|4h3nw1p
zP`mJ~D~C72GI%4qg13Q&yWQN|VDKknB8)t+RS<3}{$FY>^63gJe?}*?MQ}dQNU|QM
z;0ac96|3a_UKfzl*XXGHy!^bnTG@4D)hABR<5{$G+?P>_+vZ8_aiwURmo~2@t*%s!
z0tBMuHs;U`U}wvM+(A=SB;1CLg){Hv^zzcVw-VPWMoaH|2rZNcNwQe)puroMk3^M|
zD^05+g|a?XnizBPi-BY?ttZ%dtm!Wd?qwg`={^8OABr4s(72;~hI{A1Gji&kb^6rZ
zTw#+2KJzpr_LQ~qYKacxLy6&8E|~2gW;=)pUcQw)@R^41S#?j-`u^&WLw-_A8W&(v
zco-6^L+PR4n;<mrP;bD`&rj*O;I5;3Q&1uqIs9tkqdMZSx%OeEO7H3ugYqd?w)}US
zYJR>97(tPS@S?f3%R|JHGf?-XUTn#+wG_?oE+)0CfdmAYGc*+fvhWYOxLjYGe4g1m
zm0B6|(4XH~Xgbx>d^*sV2b?sFVb$DAr%R78ylcNpN&)w*e`B#|FG0L=IrYjvUg36D
z41ne1RHqF2teLZe(@{}g_K3F7Wn6Y#c~ppyW%H^Muv0e}k@I*rgaiKV(X3nJx0I3l
zVa65dDN)|5BzO(}W`qvx-PydZE2!GyR8wd)6Gc|#?^)o%-w{w8f0z3l_n)JQ$gO0t
z;>?AJ4J5t5mTfV4{lkMAoX6hvImlfCi-&Tp-javOfkOFS-P89Es{4C0k3rZYBXw25
zsaZnZIrdFVlJy@kyB3)-gHI#;$tX{h__S@xG*_k7+j5GtSXDSTHJDQsrtzW+*`2<e
zBo4sBZSuEp<hQ%ur2HIwZ}kR~lP%vC^fpA>DjLs*eI^HUwvYKpdd`J>-v@S|&Msq4
zMaI6Fk;%R!cW{$*=)=S;Yf5iJj|j`idcynKAoW<GjK12TSTfi*JHJkUV3i}RN0MmH
zQ8TE3M2kfV2(Q2?jI(C|>3|T5YB6KBi1YFh^+S*5So{N_eL;myZt$~i+r19@)@s7o
zb|S9}e;hDv+z)Bi^7RAW)EL0njs_GT*3IueBsYuE%q~o5t(f1Xd6D%>{6n<dxNHj`
zIC^O%eSfPYUx6YUNAEJ9IF@;g{|1U<80u;>Zu1F9`f^IqjPa{%g&BNp)z$k-(tR<F
zX@?=zSwyHEa+#!jJ<+73q}K+J(1M7-E`hD5vsOt^J75V2C>;NS)8M}UmXUe8g5>I-
z;mCLb^@Lg%Y<F4T)sd<r<5zW|=g^+WW12(dYB|UNdx=nM?6f*Cn72_ux~Zk^;5W&_
zx1^KlkcUl=l5y^HDe-(@lm76T>abOVF^~C0iTL<CEfZ#Uv9!~CCb0)2zBc&AZrShc
z@)Xuc#HI)}hEr9Kts3|StP&A1*%4=#aOwb=w%$+CJ`CKvCkZdLdSch_Z>%oi_n|z0
ze7`sST#`}e>yTd$I@&^AjBb@d?sd~UdDo4Oh8AYf&R%Aoig<!SUu4tlX8|y}P7A0<
z{YwY=tKh0gFz=Hb_i?z3vJ3Xjg3EM62^3$nM3}@<%LX?r(A|R5ZOR5G?jIkXo71~<
zpsqE>s!OeG|EL`}S8wHxyzYRlV`()mBQaCX$wFN@#C6_U)p=5U`2it?<b#TgWT}!3
zC~222#U`OD+fRif(S}?X&e?Nz$Y$PgQKL`d)fDd!y|opxxn`+-cd`|u`y0KoEip28
zGY%rb>4g7(*t3;f1mzQ>rf~C|+8d}r-vOai3D|(pgSkQA_ikHu1<ZxLc7n$h9ftlF
z<S(FS47GaK7(vFCt~+YJ4nWC|$pi*Q;X$sphZsTg=`3_HFCuw?&;a_$V#(or>k&cd
zcQim`(Hg>S>*u*mq87|wT>Jzw?UY4SiTz(mtHC7yQ$lH#N-<{oA9%nT?SIwxj`51Z
zudqT<UEFGtIejr<@4}HG<T65x>o5CbTACvjfZTKchTJ&m0CKAe$;!Kyq>9x}_@Q@Y
z*@6hgcSW+b8mJSHKF95T^VRTSz5VK>tLnrlyt9v*iN2zy)xg8QvpO$;ky@aXE4?bS
zo1r8rX0X~o$Ve(>v`{5Ce$28?Pg%7iZCNw-nUGGIeSLO$=tw*Uy~)IpVZ~9G%u6jd
z3*yOzGR+h9QUlu7*!kWD2&k7rGlUY9(Bn{&q<QD(K%-hIj<G@j&>i;BlKS55rPEyQ
z`lpy*KguQKw9fA?V+(dYj#2+V@Y0+19>?=;MF_p{YB**O;u!a0lt_X(+SE|?U|tW$
z4lQv#F7N>DS*<O|f|pICSj{Gzml0VJv!jabmbRx@&FQ2G^N+A>c$-@%K#xEKjD7NJ
z0ju#{qb(n>+5YN7q1B4x1+x-cB#!r&*U!l3acta=OOE1ne}-qo)_mwtZFR;f6oa=A
z=Dt@9QpvsMG@LlXE$V-gmH$<HV=<;n%ixe%SwR@U^VR!R_dq|E2X>h(7j_HTzf?~J
zQ=K=K1{P%inP>Y9uoX6E@G?UYj3Li~&PdtK=DQRV<uCO?=>kCGEQrK9kRcT&1<juF
z0&)vfVy*QVB_(Ac$W;AEC#lme1zeL4d;U?a%{o8`>XYr7xRK66G<EvrI%*05s?@=}
z9)=*h<6a(}<OUr1NEJ$0;DiifR=Eop-04v-P2M3<z|Woo^SN7$b+wGZmf=$j?@W4s
zO2MsYM&`oh(dz$%Hv{cek=Uo`fN7S{%mCd!Bk_n3_KrQ_K1VHdY&_)I(-g4#Gfsyv
zI90b@QFtn!<J(O?t{PY{;!`9mx9~m5{>>92GV=M41Ho}7+=Rx+G#jz@*dTxU*1bmy
zFZvi6CGWDu+q%LB<YU_2wG@j`9{clFq*xP;x|;Av@`Nd+oZX(mafb3aCSlo76;86g
z_^k69g+VW8*Hb2sQt!RPRXA9Tb8aP$`H=J31nD*w=Zy;{?15XDyohL72NJV(|8Jb<
z7DfV**IK%RKPLTA!3;06_-DML2o-H3=aGuG`-i$jf>>uH@iFVlJ}K|x^^SPvTd{~9
z6*OiR#LG8-L@TH)xESMsW3o(dpzEDo3Ua@=eA!Y=?#L_rmZ@|W9bICUu7rI0$;!0W
z7W5LGi46W}In(yx<gbFEb(@Sqhg#)it}nG`4baJTRGFMJ>IdFaiRspDQ5B;sAU1G4
zZ&70a=-}D);7PEPEDE_<7Yi)IOv8J&bP7|!Xi%s21_*hG2PYtw_k?BHHeu``cw{^+
zZoVQ#a7K@xU%j*EU;=eO@j*7c!MPbFN1^!V6OOuYX7`1^wW!!u4vZkzlB`#I7qYoP
zMOvB$W84&SjH?8G$Kg?Je&xof^|1l~saf-ny14AI=bBk?x|X(2F9P_#iD=HYEI4HM
zKpxuY`{JXr5J+zoR8$`TD#kHd>0RiBmwNqtOF3KY%A5XzskhGm>U$|jF21q3IHhNj
z(~VkzU=fSYx>1LZ$RmE6ckAw%eEL{~YaD@6;g2|afpoS^HH!N}a1w~W9a5a!WRotq
zonhxh<ju7G<uV+-%cO=D`dOL3z(vkfOaw%JQ{~17{4J@(Rb1W{fQ=uCvtsr7m$-+E
z1?x?2zu<$4Xr>;XHWP!SKiHi_>r3fylC*y+MhE5Z9{u(pMP~zjj`o8BV)TyNh0>@)
zy^b<780O|Ad3ho-LADxeAjK>JNg6QO5B(pfvC08GKR(QD60&%oa^@=$W12f3mw<D_
zGeI?`?xT>*ns)Qjyxvu`CX>ohU7Z(nHTUZ#4wdzBq)xh%Qi+`T9oGl!b=uxnhXi;h
ziVrYIcu+zP1{^YsXIzTE3gP-BI@8MV)yRj+-ff9$3uL{^SW)@dZ)EaT%AO4LjbTIT
zoVwNoN0YziFEN0>ou2730Pr+}1W)flI^#_D$nPdUB-WEcRx*tD1foz}j5Rk1l*IMd
zWJX?Nz-P9oig89;khnYd$+j2#lIAhSndj=UK@go+!Cc|rz<hmMyS7xu7Fc^O#1R)C
za`uoMr@Ffmc)%lU3QMoRZ(geunx)C51CRbRk@TWz%1iI@_Rkvb%3Qna>+9KT-bS0r
z#L5{$D{G#XmX?c?n{KV&`KGn)`Gti&=XQ2>c*n0nyf7cc3mtv-TK8`5(LX7zdB*1F
zQ$KHCB5kV0<obhmR)FK*XrqhHX;ZNDQqvANeRwzIW%+`C;p3T=h$nkYpo*>Dd1=V4
zh7+8G_uIXLmx);mgA-9??Zyeevnk21_j!Up#FE95r-q4f?FObzua<0h78G(viFv{|
z#v>4VBEtr6{aRRro1$VysOz0@px~E=#=WY?`Elk1N)eFes5H}-pa=Cg;>XbsDXsCt
zUCE;8EqktxV0J6sH?d|}YzmBWvyOeMR208w;yW^A@WHCBX=ME_<3l4Q0SQn@=~=aD
zYl)z;h6}?!r6cL|&;QFo-3oL;nF3;CQS4oLG!e5-xoU>2Z_kK`wAU~N1oc=ZYws^S
z@q`0&nX9SFB5ceLM|Zh9ZA=EAib-#%BCKcO!FDoknB8h!DNLkF9&b&2zkSGu%JChw
z5^jrv<cwRN>6Hel1_xvUT`0-o;mC<zeOy-X+rkAh69*kXF0v#0bJNgyciM{Gyg?iM
zpV-ToVB>T^wbEm35G?!_TlEC=xoTwS=P{Cos@}M`J_P<VEjb{11o7bP)saVI(4VH^
zycm<hhFpj$SZfLH#3bx5;iGmyw=&QQIx0~g|LR`ztc(E!R(3b!BF?&k4nur=S}AhM
zq7=xM?iEg8K%)fi451;jA2X;dOQ)y!(yc$mNX>Q(v({TG!5Ef=U-Ny=k)wXT$#bu`
zH-<Kbo!utpZ+dfrUH#rb*Xh+wt#y=NwAj6THyB_^XWp+^UH$LuR)w+iu&VKycufft
zLz&IcAZZaKy)3llV~b^(X2|W$-V%~IyTo<W6dz_aZ$=@rED53)V=ywRL}zLc&l~xg
zFKx_VJNLSe&&;Uqj`g+gXH%0%+P1GF>{VDn)t}<BS*Z;8EVul{xFD3cxm|DDObm71
zzlzC9BoF23UlL>b7r#oCZEfzR=qxS0U8V7G7dIj{7F0)x$O8Hyh<f1O^G2z|01jL#
zl%!GCK!ybz5_!s}(H;N1t^Z+`Qow+RFf9vrXS6Biu^LijqqNv*yr+2i@~k;xB>&uA
zz)*cMKH++<_aves58F*1kDj;CtxVyiXQtBpbbI2FX)q1@I$Qn7dv2i$op5p;>L?4B
zTbNQ%o}ba<eX^8O>v7m?UJmzm)4mz>LNId|iErR5irfkZYu<|*h2xL;rXDztA?LS8
z)JQR90(cdPWPAJo!scho&*u5)BMWS<2Yxqr81z!(KZc^J_W&r-&zD>}4G6?P@aTH}
z2UQ(!TNN!RB^6My?tNq9;|JVKZ42(xhBvT8_n9c4sA}4+W1nnS^3nwd7`W7C{NkeY
zF@6e;FIN|7o#^RTS5-;p<?eb#^m7ce9X}S8jNcm)r(X}t16dYK?-29ibp5Kn@`sfR
zi1Gw_H2FO++aa3_1gz(mU5^{LZ<u4Z;r}z)|5+VFtg})@Rf#&Gno`N0#A~4zcH^f^
zlrx`OMVQUf1<g!cmi0qN3)pGIv@#TR+HV*kxn*z>=jV~tWBTMyZ{P|vp_V3IqrWMq
zt~jZuXNykfpwfLGjBI#U)Xls36XUeb^;Tw}Uhei}OIUAvOurISKvsmQ&7!%Qt%?(P
zDPFz01Jn;Hx}5%YWG8~qqo?qIZ?W=Bpt0!x4*Ep$-{ORq4)UU%t`uFQpBP#fYdfou
znY)>#?~KNCj9@1Ia_Pz{z5WoDu3is0w7VE13fw|};%Yi*e(m*qm{rJ&Ny6haaj;k(
zpnGfcNS8^+vUQkTn8p#Kb!~-Mf^C!U_u_ld9F2^;Y?W$Boo<~=|5w}oEBkihRA<aG
zWuq*<F5e<j7@Ig!`oM->3DiGO*bJB$t&CSLT2I`2%E+k}Co9K+NCu3`m$otp4lry<
zxw^WR&(6-aJO2Q)kbh&|uHR#P@lbq1=v+Ka-Il+4t_hS*a~d>gB<1~S=X(Yn#!nY>
z0^!;o4znF=;0i2})H!G6;W45{?buelTQu8#di9bMN>mB%f&?sqfX_|&+UR^f<*}cX
zfV~G2N_B<d$@x+uwP;f5H*CyC#Hf$mq$s_kIG8E*#L;j66Y{4gKyJ+!e3~yXev6z%
z({B)(GFYHI=gOE;BOa#Urjj^|#XiD@-DjzkHAQXH-|3p2{kWYVp>^OwL6H!I?*_?=
zFe4|_Ot@F6HYwt0K%^{z7rN=lzoW-NLs8}}%_JfEZQ@V`%>XR0*o`N3ht+)J$ch(L
zFJ!xZj=D#0I9g+JS8!{RU;v9_c9o2t<nx0degB20Cdzi(>>-VH@k_8}drj#V7csxc
zE%l<lyR(4e5H*p`y$!zU!gP<^vPF>9^7WnZ<Nu>tp3(7c+55^1R&Aiyx$QHNtbev)
z@Xg}hd8lfq`@Ze|@a}nbi25kP+_1p=N>x9ZnU<K+PY9dno|Zfwy2SiyKT7+T7m0UY
zc)TLbl($VDt~}zfXZcE*>!kZMjBw5dt~u>T{E>g5>5^t@bp3#lf5SEZE6#HzGASw2
z`7@o_%Fb!+cLcI6n3u}he)7O6YZXcZ{*v=yw_1Ogifl9FF0M@nYX{N2ifrqou7<`U
z*A7f?U-=4viI^Kq#OnQ3wDSP}#sF$)Cmooc%Wuq$ch-GcjI{DqUxJVJiFBV-su6bA
zuQ>m<n|&;U$aF9*4FZLvAVC)1Zorn`Hn-_kmbiTKbK&LI{z~u7V9g)asy1gs86fPd
zD{O`-@0LQ2=XOsNA!fql@H01jR8>zy)k%+XZQDeR#upo8ItFE8S$h{TmQhy}`2qtU
z1Hypqa>GpwRwnkg_zKi{7s!zV+l`xapD;i=Guy|li`_H9Q6?|ju)3UD0q`L=x6@n#
zGIM%*_4W1^!1l==O@^8{llq;XO>3r1zkrzS-_N}4BjcsO$)~b<TGom@X%<BDXkyiV
zM)C*{G*4*Y%a#?CwZ6kNu7?^p({5SXog@k4ye#f~-5ekhSoqbb(yQ=D$EgvkxRIBe
z`We3NaO$IY524_Eqcb%EX<G}))sD0nZMMTr^KRHrDSq;MiC)wl$8KrM-*A&f{m^@V
zwn|qN>9AMz?-yh{k#tlB@mLR6=|3$ZS{hXdfx(kDX7>zTgQ8{W)R6mhLV8&Ny!i`3
zM@0FI4#|2Gru-*-`S%y}$T$x32gxy()U+x`cQ_?j_`aT9Gk6X&9jb+<C?1KLb@uwE
z_?q<Z42U&2Bs_SF+<lpQgjfxKVd&#m&Y4@#Oq+}qIB+@E`NBRPxlgs{V^G#+Q$#$K
zzKni)3Ae!!wJn4ES;oOr{b)z@u9V^S#Jfcase+Lrsd6SyA4OIxQ)P3Dw~)C<N3h34
zoIfX#a5By{JwgB0UjOUT_E#d|k%ra;<=8IGY87BF;)K?~UYisl6crd@TAea(qY>C4
z!Bljdz3&Grb&$h2)4)e1vn&U|7jY=QXP9nTUs%G(19xB!x%91^ez<jU4GoE%PSx&*
zTsQ%5f%E*mKkX(<0XM+~(`@;UKx%8#wKxTtJsnV-<wxN|_6R=ck>b2}>z<?T)XjiV
zrKjI41!$`vD19scsL;+HlqXgRO&kx7>gMT1@8Fcy`Z4f%5#=8-*}xPwaY0;`1PWKe
zjRpCqR=FLrKdn`2iPhb8>O;QU21`C{W;?<-lQ$nCg4+}@c(be)L`#nj{iOK(Y~WDD
z;mA)nu}pTehK~zn-^iSGc{659n-+e$bD4eqNUo#QScYr=pW=LEO!WuBX_>I2x~BcQ
z;Uk^)<5kgFL>^^pp7S`ai(wb%{TD7749!lDM>%&k#G*NUp8@dx?frIO2`~3Uw)j{!
zZIlGuu*Q2I4m3^~Z*dSX4C1_mqrBK`(5}R1-GeP#$0{xGHx`7jLkNz4uiV{g=H4`p
z8wh+2A9)^Imofc@+r>RMi=ypqr}3h_+T%|VuTrqQ?_dXarn#HAgMKQkFc5q<_BP?k
z-(CO=oF{-X5B!x=%ZTi;k8@@xkSl*|-eO_@@6tU8f1uU%$#6|Mr|@S?Y(=S?DM@tU
zjR0rvNP>QwG7i3-9!XIs?A<ZO)3@^cZU@I?Tk>@wnad}c$4VB4ceKW2NF?TaicCJT
z3&4ABbEs-tes$$5;vLpyf08G;)b5j5g*Sagacc72mN=xgPVh)H{1u;ye{_VK?9*)i
zx5<)h2eBt6Uzm=COYGinv{-~oe%E|9Cc3s{5UhfiyZ9dU3oUTpa{%oT;nWCnk+cWE
zFc61-faCrHnQ4ux47l1pjX-2O!^!cMz`RF<LBFn{(6$Oft{!5Um6rB@r<oqo|HvQk
z3aAE@cO25=M{-&g15A7t@ENoMbZE)e?rd!@3<lfNXFh)mywtOG7)z&)EFt2vzs?^N
z_alBQcP=z}N_hb6Mn-P-PPFWodQTA<uhiL(_`Q0Z-*7sNF>ci4Vf|fDrN>GNn=G6y
ze}tBw%YLo3LgY?OPfMBdc@s)Y0{!L2jl6d>6sW`(Ai&GEaq~yQW$pJJPaq7?1PAi)
z>E1I}<9xFqj4!=t=XG`b82`D%uTTKwNPt8``!9*+(Kw*kEfY68Q*G=N`K@hGbUxs8
zcj`U1lzu=|&;cRY8)Qf!wj9e#!pT{7nr+u#Q=^wDLKQl*inlw8!!`a2VX+gR7FLqw
z&I$OrG$_o8a3D_ljM9Ij&O$j~^Y}=?=Rn-yf-osTjBFTP$Qy?B*$tmwMPQ`=%o;9&
zRyzA;gCZa!ipDCOw^HOf;aVBidArwDXp8y^EmHa?yB(?X5LeY_@<7{rGwoy0H$zSO
zA&=IFO6UBi!IKJZ&a{g>@dR66Wr+V{xROO`&FW+D|Bg8QA@+<beItLTuBvFKr{>I+
zn21G0WRqPwnEl<>C2`D%e~JxGUiw{itp7;!iyPxp+E{nK;DEu|DzfNqZHK1w*Msh+
z!G0^r+L-i3A6T+fX`Ij*Q>zV+%mZ&LiKU=)Y1^A6o8BLsg1zukBuj+hPrLV@4la1H
z4hhGqkG{Vv2Bk?L8s8G+igeSKq0()|GhR3OZa}39E=PH5rdLV+gIbB6f)>%`41@&V
zDA1CZ@)E8a#rX1%)|(a_w4PQTtN;Pf?+3`IjWC8>_JYiv4ALUnfCD0MBq6c&vvVUu
zL%!4ByTaO@3rrymSL9{^cKnwF(lkJv37CN+Nhyn{mZ>8jt=SmflUBnfj|Jz8-Fio0
zv&x_eHPK@FS%^uG@SHWfRAq!`O@UDC_T1v>RiCa`YjI0yU^O7RK2=wLCzAf@RNe`Q
z*`*TbuQ5U2j>Bd@<O&?c5mPWvfVp%{d3T_MGb!pSfd}WNG2}ZEw~i&cj_=y7%d$zd
z&F@|-jvoS6j)UCnF3!qNuxuw<c?796yWst^iQpD$8UsH0@kCT=K8j8;FSkMFWk<8N
zJhx*K#Y#8g3>W3ir?@P&K;>oAL8=n0R6R7oVc}lfmvJgN7BtY;xZXEH^l^?b63CcN
z#7RCm!LYQs9+O1>twQQ_Z&*IbIy-h*O^)r>a%J(Lu44xBTF{y~pPA=PkpGwGXcA|p
z{T##G9^q=10_U<a(J)9J_FmU|%JDK@%b$CaWx>h07Iif^RW7sI`*aYvP}`1l$z8m&
z2C^9cKY0;o0_;}l4o0{#b<c`&K4!cR>!)2G%x+@|$Bw?&`Z;|#g=O273zO5r3~#z$
zfSxOJx-5P(RKn!yt(YEPppV$|o2K0@GOek*$aQUm10R-Ac{clRvjkU|n%ins4kVfI
z-qJs2XVQI6q-+DPy&e4$8f8}zDy`G1OnLedTja?~R(mvf7!&u7tCqTW{oMid{RdR~
ze<2@f{M#9=0;`ylG|_4u2csZ}X;$Jq$i;F&aME@x0wyb&7w#`qG6c+~j)c-e5%7n9
z50~`m)f~GQNy8ETdkf$Z&5;SaCjq&tbu%&fu5JSH7dG3G1L%@HWn~=HLCXtV5CkQA
zndNy%cfqx7&{gDlhyDlO{O71@;L;KPIrnMXwtmU~LPos_Shj$t3tJrVl9c|3Tn&5R
zp9Jsgkl5oIcc5Y_lAF<f`q*S!`(YUyh~4IE*tJ4{$NMgSy7&?-W*pN=VHadKYZ1o?
z@=Pl4I`RSF6{w_+iN_XEyFWp$Y;X>D#eL~iN)@Bw8TCV)E^Lq~m1-QB1r!W%==W1i
z1=@H1IA%K(JxD7>S7bJ02n6v?g8~|GW`mFVWjo1Hc$@LORVpdP%_7v}pE-hu9l1Ea
zaR_69n=;oeP_=S2sXP{2ub)4nJ?Wyd1kwt)6E}7)6lxgR4J9Ki-wuCpDx30)TbN<Z
zVDqc@qf%tjAz1dJ%ug{M5bJW1ptvt+E~;I|tI?0r<vAfuUlnrXBdYjbZd4_EgIrYu
zvb}Ep!*5uz%X|~JL<NZgW~J!uP<W-7*8^;#wzeV;MaYqh+Bpf~ABHM=Qsv80GTMzH
zoq1sWxnw5CYQHnn9(1PO>*nBU40+{k8t(im#dRCm@wYQp@9&`^w<A-Rm2&|suoWjG
z^}p4<`{WyWoBJU<y%JxjHOcN1#;l{K?6o5wwgYNLx}sk;j##LmrXD4`32B<D7<Be(
z_3`Z~POPI!JQ_o2WDc2jkGa=Wxd^sQ<Rx!Cm}f&#ekr(_XZjFIJAyW~b1U&C4KZod
zF)U<MhdzI<wSdSYOFvOJYdhmv&Zt;{{Fd)ki+9XX=S`zCfEl)SLiT+)e%7nWk=@cg
zR*>Bwa(EqJg54|u?Ad7d2n@gvCWPYl<p=)vNG07XG71W0@e3d^YPq{9Y{ml!CpD1@
zQcr#%u`xda?5%d{eOeZ8xF~IFHKq@RN_+jP$_&rXTS8*TspkQ<Bm)~1tOn-lL9(s}
zk~PpM)?zeS1Ux!GC5H)SzBB1;!k}+bm}?cm2r3#goDFu!T3NDUmQq}~z@V?^g|}{c
zFM1Md@o6;$ScBZXYJb#g3v6|mr{GpOFj7~LP?uKZl(isTJ-Zs841q8)yLY<Bx=q_6
zDJ4al`T$(}X^dJKUDE;lb*)U(=YELUn0xq(e+kfx^kCEc7GJJ;TN=Z-<<K~;-7KGG
zSCWl!JZ!Bq<Dhw#r-vYcykIC)f((S{!}Q(|uT4|i=*cw%v~n9(MG+=!Wvn#kt53R|
zHQOiN`lR454ffI_;G$wI{V{CV_bF5@ZsH_O%Y0w()<Fk0^fSI|-A>Brid~MO-A1K2
z`SUbbexfC^_#6o%YMR)cU*Mznh<FTFu~GN&gmh5IKHili#jJFDoprUS@s`os41Qlz
z5xlF#5{<?BB7nemf392h7YHT)$4L2?%cQ-IIaX(aU!7Gxs&CiRqbrN^^fUR%k0-I6
zoy*4<p%S}?=RYu;u9HvdH>}bSeJ{inh|Kf}i8U0oC2x|*w8qeo&uL-eQJp*IRyyK{
zOh-mrEXb{uakR87w3^|HHB!rYnk3o|{yUp)#mFRRgyKBI@*~GL16_hD5ApIZEWTK1
zu66BA#2^6h4Zj)<Fu*y00Y<z6>!7}Or|(^Wu&)0skB=7S-f~Ip2m+D4?!CAhp|`^g
zxE=Md>KCPxjnNwC#9?*Nv>jv%<f8UnS^^7(8x`k+gesYU_H^m^^6zwqBcHdWR%(?3
zt$_h}!rYn`iaJd;kyardvz@82&`F9XlfSvPfJ3@yD!SeQgoTg0rMNc(VECvpB^qoM
zKYf(0z<hK8n7#Z~?Vek}HWZM`OS*u_kpxSJK$v9-W)bJyW1cAZ;lV%OXq@yvE_3N+
z_-T@^ZDD%&e$o$B)<dkoKbJ)^e9S-3w6t3p4g-Vwl=X64naM4)^cksz*er5^fR52Q
z^WXf~;7+K<VRG?$`N5C}xdjc$f!5k^oYV~3#K$)F_w4J#R+$yQDNkC;1X}+(NGZ7A
zU8KW8R6rcQcEN%gQiB%!$TrF@{hKAhFU~#M8>yE|D2{T_%(j!S>Bjg~`Z6zEEY}wC
zm2Ov;7n3f9GqCsQ*q6W2g+ZTuWH_(H6KF6k75`XQXS!|R+oneFfZ;wnT?A_#=~EdO
zD}+2ow~gR4x(2J0o1grM7OTBf30E4=5m8(JbH>w*mT9Byd0Rw-XX{~kVrf|#e$-Qs
zJ|Vsf%=^3SOtkls2p;9otabC?KgwC;QG*MHM&H6Z4viM62*aR5d2@^5)qnkqCjukG
z_nO22<<)S=6BGgZH*f0u1=|T>7}zeMC}R7(equ0roTnjbeO8zcb{M!d7FnxH9~`mr
zsD5{X=b^+`>xIV*c;F<6ZHDi6e=}buq$-`1$9w97SvK1|;KGe{{p?-grlK>=|6}W`
z<Dzc2Z->$WL5A+`?vQpUr9(iZTLlRbq`QVj=?-b66hyjP8bOinkQzGg!+Y+z_niB?
z{)6a8ocYdv_FjAKwb!G&@rT*&Ysl$SOqI=|?W<Dr!ce!UvyxqeMy9&tEz_E(`We=G
zEfzOfi?h3VE0SsbJGx2KzAmaA#iNtgD_05!fog|j!5$Ws2vSt=4zRIN2)Ed>C%w%A
zahZbbkP^7QkBj0yFc~!rWykycnb_HyETs{Zl)RRNESP`W0m^BilH4WHp=*?d4#@ty
zaTn$83|Y2o5@2`rR~}V}8-2iqoU^)(w8759;{xYy3v|*ceZFMPAo2K4v`?Ypjp+GR
z3moZI8cUEX2SKsRV31Z^oDpvH@WBsmtSKLR{hxi{pB<@8=|vf|_*^eup6<;Xk7*v@
z#E-o{s#fE1rs_Gzg`61#M@i?$zooO?oUD-KF@i3LDQ)KAbYse1teQ-wmu;~`E26_t
z<REx5cVA33pjYSeQ}41|x6!o^>hu0Ml~V*3;AejlSB~9yxL37oSZ6<|7mhBM!>j&W
z<Tr|bZ!!-NITs1$wq-K|xx<)~TCWgh_K_l6Cgb3rcD$<(3+i;(I8zCT!ZRHzI-;hm
z`6qpbe0<ME${%M8u)8`muc+j$JE$;CC_i*l@k@kRwci{CvU1Gn>BG`oKkWpQTK;x^
zYAyXL$SUEwt?FYi2rOpXmL&EUgj-xhhjqKOAoS*FN9AY!N0{9O-Eu(d({*2O20A-i
z;Y`YT#sjnbD-$!8&vueO29Z}`Jy^&5OWOy|R}zxUNJ~E4!Zn9OLA_p!mDz;z5^VJX
z*ik1Efst13-ScWj#W`^H=64jWnS<DSi5|9-AJ1MgL2{QpFBt!D^zFsdnP4@u518QU
z_gaT5NBjOolb)LO9e2a&W<w$d<1AWU9ZBvTNMtP&=)`F_9BgBD&#%+a$noO;^W3$7
z+b&nQ9e5`T(P7J?Stx!srMjk(l0Q$c_u@YTV-a0WSEU9l2sKj-Q;-B`Fl_tnEazCt
zx0^%wT%Cd+wQF!=q4}-5WuE4V2?Npg6j-6FV3P}Fch-B6Si{88>P#6LQAN#;(~PX_
zc=*NP`)VSZi&yoV%U58b6U|RI<2)#~<yc}5O=lf+S}Y(0Z|b*QZ{<vPu8NC`ebckD
zS_-!=N)F_yW=Spfz|{B{QaKV+vo-BV8T$U2SB-=SG(Z~2m*z#PHtTl<$Sh0*$!3<5
z^)!{{wR^g?RxPPaoF4vX7PyvMn24yg$EKw}3J)?Rr&m6bXXM2<=cfKd<b^=fS{iz`
zPxK?^iie3Z3d0_&omW{@$!T>t0r7ZL-JJ&6E=e~0Cm;Z^>oqC0`YwP;?sc__sM@r!
z)v9(AOp{Q|dP#&)i{L(ayzjMyuXmNzv}#2|QiDm#BWfb38b0PsXJnX&;FW_3qTa6O
z2^wlSO{asGkKY^mFImFOYnRobFD=XNGTHMtHWq=8bBry%-J{w@@kgD*Zm}ChD;FjD
z*Gc)`2j<`BrV+LPm-I8WZKg50Ff>}}J4~QM20yZhq&H?&$HtWKi@tpzer&ZF5OkpY
zGAP!14C%XMFOO6{Yk5bMFv7kgk<2sLw^rV=f22fDUAu0`9s5Vrm`sO(+&^Y;*DB7D
zN0~7tngGdfLOS`3*IG>N1<yHOrb%!GJ3`N|2@>P8?_3ZdG47^KFHI0kj~l;ST_-oL
zDvtend1Qg4mGaR|s*+7(_p4SmSs6&Mhb)9F37mHSIC=0*_-9!VcZcq-u0Q|26GPS$
zLFp=hjcZeV`8z;hXISa)q52i}3)h(rAsd$=+1f&=Q!X@Q=xySdbii@<GOp{PQI_`t
z8qL{{H2eBZ(_r`SITXTn&uf9Td=E<N7s$fIlOk6Z$bPh=ILZP!a7vXJZZOIm^tOp~
zLb0ST0fha?>2=J}k~U-5l{iGS@Kc^)BlhX3ZK9PCue9{-mwMX@!|#WcK#_Dr***OL
zyDP%0m;z-12|OV|3yyr~=G~NQA6o(OlQxTV;T%}X9Xx%uNjj|;%HK`dbM6%{(vGhq
z#$ImRbTMKzkl2LaJLxYHUv7@9x`x0WNa1q28W@^JI+@Xvz+FYDgJ3uFfkCl>SCUd8
z!w8=>n`NL#j`5mbryu;C+nwLv_xLZ>Z~6bom#ygj+C23h574SpVsH>b`2i)iz;?JV
zfG5sT`Ph-4kHj+(t4+(CTb+y8HFLlPRRvQ~l8>Z33l>7&&KITU)0#Cz!n>>X<XZiQ
zE(2vlklEwV4y|@R7P<J-r=8O0#SPqS6<(=ovF*b(w2^a3h}knMO;+Kz=QeL!L-@Ki
zihAsQj}=TaR1%CZoS1ftR=m>{iO_vUpGK&w0AB$_Im~a3{T1{5v3Lz3DweGKgj+Za
zZ+hHu2)Ck;yTUBOgj+&3JR=`(==|VKH)e>$|62(DBfPsQ{8^g++M*tOu*eC=a<<Oy
zLs~8&51*0bt@jbS5EFzb_k7I1ppx{}jbVxS6^6~jyiYiKiM?>~K)v*+$=dbO_8Y{^
zNQ#IjqtfmPTsSAtf(A>KJ0X_qk$piOi0a|04-VQie_om=>Irx1Eu3#1Iv*VJ;p#KZ
z?M$!CJFh$kueReFE7)VXGJ+KCt`=kYE47R~>_-&*;=<o@GGWUPGr&z-6CI*}EYP?x
z=RS0~#Y1m9#iV;r0FOrZ6N=>Hw)8Y^8}kJP_&IL5|91VC$f}Phez#Ni{4SkoAf|dp
zx3h^>HMo?b``3V)L{3pS|Ej$<?ItDJz6LVlwVRW_sTmnYK$%i^9m?tY+W-?{7=Jv|
zrDlJ7AL1E6yFgzy>utcxPl&nl6Rq`i_cU4_3x?MZlY?7;db>zx_`+%~sRpul9z`RW
zAa3<`ElP?_kI#jlG$<#042;Bl!|)#{DI2~JA5lJ6T{MUvmS3MBSW4h}jaS=+)qaH6
zsL~o7x4!)0CkeVI+l!03_H)#@;w@2<v=0f<Mjd8M>IX{alZLCKgAW$#KU8vuf2F7(
zHD<q#FvguC!d9@C+h+0E7GHbHb=5xUobc4*OYyAZWG7x9&hbr`+y~a|K_~;kbuyco
zA=hoH$Nlf;9|WQ@>8gw>exou@r|)?s&2F%&Vg)<v2dL#Q{PoW!lKF|}t`iiXWUkhW
zG{n$Fo$1{v6cKW18Le1j0P(c8H!A8D25)kjmXCa_05VN7!G9O&Km}lZDrx$w+Uf-G
zbJemBAFZ@}bC~zp%bt?El$G@x92_h&dH!59!1uId)_$*8JIMRx+!8!YCI*`R?uKXy
z?dUhvI$Uugm`j^Q^-TKUpn1jks^Ari<Wtf7<y-tjMTW0R^XimgH=3W*<~k#eRD$ig
zmo4Y2NX+yY83nkY`mfT{MaG78mQ?#5<vg?XLd^>#J;)dHPaVu6U$7>xaP#XDfA%^0
zJ;qf!pFPo;OoB#irRvJ`n;DPB*`=d<wBSA7vqP=F0~%5Cts4XIsAC_Q=o}3Pnbrbp
zt|=GkHd0SD8oi4lvrsD6F|Kca@ZaBT<-HONh6<9POu6_DV#HU6CR+Uf-H>gAp0Ei0
z5%zzT7TSo`ER(M;9D$}5V^4XI)-_70?rnPXkr8i{;B}4MYHLJR0d!!{DN4c>OGmFw
z_dIit7s2u3=@ViCdVZENg8j&IB5%V~jE=gGB9rji^8NB`RP%CKQ#F=CV$)5;pqq5P
zM@QoVuOhb1$<tqTn#}fjLjfN$c{!#z%8N>fcbJeu?kSCBbdeOU_kXz}nQX`r{qP*>
zij&mYb+H6qs|9RC8{ss5uBOB=+uVT<`{6{<VPW(%Arc>a*f;|RanARHJs7@OF>LZ;
z9V@(@8QH`~*C+~Q3;6GL0D9-o6%K#~2}-N->YgN6rv_Lg{Bc=?aA>ICo0(Efx_K18
zniQQ=lvXBfZSB3;@6LrH5)O_`a;BnpkuX(@Z%QFQ$4YNpeSB8!CDUC~4?@B@JiwbQ
zfPMA$FrZ08J<r3KhPbwD>M0N7hY6?PU@lo|4)h!)ktPJmrVF0-&Nnt53&CiIdDcT6
z{FYB_gQ?U_oLN8b(;4h~Q9w_hU9#<m$YY32O1$N6J_~+<DlOm4bfbf}%SYQ|FaNjQ
zQR>6JW-P4Zdk^`z@4|s065=z>CX5^G5jea7_6RP)fUJT5E_`BELj@KweQSY%g2_o6
zajFcu!aZ<{Ld{S$xI~k(meaNiY{N)05MYai|JOsxmb*&|#BLlltz7hVZ!^X$&5RCL
zB(FlV_xoX%zwRHARgB$pU~g9(byw5V6gc&ma4#=%HVVlm2vYeq<Uroez(m<~Hooty
z^aYklxk&m$PgjbFGIBxZ!AZZ^@FrryZHAHl*;XS<cY-<xRojc7=e+VynXba2FD9iB
zm8)X+yFX(f+co8H*g2Gi&(<DK|M{`!eNIe(`>Xtm5N003PZdnb%|AB7-(KYp1r_~8
z<9lU@&^D<`D7(g%8Ww!YPMIfnmjFmR?%+p<4A~T^Fe>W=h44}x^?%~R<ro<tU=Fj=
z&qDQEem2VA%?-vlJl!;igXodkza%iYlp$;t$(ElKErLZl`-!eK*aKphS6i9$fh8j7
zHP+&@X8W98iWN5+1Sx^fvwc+eIo72-Y`5wglC`pFRfpm4{QCTlHw%`w-Ohx9J&jhj
zpofV875&}%0(0zMI+0QP^P-a_&=u@`)xqv|5uD~<PVk@G`>a)YM8rQkzcFU@I)N9R
zltzSg+-=3e_Yi=0Qgqa_E@F%l@aJ+Rt8dC*)NC~mB=OwZz<VP+pC71kpMWWEq2zRH
zaaijU`UTM;Y1CE<!lAvqcxt{6k8{TVJ0oJE78-wbVL+=DGBM+)%q^8j@#W?rKp*pZ
z74T_ZS^2rb@>h6e*hudI`B>m}JdBWI8#CibQ0`N$mdym8n$a1bp(e|;a>taTyL`qv
z|M`or>aV@^SM*Q=Ro;ENv?NYqv{QbkwpF*KCq-p<gSlBmAE~_bdW>sjO?gPrW0%-<
z{DE*N;zj<FW$jyQ9=Q#CSfC5LUAHszmz=e9)90t2QO%JL?>@JTCUPrv%_5QhhS271
z80%Sk7iIoL5id$^1c+L1Mbp>ETa%$G>gqJ$s*s%V@$pa|FjV=*X=?_0-EyGB7-_Bu
zyN8D&*NToJk;crrnEX`Hw7>Rm3qw%j7xXZ|e8*(It5lHmnQ|GniZn9B@3FZvxM`TZ
znJnswxM4u0^#3^2;>F-sBlK8SZ(55j5`kq=5|$H|tsLx`tHH7I>(h`6S*HBLEgTM?
zuH<%*6AXTYJgf+kR@vz$f-_m=wB8~(LX0ils<(N(%JC}3GT#3KQs}<%$0Zf}^}jR3
zz5^o*F>eu*4@?qNp=kc8uO2JO&*k%U9q2Bs2O-Cm8@)S4el<%i2^YaLBM0M_E85l(
z7H{X_IdjQ&j3NHuL7+WuJtLX%FPqcI1J7)?d%E^|3M{vwwJs9r&fSMNGyrD)_HVTK
z?wL`ryC@+ZnD*GsNCBgOeCFgm4J8_JVt;eke+GyOFKVHo<8h+A)Ej4b8PU6RG%7Q_
z5hrfCZpJyL;(az~t{j3gE4Q?=cz5t;OSH3oyINQ2%CZt$d-?o5RAIiq?PQV<W?sLS
zz;4>CSvG9Wef>mq)y1sZuUcHCYQA`+QuOu{x^kH1cY$ot@<?9$tIs(vM4GrFpo*5&
zoMl|}6ZELV@nO%{rMB_EGRc{oJh=Dhw%WOi2Y-dVsRYB@d1Ygm-hb*$c0}dAX3^B3
zH<?I<Mv;iUc=&-MdSxE<8VY^oJ1C|1(WoM|gxmfkr1e1-wSL=2Fk3Kr2+6>5c4P&~
zfRx_PW*o*!83J9wqm5qnB7_X_Le{-UZBp7aD94_mQ}#R}f^duMJM{RY-B-Jt2pb1h
ziK64Z6ht-3p=K~pMYo%kd3#JOaHY&o>)kq!o5?9y%5%hu%E^vCu78NUc1)}vdC5vk
zxEVk>%=`X2xR65n(uQD4wSqV+_X)2yHTd-7JKfEBlo;-QPbc4}_irtLVR{Cce@1Mj
zOR#guS+`BKQlrvs$wJz9&6mwQv79+v&=-CYhYIm+dPbVMf9AC$(7o1s4`w2VJ@OpR
zHYl?-a6kSFRZD_1q-`1qZ(5`Q#o?doU;~2>q5Aa@x;au=?0_p(PPLSLCB4+0<mF5^
z@f<1FH6t*R%O~C2Nj-D+E@|{V)JM_&P1*eOGyRQW8(7Dj)9G7A{OGzrqMenfX<yK~
zOJv<Dp1B#zPOk>LA2_=o_}|is52bW5UFH#F2me$Favy!NC|$*)y6eWemj6T|e`fwR
zI@J0yo|Ua@vHog%H5SLQ{FO+PefeLhG(?aa4xK>`Z+S!b*T3X&H6b<SRx3N~kED#2
z6e-t(1t|73sniyr!i#jL&18>g6isFSkkPvc0hG<fJtlHEIy=Kc{cakWg~MkFzI4ZX
z`A5q_>q~;*ZLrEfhM5cQ(`(?O%+$`yLEQ~T?clw`fJcyWTxgYN{aq63++mG3<zfO5
zm22N=TiwjOezVs(!5vz3=rEUg5A*N;2NNzj`{tPDvbse?rM^pl(8H{he57cV(=+i_
zf+L;#M@-1(nY564BJrBa=S>Vxce6}Z(ZPEGvq$}7WuVlz1ncHwFme#562fDacj%1s
zn-~`v0-moi+~>4Zj<&*-)?;^zP^%dg&9Sn=_B%4c;O@I-eqbpb<Qa#9n$}x2Q=4EV
z(JW04&DL-bJi%;|11)y`V^Hw0{wPa{`e}vN;Ru@)+RYeuk=m+)Ux1})NaQiO@dcy9
zVII&gd!L#nDQt(ye8F=xI?ajXy9o`iMOzIg;)t$0{%PPo^|bN7`!fYnf9aNu)Xxzu
zmv00~>Yl>C%GodxkO5C)s^8{2Djctsh%Ux-knqY+vS~_=$>XoA*4Ck_Ir7GYdMkBu
z8c=C==qHIq(iyf5iKITBSt%weEUP&tj<OQDPs|h_J4<)nNd5}!trT+hl8afwKZUmz
zoNGTj|7t8hMX2$ZSM>FIovw%^-!L-4j>){5AEl7trc*-Q#pBn(2ioDV5u)JC3pMxG
zX4YcH%x4X*-{NYr0Mk&E5fO400dESi>r8I&hd1%fFJR&TphQrAK<Y|`UT`tnHsqAP
zht}G$B?{<&gJGnUh7I@7WAH4^(f94lLugSa?;ekSWYw^(^)6USDIeeT0$V4074}p%
zkx*3a^6i_^r7v!C^6SOySzsu7=$-W*zxx3m=__<Kk8M3|SJk$x;Fw=T!!7<n!Hn9H
zhS7_Cq!0N%Kr6Lw)O&c|U1Lv?ly9uZtB241=;}IW?YgQneTk>8K<YX^Zq-;T`$FPz
z8x0d8PN-x#2m6eHkB(njhj1kU>nc3RN~^J;?j3>1;TL>vj5FQvJVh@85fyilz+>~w
z3f_m4*H4qwC-<3pd>^#<N{?kZ<|3J*;=q7xTGA=<ghKZ2O}>UM+|NYiA>7hYrl*Vo
zRU}Z+6o^rA$t-xRA~UqBf46YDrd>6(c?NO(H@A*W?yt?=_`>Z&Im6B&ypp}WQDwD%
zC(uXU@jN`0j>%@sU5#guft;S7iP$;FV#T1YTu@q;H5rOzED-pbUhK%;N{jb#(FfY!
zHvOeD5(=!G2+-rny0qKhKWVRjRz>`q=!A`@jlyzOyU+i*8B>>d*%ieRW|>2amPnIP
zMQjQ;U5s3S<hC}|qA^vw;8W4j3OnjltkX_3CKN-}q9;Fn9%jAzsXmIxM8KO6b2m8{
zG4Q6G$59V(@Bp!ngJ);>Iv$qnYIM2@)<@t_eOw4GBAoLO9_~qI5f3~3OA9Se$;`Z&
zyTivW0nSFJ+L2Z36Di=bnMq_L$JSuiBTy+dA|aVStZjM$7mwRTKl0-@!;N<+Sf(sF
z41YiJg>$>=j%Ux)2Zp3*a;E)Gl*TFx7$ria7B;|-st+=yI3Cv8iYK~xRbh{DkK!E8
zW}91pR^O=B=Iq+U%Y!Y;G6*KbAH+Pc%rZaR;9nD-mk?^@$Yz(A7E*HaR&rKv{(hfs
zJz*9Bh+;08={0``a|>dPy_ouabLM4@Ugp21UN=fXcKRrPd4uh$zI{incR7<`!$oH-
zS<$vGbCODcVqwlGL|+pVFWU`X=f773DO5b<B?@Q?yGc;4v?9zM0Oo4OjESQ|%?oM;
zTRD5*fMp;lM3e%O`D{(x*UuZcQaMb6jRzwuiv#cfE)doXj$872Z+3XAYqs;@Mfp3H
ziQ`dh)L#eIo@J~G%E>eb*zohd6FB{2#Jpokr{>dK$FX$gg`xh+=R{GpM*^`Kc~uVR
zeO5E_H!%t}38k=9ja+<V53{sh6D6e3WqflPeX7>2GQ!M=NiS^w!y*56v<9Z`zBW%8
z;d=q4B5NA$n)eCyn;Bgk&=AY&M7M~<QGC-t=82W)_vEPBrdMA^CKzD@OE{(kkJgY1
zpRQR8@Gc}o@5+g&-RRqJP(^qty75pNeWRDl%~d36TgBV1a4dFIA(cNzhB6|n^v;`a
z^>sRs^c1ykvQ(ho#~i85Na<EIGuMqw{v&}13&_&@WITwc>H6;6Jq>)3_kjQFiM5Bs
zBjZQW9pamk%6Lv-RhIyGP?YyQMq`KqK}oqz0NJ<0pua3AEc^@L>(#wcIJyFnKg+vd
zk1~#lfo-1&cw$kGQ3yBeoQ0`kJm0(jfeiE4lo08Gq~Nj_(^|(#Y8y_`H?e0d+&h=M
z@`V)MG4A|w9tFWTdE=ubW=V{__zyGLaCKT~tZ?Dx;tc5m>`*qOPD)3<)XTBI7BXHs
zjY>JOpQ9H(qe|-qRaEUezC0IHjJ_3ieNHinD-t{p@MYeQP7P2VviG2**8wx9Iu!Pm
zDn&N>KMT&!-LLpHI!^IsWziL&D8^hnf)F=SNL^5@e)qK9Gxr)LQrv?08}$TCQR4sN
zAF@-R>2KR64T#gaTrecyt9>hvJqrjji%peaV?hDMjWEtG1}5~$fQm*(HSMR`EN5eQ
z#AIz|8&N89x*MO8GH8N{b(w!W*flTEh!&nP;~~l}#_;TK4`&K!5|pSh<=IRobx=)m
zq_AgcYUKSYaeL$xe3P`6RsO+NAjWP<=h)uELIpcjmz;|-iSo}G&7v;=mHQt@3{N#)
zj{RvOV;Kgqq1J$?>Vz&_6F8&bLGhk{f4geNp<8w)3Xy1HW}f=vx!0uLupBM!qi5Cp
zX2$om{!c_?l`JM5O3RrqCf$9M1?q0ny(b?Vzz{kQC>Tw2ezsSEJYpIR$y9YN3lSP~
zN*5rOpP>2p)0W4i`ReTN%(q+D=RON!^r*FOq$6R6-bCDqR+8pm`|Vz*9Y5a7nim|L
z1F84@7>J;D1@X-;{BdnXJt+TbSCp~dmIS$LnRVEY6j!Q#s*+pOWgX;5ra>!Wt0Xz}
z#Cm1iIkW0wLhnoDx<Byk&j2z-U$GiPdiD6%pJS3td&WO0*g2VDX}MY@@8k&K^_Frc
zlG-TQn{_Yas@o%;ipppF?0(X#%VKP%NI@St*d6y>I=?)1?WsK_Xjd0yxe3s~S36lg
zN{|;eIPM^5D@eu7Yoz3|pzh$K8%}wXSnQ-z`~AKT8U7S+JR69U{@*Ob2ratW_k|KJ
z>?hN&IKFEIs2zKxEO78dhJTiGRSwd8WM<4SnL*Rch_yYYGf`oKP&8}aFxH#S_sg?s
z)05)XCnf#Nj;G2N#oD6-|71P5y5A2?j1OqD6MhN(Syrjhsr-9==f>)6npgeFH&ol}
zD*|i<eX|#%I<hd44X@Wm#p9|KmF?Ci62+$sO_HSe=}L=sH&Wvi2WbZlTKQ_ka}QO5
zJW;^3P0B&;PYir3JElVx63pDu;qgHav}KHYH)q|^r^5%c29bl5{zA4tzk>5z0p@I-
zUWsp4h&c4hz1CDkW;^9wy}wiTEH2$Q#@4zJi2V#Wh}ve(0EN5L#gFPH%pv;|iQePH
zjp8RjblO1z#HVqYBr<3<Fo4r3Ap3G%D?HjtS(*c^YUC7mPZiH%C$PltA*HC}N~Wae
zNwjJuX|EjJ2+<Zsr4YrtcIVZN@-XSA#z{d=eap8O68cjyUbYPzRyen@dB=iRSt0Aa
z(z_%W;s;;-T!d#G56Bc$1*S8dosWK<F(!hpFI=>3)=wJ$Rx1*x*H=dALe<DPzD%$8
zfz^7+`Q$|L*){!5Y1;I;X!w%QYEH>Q=?=7Lf1t=Y+pIC)iFC;T(W5=9xFm|Rj8mq%
z67)u4-P>#E#7!{(-gGc$68R8h1TZfG81@QOBLl5PI}km*il%oBzM8taUP(H;ehau~
zK#=jp55Atb|Lj&HQBfQ7s2S{>bM-VO>V^-D`-hy#JmB<ahhAAKSFh^I!+HmrIPkpI
zf-D*ta=HY$B^QfU^p=$J5RcfX(dcrYOFv`e)v}!o>unV0A0>|NeSpUxQC-gz?8mlh
zHi7;mlV<Xr6es?g&W{#*y2-a@#)NL@ab`C{#}GZ(rpu}4l>Rw72T_p~@7RtWs=5h#
za!3jrmCC9QChDvqCHYcKQnY8#8wT5!%HrZpKOZ3y6v9-&_YU2C0$|)QZg0-Xhw66Q
zX_)*DIC;KsnZNwAE+uUlDOp7GoVWW1$fER4I*7k5*6+sXpu)SS$>&espW0G=4Frg7
z64v@fAE&}65)_Vkl!g3iE_m;+%=K#n>7OzblP~~YpLb7lmFQ2O*N`VJi+OZ};0%EE
zlFKh$_Q|%Zq+}NBAf(EjqNtY1ebF-=Z&nTb+q%7|loEJ(n9Rp|ZVM#z+X-DBtDZ{>
z4Ht4xkP<r%A}ux78++}~nw21gu4e=^m&!h?h`#ccDU&n9yggoZ1rbS{Ec#u$cW#7N
zs%*oSv|p7Fikq?h=kfX`q{V9-0*Dkct^;FbCd=*M3N1{GORhg0?eAmTdr?*h9^K4<
zptzEz>t=z2M>A|}&Imr{5nuFuS#^Il(A;Ice2E9bWC98Zj_1Jl|MkL`7#RQiVae0R
zJ-}{k$ZzObyh{InSl^fEnQs?GWDmJz8^b!&pG;$RKHo8H|JWB$jDqKyAT}gVHG>6B
zqQql^$<i&d9O74xD9@&pDhuMv_tZr~G~X!`*OI+U=OC;_oY{!el`O_$c#d5`gI-hN
zYMTqGlGH#nXAde{&Pfg>UbnlA*nH+WDGfim`cexq)9g{9s$!$Ew8&M@Ok#X;h0C@w
z*nBE)nxsD*-&!jFyj{5ZVX%jH-#n&30Q@87%svwqNw9}9x&*hl2KxTZBMrCFGbpX;
z%n7(Jn}^-W<vxV+9z+_|&*GVh@bJ(#&h2SXac}aEGI^$49CQ#+@r)s~_kVvMw@C8%
z-Mh;n#2|;*w*4GuoT=qu4oWZOJ@io8#E>NG=>sXm=#<DXyFA$oBmZdPg#uWZq+1Yu
zNHYC4=d+iQJe#EZ4r;d_^F4o-Mj^s2mvdYqhxojz&h_Jq_Cz*^2@Gw9vN6mPc&5sz
zDbEWHeF*;-+W-yqj!;AzXHUW*eNgLNC28(rWb4)~O4_#|Z)Uw$sN}DJGJ?98-nxvP
zT+zS!s?CF;OE0C{EZo7X=wlS_tr>f^VYL@~-#oA1qqkSzqJ|p!BvE<<(Ew~0vY;7x
z020DW=DTvT^8o8OHKlBWh@LkZT6Q<ICF0QCe_aqt(fytZ;$?k%b4@dM2Tntr9RbS&
zPRhU2MQa$HV}J1cvyDXhB`&L<okjfZ5=6Hjo>i-4ZY#G72Sz)mVi5SydYNzmj1lMl
z-f~gHPKRrdzzs8)m(|-uRc@8yV~R^U0ec`Z7V-mB+buWRxx`y3$Z71W3eE(~Dx>oA
zr)4vC<oJ$u`BL%5liBFdrZtnmnEl!`o=0D_Y`)oxG@<uO;a4amD4ajGDbY>C>l9ne
zWE)LZ{=>Sre6F+fBl<^{N%d~Tnc<hV`+3FXIU}En#RZIgz_h3Fgdh(SkM*J*BcD+E
zBcEnBIZBUH=6y&KG5Yxd7;Aj{`{z6I=ML%IV^(MYScD#)9K7btKdQJy$y^rwRvhFH
zggerd-#_`#Mhe6HJ)jG}%0dlzzEJkK4=jC*dGzZZ0J&i6-GZmq$zBrBU*}j537t$;
zsC(7AJ57lVkxdi@xWUUd%1Zx@jv<&XoX$3Rp3W{43v>&RLDRpk9!5pB{uF&-#3JU>
z)=8L0$-{tWk$<I})2gUF&EFyN>FXyY$D(AKgCL_8jK|r%$}ZRC^b<4A<tn_9Pw9)#
z)l*0a&-4iB3_iFOG)8XL3FdY97N6_6+;18@Gc?v+%ptteG|RNnvVvWu$0F)QCjw>%
zpC&C#?7!A;D|pqWGAT8=*=bXs>3Ro4RtlJcK-2H^M<)^!q>PJT$OjuNMu6)oj^N^V
zz$*X@e1?Hb@IweSdDQ%UNDBm##kWQPuZ!Iy$xyT5(3KK;OxW{(BL_my`x`28{#Wk}
zBk`|PrnPWHnW%A1D>Ktbr&B(vh_R-CdIcksu(Mq<Bjo@=^w@<v8Y)WMUNT7w*L&zh
zSN_^|eF8jc=>uj*-9mA3tEyp3eJhu=^gBbEq}cQDhoShbvozxp8`|=E2xw`VFN1S9
z*~~lSrE&9*36sQk;~ccywmQ5~?*$I!4D@yCO&FzLLtEgv+zka6Hy@X@<R;jmlVv9L
z(VmfS=zF~SNIl0T+ntv0<Z<1ezG?VW;J{Q9Y$!?t6_c^_sGI?4>n@JBapzlnZ_5MB
zBh>Tajg4UhPL%6E*hisR;}zu|(aUF$kq_6|@!V*@0jr(TTOPd|6R_wV^QtqocTY~)
zI0bg7G>m*^G#j$Fg8o@lK)^YR7J5<&w~qXL;GN}*i^?OH9pSeuI)C3Uw9k>ZJ9Fjo
zMg+zzXG(|%4fe3J&HRa(g($3G=~N=MKCCnHDW`7X0uz5P|5Hs!5)V4rk6xmH38b06
zw1CP-`tk`BBK7X)m75_E!>ryz!}h*^6=*r}B%{rxEMuRX{(5#6uTtJuXo)hQ$D6P)
zaUdy%@<O6X(f@_H0!bblZvH}E5H#YfWDuo{QD$&jaCMzgswS0Jd(mbn)7k|6HY8R(
z<+wMnCD0>ywAr1lNik=R)Sf*g2qX`0S%~x*p~hC&kz0|Kej4yfQofbhnBmT$0~sk9
ziY~f1v-$3!-F)-)90E!j2-vv)qjH1bY0&SlM3n!2Li&qckDRkig3RTmw#Y;{v}FhB
zrnCM%_u(6O*FdjER**Y0s_>UOYz#ZZWwog~Bc=9>FIadf_Qg*st!rN>SBBrPNUu6Z
zqX!=P$jhvq(G+*&$c&VTp{%*3#q;W7DY&>P1;y@LBwX1ZdIwZ~Pn*9y<SO8<HnMr6
z|7lL-eNPfzPJx_$d7xGLqXN0SuZxm+fmrYge!e2Qw*(>r0b15~aYLOooWe{Sy-n{2
zmsANdyBj>4Wk=d#Q`|8p6i$W}Y<Hy>XlulUZX{B*JTed!fH=Pg$>fSzm8X;(ZkX;=
z7$xm-+J*Qrtny7CRO?wHk?uWNAUR3j^P2_Gr0A9-yUPg<8D86eKkhP1BG4^GTQNu~
zK5%-i9I`iRSuy>(hT29yaqHKQsQ9r^bzw2mrbrM%75DwVam0R))w(zhBc<mpfl8R|
zzD<1BI57I0>?Xf-SUIKGzVU?=n}Y1x?Wgn6%0^xucZLQPEGLCZlCdWqnJPvmT10x*
z_XV81Pjknknp*}z{1DUPweXkp#Uc%soJ;oE5T6&ZKDS#5NyH~s^baA<_=FB9Sgh+K
zUzrWS+bo!URfFRYzOt=)2E5uGN%`n+Z!{1h(a9Q#{OWBq?0e`1b(=eg)8ek1ROgj5
z2MXsB3+F)=S{UJ$YAdrv7|1yq6exKn?`#cdMiHQm9*vUy1}0PgA%FQNY)H}mck5&N
zrM%tR#azt`#tp3@besGunpAGCG>yRBRYms<Xuv`-=w(<*9@<znhckHVEw>b)nggqG
z!~<_be*gMui^oi+%6<t4=CoMrx&C5p#&MNeOw;4AZa(#N23IEYCue*Fdipm}gLvhv
zW7`GRmxI_P1?YO#CLJn?v|7`AOt3CArr)agdS8tBUr6zg2mGuHD60Qrv1RBg$*&lF
zPXED!!TDwCA}dHNRk+K$7$DJjhq4Grfw#NXtnM)Z3PrG$K9U`U5|aLu?fSiTB+v`$
zxrk+$u!<qQft}`aZSi`i=;bTj$z#CEHqmno-sudZ_)fZyhTpmVS`4MRNjI?nzA9s)
z2e+^Y1by<V?x9*<3kiA~8t$W&*!e%MGjHCXICs36k&1#HJ`Mhnq^We9mt~vt`7LOM
zBxCqoo6T1+;<dQ@+aKUPfmOb_tO|;CV52=>G{3)$0|nsWE{X<L4ah2^a&GZ>o^`fZ
z20QifWFFXk|ClM79qG$QvV$1DCTQpmXp*>1ijw}Nd~bg?0w>tRb7*z;h@MhWkm9QZ
z0={9%X=z{?&*Z}`-xJ$1iq-WAUF8-QexuV)lS2GSklXH&Wiz|f7v{4Qc@5GIa-^kq
z=M-xe`@XZwk`=F_S{?DDCcGwZqVG|Fr2g{^`=5GsH_RZ^$knKDI1di_&wTQac=j%6
z*y&+d$GvNuW+YhETI`%__s)Dfl+xwOUKSH6V5%?43RKSq-PdqQS2oy2bQ(6V8aKm>
zHd>~gxR>?XDcHh_HTn{9r194_m-+)}%Cku56?_tOp;2ob9B}KaF_`e9Hsk)}5FeLQ
zzJWrvFqJp?J-pnDAD}H0TtuU}$vk+@-yiy?c=HdmOp8C_-i;}-vgisF7+j2eQ-AKw
zQ!&vOb#5a+Y3xY|y!;t)_Mm+ze02RE-taM9sq|QR;i=oLWU(#kiP}@>zyDQWW-nPT
zba6M$I68GCqz95n)*bwd=R_du=*VOD#Gs)@FE?UC<_g1Bv6MEoiF5?9I%?lsMgdW&
z2DS%SJ)UYZT@zK?Ep3KW#%OuS@wnowsm;`0KN+ab93mCJ2l)w+;YbA<ETgLjr(k7{
z0+>*-=<o?V&%z}O$E|(S;s-l4`^5t9mDR~O7PgzN+(~M=vML?lyea@v|9;-YDvH{-
ztNk6jp7WP)DJci4?YhBAlI-~{nZ;F$A<Py;^|x;`e{Z))Acil&Re9JD;1|*Ee25EK
zK!LP&eUqjQ`m^CqO%jNH{X*Ny26NbYDkEfp_pj(s7D}r&*!=_T-jnt%H9!%mUD(f`
zw#;t()RlXRS?nx_f4RU1lZollYjR}h+nD}~uHo67Q0fc0?rAfyVp<h{O55Wkbq`|0
z>L?zll{!;Mezp|P2VdC8i;+59XbWsqcHGU4e}Q%(X&wpd)JP3)C7AO#*Cp<RH%$+O
ztt(Oc4qT~;a&KC)sEzMivZ1nFf4UH$TfvamRV93fxH7ppS{2`6J1b*FH8Xj8Mf2-q
zHX_sZC^GBsBRwh=Z4u*5J-nDCZ}qMiYn<0^<jVnwxA4uK;s=r87)&_90n5XBI_vA`
znRTYV1Oc@%SLs`(%oSbtC*ylK`U+Wj`VOXYb<IBQ>a7jsY7A3yP$EFJ{TK@IIb%Da
z4EFf=9>#0_KN6%2dTM}7bDnYL0v4tqk){RhnNns_DU`0slw)`=40j_Tpl$aB+afF5
zC|sSM9?m8gj~@E#;O)9Z#zLjIT0WXK7E%R0lzvge$dNSkGyWO^-pT7F8OOe6hiZB6
znDExH{z%!8b>*Y}4N=!*ymQfk1qH9Lp3$fGYl-q)Yiojw<BjUbIud%pnWxoQnn``Q
ztJjPX@HpQPxw6<4`Ji_{c@7$xsDszdDITpUD=_+WAhBjC=<mb~^wR{5gg{pAhU^hN
z%H=V9o>=qB$ER5fFOV+;OO}EG4CQaQ=~m<(F~WPDK&|}xy2y5_+yyv$O1Oh-N65eu
z_4ksd9k(db4K=0B662k%HR|_&oCfylx67R&3qq;UfhEOLc<>vr@-lVWQD}AvWFYyF
zeTl9c$-dKT2Ox9Xe`sbHSq)rnSR$)JExUIufDNSnPxV0BGfcV_?WS;nOaDb$yE3DO
z6CZF+4h(7UL)2p(SzlE1%re>hOYe<*=9w=#U`TqVFpO*#W<BVs9JZ#kkab+hKxWpk
zb~i#GKgri-8upq_=a9Q~%hB^>(d!+(;QwZa_tFKyT=y5+zy;ck+D^JAz^_S>efsaK
zjmkaEsxKh>C-a^7mm$Ht1@IDVM&SCbFZPAwkH{cDQChdU&bbDv&wLwl!KGpxz*2rn
z3|wKSjIy9kBEKgxS%q-!ra&=b83Rxw4_MF@6COl~(38%J(mIYM4v5@??b|!@Q6=Vl
z<464-mI*}BTFZj+W<6iI3kNy++Z&<3uzhu(KjGJ-<(*5xzGbmX<h;^;fQO0hJZ0h)
zq8a)*j(AIFle2KgOZTICA7<Oup@w;8|5X|5pNKQ|H;bbxxz9Fvi~1{8YRY`Q*Gg5E
zH!j8)n)|9s+K~xR35q!NoS2oisT^H1N~sA#K9J~CM2I`!fDLV?^`r5@A_a{}(jPd(
zzuQ!ZWn%fiwE+LM)glCQ>krm@gP|adMRknDZgWCEt^J??zQ(MlsXdjW%5b)YH5iXB
z0}Zd1HGsQWilHlY2DNgqA)a)%$mH(w7GA9BaC2+!CvwufZyeZM(*i>}4OKDpg2V<l
zs8i}YXe6^GdMIAC8AAE#LQ;hzc}z6)_k@3)oOQMOYje>Io=@LR3$tZSwZgQy<jS1h
z-IKJM`H_^auTS{%?<X6KGyw|Eb2j{+Z00VQl6*XoVQq}FN>IsTg~I#4<12S)`0Ap{
zz+^U?!1vEio`%C8*RLrD*%}oomjTAdcLmR*1J@>o8Wqe{Q7r7fwIlywH$~`YYFiYb
z?%{&B$Do*77JK7pV+(>_gWh5%FTDJ@!VLlZrVrUTbu!jx$L0m7(H9W6c*>QDi3u7o
z_+~+CO@k~rGQc2pjt*adF)wwz%-~2&GY+`gkp%`fR)RiiC?5?D#;^e%sy-R#rMjQ%
z_vk_?-||7y^(}5(fDUk&JP2Z$Z1b^amXH!C2*n1fr7$75rByT{;OewG*M_DRyYlV&
z9{z5(DnEvSZiT%)G>H9aYp&y&J*18W8qkpEi=5iCj-+A?x*~@7$g!<bor#6iel$~d
zidl-N0D^IUAYBqCJk6zG%f}t=#zQ>1O@$qD5<LEUbab?LcX5Y^PSpsaT(|7GQ|ISz
zbN#1iJaRHJtxo9<kX8b#=K-{$K%wr7y9kCwc>X}ET@P<sarcf6(Zby3mq=l(Q}T&K
zi94sA<5TbXNJLnxGMG;!oFHM++-w=bvi&<rLggh)llKYfppoyvW<FdJ2&qL-5qy+g
zoFkZjOQ6;K=Xvan&s%%K-KT(hL9Ol;Oi}puBbOi%gngf2OeWNUSm1nq%$@Lxqgv0r
z=aX~xyUZvzi99!)lN{eG!%q^zgDm%)SAuMnhc8$&?T^R+%bSexPpof%el!!6f^cgw
z4$P$gCmMdvj-GiHT`Fpp1U<et>r~Fs%Yj#mi7E*0mX>Rm!ut}R{J?|+@%<T%jly(r
zyMR&8BLXld^eT84Y@E!p&mVVdwl1UpGWmdGl%*t$0sJ*zxuX#VU+M3{X`beo>IL<Y
z2ZyIj<|1EsR@;@jq>6(Q9tE)d_D)GQDnZk=F=}74_t_iZAb2t;EdW+xP%O3-aM3@*
zl<f++{huebMr6eP<cau}#u9>V><@{hL|S>(>a!HGxIO$Q%ZHMRf$cP?Ft?E4-E>i~
z$SEh}j?R04Nk^kqha^>EBeBEO13JW3@G%Dl&?aZ5rd|ZA?Jw8wMvz(<p2Znn*#OJU
zG1-m8Wl?szzK2Siqmt7<RE-ErGrqFwHk9BB^W?`jLm|W^zhAJ^7)~|Sl#6PTFwm_u
z0dKmJXjl=ueIPjvX2;w^NX1SExs8_6ZBq%=Of6WE<OVIdZblke3vg<!1{YEvo62U2
zef`o$VvqH(d|T=R-$@oxP$_$_v=W>5<6p+pcMD=c?=>iUflDi5-MwA1!7eAiUeT!q
zsH9w~N?D{I9M%Bnh3@M#TkoPsDdFb$z#{U!{O7ll30fDE>;$<0x$FMOiC;pdvzw}o
zJO`i!8}c1FOzIiEcbSz!%gQ^!Cu=&sR$+%>N-<u^kD4C<&v^QH*e&<yk7T~Z%KGJE
zkm2K@PPNW?riX}|PXn7K!$`}Njjs|wc1SO1;VeS>Zi3=bj`35v@n@oXhWlR=Tgxl6
zbBlSoe-AJ|M7wz+`X%*1$C0ht=Cz{!OJQ5l2*o|G1kO$>ul2{9-KOEnS!a*v=Jul*
zX1@za3sfW09l3Za{ijIjHIDuy!>WHE8~cMDdtl7}z;k9o6tA<KX*ba9Y`ki%{QtCZ
zFCnPvV1l*#(G~=q=0c*A=(s*tUX~WL`Gc;t`Pc75cB3<+0q5(NA7Tfym(#zRo>{a9
z_M+yweU&49jV@yrtk$JNm_$n>$VAE1@-E=hhdTT=6LYn0fe%ctm->4+2pV#;&Qa5j
z#i&mhZl7pr)}Y)k9}G^VZ?b>=ZFsO+<G>&Pu(?jF0EMrpFU!!TIld~1K5^J6o#b9;
zX)WAV*iJk5*SmDLTX>GKKI-x%Kfos)>7@7WV9LGzy6(!mZMRCip5w$jj<FdPqMmS7
z_fgB^+1iXvA|TH0`+-aO)Z%+kTfaf+(m*wSbp<#^uoZECZ2C@7XdtqNLm|w0=ZT2o
z4ps}-GuJ;?*y<g4PB3>*3td+$0vr2vcZX8*VV7AZWSbM^+~#MUE|ZT{kyS{q`P=&+
zt5hj~zdkY)U|bZ4?d8f0@G{^jGw0ayad!@mel5SYhwj&-t#n%$j$2>VA(rNKU9bR-
zR$ovo$E82GBDgDay%>EQ;ojp^N4x*@`nkHr@|XCr7U7zf_*2&1yk*g0G(Tmco5SG>
z9WY-Fgtd277EnP6v^7O0rjpIudVUNI-Dnn$FRp9XU0k0(>tdmt@dA{ndyo10K#QK+
zg#tLdA<MzgNJdok>BEH2*wfwIEm}FthA(cl*7`o&zd@xI%jD#lj#wLLG*_D@OkMZV
zJ&JiL>2yt*Mdq2m+Sh)L9`JqpN05hsU*(UyFXvcu@_*Ra;z^6JBoHzen?X+GQldmk
z>?cCoFLZkonaXj_7_1t_Jhj>xodxO$7i~B-O8p$bddowTy29nli?E7f*t146i2J@v
zkv3@t#q&GatR5ME`W^dC5CrjounSWF@zXn0-e><GNKGEn`f>gMCyuR%>1@m;bKt%$
zd<0V^YB295RLo`l`@8G(1!Wk%l~@bYl|Q2;2O)KweZEHAH%v1ftcN-o>w=wXN}rTE
zLn_q++FlxcU(OJf#!*zST)Vti(D|JF^OMSg(q6feuRi87FMM0irz)3|>c<0RVkpf1
zCIx5kJwOV6xv#LCd(F+*PM`M3zc9M!vy2KcUdu0}*2kiOSsQe5v8Acsa?$7bx{G!@
zk6&k6U<55v=6`*=KDZ_yMlFPLHz6=1842~dTL<xB2tP@UY<23P`gT>`uv@Qp632H^
z(Y*J*A>^}3%gz4v<*6?s!lU0DaF}4#EaJRwvQ&>o3~bbjzfv39EE-JQ@{UrKIwDru
zJaDn=cg`(;@{Kr5+!fNgUKbm~R~rg@n{K6#YZG~(<{!H4l?;;hGVuGnneOb#KWTGH
z7vBmqP%#HLjwAzUFbmeSyXD#|_oiSxZ~6XU=@?JI_Ef~9JLJve*Y~x+5DQ>$CdeNa
z{GZnKzgA#Paa#Ss-yT;y$6dBlkP=1>1+Q%epaa8|B1vRi3m6Q}szCL%>8qEG9>=i#
zegI^QvH-zMHL)k(S}AP`sO<0@ONL)=hz=>E$V?_2Ds)dLVbUdZj{d46vq%EU6%yv>
zofcSPSI3MR6-#K*RFZd6;855hcknvUiZ^>5(Zgf#{9*o42HZ1GlDitVFk=Zx6tLZ^
z+uPHtgqUaw7W#6Jqulvb6{~mOmcV^5CG*!(-cq5q)#c5FL60q|x##w%(VzVeKzJcJ
zur?#zii-R`vEf>YsSSE<{*1+2N$H-c$!^n&P=0I5zfD0nIJj7_TuT4N>MH$JA$;-5
zYKk)7YiUMqtrv5O{*=F)CJdyc<sV%m03Br8fI3>}2MhL)Tqh#`KTODf;)WvsR<I`b
z_QbU3H{K!aFz{p~d-s^KjR_AW7sk1l{-ToQP@?KS>Mb7fA$dD@jwCu=a<6~Uaz8;L
z8$-&_RCoS1I_r5#eLgYg>g+=kHo_fwBF#xJEkS7%1*6ew`_R^jvKProBpTLKeOP42
z=QhMiUwxk@^ZV;8XJfne3!B>QuAMeD)}BSw46g(=9Cb-knXxbNz)DL36CI*RzZu7C
zXu@3Dp@<0q`j;IaQz;>zo652^qARQ$jc%U>fg$HTPs;9LfE&6U)YC}u0J~Plv7uc7
zxRDAybf+6uFr*uDg2ZkY$wMD?AsZ`z3kEPDs=$MXcS^x~0az8dTh%^`h+cM_@rqjc
z4fy><umjRm`cQ5A=V$SyR)4e>&B1HB{P_3#+vjhMPMae6R+E;$*OiQKO5_5zL&^9h
z@XPXsp_`ZiQeqX|!VY*Uq^VFEzNS$L_MnWRg-R(w&m9J;Px!i{!F5jrkQ+l17P^)3
z%=HZG09oyB(P-xye~n{e$k{!J;hk)(2`swp-SdlnGgAmIKP(QnTJ295aH!ub2xN8=
zi$>mqoGJd87KTWK0zUZ)h-Y|cgS_fYr89wV1B;3mE4e`A7qVIHR6jJ52Z!sdG3>1?
zQFqFlDFcTl7ZV9!<h<5N3vDh<18z&&zyAvFr4}{<l)lV4Q2$4U^0_VWATI=Q|LS9(
zvUkjq6frbi1%3wZZS-WU)OGpO5Uq9bl0f!=bfz{7s6gY*)$-MXb$)hM<;D1w3#=^d
z;b(z0<fG9T7(=opI(5tH0NL-J_}d3Ge^?sTu(JnmF*7czSrw5&!CXk3BTHj8ffgB=
zgzX}heVml`bbQ=4zc=-ZO9D#YNdc%<R4z)IWznWfi<9{bUzGm{4^wE2^^g6m*4X|-
zhDD+jR`DvQZ?|47CxiWV6-lVNC(@j7D7WsrvP~<7eKOV-7YBqB07aiO`btH5?0|!D
zOeJ><(1&R(2oQCDZV)~xZ(mXXcJM#X#6RE3gtqpP7(KFefmX<6ZC>dHS1NNoKo#r?
zsE>BG>P)AN)P^a#8!HmBu0k6){Xga_z6~-`i1|bdl##M=vAra`@xhkWvFuJ|moa~?
z(iha5)ZJ0?Ex*aN>XIhOuxtjqn?cO+p8<Jy{byL@TCh9$LaGPahJN-`@GswAy0ncS
zjg4|u6rU(j#AYYs4Uyq}e{J@%{P3j*THHx#L92R1sMoFvV(KL|fbV=J@&Ogag(d!A
z=vm>P#vj5BhM(9XM6v;ZG~v8)^*4ITr`U}JHrs#!{ADBM#Nq0|-Z$^dllkAZ*99d;
z2q28g$jZuE7Y4=zq>%Kunqg9d<x43NKAmRDp6}9-6}h*N5t5uaKuV~<(8YzKXMb(J
z6W;k<6A_UcPqN1k)-SooF)6qP#9V>i%0}K_q%D*`hnjAs1xm-02A0$x<3(7_oKS=x
zc2}2O4i69CnyjKf55NRgiHnI=-W8^sG@vs~1v)RT$>h9fvDd5M6}sC<P98&)HY*0&
z*?T=rBWHPqgpT9sC5)!L`7CyT`qQ}C>-!h*E(~pc@V8X&;a=D&oD$5Q)35_T-W>3r
zYH|_WQyaeSF8!=&Pjo04vM)C285FB5F)UsReV?g5kjQmOFi>4^CJJao@>31>hjc4H
zBKhjw9u|^WXkXoW4K2CdeCgtWRFYa;ZT6!;&iZnB|E4i}&INfAo`P8nl6YBczH|NM
zAka9rw5(@(O3_4gfhXu2VZ_6uF}*<bFs9rz?GpK+YxvtK$C|0lS9Te->FEHq$Rwab
z5;k)*t+NPeE%;&byFYC^I7`pTF?@YFj#nLOLN};QR2#{m!hYUT*37DJ>_ts;sL+*h
z_BG?^13Zr`mwxH|t%M$>G?`0|F_D>`d|O&FWK_MxtM4HvtdZ?+mvw|&O5~5n&{iyr
z*CA)F9Yxcal~9u(=Ob8GuaW+7^9Y`hgVbGF;pj|cF>3QzQ<}t?*aU0juk{ql=w(rU
zwoGh&FXFF1kP>;Qsum9kLhjr`2*M#g(AUDv8yy|hWvaj%>x?NIQ&Nyr_`mc5{~})k
z;j}<}DV?MT<0MlktLCjJ{ey)m@_0UPLzLu@kv&(TB<PpRZ<S<{a)W121@PBig$0dd
z*>8(EiSuV1$;~}x8jM&gMGyV1*uVH-vl_3X2)462XcflnlqI(!d%0!vYs*_{S-mL^
zkq!&~M35%~Dcs)1h#&JZlji@I;MH3}x@g(bT*Rw(+~R&yIp*zPl#Ji|v4JuJ|2lSp
zYO4bw{FOd8TOT!i7M~VV!xOJ{dyNoO@i&=Iw%4zen8VLdQy6-@tRniXmr3RP07#M?
z4E&$dJqI?RcTb9VG{=5EQV{?9U!#68O>z=yC$lj;h@%ALD?LRTTRuW^%4Hs0y`Wg_
zE13V);A`PljrEVz`78j0^3CV&#&&u&EHNf4gCac51y>mjWivW<11_^*ZJteD4B|+a
zf3z2IWfz9~9^zLrKzJ*DBh<Xtavwq*z63-m8D3@9gGds2-lUUyla6E%=>~OA3~GQC
zSpsNkNgx3qB(L!y5=PWtDk~l5L1~#cA`X2s;{dpoz}J)(JLtg0Tg$owbQFL05G784
z#q@7*{d+ObPa%JIqz-G+{D2IKBJuBRL(+W9l4mAl;EpNU1Ja2eu`;Dy+$jyEEDb9f
zkSKH~e>uE<y@mT8+-PA-WKdv{<`}co%pER!>#kXM%}&q)5ET{JFS7Ckmjb6z?TJ3z
z3-9HNuh?DBM*MY)1VhVve!g0R=SlL9cjtAgf-P9?gS8WN<{(2hMZ^3Ddn?(&ana<c
z6yFSM<9OS<JT?<X9<kr>SW@77Jk@E{(u4AMd_=qp;2exFj0^TNYBQE=d37ehRWY6a
znQ=}N$h|njL1G;Kj_9p&znp{78N5=g4fpzAWW7~b6<oVEIzgl*1f*LU=?-a7x<i^t
zNO!k@AdPfOBOoc=NVn46-5_0i@V?)-*7~n~=7GWm&N0UQ-1WqN4OTv@pnH?QmbiPW
zc4EVDm2;^5e6Q;vaW%VW{6k`ya%O=Sl^Sv8<g*sZCc_}njJ!IW4V4QLsed3>@Kh6_
zhQ1(}3-wpTp@5`eF@1Vls<OaqQzhGD_y2oXKr|5s4iHYyD+AaKvJ7O31d_=~PCXdj
zhft3wT)}81(Y!>+2IkGc0hAW;Y;kO_FQVngG|3+kw`hnLUpgAyWw*U)O?LO!<`8nA
z)vaz7tx;_)q)-+kH^Pr0fc-{c5XzAF3abQ}+?dNZ_isN6BSNh23!cvl!|wA@sY{oq
zE!c*;p@02O<UK+s!}yGT1SWC(2${LGzkj`3`9tKzD$Z9*{uaM&Q>)05NBXZ`FL*R_
zR$)O**rY&mP661=E*`QjEdZ_zPG)_N$P;;{1-W9N+qA*O{WlX>(zf;~C@84b^tt2{
zfHJ;5-PljiwpmF}{yJb!^l*uY%V$!C@HB~{4L2MbHx(y={^dd_JG&XM{V(n)P$@RX
zDmUcIroEbTAljOP9ODHEnXE=pU$s4Fq0d@u0em$U;H!6%mvzYIVMTij1V{LXI){~`
zl$?XYy&6K5Ju1KW60nZG1p{Y8g029K+f?%}oM%h&bl%Wp7|4G;;}e<#mFj~dBTG;N
zDSqQNv)HmZL?WPqrn6qy<6s0vSAiM5mW_zGXXUbW4SS8=brfJf_3r5aV{PE%89rQS
zyl2z5Xj%#ZZ^4{>+Q_y8A0aU51_8}M9A|IUjBE*EV(Yha7%@Dl?sL+J<MdQooR1n>
z2tARwz!S7*pJ4JmW9A}H<=8DN=yx~;Z0h1^-r+ABVue>{9A8V{V7|`l5WIHx43TVE
zh6XBIXE0W#Zq#?g!(E*VXz>Yg`?m@kjJ?3kRO%lq??yUj6Mzcwb>tBe+9t&<(myC&
zIgQ*YibN-3Q*bWZh6fsaeoid6HWU%|tAsMQxL3n`?agv=Xkfr9jN*}Pyc^Dz;->qr
z{;JPSd6TX5*Kw^3Q0``>yWFBfcI5LS=x28D88&bL%_|wEWgzFk8w{ZYO&k6dB5k0l
zh8ChJhFG7fd^1)Vnr;#!rA9_7iim_pBCvh5*sQG<Uo(i>5(VCm|5EjStgxa|f-C+_
zB3tzWj<|30x8fPTh#tpNZc=dL#IrXloQBa`(332Ob?WHZeUum9#5Q@aBW(~L2(2D=
zq_8DNg6HHZh&Y#q^Q;(S%Xl9kj6<G3BNQL0vU`j>rK9F5E{i$nJWlBJfS(yU`%J|l
zvD{V!vXFU8H7rx}TgY+12--VD$7(G-YC4=sr!;A6G#aA$gKQo}A^)f0*X7c<kE%<-
zKd)dx+~ayTe^%=g77ZGhmgVLXnBTY2j%&Z1w*L+DG&RFK3*RIR@{?W%$v1%bZ6}>9
zZea8NGH;Mgmg)OmfhFYkZlb*_V1H#Dm=d^2*La2>pZTFwg0@q1;9Ev;_DlM=xZ<MM
z@s@m?0mJHeV^M_8JK*~W;1gq6FC>Pp5ExxFRy&I4C*Vl4y;YFT>oo1->EL6CO=x&Q
z3QurETfb{kIbtG2tDZkTFbtrc5Y~%!+DOm~i0hV7Fc9-vr^lWP;<wwhuMOH7igGDQ
z?R78v?qr{}iE-i)prD0a7(|<~_vcA2gDFXlfYDV^+Fc&kB_-U5_Zj4Eep{78!TTP2
zpW>JH7dbLUu}4bt+;cUci<;}z2Gzd-j<SDPCr=TK^~0X<3RfUQ&C*{I_t-DOK<XpE
z19HlHDe?K9n(W#%tK?H!>YvF2F=k06l4Na^^(T@DpfH(((pO#*WHp`Xyd&KXa;M)0
zy#T1Tn1Wrn$d>&-6Z~12b-1}%J*qV|6I%F-h|jT4l8@g=KQJ{0@cF#|)fjjBrdQ+3
zmRPMVyr@LGqTFKfQb(`!H8#?r>K2z}kE&Y{5zRsb$cUvzM{zx*$=s87(V2?YxMu_l
z9Ok}$*8QZR@;=SU?~wtmZvGFgZomQT51!Q~TEK+fEMBxb3z%k)pXO95ydS2VxmmkQ
z`&x#-`~f?0|MAlLf={|@1ia`r)&F9l@RzVUg;K8!f_L40RrrUIh}W>uuMj5e<r##-
z5re#PSS!9lw`*Vf4PIs1zN3Blfs>!)^C~Os(kC(qB<xY4<W)mCgyqL00oJ37O8eaN
z9TIQmqQ84e7FgHLoEhUkm&zPxZOQdw^h@q5f3%U^z{aVzvu9{dpOngP$Z7F#i+@tL
zZ#ffhDmAiTWi7+G+6Xc2Lt7DTW{6(inSx1y@IN2%+5+*mbbD9_13dC<MO!@DHG*G~
z-`*7l>(}93*erVlND5P(8ic0<aR2rnvd8@%-0SfUF+%@DL03e-U9G}qS9_&d&zWJ0
zuA`|$WyR@f({N2WKl-tDH!gmOdU-RdG-6Mb?gszg6i5Esxf*W{a^pq|0q8#1ihUO8
zSznCw)~>*b_#BnLjmjPrtAU=&mue+X-Kd5@td(Nwk!2;B<oS;yR~r-plZztazSb+o
zGU#@U;y<k0c-ko%#`M5uL$SSj23-9hrz<KktPd8^=oj8uRDVDx3lso^4YJ%;jq!)%
z=sxT%k&L1I<!@M&l6Y#EGr*m9|679OqMj905B(#gsx;-;H>s?OOr(Jy_%<Pghk*$P
z&9+~Vz8}eiyPbzcw6_Yit6WERSYo5E+G?g!t#-j>E?rF+yjPi!<%v+<t4miLG8!u@
zf5oy&9-oeHzwD&o{vn7EVeBgO+)Us+MG@;el26>k+A}(3jDZ`r!T#5qbI1E8G}+BQ
z2Y*E3rmxO<zMU>o`8j&zBe65Re{qusU8J*1-Ri!ONt-KFWhm2XN+|Led8xIo;h=wW
zlj=KcEw_Dsr*q-Ah1O)a)jt1#9)r%&Uc27{BAsl3-J9Ov=^j!yB6~t8kf)D33{H^8
zvtBNar_X<gaOP8%x$G4R+^I!saIq|ws2-^OY@O7av`>qIv@LzyJVgAo{1Z%Q_k1<~
zZ)F%CT*m1I1L=sL&d?I6Do{75hT2`z#!xOqImSghvA-YQm8RZPQLN$Yx}*(s#j_>O
z;g#1<KY+)J9QpJZFPsr}P{@%Iqo!W_noEw%-*4Es6t5s*lsG#2M|pPsXE)A+W88Ed
zy0$Cf#APK`J($`-9q^;<^M0W3Q7Wue?Nyj;pdC*wxzC9E2p%Uo{s?;qIF454G()Y)
zYOu^YPSHX|`e5Huiq!jW3<5gI`odIT80&={C_evbefvBx8r^IO*h!xB{_z<Yy}@XV
zN|}nq7cECZ*^xtTfP^_B4@~<}DMGiwY0Q^@q45+s0EIZ>T96o4mlUAIsLxn@LRZo&
z<lBYG`}ybc4fq7UhVJvJl>|n=0H%ad?6EuovwuY_HmyvIQ+?2nmHKaP(zbT4UWc<H
zmKZz^zOZev*k^hQbaD-9rP{TJ(a#Xj1b|Zz9irp(!GpDX7(5!J(g*~oc<VFa4i0_x
z0Q)~hZHt#RD~!4@YZpEC-aae*N={TIfw`FlRM}Hd`bC|$T-v*s!fEN_3xIiIif2N+
z?X5}||M_(Od`@<nq6^mg5%|(z#k?*xW*PXdAt0A^86UQl;2Ly~f0Qpphl4VJj|ZR+
zm~p3QKkEt8c{lqb5wZFIH&TF|wdJ#wBJ9Sp&I;~w?Uv5dJAF|`=NwsQ>s80UaX!jO
zF7FbBu>Nws6e61*Lb4A<(f)j=(}eU@f5AEL8b3oMDPe;Y(TZ(l7$j$Y$p^}KWJSLL
zhI1Bmm0r{Nl{crf?`2h5*98uP%K9^==}&-r_4~E^Ld9$JmeM~<{W2a+_9qztn&>GU
zuNVWO9?vl_+zau)UI4h`Uy4=j5~L(y)dLkBDgkJp*LW;pmS-Tx??ij!!IH%UwJpo&
z+iKmFz)9c|ngz|IT+qQ9#=T2?C^x9a&=Cr;u&XBS`V!Ix!O#SA5r}}kT6_Uvre$q4
ziFqy8LZsV;^RL`4{W~qBd@XnB<>ia-FYzP)w~EZ4nzAj(14dY*aB746WrwtaUDru2
z@v<t;OR-P#b=jE`H)Xd&5g!-qoc_2o(L|#h<iR#{#=04?>9ZwOq=I&sYl;<G<*=0o
zHqZ_;(izW3!4iVss;d9{{o2c?)HuGvADmUz?ZIS>Fu7r|zj9HKvyu5fI6LOUVr*lo
z(HgY5VYYDBj^>O0Oi;@Dir&jat0sFrsYjecu_F6qM>A!^D(PU6lh0Jv(hd(bF$jiI
zf&x{WV1{`z7-^9L@H5eaX2u0lIJ=y#wxd-K6r-T`P&2`~)+(O$LRK^c9U_F`!(h6<
z*K(Bk)%(YUMY@4jLebd#N!%ape-ZV`K9OiI9ayj1EYD#%Sz-<%_V!DiOW$tCl%iZE
zJR7V$EA}S-!7~#}pO(7xfQH)I5A-pa!)KQ=V@hR}iV8IA>y}{oSe#4aP?ttTOSa>w
zBjn?RU0j%V3lc=*4Tu1qf(YP}_C&W4Le(6^&sRveMZS`8&lhO@NRSKyZ{KgB*R_(m
zHO1b~)T%njMoi;kWg-LQbGxxGyXgs)TuYw1$5`XPS?Nz4Cg#88*dW2#2xM}#&5g_G
zwPnw@KZmBR@f<<?S~aFUxbyy|`IPu`bwAZ2C~Pj>DvQ?yD0}?9k_g9?qktvk)~~_&
zr0MkPM2QvSxhM~gI<<h-)%4r5CL!p%OOHVzfO1&(XwHnYPX2VH#h~Y&hzjC!M_?4!
zl*=x<J!j5Y*UfbYn5O4fTqgTLoaSfSl?OA&e3<LkWO>IwJpG*A^;RMy7XH%vao&i5
zJMYFb1QXohZOcy(9bOw)kC}FE%xCP=)?k)tv{J(>tib}3-7P-3@{NYWim-|Y6AZWl
z3{kiFel#P<s2+%B&hn{kreM(A_#wuTvj7><6YThU6Y4;!%2iW{Y&ip9Bomz9RD1+9
zTzB*K?a{uO4sXpM_5a27v0=jYQq6JY1KXDl=z|je+VBG*cD-k4l`PCDIY1Q{pbN=L
z)BTdvi~0HGO41go1cflDe4<8mSQ3v338bJ<o)Km|ZEeCLvr08eueYp#E^3m_mxqy9
z!5b(|Ou{41gd@(DEGOE=d^Q;Kf*~W;r(AqZ$ziNNMSSuQtIb@KMG22&McD(KLM7<>
znE_&1dYLd1j?zza>KE2X#ARHjZhH+@Y8=>NjWVBQ;2+TI@>B@dYoxy=VI~2)U3O+b
zwdyk<934gy^B3+lFq?*Zlko%CKs$0)T2{sIKgKeqy(q<_IY=A-*S;y7xIZezH+v0-
zKj>}1U*#J)V^W`SBr@ysUiLYcDMWybMXqi01qp)bsJ^I<smpDyhSW-Hk_HcozGL`Q
z-+jsWD)-(X-bNSvy@t;xht_-#O}#^VV2`65=+yB+uUOAVJGGMJtzYM<E4_uvp{0q@
zZ9LZq_5gBg>5&tkkP4#*O`g*PS(<sjVA1ws2mhLTMh`#ci*tZQ-ZUXL82x_gaN$!m
zv#W>^X{$LX0D+Bn`gLx>>LX_lAB|x}q%i$q;kaz-ttjCz78CPC-LD(mkuzkitZ_@@
z7!2@vqZy|0e|%dW>36IAn3ZJr&UX+QX05LJPXDDLPvwbdvs-L#TmKcd3XG;vjRuUt
zi--KJjxc;H*|i~>x=t++R^pQvK-j4WKS>D6=W78wT-eC9U!kuUOssDa1edkH$@qt&
zZ+vcZ0wQ(C^(R+Htdb!Bk(@snUEuk_<)ct!I6RJI4*EEJGWl?>KJPT{e282VAb>1B
zhoG~k|6+iPM2=A*rzQ$Tw1eaEzgB~|DI*b0ogmbK*H=Xsh8PzPUZkz~PVfaJk^yvP
zO7HC1hy7nxW*f)fMoJJ~qagjSEZ=aSXxb=BtaQq!*3TU^UQ|QVp_)i6nB5zT34AMb
zh>g*;z3-3cyQyv|)vn^MaYKOh*DBi}$_k3gzjdB+l*1x~y?uh_VH7l%wmcL^D>EGM
z8L{MpJG+?zpWV!5ylf*Kra7Oyy$}l(h2sMqoDW-!45G_8VNk$087VNeeH+4qY5?y`
zcPf8-)(S2Y@IKNo{(b7AAwzWOH)QY509zM@1}a-<nlHtqcRuf10!KJ8WF3wkuo4WQ
zspMV;p!0W#*TGL$?mwnTwE2H+fyo+!L)YpPEo(fFIV(~RLCXDWB~CkVZ~k!*;Ie#q
z!QGB<C_@_LF`uAp@ULryv;=PVCE8oyN=#S+tNZ{x=LoahO9IwznrN8;8vlr{vdU(m
zLF_3(s}N<`H3sM!0_mMC95rhXJ)>x|DmtERhh)|ZCt%V2!ybb3;^&7S!I}x@Q1=fD
zOy8&nxHm|u!M~>j|NfXTekAS>)wNy{paJ3hd;x_)oEPVd0M9H=7VNjzPzLIza{V^J
zSI5F&6F1PoaGOtMo|E>o9vo9zVc3D%y`O#sW1u&!%seRb>ZNeRBrcH*j_yX#8<Pix
zze1dZ4`G_dV=v$(SY|RD>&sc`vv~p;F|J6SD_}N`T9S9X#!f#j2`y(}S7*SMA;J}r
zy_)`Xj(nX*&{6;T$4#JZD~k5yO7Z&r;O3Xpw&YWrQ2S8F)Ru9}Cor-W8@%v0Jc}UF
z|7zY%$K~Z3(2x8@$jV=%8^dG1YqbWZ^raBA?hNa>C91$QT5S+SMDHhANAJ#g(Aq#(
zK_?NoIs%i2YIQ%JXAI~ND&<4(;U@7npK`AMxc?Dy#Yw(S1Nind?86i0-0q%q9w1f=
zJ!>oGAkhb_s8c~70;pOT9VrCL-XYNR-b3s}{68Mq(&mxwZ8dyE`CoC18r{EC5q){r
zX8kJ})kxTvR`^#h9bZqmf$k4xv1xO@%`g04U%IT=ePNMx^8>M6JX_8VR%UT*q@M8)
zpQC%>qL0dHFh9dCuZVX@h!7_=UaXFlBQGbBj(8z7(=O9@T8@2b_d&S~OdG{)kidDr
zMyP*LHa{KNO#*J^!6>7pU*J|=B9{n2e!7*-|DRhqa*YVUh?phHK!Gp}2cV|b4jjGF
zuCbc12D6pfU4wkMB%;;<D5eMIbKoa%nM8wHMI7BW?(K)lqmD-nvq!GY!$T}_e4720
zZz78xCie?#*WVuVpagmh6genAU(I*)UfSx?mO5Cy+rYwyru|w^f2Ga&Y?=6i1n8zw
zPG?bkyM^(fAWmUeJKY&5KFPyEw=E+m0T=NKgE#EGRWlewKNrZ6$O%E)w_*DIjtNjo
zba?Kc1w~{84e+(5*k!J_3=Iwv<=uqC`uRI!G0rjkM<TT#%5y@7d}1TKdRTvwqn{kc
zybfx(Ykgt}lq588Tp6x2l4MQAdYx5W?R0K*4+-@w%xt;)xrvW1_*;Gbd^?~U+Z2KA
zF0-8ZgP!6ld(KVH&_Q7VmgG6DvCM^5kE`*;h<I<po4e`sq7@Zc6glZ8r-!`|J{V1x
zgTYqE3x_WxEzwsR&lAaev;%DoVdU{se?H5{Y@v2Hm|rPlYQZ^bp&c(qoZd$7+3L<|
z4aS23JO=6$?PXpr0V;^UmS!e6qrWZ4&ySp~cibYr*Rf2h`A<5f45m$NuVfA-OT$bs
zjol8e*QhkJ;;LZQ^f|eQ1P0s)7;#z&>TqS6Kql$VpV614xZ$l7o%bOb5@!ooE#T1x
z!~{UAimA$Dh?0B|Cu3p_tP}V@GztXvy;SqNE9zaZQ*m|f+)u-fM!nevZ>rTLdK1Rb
z^E(yfw+g5$;q)ZiA^8Hi*vwF2)la{YdIxuWJ8|JUDn4Pvq|&^ancQI<Ojf-h^5+o4
zMZxGr`dfq%yW$^L(zC@78<h`R->E!liiw$`t1tDsM-u0VAt>mcgyx6-bNv@K#`jYv
z*VGvnPyJbDs5of*Pv)6(2#LHkBP>-KKBHPeLO1N^$OuMY!zCHOYm~rj13ieOu0&E)
z^jF*c@eGd_`KdO1ALlN312OVO`zuSNo@i%~9cJdDJ@y3!q|@ZJ^9}?4d~H8;8z;3i
zHIK0!ZL{JWRtyhhKr#=PGdHfKCEM@^fD?L>boc1-&43HP|ATbSIPig0NJu~Lmv&6u
z&JYlLeo&M~5WeN-Z)_RcIgX@hI{O_9vwVy!zgQ2XAAzH{-&4k9o|r=_Ao6?#qx=%i
z-FHDJa7`oe{I~l}@!C5)kG^QX;gAv^Iz`82Z9qJq;t&OR-jAZaE)x6vw0TX4$D?In
z|CgNv&1@;D2nNxZO|9Ak+Jb^kxfbHKI4HHX(Yi~8mTE0#bB74qA_^Q1=?{)Nv5~%_
zn~8%Z`igt_UX+pe5%<*ix7P~=>Qg)TCZo-I@BX8%>A<pWxAsk^arO<}?7AD}QP!$0
z6H$gA7_vQ5U#<$038Km{ZH;n8yyDdhX}g55vy9Sqk{>s(n1@<Wvjd>|$X_NKdtwmu
zgVS^cw<TR(0nx%`Z`A&2Duji-$|{5bTLZ*;Ti!Be74y^?b%j#eG@nm;dutkPQ>&Nh
zd2d&m4EX%i#M~9(e6`fxrIW^3!irnKf&Fjw86CwYQ(oNL)~v(Jt~WkRprFDPwk-xZ
zAd%E%+R#@kv9olSj3xt&hb=k0IiO%&q0|Da=l|9RV^9JY%9J-HRb>aM$tqQv^-~14
z1PO=LL|48vJO2~Q;8pUzX=QlI@%yVUZlAC<5)-$P&&I`79!#{767eD-FMWr(kFoA$
z4-kOK-tFeq;GYu-GMp`#M+!ry3GBl2#!u#lUf39AL9lHEoyDN<GXu^i)RDeZiG&&0
zV=k@GDA>Sg0IT+s?jJ$(3;4x==H|L$yjW!LBLa`BOWzY202cke1Rb~~bn895efI=t
zlw7WKP%C-z+`6yI{av!vrT_GE<mY?2v;D`XXu^?V3_|xn>sOyH9F)NcNvON-QIsy5
z*zfy#`_D)9;%LeDYO&e#h^@<gD220Ch?6r9JL^O~3_!Hh?>{i{&3Gdn86g5MA@Z<(
zW{KBBj=*S!zSF&PA<F)M+&!HKKC!3&XqPoELQACUGmQOxhAWC>R-XY&LR4=ylF@wM
zIo7ET2)BY;tKYM7bqWrTR&zebMwc4#h|Ya{yEtEX@u4Hf-j4&+x1msK1pjS*^42kN
zFL)nx3kL^7Wr~HX1ZFE3Nw@Zk=!-Gz8dQDtGaTOgm`;UUQ^$CwMi$E;XCxuqqtAnx
z66sgDwN+lGFC*zoX)`tv&)0=5j_Ka7(g#|7aX){%jx3C(U1vsbJkU*Us`xjO&L)4&
zs7h-dpa}gyD~hGkH|tuBYZ-RZIB!^Q;}r64^;b&6{~6}qJ`HnkJ-@^}5ps|J0AQ6V
zXZN1mLEN8#{*H0VcVG1$&nhY<7J>!xoMyi9TdKKC+I`w<I5GkSWL#;i7ta5%8tGt+
z(JL6siu<FVIP~jM3dR4wo{A&Jvxc#qX+UIHBt{TUT%r_>9%{FExm(^|9JCUfS^;H>
zJi+8%vpWCZ+-~iQF;boXgCM#Q{9~7VHrD(riW@O;FdSDv^tSXe(s8wcwKPVNr2u^p
z%dtcL9cpI6vCNfJGel`w=4CBK>*_VHzba3(2BO?*KM$?bv<j*L*p_0%;(E|kuyW+G
z=Ggi59owqF!)*7VdnneB>T$kwNm0C9o|k;qB+_%vj_Q*13^<q7dR)65siu_M57Gwe
zG#~p&A86h#5&dH4D~tw+nb!EX%`0#<8@>&vd~)UX=yoNk{!iIKA+mo9Sou2hZ&&TS
z!e6|nE|0SP4ztQgd7WwWtPB7#!E{@Aoa69Y6pU#yh}Q|A`4B)YaNByjfHe)|+Lez9
zCJVm6;qoYZ5gJuteQOvPePr1!6r+42r=pz$jAm)K&)W21+map3`eX(4^W)x%;bvOT
zX(N$09N8b6bA!!@@DTlDP-~UfVg8TuLe<soybuhM5W%DK)v8U-VD}gMK&8!gkKh1@
zErC<iDzU0>_NgZwKk1%NFh~Ib3)`8TeYbtAKZxV|N+Z@QVIAbq<rPIH$HJgV^45Uc
zwu#w^Gz2ATh_j=HEkhRaeiNNjK#QV;VfC?6K)dPPDc+K5dBNu6yU`h^Uyb^j{p}Tk
zTQ&!$Joha{PHb2(2?AeWej^n)v`H{`)_ru#RDMg<pCJIlFgr5Cx||%kDrd(~LAkC=
z?4KU4{oIS;RUkIE6XM-G%UNI2<Nm#$A$Rj4E0F{W_#f!f70Z0`T4{pmTZ`iz2(W0<
zP0#T?7j2Fiw0;t%+R&vH>)8-L4S{%}XLD9?l@tuPh%!YG>q`bTx~F4-u~`zdUwio;
z%vN6l1#1~@dj<JXUykm6D=!3y$?~V^<Yhy6jfVEjZYGN7vH5v+ucrPtP3|8fvb=s)
zmPVywg`j~-C0aS{Dbd5L7BrOfXH0E+@XWp_)c1AC==Va}B$oymB9upR49Bd;zCC==
z*CPyHTDgMuT(ozPZoBoFDJjN={X(znhyAi2cN}!lx4=3G4#rV-HkyKIrOiy@mv!pp
zn24T4EKGv~udTWKtG0%6v-4{LGFH9*2H}-38ILW|u+_z@50;2=;9`_@<A|$W7VP`5
zlkI^6Pc7w4Yq*_3W%N5et4)YC41i^$8joyPfV{-5FLM!Oh<cE)$vQi-z(8R8$7v$A
zz<V;+bNW9f3>7WBQDgT1fHA?)3h+QnrB07VfKTU6U^Fw`;u1hx!tvSP!!9uw--w^P
zx}+*waL*tdrpgxrvthXdNb&Texfdn<lK{@gE?|tW`<_U&V|e(}dX7{VJMOJUtiBZX
zdgoK8igMHA3>o4uSSZrE-CourN4!3>E<an-oXfKv7=r01L&5h31N|XdxxY2|_hm8C
z<$Ds<;t4+B`N}MY)#<000wznt(_hA|c>g+Uy_YaJc>W+_<A{?5mgq^JoWFY(Hq1@n
zAF<_ah(c*x6^b5<*uWCG&Fj;$lG0hw^+#w2?ELaGK~>-pBF<oO|J`unYYY?1A#|;j
zmU;SV^)BzIa4LXIrndx^pITt7_f!oNF~6gkzT+L`JU!xN2Wk{?!XZk2lQyqDRldaj
zQXvwh<sw%_s-E1kq%Q$L3%=;Auuo`#2$qu0lz@on`vyf8iH|)Lk6*E+`#&a_OFyI6
zG{Ha7j|Sc_FChNMPTUqi<zCqA6tdhOxt4|uNlYYX9TpYrv4404c@Xn&PZz-${5f}U
zZ*qmYdjC(UI3v|e51wb;(QSi;ZGatuHO2qv<84V~fG!3dR9(bp%G9D##$8!kf$mxq
z@`V&+SV&pVb+_uhB;>S&amRuzK9#E4eH>L%nAb~Y311w-jQpzq#WVT(Q%02Q>(_IL
z{`HnitXE7P`xjHD#l9D<FF&pg2kthO&cEMre#nXUp0Ok0?vijj#bIF@9*FxrieaSe
zv%Y7O^wK%mD_lN--Zyp%;N(wPDR7nQnuS$~^M<dDsWA1>gv|9ko`u0E&kaP3m1L6p
z^pWa)8Q~^$T}<5r5$lvj?zMG%fH=&#SrhP{Ze%9emmrWl4+D{sW~DKze1~{yqUKsX
zn8sgFP(b<<I9`1qSH-Y*|G+C;FYO<b&ltfL>7WQvn6$o>$hD9B9AG!-kvqwem7#+i
zvx8tGcHL=xW=`B&ZL?e;9xgeFHWz`dtoCjQmBB@;d2y$$pEra)$Q={UCOPj-5<4un
z`D;9Dqol~%<OV}=5yfnHdbRbf9N0zLH=O7k;G}yw6kbu=e^v+ib9rAo4uSSNFMwS+
zzE#sZw&>9PSm5bs_)KirPc}lWUx>s5JTKaESrfG?(o32+Vm<aG&moWEi1nn<{T@KV
zY#lB(r{X$tHh?qvwC*SKJG^#6^-~WW38fegJm&@p=h@>7Kb9MBsKw*dPBI~4>IhX}
zqO8UeVYcBV9XcuUyN>*jw>p`OHX$gXw0No@kK0+`*RRr!bN)Rf`%@(zE{DeRb>=X1
zRqPk_wZ$~#<cEd>L(Sd~D(r&n&8p|BBYy|elRa<FR>TfvC%zZis$I%$a<@84N=bPy
zWsa5zs%(s)4%p6&*Br>e802NIh>3>y%K5NNvl;8X01-KymZKxJ<jj&>&rFQfLlU6?
zbF_+u^)z3;!a9Pbt;k3D*s1m`9`SF4aA-nDv(>%j`IuBhZq=9*H~KvOrrxFm_X>$g
zRaql+HwZUc>>{Qg&)WqkNw<pWVYoWFoh82au)?<p@#CD&vL&dwQfMlrdPw6WW$DGs
ztoGbEY2%G|+~#uh(oGW9<14!*mDalaaa<;BiBkUC)^trj?7N!jG8>pI$fIEH<>*na
z#MzuCjAnqxw)<i^|99LM(4U77b3+jZ&xwET;Q~KL@^Pgji2n%eRzhchMO_c(@Gq~B
zI5Vudle&Bl>L(YtSax}$fuVm93o==qa>rse1Afp{){9D>XW}|Q0PXouv9oy_xuz2l
zLPOi_R)(?qYk?@$2n&Wn7%l_?NcUgwziBBcY%oY8Ye^%M`!k;U=<vuSOrpHaN2i~n
zY9V;V-MbUv{@dPCXv1(iOnp2IIdW*m7>h#8k+}7olBTrx)>!R@yI-Y)@O{d&d;y#;
zt_YI_mP&k+)snI-F+&!f`=PQA7ga2H>`Otl6@)mKrx>FC5x>TKqHn!ZFCG0>&(UGn
zcF#xjP|sw03%Lgy`CX?b`wOLzn6?gQ@Vet2hfSW>udP`!yb61)^tr$B?VU<w)H)3W
zT8S?<L_VT6mYNdccCfy_egG(#;1F&Lp7=Kdv%U@vr$^~_I^_oKOlMmoiPQmCRB{+7
z{9lIfV&NNaH@@QRHFY>IjLEI?M><+jwXa-qT94p}-nmMr`6LB!><WHcWf2q{l8X=z
z!)0c|L4KaTv)v;a0+n?eHYFUc7Z~XY+JZoqfP~q}ct$`-=m@wgpPWBkm?xkgXYpHD
z53er{bicUBbHrKKoGE>n1sAkYV1<e|lwpXz^R7H79=OaGh^-p-cYMI=p2iGquA6dR
z@EtJnppTah)=uCg2|HKJQgoZ_><qN5#H9O@Vd79K2J(JGujRYByidZvwkCoBF1}Tv
zQlNmXwAth=l=oT!$MN*M=(Fy8?6~{$*3`TiZle^*PR_YnTi=6xxzu%6mj;jDL~(8#
z&3`q9F7w{f1iCqDa0h+D^4EG@e%ECCM#@y;rNQ<~$Nnkp9b-(W?(23E+LtbpSlC?N
zpOB%%Vdv=HElVu>6TIE1!4v22%=Gr1%T3kieRk+hmL9(>w8=40WfE0$rk}{>ECo)*
z2gkfA8M*k=n=qY;c1}O!v-iS+_mp$uTK0M~OM`0oeZqOo=oW=*kj>KU50<1~&!UTV
zHX!zPV$n*rc9)T+@^B%-mq^h{3cr%VJr;z#(j!^;_->d%U)39u0=0x@`JMyyN9FH-
zNdcxUy^cN=1NNaZZK%q83pFI|jFhcAQa*X(5t&?XYO467ql5V;=`6)*OCuvu<jyJ=
z`(YF6c}r}pt00Wk`YJ9~f+R$Y|C|XtP1uQ04|PQ)W(*4g*hG8kA%knAUwVJ0^Iz>-
z+!J;;ghWq?r(*0IN$cc@>Nm@XCduUPhVN^OhM<OvBDE?G+TpF_VOV_i__ps~Y}<IX
zuI5gh7R}2nwZc+1b51L-5w3e+Okhpab`zl_v@Fsmp^`QDP4_Qcun+S2P=^FznM*J{
zw?&w9XGF1iM=^~4(0y93(`*wa6oCo-m+8m*@Q7b}La%f5qHOj=NF~iOzV1;3^nE+B
z@g2$x_qeYn<w*5Hj8<YMjl?!#L9`oQ8o;|%^r{KDG6L~Qs^)Gg&Tyf}>k+Wi*Hfd&
z9S5@PpYFPNT?^`cmsFc$fUows-vIRFu=usPSNvW>>tS5~+uMEiBhb#Wm>*K5hFGCp
zaAG~@16tSK<MdqK7bFa9j2p-1<mB+*wD__^W!3wobwF>ab_&7R<_x;yDE>M9=L$gv
zHOwhOJ0_4SeFh;awm^X}+v&PSh@@#*R(||V(WuFYJOmf{O0?J8#{Kzc#_vD67&Int
zTO<)<SXL*NbbfyPtrFUm{pZ~?h%GpB{Ry#!oaMl;i;X$5+kHQF;8{ib^r$I`dliJb
zCLjmH6l>!_5$KH4)xS4gX#Em2sj+^12|-i;*~&R;<2|OCe{i#cPbFlsk+wHk10D(2
z{Jp-ah1IUGOdVS6hZ6Tg?|En4jq5%~v=vQrPpUccRcF9X{nTC?sN+E3)E&N7if?7O
z(>xQ%%H3QLh{JAVMkIP@u`TN=$9#iP=@kF>E1TH(H<hu=zuiqmIt6Te&&#H|UvP%(
z1hPKrclaf4lPZ=Hyb$vdMDbKrVh&@1T0txe9JKH2x@5JQ$5eLtm2O*E6TybHW;+x~
zxc_<qj4Y1B%DBVyOxkKO-^#V3IF-rb9HIUd&T;(O&NG#e#S&E3=i%se1Kt!OLo@^0
z|L09%+cj5=RVp(xD?jkRn1)6@KieGge22MgL*D6cjgXNSSz}F#4q=lqTb(sY)OA4&
z^QhvGAaF@Qgi|{={@fa95EP=%h>gs^KUBvjBV);10s@sHo!V<JZo8rfczr3LvZgLF
zI!|A;qrrLaVa6e5-0y#m6d5cztQs%^vFvt5SijKM)*&l=&eE5Q|1<oq?5)=?O*4x@
zOP7)eQcUv>SZQw`3uwk<NK$(eE<_PItZ!pn4+;N5eaYqI3vue--Zb3Sc<0Y0)9vO?
z#VAhs9r5P;>Eu7(wla1YNQy9j5@*ebQGB(U9gf@dk+INp#nfQll8!8u0C{bX<&|h(
zp;W6x1h`a2@NhWyqapa`d~oj;Zoh&*+&E}|RMw>cky2I&85V7KSs4<!-okLXOd`{Q
zp@oIT_oLf6Nl8ht(jKdlPkW$9Kh05#*h)49@@oyY4V6Nke3c7w470pd<o^6-^@36D
z$ErQ-vxqcXx(5=-`~V&>2sB_3FjevmbK^moRhn%rLbii>x5GX1-%}Cy@G1}_a-`+f
z+#jXytMG5JS!PyLR1}O14&G(vFA<9(nn(Ug`SuCfk&X3$mkuffCYu_nRBl_J*VaC1
zj^_FC`LkEmIFN^5&I?Ng=dMI%gO~Truzp<aUok{)byIxmxHoQY`~vReyVLn5D|dl9
ztZ!A3r~v23fN=t+)>uNV4$|5Ator@OTPWmjlU(FYa==2ngPUH;sQtCuhi8P`HmWIu
z9gYXVW~z21#$CgmRVl14Nf@F8<YpT{s3j2>5O!|;kWhUK-r=?8r0HylFc<n)Vc$m*
z3`lN~$q)Cs(>|hk6s0*PV+$vzd2khrx~vf?%2IrzP%JA-<k+0}u!rhdA&~GJ&Ns{7
z0_UK^cP`xLp;TsYW<UQSO^6Qt_1a{(J96_ouvUp}_xs<0rU><Bj>?J4nhFtk59#&g
zYIAHWBA$Q~*(+)^yA&8ZvcE-sl{UorH)-aL^5-LI89l0U%rwyfgrCDb0vcU890C4m
zQ1A=)Am}z8-+NfubAp?z0b8QrC5Vz=9E{4ZT{g6W_!>-G0nVDTNvWQ#Bxxi~#CJ+c
zN?QcK01SJCfl41C;4|28sJ|<In`UF90)7qmA#Nf9WRSmd@$ZMPqf~LwVU~kR_ZH&_
zQIW;}MO)&2h=WM5)uA3fudzJY?M-x+LrpWRAb;@ZUv{_Fm8(aMa#TuWNxhJJ57jI2
z{oC^ywvco4`9bXGU)_WgLL4_VIK9jFbtgrW`nu%hXVM<VqhU)ad~4dN-OaM1=^~$p
z61ppw<M69d%!r2GCvkAiciEP6z_6RA$r%e-J8_3VjaXz=Arl9_C^m0`^vQA-)2<GZ
zMkx9U6VatY|D=Fp@PR%1Re%)u#RSxvajab+<x*{@W@Ie7uXRU6)etV!9Q=X^Q-{uI
zzK6YXA@5vrs&Kk_*9b+puVL#ZfriV$wB_Ki4+!Bwxozia4zf9*s-d6hen~^eY0OMb
zTk7prI+mV<*f}aOh#TkuD1dE~adPD1tnYiM5<~zsa#EcIbdz5{e3w?&od;~TIdC{_
z^Ar|wy3~#6?qsO9|F~f`??e$CiA~Y#4BX`zX7^h16@)Ym85l`UjCFT+zZxDKw37aE
zxvT3kvytLPG5`Db?*Q*5->P9$<tT)I0GqhlolgU|S*%$4i>*6`aMmAR>K&H67Ai{y
zfLpU?Y2Nqo{uIA6SIgYk*qFI{+CGGKB&f83V&G;T%v^AgI=&4bLS}5_Bk%PXDW67g
zPJ#nmy1;Q2^j;Doz<gvcxlJJmG`+>g5uk~mfMdwtO>8LjOmDxRil>fTD-rLM;cVAu
z2#hu>+|xMb*EF|^dhwty8e;1V6$Nk0g!1>F)0HKapLxCYiwJ{bj4r7Bw=yn&sG+Xh
zEgz{V>FauTw{IN|DNH;gr<8jPE8!}A<ix490M>V`M}gK?xmi^bn`2I(C&71R$Eh`2
zZ_z^yLKl|t5`Iq8ODB_aB2jiOh)adDr2;L!o#hpNkFc3?+Q1E+r3Z=Jl;x;AzaD(L
zLa_$g*N5cWHCv*KMrVmbCA#G{jCW%T>lE~T_T9QudF*T~{L)xdu;OI4u*TzK7*siW
zi&AeM-DDCNR{(QcN^u$$gweAuWaLVj?^XC{NFGQ}Y2ZpTp0kaD%QRoFrrt>+7xZ~Y
z3>2N0shLy_rU+J~%E<J6y1^tD|DZl<BOmsE29#&A%Sq_>!aH%rCQT*z8R4Z&#)Dx^
zm{=byx*NW=Qp6f|Xkfh}BEV-BBNqFV&c|w09{!H1?cKnxlQX{P-rhJ;q}(+N)6WUo
z!U%Wrb_^}4_`kj13L5)nIgFjf@WS!>;brkq3CCsGqSO;{^`M!E0q4-K7V`s|-&rcl
zjK>L=^<4!s`M@+5={1b@PO_P|@m#Fg4!o`B7k^f<;dr*DSi1EveNiC80GX&92%|U+
zEwOdGym)xy)wtz8GTBLQH=OsSY_<krDm4%zb1FiGJ|3)M=1extH-|=M7UJP78P3|)
zY_;(|5<{&>yOOUcq5C%CY4CnR@bPG25SC9N3eWwvEa-J6#1?1*r(4y6m{sH#Y!Zz$
zOVHoMDl8yF&?#46c2Bdjr4hPQ53`xH^R|LorP80Al&c4jW18-yw6KzG69<L&XFFq@
z@f5eai*c5jRA?{up-?C|F5zBplXU(RnpGl9XvXhm9l7disJ9I`^hsw3#jCBO^1X+S
zV*S|+4ABRxTgTNao<7-WL1k>+dU;93F)%d~1asa(R4}!uGgW5azQ0GqRQM~PpdK)E
zpUqnsldp;pPwZj3Iq3b1jtui-K-<dE?ZpS<60H+%!$Fi!1O#Va`Lw*1<?t)1g^~!)
zjP41Ay3Z453C(d}H|iG33eAdQlWM|BTZ^D6yWH-3YWYYwa&r>H#qtUR<%_dOg{0Hl
z3mBUTd_w8J*;>#~`W@2oF8O{J8=WC9kxFj8ezp`jtU`!Rql;UVpGY~_jw&J9+&<5p
z!U+c7W=dg6!55Zb_B_Psmio}(w1Wo1)VJ3ytuhsw^Nd4o8xC5guHicI`&OsV&JnQL
zn?(@G+0(1N`@;Q#L|2}g<~D90NZ2X}A^YJWt%l(XUqWgD81|ea5&j+H_~73!M5x}g
zw7ZGm?>;PK$v9j;SmH_z-A@j$Af;Hn1=X2i_9QNzcPBkkZw5~Pu!T+D#!KhXM1+S=
zN4>q-ZV-6k=ihtGC@uhjWOU$Zx8co!rbKS7nToGC;|$oymee#fkJ0fGLe<Ri(lQ{p
z`ib@8n~9cCMp~MefQip@@S42}>v|53qqe|EEWUwbGtXA48Y~Hf@i(GzeWdNnLHo2s
z)#0w{(3&MI(WdFw@YUq7;CcFY3KC9g`;GgfeRgZl!;rz;)lz*hlwi(iF}-{_VRWH9
zpN3N}itFPpwMRQsWh_z_iyOCz?(bL(9D@&~GL#{lsNONo&g$>cxqs*kbQtZVz3s>s
z)oO%QA-0hnm9eq9_X~2#Ar_Hi*IP3gbN-0|3zeMSj7T)1Eg8-d^XF)a+<rm0Vb))3
z(lqK_hE2i$s;$THh*X8ToA(}-O<w^2d{9eiU>M9eW$ZrVjY0s;l?h!2nNA48rk&xk
zb=xS0mse{t&hAZ>CB|=uGiwPJX);E#ccUS@z7(acz5y?SAiY{E2?xoyU!Ozjy3d+$
zbA#yOrAL6-^3j<`wfS}uM6?9Ct!IBy$u!NRiAkU|NLd7GZCZ}jfbk$QB`K+H>6^7a
z$4Xjj8f$`Q-uG_=ndAKt%z!PeR*pD+WW$4zgOXAQ)=5m%ksb}rEMu4fd$a^j>~4L&
z-;23>j(5WX(CL=njD^GJw~YD13wR+shwd4_x6rDt!@r(wr45Asn2?rTL4mwoZa56m
z;@t$6A0I*JzC=+{h#SZQ>km5$sFkRziR^iU=g(tt>3LC4KJRPS{P-Eyhr8>>iDG+~
z+i*oMicw}bG|#2lNjB8vbJ%8<A{6TLs*sh^7x!f~4~p~e7UjFPuE-74W)#fpkh3jb
zM&E=m95(G^UmxX;4`{}+a}6i&3N$zm;WX1bQLk=M-Z?@qXIEZg<_Ekx=tN5YJ>D6!
zg6whr#S%4W7TKf1r+dpLVOXV@uw>z~FL#QmK5JJ>vN>y3{sSsFZ@Oep=S`Ju*LmgV
zj#&So<LR#<F?v!bCJnU_Bd{m|uQ8bQB3SyOxy&uY)$F^fN0EsZn~dRIgCycfzl(Ys
zuTbUJlVgD)A_{WM6FzZ)RvjjpGd*n$1L^tqE(!Q67)qu{;ZapDF>PtFktLQcQM!yO
zfD=^Zu_`F!RH&TM(iBL9zO&_qzw~EQWYPISPy-gCLrkdbGwqY>C5M~<4mA;khbqnF
zB}DJA;m~zeWh0IBLtiOvS89M5X`QV^XRYgLmd&xq4&gJgmBY^=htX!eXHBH|H42vM
zI+ezK!%ye73D_aBcmxG?lelg5L|i~uNXiWOp733@9CIX-H3!A?GjA1;pGTu!A`>11
zz>=WWxx!7EJ@{HX^ab_NSf<)w;uH#b%jAXlr^w3LRW$zn<8t~)3{i(;eqzd~2QX)Q
zUbirJW%N?97c`VMGDfk|!h_%LzbBWa7E$0x%yQD`U0^Z(#d!So*AAVkRlO>WV;YIi
zL%FQX%JxhUQo`1(vG0NPijOxnU7Jhed#ax)3kk4?rf?nz?cX?1o|$zf;dcI5F>m}z
z^49Hw9zN-+V_=>y>tw>k>L+q~mEC?F*?g8_K4&*4nuU|$2Xf{C4+LSn@cDw%VC<Z4
z=B6rRs6U7aFLN=m)PF4sebpDw2}At(T3E6<A*4`59?nTWI{y7EXlT2}ck`kJAx2qw
z`I<W;WqFlc7I89mzy_K~N)p4Twf&cOc&O@M_YcwV$hiEBDig=I%?%b-g`L-{A(EfJ
z+34cE{Ge(yz!LN~KOZNDXG==$?>wn1&#_p`5$QrHl2Oejcv#iuKh7W_d?&XCjm?D-
zgxvk!B)P@QJdq`2NvY-)%HBs-V5ROX8rZyXAt`Q7###C&fh}@eLVr<u?V!igk&CC3
zr=Q;CsJJO%s&%AQ!1v*<*41bjbZ2Xd-}&%w(NEG;uYMGGuoBE!7vR~l1<qT^icOh7
zjH6+Hi8*rVygz4e20r8<m4n?5zrnyTRfE8?5BxOMkmIj$n?l&4M5$Y@EUVDpMA_mz
zTT3S4#AUNs;&*Xd?q+y4%$$_4Vqyl@?n@uuu2oh42zRf3F7?9G8POuW<jcFn@+y3J
z>%p+A6Kp;|^+%L&S$m<_poh@8Gmc#<`e!Kwr?PY3!_^Jd`E-lx*}r(RPpDAP=;S2k
z5p>Uv7t1o#28}?WrOqVj$1l>)WcijDX-+wczL^2XXtrxG`ls(%75q_B(?{y3Bu)qi
z!V*+Di^A5patVkS{#s<ht&E(beT7r0V|d2wx5$9{IqrA3$HugUg-irr=)V$u7m#>@
zql$TJMR*ns-^h@08ds3!KBenw%}vb0LZ;h;$3Ocs%A6m<)8VYEwUA}YJbWDyUa_`{
zl?En`dT?pK&%p&>RaRCfNYpUu%NHaq6FHyhJMlhJOodTKmtqqwuP2&U%Yevj*`E@a
z?dttlUE~vdDrz!kYOLpG@QZG=0P{lw=|v~VG^K5M3kNv@T1gSwrq+SQ)0}_&%Z&))
zmb%xFh=5PAfpFW8{vKydpE1zQmd2EC_!`8imgtR^$OGNf3oM6=eeLw}RZnjE6qBYr
z=+OS>(2aV-O8I`2d!wDt_RDz4?p3pe5Y2gF;W2p-Z?q(r?nZqeDGNO<6>0#hA=zda
zO5!HUW+L9Vn(%00TrG|AVbii@g*PvP&uQhD+J%)*$YS;Mq^4Dm_x8eLR|ut`ik3tL
z9OSIjb_7Uvx){`!+;OMZqjd-r(jIl+Ke*Y}e3ZI+F;sAXnt#IC)#~O~Y<QG<c0*yH
z=Zes^qXN+R-dF^Fhk1;WEh2Y(!fT<hs7ESFd6R04Pd+qbTKMz;f@i`*GA9{#1w})`
z<X#Q(J|aS8O`<i`z8BVhzzuMG0QFsYy!g?|1;`MohSFP=*z?}NNYK=!U-r>`gu|TF
zk`gO6gO!zarRI0-ZxgOfMb@QWWXoVDwj;$Qe!x-dm)oyNn!^*<6|2$<_Ej=|Kce}$
z7#>#th1Er?tK7q}+b3qT-My|d_C5y51K^ki=Y@oL#@oBaeSCz8jOWzVmi%ru@2l2R
zfo)Ne#w>fWR+XL@0(pdf&DG+};6PIQ+Mm%3Jh;|co9u)mw7f!KLqb4EZTD7a$Wv*k
zW&)>E&YP_}8oH&PdJ)7E<?d-G?(Whaz$wP)GQK(*S6ywQ<MAalN1M1UFD7~o1%fC=
z>ru{{wbo>-yu7^4?xv5T6~$;7T!!ait-6|(<W}%`RpCCyJ&nhoBn^En9h$I~Qtr*J
zo1TUVXT%?@%pWsl2WQ-JZV)mzRFy`4W+zwZnSA_aVfTB7Ex)@N16O4&Lp^vGtC@a`
z5A|<-?+^jI)MsaA6>oV$<{zticarb6)heX=tq5=$@0@ZhJapj&cvkwrsxnUrb}(`o
zxB%dz@GJd^5|0LWM;-OOa&I5yFI~5eOf1n}D?>@#EuBz$Hct(jv%06HxPjolduzc~
zIE-!b2e`9c12LGnKXO~j@ofG1m{_Q|b?ft8k@g){K1UH9v$$9BmV2fiz``~F3oF<0
z(pr<BU*v~56K&~ZDw%Q#4*pb-ZqyFzTD1r0>QSm#Gf#O(E5W2MJSv%a=`vHDDe*C`
z9$NTp>rMv5t>}!ra4?0?Axo292I*Y;8bKIk*4%3#h0;Tw&Eec#*22GLZq}f~G|(Sh
zua~E6-eRIPVi<n#B?JZJ3&O+0=WBfKJ!1BRvBUp|Nle0lJrDKp@GriQ7hMiTQj$v+
zrNZ8R;dw4W!)C$d<>0@f5L?f(`b^nmJPzkf*pLSm|BPZ8E17GKYrop|&)iRCYO(jK
zt;zAv{%{A-hwDCLMl~HQ#}_PYSJ0~KWg^d#$1fw{GTxV&*I-p*!&cWP9~q@h@*?Y;
zOqTdK+&fjWN&=|D&E(NXEb<(^$D`lngVEt0zba#YMzVmH>$iJS4qPaZisO}e_s@Xv
zr2oD6>UFFfJm^hT?iI~qne8vHF=+p!yE&5S%R=$N0iAJxzW0%wqIXJEXkf~i5aO48
zCq(he_}TU2x2U^{#ZodTu$zy9-kP#X3t5q_b`9wlXb?0T`?n-U;ohy;COQB)a!9Q^
zoO_^wY1(>fA4~hV3(&BQA3-hSBgb>h*M9d9iUi0qQg{Ug#SQ4|Lt?+b89A>4moz)&
zWy_6%c&E|vY>3q}$b@4%UMg;rKH8$|)==uDPr!8_T<#GB;@*OlV%tAkqYio%OESSu
zI~Wys_yfk_>ta0UpJfY+LPmrowK_e()pO@X%#>Z`&yVRjD6%5I<n#rOF6033VlI(R
zPRQ|#fy&a66u&70)MWs1^Z(#AM~Qg4OZIp#wPx!uJ+^&ou6$pXsnq`3__@aS>sCIF
z@1Um=!BwnmxWC&z&5UAIx4}o`{3j5$*|nf%=B*!pXBS|HbyeX*l`WU*zhd8@gDRdO
zXI<T{R>qZx**`=@@9XukUmvTpiFKX{j5PKl(sEM3ib;PnoGiYt3@@oP9ekz+KeFs<
zNhCd1jxb^Kf~st?{Dif2{86+>6gvIQ_gCwyJyO&}Sq_50#0G=)0ciEZwc7YV_Q0SB
zrS?5f5&?lWl^#c_Rbo}lo2JNVVod(ti8;y`42{I?$_WoStVGilOJiD9H9ORW@;8@R
zGU!rlJI+Y^2A{z@YH&z6lmgz-MNG1sjL*)YNlX&aII*#@Yt3`{D}-MhnW|)4iU}D|
z-nPZ6^?lF-)RC}w4r<IBP~W+CcYPK$f#k6A09w*fFeBKk3Bn0&9W1HNmb0w??rGr;
z*JEeC?0zTS1nh{`teEvJ8{iq7R>WX0Eyqjx`@o#@Z9H_27+Cg^$R83Wqrvz3-d|Fb
z%nad~bJkUJe!r%fc6L-$v;ouBOh@4~aFs5=3^;1t#gP+{%K5c7G)XD}{Rn-^s){C2
zf*i$lW-L^6mG_dA)ZWU7CoTba<AMKkUbmsJJvwe<^l&__yj}UM%xnjN{%_p44X07m
z{)E97KB!ugY`a(2=5)LlI}Y<+&g4Lz3`j+BQ8HxW@Kvip$iw(<HLa=PR$r^@WwbdW
zeB54@sL`@Wd-+M$W5v;g!cO8s2KrhiA_qeqXRG$)t2$(_M%Xe*i*wcB1#>PC1wPmQ
z4A02l8|u|yxawNXHO;l&A`;I;dA~-k4db~aoB8|v<mjhQ%7+vK&F`kM*RJA1KO#2&
z=)?6EPU+u@RP_Y|O7wK)of3Q$u&V{bw2i2kGQA+q6zNQ5n5e5ETaCqog67s+^P=qu
zF?>snd-);zG*<EXSXM50h}u}<(XFpYXDfU6Wc<K}^Malj^Aj2cOWU@uK46^WB$&vt
zbSJ=l!ErSXKcJ1Uue6r<<q4=8&+chkaK}c;e)a6`0cQoSI&+7sr@MeJ<Opyp1V3a;
zMZc@^jaM>iAz9)>de6wPD*HXx&_k|h1}dpBl^(VLt21wuay4mer$(s7TC7MbAhs9=
zDMx_3b6gdIS!NDr{XX?DQCSiJ#Ei>Y{%*3rsn%$JO2R`%5FqX(lK4=zlR}O}8wezM
zT~&S0qg<15@x1LB`a3b(sj??wpaJL)-rh?~q!39^kMLQ;UESo*LhDtFuR^^}0=zQs
z$b6}uKu$+2F){JNGtA7D+ZGT}@OCdNz+wTa$@hz&ad@ZvWJjROVbO;0f=>(K+&Z)E
zE0xUE#lhSXB4jI1Tnn)8HUA%5UjY^M`b9gGk~)+OAW{NKcXxSd5r!`54hab<1tbOx
zy1Pq|ZixW}32Bg)kaQ#^-#1?Gz5ln~TMO3`bg^dUcfRkOv(Mi9U<ta+pZ|5nsHwrg
zeH;6R#BpjVWa&JdhpUDkVBeY+##-tTTiCUWyw26nP50vq0=X(~Vqe4~3k#sr+6pt}
zMU!C}BqAjGp2-N`$YK*KqopBkRL-N9_74*6UR3HY_jE|>E)e#7ExG<~?$COq)Sji)
zYjKpsDr`oepgj4h%8JqU*ZV7WNWP`nsNOrt2fi{sSx5ky|4#{QDm^T%!M8a=AtWkR
zY+S+2g7fNCV?}fII6IN5z9GgULED~6Zu~jqFYEhpZi#u_m+zp<t>wtIw0^=TlQs5}
zYiAU=^s=U#PJGG~&EFBelws%91*e7#*pTxP&pE4WRZ(ERGc%C;*5n}bIDF`*xJCx!
zPLD_;<sB&pZVh<sD_r_GN`1AJ@7Ri?mrY*b-s$T;0MLErHkj|GNd;S({3RPsz<Yh$
z{QBUpLj+IqAxGOW3+SYx#a}uf5^|8?`w=xaB(Z2jD*PExWYOZ?b{|QBlj|>@%ddTW
z&BJw6A2ahRqB}&BK9{~s;jlrmH)J=Pdrjf0TI;IVZopwcPD@GSqgeS{lcE>uu81-f
z9vRAcbH_stip+bBRHng78aiKcOJ(TpN+#o5MO)+$XI10tauB(E2r7`Z?cK>amXUbl
z#HVF*>+w695BxRrKVHza**tb;er_^LDL-pe`Q1l(SBmm*1J6noj3tmoMUP*ik0-tK
z=6oVBXw{DjxBJMZLqN}a;or<fp&ZMAi4UsUb{t4WAMxOXc~!##*n6Z9!*gFd5fALL
z@rMWt0w>~@0na&yW|0=BSzY+SU~H3!BKy(JQ0UO!k~cpYsO{ZDq>HsvOqCAK_>faF
z90}}1ak>=vp*K}kRUI=Jy~BKyd6Pvfwuyq1o5u}~NoCI~;EcYRxC7*vU^!iC>9df1
z2imRcw6EDn3|aRXuAQp9_fe&D+=#UAGBNf>0NnGwuxI?irl=C1ug9gN!(@@}f`8~p
z$!UUOp{+;%N853}R@S<Ly2Cj3Q#J1aL4;p4wRW$lh3uz3B3h|RQ47<A!y7Lyrw<F4
zg}0w>MOM6$#UdmF6$sIo*N7#c5hdZEp?PVRT$4%lTz<K^++ytUb2iass^n}EVyiGN
zyVqjirm^qRPfl5lOTJlpH6VREGdbDkGcuaGbt>t}Wu|KXPXKUg8Ct#kXa0U}X7{v9
zAYUMs_K_qK7E4LFSHr$V19DdClknME=(XvmTNeGbOsso$%*=c{_kHl`6=4>mb_C05
zS3(wL@zXn0@-0-vd-oFic}GpNA4TjguugxSuPjm21*7L1pflU8K~Zp-oPRT~@=jg4
z9*cTd%6e^iJlzqtetru%MNetrsL<Wax;8b(KmM&`3F=n7?!$X#R=@_*shw=Nms)i4
zn%T@$-42&+iA<XkF3M+An0bd#HJyt@_Ny&P)C}|w#e)v>LTRyxV3PfMN0<z|W@Clq
z&g&25_lKDdS=?V3YQMAP%C(NQ({50yo}eTq4eOE*ZKqqxQ4Ck!G{4-O<&+c|O$<YQ
zc(02Se{j10`oKd6@4BtyK@tX66gOo=9?wrof|zTar4ce5dxQm5XSGKbVqq<UArE5@
zt(Qs>lX_<Eb`dVvwiBsl#MTydCR1lD2h-LXnb%)Pz$k$o=+!m5BQSW#o6BiTApym~
z{rmPpWnj?o1***d4;O&e98s$}W>*~mU}y;u5uauyL;F!FY!5tYN{cI|kP{st8~caB
ztdmQB6Ovv#Jj7Q-pKx5<IOY5vP_gf+Cq2_GSPALD4miRCKLd=#w+o~WKJ?4LU+~Pi
zKMaiT2Y4`FX6%246Hfh#XnCnDf%WaU2F6b?#T3$21Q^TBs76Wbuny?A2A+q7Sn$KJ
znUgnc*)|jNJp+mod~;cMB$D;2B-swnQ`~m}J?>?OdC5M1Lgnaa^$#NIohM(`RVN>R
zRZkGUxi&rZb@w~`GrhqNqP-Vyb|;>@xMaMv{IW%~9!wpI@WraP;(rVVw!W%CZ38(N
z>WZzqY4Sj~juoU)D4$6F*@QFX#IQvTV1}%aXV?&vFXmnxeD$I+?=gv2^&$v}rB;3B
zlganT5!Sb(+%FZ7Qw{=qri>xpR&3AF11X>R7M}-vNUu02j77?n&BqHELC1{Cg7VM4
z9$z>C0R5E<_z#9``}(CK0~2S=iW2hTANE^ZfRia8oY;IavRdt0Z>~5~0r+ngZxaCU
z*`F;2U(b7^;S(Jb6LZ=o^@bNImtaMc$(;6N&~y-{h3^J=*XKERoyhxz)9jVA{)CpL
zq5Rj~v&$p%s(Z&N0N;DB%Ur^Q+0kq~+1gFlRu{1%d`!2D5yqpVOIWNM-*08twr`br
z3qC(cB5Py~YaUTeq$eYGuY8o~>5Mu`+mB+47qYt5(8n}X0T%h~@dq?Wt)CNuozxyY
z7>hU+TNNMORsX&5gK!TLq`wyF(n{D(5yZW#Bl06+Di@WPO>!VKI+N@AF`4$zcnNp@
z=KNpU%lM~kY#unzAImc%sJc>oGg%qU|KfHO#o8iGz#KR!rxoGR@{;2H&UC3meP!qz
z>*BG#bv>(XDHu{f(rM4a`b1&yI_0zhL7x+Df#%}K1^&XM!DH?OO>Igp)M7YZHkAxd
zz?Go+<Ye(+xUypF{AB>u!N|t`yv)Lf56K`0T(haPEm*@gpge?_ef6AE4crr?O*bF`
zJ2fDHwj2}An`|?yYkX!e2JuW207<~K^h*&j>Mjt#@2+)&T`NP8G3Tw$F~m`7c=Ud_
zsbLJ5?r*>~Z(y&V@T_8I)cgWH3i0OTr793`A6)2zof(x1yiuwor}qu4AQe9N>QYgH
z>sh;p=vj@wg@=2HY@veJUQXUM*{QrYO);|9NlgYTej8A-v3|Y-A9LukwX)wXsgx<U
z3G3KGL^df0%y@qYn)!9#gI~)cQL~v)QPAomGAVl`PDN;f+DAHX<KY|6Z&|wePw;kJ
zECS|+rJ652?#7x-65rf~fD7SkYi=?iZVpqUZ+6!LF6VE75Rr8aOGIN6zFqi9kMYyF
zLyq4(u1_0Z`f3cQCOw>>C~=_aS2n}|2neEzk<9562<U!6=dUNWz_+~k;4Y{b;SLx<
ze!!H}SLZpPPzHxTF+pvW%7#x`Mo$jy7*ZZ~=74PkTyotmg=Op9^dV)}dT;l>8Q)cZ
zSTm=KOJB;zTKQ@EoEAAXE8x=4hq|rc+2if5@8s6Z8V1ctGC${^XC3^-#!9PWO1c*m
zUg($cG}MhwL2mGu^@}pkr=V!GZ#24$wl9k-d6G#cyK!P)Sl+LW58-k*KAusZNPTJw
zoRhGF_Lto-7O^U}Op?~UL3SsHaBBEmlbqadG5sdO+{1wR&*;imCChni%ebepXz78=
zRB3J0iJdH_ynLW0o)_;6n@3V*BG`<{5?J#n!t{@i{0_rveR>Y#?EMC`)o#*?`-u?1
z`L=fkkEd;kvN(7J6tM+DJG4y`s-V-Xe3oN<)8cP-@DtqVzi4+veI@!Fo30-|&?zKt
zoif5BSwB2f@Z6(QI59NJ{9I>U>0^114<_toYoKgckffi-!M+Cw<be6~r4(w)!!QRa
zI{^Xk9CKr(v8j$b`JgC=Yslxq)qwIdU{v$CD~z}5wknusxqRC6iJS)8P*CH}mC6CX
zh=>!l4?G58skEok-Pa)p{(*{@QYXE37gxNRj)y~Sd%CkYdqS7D4`QW;`tqvU_7=1_
z69dP04#6}?8auq>OTNm})RdIBc-5u}vMUa<;ku;dADCbNG%h;r1cbXQ^SR@P*ZdCo
z{TN<z1<MUJ`ifXg%!CBMHf+@7jQzh#Pdc2eP|<czQejImlY}A<g4cSFt1b`Q<LpVi
zCqkRoST3S59J;U!-PWxN5%oDCH!F<#VVoMmWN_d~eytW=hwtVK{Pos^oIGUofC+&&
zv@~i~be&mQjcUU=np&F21aMBMRnGppa`f!_b|#s8@N-qfV+1gRJq(ozz+<MfK}SC%
zA4LDNFEEQg6F&vT?pcW;znD$2)Dm$20T*$}Jl{&VC(mr%aVg9{PLXlp_!UQDzm#q5
zX-Co1>lTg?IXmUD;c|vO;q5N7M>6rsdaO$;hUBry$|oO1<rivP9*;ExxG?kZ<CDQx
z=7<YDcjt<I@xA=)x~39qgOx7;4QvDhrP+KXHIOr9GS=s&@#pXOZC9CrWKmM$Z$%$6
zSJM9R43zIv=L_D3^5Q`G#Fg4TT+73)Q8Dd*Jol=7tD-s1TAp&!EiD!+Sd^ELp#mko
zhoip^%$8=9nb%fP0gxZEKr{1+sxozlruPjTxbh-U#Pr=FJ~5xs1Y=dUQ-g%VN^}MG
z^LDH`@xSFwD~s#F^>*5Oq;sXhy5q?V*BXmlHA&aZ$+xtTEut9>Rh%Ny<yOoVCfqWi
z?+OIhaF$Bv&Q+OPH^&5(s$MZ&-k3Qf9lpu98Pd-l#u4f&kg6z{OPI6y$>}o<9^Q+B
zWFbDQF@7kNdTT=(=7^g}Ibqlwj78>z|M?Jx$7r@9Ln6+iN*D*w=a(!!bf1GI_h|Cw
zsJ0v5YP!5Teq~+B3`T*Xl=5`45WwlbdF+41LA9b&%L6$lCJuafkE|jIj94>aHv};T
zf9b+3@ZRGxw9SK0kLYOA_6U8@gQppx&otH?d-lf4!JJ1Az>fTW?{nYmGjxa~E_C^t
z2xq81j|rzGlQrsvr4Bl<N6KNZVFrUc2WepD=;L6Kaq50-nKNwL+}C@Y(w(L}80<=+
zE7BR(n$kiH!%fcCtSp&R$~AHIWaKfklo7pPj0#}H@+vC4d%&;dKUibD{Ve2H%BF$O
z2As&^o8tz2N%ft6!{>s_>D<EGo%T~#32+Ij=rGEMAXce|qjT-h*ESzIaI=12+t&b^
zLQTvyA`hI^vrCu@Q^P?^M_wDF%P*{g>y})>xPGpRoa!atTeutY+FEu;KvcA81>6kx
zA#_(HP0a;MexXp>L1kAD4-Wvl2GZ_LR(!m_bt2(tx|rr51-3Qd`-S4D4E~}=RN42m
zoZvu)Kyf{JI7!pnh)X;K=YD3=gL}#dC+u}of6%RAHxU>NU4BL?2>2lVfX9BBWqU?I
z-|enuUSr=S5=d&X{XWv`cTrh-*>s!;Tn)A?UP&Ac#a%vioUL=}S^Jzyt3;g(;C%oo
zNU_e8Ra%nH@ok+8uRrBMW)yEc_2?n=bD!{@a~u*0e9H9V`vWqY1N=!(jUji(zrlKY
zbIH3Qdf;X!i~1KiN}|DhV0v;<3%iy6jb9d1SF2AsA*YnU2qpj4Q|6ulPvv(bd**Ni
z!9#&-yv}3mWS`Tf$#zLdjLO@qa}3-sKbxQkBR->EL|~_qP)k{#H|{LQ^7^j@-^^%@
z#Gtyk2=r1{sDCg{IlQ^DiSVl(9cfV|GO|N0^Q2xI@(r7`#_J(lVt-<$x;}2-svpwy
zLv&`*fz>s|I$VI4H3jx9CD|)CpIwZMIakPgL-HnV<?Dx0jsn|fS|oVQ7q+mRlLMbU
zsFQUj7bsu-+hq7#>(G1!4#zRO7^q&0H!3!&hv7kYI%JV&;rL1ukdAppM1CEMJU3Hk
z9|ipjq@y^p2i?K^62S<*rG&2BHyDByqHgVIv)_7vVWxV6kC*q!K499`dAj|DKWBjn
zfOskVP}z^i{uIQMiT$hXN8b4Sw~1k_poDinWR)+nHz2xDkEI)uzl!Qmy1Rzhx~3VY
zv|>8unm>9qz_yuAu2L=1?;BtDJhj*azOFkNO-N)P?M#K0<!04*<DjwSprc|7UMrUC
zRhvB|5aTWNJyYUY2q=zTzPs*+rWzkzj%=glE0V#grUqM=9;Y_gr^||$evfC5;vqkL
z6~5IWRGaYqAwPh>BG+U&u=3f%>Zn2;V=NL*dd7#91L7MQ!~_t$)p<5PzBaUa%me6!
z)g<ik%26EstZb*X@NBNFlUDmcP+P|U&q6C@fjf|Apo@=RItCNg_dszKh&l>~6&J89
ztfvKL{T%7A@0oAw@SGDU;Su@G(#rN7I=)~@4G>N;&p1tiGQ4^Go@PF6|0y?;gn3Mz
zZGc0+Iz&T5qd7&yiH4F={}e!8MI|_jP3*=^9xnkWq!;cM0tM}ZYJ#WxU|=t7_Q42C
zX_Zj_JEvMOAu%Q=Hqr4w7=@oGfECG|PKGPk>5*ysDCK-5JM(Cx_%e4cPh(x{G%yYS
zrw@g>g3+PtXM?Z+0Wr(Yql6Fd3RS)JhP0=L{d2<V3rw;Fw*?3TJ&_;d(#M~^MYd$g
zG2~ivw>>ddg4loHu@{cJ;%wMiCXFT%xBQ}&UgH&7dL!ELjXQbUn^A<<eOZ0Ipwje#
zQyIk7YI;vv5vqM|GQjyO=MHW*4b?QS4YyO(d>{Vk5y!nBluMISMa+rfC(K7uhvMa7
ztzh7^*uLv4>VMi&U)ZpIU}ia%Iyx&=6UCA{KkUrGfu6{q`Fnwn?1xcn;U~b2Nzw@x
zmhxW;`h8Cu;f)dcvA+%BO?k)1ZzOl5BxEme5isJV<|AJ|$xjq)e?j-fbPdH{QA7bd
zBso=eOmjSy<&Y_{W9aUyAbMfKL@^fphGS1Y7Ko*I6upXu;Oj!Or#2O@ZWzCF)8YI6
zagXVX0=5C@l1)_P(S-A05p@6(rC>>$e(k#&Tui%)<nemV@We3r8dw^82hvs=er!$O
zy{^qRWs~W@CUO3w>r<H_#5H`Yseb&-Z7?$&K%hZF+{hSi`sK6BpI}(e7h8uCM3cas
zfB_*bYLX3^>IeQ5)|0-cJ<Hn;{_+h>^4qaGXz)4TT)f=_eM1*aMVn8Rr~2b31aI#5
z3Va<<*0#*WJ<dO!jba^NsL&nQUjbT`QGRs;vB6&=h^i|cgj?QLla$j_1ULolR{F;!
zr;N*6V0!u1&T$ZEi4zw^czLUbySFLs^tz?_K~az%-&&-~sL$FQkZkWmQL+kOhw$k0
zJ_t>!Sd-nRq!hVLtV_2nlVIf}+s$@X{>7{!s=(}@5_t_4oXgqrSi~Y>07_Y$r`rV;
zs2d*{ux4sa3B33CK4C#631`-fq-HkPf$XRxj296)VY)+m|K8MX78QY35tlDfxNq)<
z36+~oiw8&V+o#~YugW}hr)e8`55pmfw)D(REe5uSUq|pNsUBC12t5n%8xX1<^up;j
zqmRCDshS_yF<)o-z`+#DzKB2Nm)6}Wa&%S|yZU9lHGMd@O&={@A>gnVy9ZD{p?hKq
z`*$(rvkU>!_v;=5^Ff;qo@KKDZKTQ{yz&oYtrI(Af45W@9@muenCmX%#5byVo|`F^
z4uq}-T_*6%((ljSunk{-`{dEr9~6YRjy>s#>0f^G1w8vPV5=Zu(W<P2$v^grYH<_N
z#nzvrE8uBdUR8QW6!2zo%D<Ti+$pH-7pt;L*Wb@7>eWcAG}(fqB-Ho}3U=BbuXbU-
zx-mJM`hbS=z_-h1Ly;bw5U2Yb;UE~((DEg$xVDBj`%0zBMR5x=*g=d&ljZ&x0$r=Z
zgZ)s{EQ-?(3HBYLspsR|Ae+uw6{dT!qJ4O}fM%drgo|uTK{lM4kDY#AI|Zpt-aA1+
zf3!{KZ0b2qc?fm;y5B;M`Us#1RrwaSOW(g0PRo}+w!mDUsr2Xl!R{y!MkeQ;f3q1%
zAXep%{%F%8tE4u3=Fj%0LOw~{m3pY8jZ?+GH4SRXdN2{#*x2|0lvcd+w2;e<rrmi2
z&jNoPKp_~HN0)a1pW3(4NGAouvx;r)LF3+#_CV=g2@)`>=pku^Nn7L<TN%!uUd{Z^
zWss##XFgWiKIPTZs2H?Kn{o;_3NNZv!T0(+Ns4S5P7v_C)%uw0ixe}ifL~9vPnC}y
zc4oolx5_q<wm+ACcQPo>_>m?O<3u%#zyVSZ_zgwD@?lu(O$6IY&u;r#)kaWivF32!
zkEGzrV!CwE_LGa`TsN*KL$zxJXJz?A0_y|rteK%8#D2Nv-77~^1_c=HdA!jf=Iy7*
z#|#<^Qy}p*3JG6l&B<U2X<%O%xkm+X;C~Jc$iTL*g7~*FxRX!(eDvmRt)^7QulN<)
zniwD2OsRYl(3HT6j`pq2F>#l`!!pu6U{$KD(8_}~8*EC1DayolM;VWnb~;2~i7>EV
z=NqORs@{NK3qy{%PX-Z--jMiGN_AbV%a_r5x=iuIQ3g&}YhRWPZe&U^c&V?{e>1%3
z(jorY3SAFa3a~mFwwmzxh-qpPusxXhb=YsEA8t)XB@mkvfz?hJkP4jxzjEj@TZuQQ
z3LyQ}yCaB0sD<Nk*}fU#Vh_}jKGcGa_v%N4z9?u#JC;Jt>(woeRj$Z#=c$JFO`w@9
z>*vzQsdUsN99gOC*#ZWxw6E&RwVAZ3n!)cUhO||f^8rkZrOdEiUFRiV6W9nyPTZbl
z`>X=Wpoued&$f(s{ag-Fb<8?(tO}f1=G0ZEU%CeXQ(0?;D?ozg8X!poy2t2`hZ|+%
z^TO*=HVB#my}eSPZ)ELnX_`Y@_R=rwqujuZ3@jO%+d<uz{P$6Cr(wr5Eh7K=D3bn$
z!~C@t%&!C~4qt7|vgmA}AeDUE)-c2N#u}fVRg(t@{PDUT|4>x@i?0$og==(y1s~;W
zp{=&u)TTmK<15j>&@~LJy1}O|IG9-ckU~P&pXpq1x8sLNuWAgHoPz}oUhrs;$~W6u
z!7t+}sTriBULWp402kHm^x5DKuR`m;8~9;1P}pyg+ffxJqLHa@Mj_BVk)tPCw(Mxg
zQW*%6!3EuUk_B<C+iIfS`EF^;jc&!!;@iK2%S?JvWUs9xWsKyF0NV}@o>(lhWsQw^
zPLoHyGl_z8i9!=+IpWIx1Jx<5bX}zHjIuA~Vd%RT!`*a+{D>a@<e5&GgWXpG>IFg%
zN-clRY7INzlx#Qdsp8WZg|3%`_61wf-K7r*w!$8-m~8v+1oX_}NZo9?v~zWTIw7C3
zAAp2Wzr1j<(;umRpIvnPjucF|-tBRhzXBJSI^JvB0(!uDSAV5b-CmIcmJ%z;hPPYM
zjMVK+Vss>uU6^o$oTRV<t5YB)D%9`&QqvBnatFh9#zOC(1afyQ%E(>`Sz<m7c9$$S
zE8q`x8@y?SxL}x{KfbsEbAWdI_Fdt@>2PnQ77hkhwoPE7Bgzn<%#-*s1WNFJyA;R~
zF@o60{0GTo&ovVCn(ET_UvVSnX#!~>qP&~abr+eC*G0m8ux2z#l-<4F>8wPVsV%np
zYvTQ_3r>q|@1C<#*5mF?_WeySLtt}z6Vl5%IPU|L(_{Z{Q2i(<dLBQ_HkN!^gA`4F
z%+-;|J#nAVkEm6_VnAzGo-X_@sexi>j#spgW-k3B0g08_#|hCoPF87gQKdu8q$?TO
zP&<a<37`5;Q=&gQvx?1Tyt6FH^1A$}uindMDHsu*#Zc=G^0#pmEj>hk$PZmyOgUme
ze^5)qpPYtHdn~WX!oa5<%mhr<l;vCa+aM6{dT`X8o|sW6ld}BpWdu{kR2#i>+ay-s
zGPKFsd~h>I01pBJ=k<-(s*}2&*TT5cFKtXT_XcXowKOf-qNnr`epQqTLXFY#vBYNS
z8*m@(1g#B!yDfq7(zi^+Thz2&iP_a(9hrXe3y`HyF~;v~>TRyy$4cBFN?PYDvg?-9
zQ$x0hk33>m0tJ@Gjo<P69o(OVOLB^>=Pon=&x7}XO}R&afqpJGK}^%@gT)XaIe(Mb
zbk8T_Q%5gwGva|5C3=?WomB#<s1HE>0%?#+9#4Pk$LKD>0`*51&4PO<KXMI=ne$?(
ztd3Wd5s)QfL8K7v040CV!ouQmvwct`aeU2kys-}l!Ufvef|ITL(_DL=#>e0Fy@A~I
z1FTtfWc{;j_>fgZzC&s0%r(eMuv*=)*&<K-)_FB_IgZ^;WW72kyUY=`@M1ugY5jZ@
zQ8h}o5~zWTTK*j<Y67a)ATA!px^TOyS*@Ite{(<w6e|3)lX{Xju2ajE65o*MF-HYK
z4|aSQ_e8kq*^6h!<y`zY%_$jgU|AEU`@Y6W4oAwTP=}RGK1W5*xTN8ih>M?ZV-Zs8
z9oL6UdLRz^^6Q=X<m?X3cSLM_2vmzP^`5nl@`hi#CZ;9BQC&&(!jv@;N^QYtvK?ms
zynLbWkecJvy~TL^-fT=Zw`2epq1O^$9$!%1;+IGl1`Afk+<7B!=hYS@f9%KCOdmfb
z-wy3p22V>rLi0=p(63u%rkGL~!M!tg7swp#AatIkCx>_FyL<j`O91rS4R$6GKH07m
ztnI4_E`a;|qKvJ)rN48fUm?aN)>D*1A$bxb&85cIeb|^mf<bEXK=qh+e7y$MNq+WR
zG3f1kN%{bbmFitHe-h)(LJ2)%Dh)qxft6bElbu1ne&akBm4cRM!jIxSz|EOUDvQ1h
z|6(ya$Ej<)_7>s$Yp-E21PK!}z;$y+$B_t_!Nr3+v>a^s3?b2BjlWUP-oHWXQ}s(c
zBs>$ay>-z%y-%XLqX7EL1VsO+`pkbMYbnU@$6l<tcNpe(edgb<xKAdt!QtP)T2Cnk
z6X>g@v&*X0C%>+v05Srq%QmeKTaxtMZTiAb2CeeN^iiCr8jri5`ev|vcAzm$+t=Vm
z4u)UH3}Q5%bZUNdcl{i&RYQ`Y#O<JtD%NLqo;^B)EWU^o2>|M0at9}GW3NX@y|u`f
z`V+5fcR)pT2Y9j<ZBBllw@fFf{qHn~MRjQM<YqJmmk-i&PY944^$_vbdNCtg47$9p
z4fuXw2bEC{3p`qIIGpr6&VGP^8OB~T%_y24h7ncrJ%1jvfKs5@vId`e`6u^oGSxIy
z{fj|}u(YBeV{n~UYR}%JJX0rfhd&G1M_Gq?MC-D?pT2aeAXEBLUfPBiBgk#iQ`Zt-
zF*-sTAI)Gx!4%P`71B6g0Mu`<Lm}(u_mESsR{SB0L;!wIX1Y}iXo!Ex?!U8GN1F6_
z5ewVqYP>hk8`!Y4REc`MT&@|Ao>|b?iE4PxuW%m`ou1F0_i3tt)JF(vnjt6|;5Eye
z4Wb8})?1$K(i>6KI}2<8E{*dw5yxc6SoWDue6P5c*8ZHwE1F>**eXaCMD~i9cEbER
zJlN%1zEqcif+7A|6Xj^YPTfix7}tn#&M$xlAB}$0QP(e=79)HiS!L#hrAIpV$d1su
z4XGAEpu@TRiP7U5Z~YLoRxKx@3TMrs(Ca`cJbMJd2t9?&n*{Wh`Ry@)jeG@i&7GTh
zr7&Ajx^<(Jt2{jJC$$!1D{1W!JV*LHQ<J##CU&AM(-r2s{!00s-)c)F!FYuLJziO7
zM>UNnxO1|}uzp5D-gSG<MQ|dFfnonN5XXN$zK{Sb>%jCp0kE3u2`n`K`nM!7OTfMD
zxkS`Y{MbqEMCoLEEWYpUWoi*|aj`{lv|+TafMUkl8PCcch8G1lE0s84+{Kv!w?oG3
zh#7M_U&pbJQSj;U@C$B!F1PpfRObk3V#l-rMGg>we%8|wqw&lvoDaHnyAdf$I6-|u
z@$lSKzBco{t5TE=6^o|kmpDU~tsN56Tz3-x8%g+c@Aif^_&)2O&hhtk0qpkmQ<ryZ
zVEX!DhAX9<5(G=KTaFG01wyXiPy&kGmX#Q>`RH&Bg7Tl)-@kvadaPV0+9LKB8+gwD
zg$qm@Eq{fToKk#gMR<gJg9Vav`ReRn+f;m&D}3RO^K{X^3wLwpv*scEE)2uHXsSnV
zs+<=1C0Sg!!-)p`v(n^;KczD7n?5{2-{xhEU<aTw<RmO-!JfXG72cAslDLcBwC5dF
z<t*J@T2N5n{I$hrpHY8(T*<H3twOPzbeZSb0vDiU5v6zU7Ph7B2ciXgLwLVio$3?u
zKdI}|w@S|g?84#dtaKlg>V~N#Eu$SPMMRSOYKBCv1TD6$@CB1u>ws(vOwXGh^fb`C
zPj#;JActTqu2yUX6wXO_-Cp+qME;yWv9+c|h8%m&dL34qJ1JH%0BY3$bc<h7aN33A
zL=dr=RuPlsb)aj*fhfDbgPMl`Wf6zpU?cy1mG@81n^bY#bm}LR-2lS&yKGz&Iez^Q
z7XTVOPx4w(BJABn-cO^NAf2z$x|vUZ9;O83SKOPpEwn*fS;TT70Jq367*=k(EwmQ8
zyovB(4dzJF;|p*)2XuyyCww=)*6>PxpLcfFc%TL2)ncp2-&1kl)DIm_pp<aP7PzKq
zrO=@Gl6&~Z1od>HV5sgWVSzEvE)J#}RU-Q{zI0No+$<gLn}-5Rn2xuPE-52EN%+SQ
z_5#$1I>!HY2pm{?K)oW?lHviKu9_ql4dN3r2BV)8=y>ZWAVI3CenJ-8U>?;@v|?bV
z_CEmw-fosD?ilOl(obs{vGfs674r^P%#U8te)RsHbqB1Zx#qPwXj1N~_^5bz2REQo
z&U5cDmxrR07?k^x#8Bwk-g=VdSbyNtFD2@rvz|hCKQ@%{?w7{MDHKW)@&g!?EtEv{
zh%5}iW{pvo$>0#{ezuP11Q!JR$;`=Dpt1r(N|VG;4XJuK<sr-hOm}E9>nmnvfr;5}
zs-V*hi8F`@T#N0w$lBk^FXFQmL=_*rjJ@A?8EE1c)P!|d1p@LP6{E&e>V$x^fGtTO
zoO<xrOG=A9%^JU2=(0okY2jDWCRT5$Te7!fnJUn9!=N&0%68slS}U9T-s);<7jgoU
zK<n%|*Z+Q?+|bGMxP0*0%y;nwcKuFZL{%HVk5+><8nxArx)n+N+g=RQV#l;BhP{4u
zgOWR(YO7?lOvI%mtdUHnz`BS)AROjp?nZtsUH%I4<{1qgObWr_`b8l+Oy*;2AOCaD
zTK;o|b=nF-+>W9YlNYVo+_vW7kG5_0^P60Y1$~)kacpYyHOt)I@vrOKbeFzLs+Trw
z?=?qJcWn^b>o?@V$p%PRti7Yek*baPxL=~z)xMdH1#nAfa2zk#ww69#$YdksKkSW7
zwH~QNpY%#GzGwRkx${4E?w5!2fNXUCceFz>{=FXm+$7d7Ad=olgRnB|F%>m#+dMA*
z#NlB9-=y$&hQ7)v0gSP<;Jy-!;fz+o4$A;R7Zwm@7Yw4x<1bJTQ3Zj9?T6+4CN8~q
zW{JP{*ImZtFT<hBw?{QWwQ+;kWvu2Z1G?PPqz5R~I6)6Z4ix)y*~r_S1=HnPY1Ypp
z!cQb2ucyE6)v|BBM*}7{(Tua;is*;M78LghI3lT|^&i=NTTbi8_o;ql^38X<s@H1{
z$l1G=D9{T^8JJ5V^b-yp$1?yQO%!tS*1i(@o2B4f`wDdIzY?s<mF%9@Lzg>r+*Id5
zQzHuz=9<3+nFmM=g{pbcOrC3>H~_ut{aNGZ0}8=%mO7_q%koFxg$E_Vy`{zPFa*r)
zx5UmGyJuVouipe5(|hIt>bR&i@!q#pfH4;wE~OT6hQ>jE-bq~k52bl8x7bJEV$=_H
zxC&4@aWtUZ-*k_`*LwWI&=T%uk*@fo<VY9(h2T)5TkUk;9+@%BL=!>eucq~w>*td^
z61o)XLqVYUglS!*x_SwU9BZl(>7({XoC`)ymiWnK)n;+_$6zAlLW+@(1zx;S)3W2`
zF=<HOR3y@!;``)i-!Px>+{4kd=3I8={`%zBew;a(BmNI?ZZ8^ngfPM5bl!HCDg*W5
z#XAE2mo1=(;Bv_!16}I>6C9u_;iezv-V~rp-Bc;q!IbFv>#358(Nr*~RUhZYj`gHJ
z92)j5t-K}`t2y*7Ns}5x*9^Ko7R=J50DoUY6Y1>gQh~;z@ur6&zX;fpb3Lwqb)kl;
zQJa+mJauQ}R7FbNF&_QVcn>ftB!IZupI_+2pJ<~J{uz$}TPK;N<Jt0{UJ+d#?@`Ln
z234Dr#{$R#Fm|3>cAtR1A(bTqRQ9DK{b*lM_GC3{b=@ETasLbv9nFyjQiIkf{kYF@
zTlO|C!t1m}->w;y3pd+<B?`q}=AR_V`M#>KS~ob5U&O<2j3R-2RRhRZxk>~Jj=dWy
zon{j;JCxxqAPc7pp}8K-TZ6f#fHT}tUFN4<rTRHTe;v$aP!Kw{$jypluxas7q3Q2h
z4qf}x)46`b5gMzO*IA8g=y+smwsTl;voQPaI2C_W4^GbMY_LoPG3QKsu}V*9c2;{?
z5e16)ngj{5b>loWw<sjnAhg}qd?2Z+a*0jus9!~0@bVd!P`Ngj<?xV7Lr$`t;>ZsS
zyE0Q2Bm0mKeT{f>W%WZ0Az?V`FHkke`Xu_HZo0|YNLRIo`n+|DgSH44Z(7yKQUry6
zMG@av*s&txIrCs_Hu2b^_=mQ3?zRPBK2h*%T?HKMUqAC~ebGI|C=UV+?ID_#9sZ++
z{QCyMF!l=Db%nAPJB^5bOj=5LR7;h}XGtDm=NnnoW?2-gz5X)0<qhbVpX9h*;36M?
z6tcp=|N8@KB(b#`i(T-%C0q8%^AD*GMu&OObL+G98_i0wLhnCA*~*AEa!3pi(_gMA
z`-sH62kEUj-rhh%B{lm~H{1n~4uJklPVLG+6ndOo7Vz;clSWcTMq4oORmr&!0K|-x
zZoGORF%*|?;hrC=F8i$w&|f)p@H39*r<Y$&?iRV?x7e;ZG0=AMRBTl~tXc=W-Y9Rx
zpD7{jh#O`QmhohK!EiTu5`TLaOst9AM&4-KXCA4uW6(Mzk_czHG91}O3%?8xCtJZQ
za%Ur_4=nIC?P(krPIF(S$RtL21M-y1xYW7i`ndvv$BI_|S;V@JU05x<TrIl1jgI{y
zo!kJtncf4b&WO%pocev_Cws2HMRn^EoIU>)f7UNMG%4oa-S)rrOX$&;4xLQH(RaH0
z-~W;QBiw%OQNd~9x4KNG?&)!P+!8t(yQWG-3@{Df{yt|qi+bZHZ%5XtPZp<3VT-@s
zzfYaJBha>B_uP~sT89H?(1iN9+O;^ndnR~*AxSZ<y6R`56KwiXbixM_d-&sbtCI42
zXWpUxF|6b%R5;Ny0(Zn~@vsnGl2VK^jkB*}>7z9iTj}SfB#mx?TEbP!R+)yBn`e_K
zHD-i|A7A=^@w)%M@lXZkd0b^&tG1RvSXq77WI$}hGQ?bQ<qmvo>xA$T1Y9O#AP^t0
zyg*-5uhly?q8#0c87L2%>H$J;iUVQ!t|;Aote+sz2KGQ-#g11ea2hvtfvO?{!DIJK
zi^_bt$^c%ol6J*vg2NT=1|*MMX#ijGqY7T+f|*(&;<b06D6qE0Bqg3<{ahek*6E6R
z@A_|dEwu^ni<u1fOZWH>003&HBGw2mSJ2Pr%s-vcl*QHm#qIVT5hT;aYq1Rhlh#g}
zUL^AN!f|D#cD7i@Yt)Ercq-AXeJViY1Q|?nY+c)+%g?=5X6ssMA@@r4ok7Y;{G2M}
z@EJi>&~Aa)$@Si2r|-7XUu*P6Ewccov=mU1Sy8qt%rdM$lik0q#})E=sQfDs$;T*k
z?`>h;oZV-58btQ`&-L9AL@aDSPSo$;k)p}9)a+pFR#Er0fv4oMq_cx;S?dtvK9QWv
zRfFa|Xc5U}fgV$bwO%c8iHNg5uV${6`$Vl2M)i2Tv2V2|b6qy*OQ=2q?mHzv`7<>(
z6#5(4yiOP_Bk|EY$4N5^sd{~EmFVCRkJb`u0k@S+!l3WR7A0<+yzOy^%xe^2^0fZ3
zcl_NIhvM;Bki@QClLTFs3OhCg4DH@x)Li%D)(rzSrqT5G##(h=&}dy0jFbHTa?l0^
zW&_H}fwFq_qq5rv!rBNTxxJ@la<$oocReQ9edE*oR44n*?>Rb}z+<UiGA-l`wV$m@
zql;K5@n0jdo27Y?zH&7SHDn-)ssUh_5k?8BXIkdbJHaSLo$}eatXKTX_dmPZ`R7;l
z148QNjBt$mZHD<GpTJfteHIOMb!EOxQx|viz+C)bSgseN!;q+?s8t^qB{Fvc3JoQ|
z1O8-tH3&NsgNXMWUjQQt-?s$@i@>+&X?O#$n-I2eRKA?9X8(0{A2KBj?2bmkKr{(s
zRNtjjBysyyl!hfBJBfj1Z|W6QVDsT`ZvDg&J$-klci>>7C2Q%RLIwZImp{-)66Pue
z*sx{5nK%hrrVISgPM+{_{M*eb&*i(ps%L-pRdLC#;|m+C4o)C*i&rB?x#AE^>*x?@
zXV^fBR4K0LfLnF%RgWo6Z{3ap<eiLw?Vov9z-a9!ucnLmaR!MYKHk}|-qD^|CxhfC
z?+3&j413*7BEzdoayB>n#FGVqHi}<HnJ@1rW)M@wmNFK-@xQ%B$7_J@J*O_g)Mg=H
zzZ+>8+PYzS(-YYeUd1`U|AMD-M$H1-et_ZVrM-KRXQ{DfS9aO9?!`l_DL@Gq<lh7{
zs&g@v*`0U{{_13^c~zXzK4da}8kV)C!|lM&EnR!-pX<{jpGE|FA)`A*N39K-hcEV4
z@Bvu}^`XVaZsNTf$YAeVvD|H*fv8Gv%}_z}+KpzM4A}my<^Q{Spuv~)4+S=P>csED
zuElP3^d&sa`Dn5UkH9YHIA9C|1#aewDjz+joC5f*a<%C-Fc3ve(&*~9NOa^4FZ;SX
zK9Tagm+0`zQ0Sf-9^e{^0e^W_yZ7-tUepYiJ=J9wyxJPgejlFWT>ov^<XNS~np@lI
z3_)8}+Jt-#b-KRw<Uy~SHvFevmEL(G7(rbzinkz(3QLQgVnAA?B>m;Z0c~r`vwu0Q
z9hf^TadMuOf}4kf$L&wcTo`3BDVGV%kQ5ci;frY+rSp&=w96$&KOhffS|M8R&tHdB
z0AHO0_A1NSTVV|v`1JD`0@inSv{>DF<&8Fd=COPbxsln;FJTry{@T)LxbnydywSf?
zJ<!VDZrlv3#cgQ$9$1XW8G-#K;*+Nvqpts@%+<y#BF40dB#N3!>Te`5^=7i4C*_c-
z(5AelcSi-!BsTe!sqh?{^v^v5dv+4rn)2Ojyi30KG;U%G%dKdw-1e}|dIXP&`U*6?
zvZG#ASdIq76+Rt|h1I`a+!q@4il$~MdPv!h0)>lhB2t%a%$kRFVs`+E)!W@Lxw?;i
zHYb~DH8@zAlMRap&!;LHs|pv8Xu1VQ?((7842ouBY5)Jqwl5UoEG}W7sZqQia)3xF
z8oAxfLVi3mtmT`pixI<TDdhlpW5rhXLr9g{limC*->-F^-CJs8;GOBR$HsCGa`bRC
zU9i3D@MBul+2u1h=gZ{hpyC5&IIZ-v4-L?=_D0eyu6o<oZ#B7&-zVS#b^}@{e?#%D
zV%7>gei~+R#StPb!~v$k59pRPL}8}maGez~%O_8sXc-s{{^CGX$vwG~iqSz_vGs|a
z*Wdp;;y+tJ$_ya5nFOX^<^+$Z4*$Tgrq6o1PaaKrR*uT(1E)uIV6xUeq0Pr{jt8;P
zE-Brq<SpVs?vl!Gm*RgD9&Co9%yVT#Z9~HA{W6Xnjd2GX3yV#BYM{$&>~g>@cD!XA
zB}v|93UsR9&`|^IUTDC;`Y+v{cj+a8`s+{^uwfoOc`p%0+pf$<s`uKCv_JeM;VI0}
zhD&@!1PQ=`I2cVDkFL7i7ytL<aT8$27+kc%`xei&Elg^E8p(|mf399)#3yt;Y!s>~
zE_hsN{Teu=&P<x<8Yx?tyvH`-OPPNMyzK<ir6ZJmtA0%q`L(xvv&<b53>z$(oFb81
znIEQ}mVMcF`au6JPgzd^Y>eT0l^Ubr1YyCOGi95%s)i2~S27YFBpf0`c3OTF=wO${
zv{D_lhIS9@pGfet%YmLq&#U@sQbw`$a)B}Bu@@+P`v(!epw#&12>b`U(P5^O+EN{w
zi@vVn9Np@Ua+7&a;0t$<$S)u*tRq+xdF>0|V6Q2^o|4Hjb-b=<;IUqq&HxSmzyKlx
z4ft_@kD`@DIxu*c8;FWrZsa=6UI$HVtsvne7ZTZL8Sb2)eXD(S&wZPr%dzD;A4CFi
zEVfg0c2wuD2T2(ucn)-Ths>7v*CTwxPXH%9V9fq}I$+_qg|pQ&fQ;{Hb0*Rfv;z>G
z1!K{vs;(*0mj$z9as|-7L(wLBb-O)L6jF_kL{PKwCCUA>i|!H7WeFfBoZjJu2hw-r
zTCi)@4%Y1uU@W!>A_rzxCnA@29~!gF)=id7%*XtSb-#RqoT^{8pkD^brj>rH+sY5%
zFEEQH)?_Ji+1uo5FUJP&S^NLfz}3a@)HN6MvV}mENi(N$$0GmdIBH@(C~tb)IOOUy
z=R+azP9fT<B+}it%VaER@?ftx+DEC&WVn3?Jam-BwNBB{g0>u9W{k7d{4m8<H8!h9
zv0j&L3$zcat58%Ha1y;A%B}v27P|6_-4B0i0%d@?IH`rMH_f<ID%JL~(=54(`!|Q`
z`Aw(SSf&;-O#$ru>#}gls;Zb<VV!;HIkQ){o>|1pq@Hn6ez2cHTfm~EsRr}`;>LaK
z1hr8EQ=hK_DemYYwSz+V{C{JI|7nLnW{}CU9_v~aSkb=$DB4w}jIFF?`n9FHb{>^K
zTH-{tQ%{XLrzYk6E1C*!A2#y0mJnT#9-ZO%54BMX3H99eMvn|mIP!2n=nY$89#G&C
znE|JtU*AGI_udal?%xSGS*qtoTH=VOqw;6uQkI~o7G=-s61+ZiCjaqLFTebZy<gN7
z*m=0c!hsxV-7w+MM@@hy7FzwYI}}*+$Q4)soL2|Z@bzAUPi{HEEq3ppK=4pI*V4aT
z<MA72)pqnq|F!E0`H`<Ti-z=(Hd2Cub&l57iWx8a;DBT4WNyBkp;HRjeBW=NA(Puc
z9X&31aET)uzW-wX#0X-W5P4I86|iT~D!-UsK<Z-(_jU?H%{Zmln}F|f!%qgi!Qx$&
z)h9l32{JFvct@^7=69NQne@%aXP*pa_Q7JAr;OnjViD<Xl0Y?~OTUbl67Uq4{)RQ;
z7v;($wC&_SpL-sOu=)7m(j%SmIy{>D)~?06X%5&iluLlYPz1>-&0{frwIO+kAsf~j
z{+PSsdR10n=S@978C@j}r>PexQ<EoeH3ckgd*o((l7=(h5)%WR&`pF+LHw+0=rDU{
zshyShC}B?WOwVzSv4v&WK*^hnp(i)U)(hf^`RRwf8s?FgsRWhp7B~(>m@g`}!-EEk
zSV{%(>5B|39(WYHCjroGL!6KiAR$C66<<>RdDZ{d-Etrm9Hv<*IUe2?m*bbubd8Wp
zEY^hl7VrBF-bf!-Pm1V(Sp7=;-tXNFiSK#7c+e1Elvbf?r3Rjx;GDhHth~x3<L{bA
z125BA>K34e=Kc$NgMc$-E{CxH{SyQMKt){N#8&}5aui!Z%lAmTPgp^qT~P;=5|~g2
zh?2rjk#-4IJ0vx4$XBha%Yd;0PV;unsJadP@)pcOh45_t1MIAB?`$%m-%8(@ZPe;_
zE(6=T7?4wgh$<RNizh_;7Qvgyyz;8at@!mjirr@%{ij)g4QQ3p%`U(RU=<J!4s6$I
zv1GMQDw{C)!~s#2f1$bz=<FU~R|2zSUEso`DBuq^#)pPeyqrIafiAaK!wSK)n*h?s
zWos9m8}2OzX(@g5aVKF|?<b)Kt7_&=IIA?L4t&h;PrBQa>*m8GHSo?{y`zDHDrAj6
zE?AZL0Agv3Qa%j>^6P2`@=l_`^xIi=`MG|IAnk*0Q$H}2EQ*K>IBa|s6SE5x)@7VN
zdg*)&ZL|9=LpAQIfRhu$b5!-Pc3=;V@G8WmznoAP2P_;;N`KkO_9wJyU}h;a6YVlQ
zw47$D{n%Ii)9s8cpLs4qb4}9!2~tZ2*KZ}%6Q>(o*Z&f#9)DhfFz9%TY;EAtuznCB
zy!zbri!e5?jFQJ7&r*rqc@$Ojw}uX~ix)mRT}2b)dWv#x5$s%g>nB`X{_(|{T#tzc
zYVagn(ySW@cHI3<3OX%sF&bNbE_iOC-2CI0V|PU*pNDf3^H^x375^8{7nN>3tzMy~
zJ-#+tCj!7VYbmkh`{6krKLQJ6KG@S*sY!W|`My%5z-`drF{RX|+P|mk|BBO^2atXj
z{eGC?{Pvtxt8(%Cu=R`ru3C$}E%{32k=7c;9#>kp_0;W?g3V_y3IiW8d_3AU-V``T
zV@T(Is%&Tm;&1x?55`;*o9hm*{StrAd<>Y0JmO1SH1OJ1PoE;EOjiRTKXm1d_5j2p
zV`3vUAYs-z3r0`7diBn8bu}W<(a|Enjoq{X*r1x$mrx0#Y5ic6>nO2X7g)}b_wjvt
zDcIMu53?AO?=Iv;UWW1r;z5qFI`)9spVewhH-*2`?B!EP11I;^xZ2ag64me>)sk6%
zOStU9@l&wlTF4G5+2zwQH8q6+9D1q^mVLV<XRNE2mjIwJhp6)Rh=wd$087=z3bB#z
z-r1Kr#o9BI)$Dd({N8Xn_r#re4(+QCx*6gBBL+u<f!Bf=b^JA$-xldSsW6TB<c=qm
zYA_XX;z1^>wC`WbQ|ZanZWe~eXj=3(qcbaxs^0!v6#QqY`b}h@PqCXqD+b@DfLGIv
znxiXr#ZtwWeIFy{^?fLhRihn!Y`E3FLJKg>Z#<)3*Q0L<ThHhJ=K1!+*l+{I^p#g*
z$nJX0{@70=F9v><l{;l&7cSY6HV1xHj3C!+;nI2zqMI2?GtJRpdb1z-1lIZ!ksm*P
zL<Q@Usm1+Iu9nY|k!Ci$LrGZ$W+V^5s<W31HK&)?ve5%Y4oqMv-{k1)>nntu3VYJn
za^HdBv`#T!UEZt%ZxLBvW(5{A^pHj&+uJB6N*P3L6XavF>k7z^JDVQf@l{rtW6E0~
zj58UBz)htAIbtFeDb#Ya;MC~b9i>fukP&~p#8qo|z$TRB12ic*|9S^8>0mTR{wa71
zquWpbG+k_bIS-c(Bz@;(sH1ENOn0uP(6h=5iP;ay#(ygl`M1W|)Ha#9qJvF|WsQ>Z
zQ_O{@z%0SbA6Q39A;P&dN-JOojeAwScg|h`@L613nBT5I{-46HgN|6QZL?`LC}99b
zS)QbOaerx;z@Ql6^!ZX#&P2Yr=uiXK7^7<Out1eqKC;$D`-jTZianQnbl7K9s;{mQ
zZgHh#bp>knWqFPXR*)ih8SPgcS<5acMY(IzfDp+iBT8bvS^l^A)*Fa)Q_|ouk5>JL
zWSwok67R%du6wLK0(ayKDCN~=Lw9vY4;Al3ihdC7@>Ls({cvv9Zup&VK;|@mywDvu
z$YweZK-Hr_FVs+6bJrR?P%!-w`5#M4fDqk_SKcQG?cAcYaFOlC>KwfuL;<XIM}s&b
z3RJL96N1if5JS#$eo7=TWh;I9wvfpQs39PeH~5SBq`-)itT6Q?)r>WV`liNXI7hQ3
zZAx?%4r;teWCgKnW@uH3Lh+o_@A?v+P;H2R{^i;C^wxpKmm=$5^wM3&92hT#5WwoJ
zzWD(uI&dhkH&i&OK7l~&LFMxN&W+P}uD^w1Q+YNln9u+A(I-N_q0P87yaUG~Lgq04
z_e^dxzgs@vn#Al{)qG*Z*JakJH)#ult(g0&eSK%;JV1a+NwuH%Sfh9dTQ)L+hpT!g
z+Y_S;;9IAg3HnoL4-~$Rj*Y3kmw(y3-g?;ZiJg!NVPkIDZ~4{X0e3X*HYe4z>O`sn
zt{Pu;Xk$Kp+T&a+ctkXfE6vRp7w~y+%3qg(M}yd^F{Mq%>)NlFumydfE!>+;0tQnd
zL>N`DmS__0(rG!m?GKLh^daL=YLyTsXx(+N88ZQfIc4@f<Q>yr7}U4Ln7O8K31#^_
zOPLIB1qTr#bH6val++iOw81?~o0@!+1N}3lR7Q_8{08?jO70wfzmiybUVN$Oe8nG4
z-Sj>BOwPgl`~`Yfl>IVI=tGy~dMJi1F6`NNhNMs~6FNU>2=<>ZGg;iRtwUhtb*RKJ
zQ{b+D-?MSuM<eC6>moBQLIS;UtABu2$?ahJ(Wj_hNvJR`CL>)I71y*{-kNSAr}waB
z#%c6NnOL8Y=T-#M%#9}FbAOFCSxtP%FYvksRg^6fdQMHNqK=Jz#wZ_@{Px{6Yz!#u
z76$GiNgw#vvbYz<Pzd-*8t{Oz;oG+%6y0i7RsPk+zy?ZEJnPFYMB|y5c?T_-K9>7`
zubS>=Fbzcj=M!_wfMCNK*V>Y&TMj;xPfATsD0cGl$q6equg&fYgyvi}!L?6DDRdoW
z<pcZdC*h+EV|*NUnQoD=-BYdO%Z;roc>c~gp;sL%8c#`gFVu~g6?wt=>FFxlR;cKA
zqD~?9jxEt12!sxzBrl`=j1T(laCda-PAKg|Fuj*$Nh(;?vUS5f{MiCGbHog8Y;5#d
zIF7}hy+_92uFM3<HYpZ<?5O(tWB)-mG|Ds0)xgt+9Rhu7zQ%IecBRK~DoN}=w%?M8
zsO((@%G=pG&-Tm1N*n)$O16B*{%Y&^M^EhR?1t2Y<~$nG;EMM7f?$7V0FaTj?w!>G
zii3!=kB?8fm3}|SNcGzb?WQqfV(gN&*qeu^`UB_e*3PW?ZB<~g3)kLk{a&ILwW~sK
zd9aP==eiv!i?yfLbghq+eS8(Bst9paCBTX#lj1nv2Z~vVCv8`66?0MVi?2V2$k1I6
z9*^-Axyj5=^2KxWK1U$oCa*zoT0sC=M5Wmr8;WkA_F*Od-h&bEL(k3eawSpR_wCL%
zvQ1*BZ^=!P{dw*wpK!4WFzKKykn9X*mM?7$9|ru4Cg*~+F)@BWeU4zYRKdC2X-AU;
zk58tHG<=z0Z@PklxG}M#X#+1;xmbT<_=f(YPni@|uSDn|n18-rCw80<_OI*fe<*%4
zQrs%~yaHNd!1IY?z@fa0<AZVDE*TqP`9z}!&dq|XT$8Oxg7>1Nx`$GlSF>BlwU_E?
zgo=i1G>2X!rHjYSJZ;Fo`m8-jY$&XryAB;#esO+ciFbU8UvdqAjm{M8A&Sd=&}vGz
zeoYwrS@*<7^i(z<B$2-@G{o<sXK12*OTCRG(=-MZ>+r}3oV-U~_M+z@4_!o$h&AQ$
zexJM`aPr)w=|VfUx3_m*iL51EJyM>N;NQk278%vCnX0iHjreT6k0*a1&(K6moc=ll
z>_+qB;N-k?C3$PppTNO#Tf+BXRsP_-CW*i5yfyA8ll)j(i15;iB@`^cK@FTYCe<&z
zeUt$>eTc$Dhedrr)O>{_;QWH-<l*|X6*rk2I6&37xb)9oP_-oq#DpJlad4!IGQFgP
zg+$TQy;jT80EUz1_iCu%ttDGF6EfkkuCHyT4@;`W2o=BF%ip!q?%i@S8e-klo<6>>
zSf++~Et7I(f4Mf9Dx3;iI!g{>>M@ds8RFX@g+8ff=D1@3g-Uz0=_!BEku*7}m*Ba!
zyu9oZDbFV+=9dW8mM_ZBdSm?BA2|1#^lUfo=3EPc;QsmQpu_euI%#~ED|&$!P23t=
zF#P83lmp(1@J1fR@w<x`m!di?T_#0z1zUNEX7#z($2@*S-IA?w<4Fb!)1|UA>>dw$
z8;uM)p`L3!qb0?5Yqgsb=Y~-zm1B>jy0uVlh2rEC<Fx0=IwMq6?iHQl6t*TE9Ix2-
z@yJ+Y{@`xAncjZWD9dTCm0n0S^m)OGfv?Hu7jmR7XG$b~9UDFWhpx8{i@M#q#}z?R
zQd*?Np%E0MOJYa|=>};eq+1#!g;5$Lh7OUGE&*YrVF)SdF6sK+JkN99bKduSe_ULb
zBY)@&pZnf>?X}k4+oMR!gFNHyi^<!pfx<8zw|hkIRPFD*Iid){$(*6EMbmqn_O3O7
zGvMjo_lTh%?Q?uYa%XR=&9sHc$jOftrHY-v7kJh`kr$!4d)x<@dMKZQwUd`XVpu^#
zj8NmA5k9y$*$Z&uxrcs+1s1yy@m{~Y2u2_fBy`)4)47MQimTt8b0wZCSp{jN&x^sp
zVh<~;wXzQi-<_~|CB&uP0h?6;Y`s9*S>|r0eTi-jsA7(`>hCd0+(Hk7A7@Q>@XZ>w
zJeZ?vlWx^$^@P68j=NwjNFUNPr1~PNY1PPjYqX?ysX&TyWaY_1xAryO79_~io!I|f
z`ny((&XWd;A9v8`zT{$y=jToMVjYhC5%He`|MVmHJHp?}$cFH+<*?O9;F)Hhl3A9n
zR@=||A~U5zeSy?!3Wwn;C@A2D;?ta|b23uk{p&8@LSrT#cbs2wB^mTCiX1F5T*blN
zQg$8`RI5}SPM#%RUc(XCQuZ9IgXtxreAHQ-*O4gFI>F$$CS^|%-ifq7o^^B@@oXbt
zxM!k9ES03OL!#uf;dt(<;NFJduc{LLm4uU2ngOZ<e-@S0=ye~}DAPZ2O4ZPw%8d6l
zWWibcs_f0dkzqohh>>X!ZclGBc|XY0#`Bzw;Gtw=&j-_1oWJ8Znqfn}5!aWFMPhL0
zqLAJa!f58c?>Fi)(tdU8q*uI|D(ffqW9o1(hwo3s6E+k6w*Hy<L8g@V09OxniJ9F>
zfsS?;>=e5qH}`$zoxgK_Y{f@~qD*WdyytJ_t05G9@ES2<LHx{DjJLUN!q)fOpye;Y
z%|$}V)Lgjd7c*@IXaOxJ7nk4pTx($S!=t({1ZbwxbONzwO>9pS=iYsFa`uHFL^nMQ
z2ELP%V-<U#74BszzZuz``_5S|j7cx^xNBWD3GK^Fz?RtSQ8h(^Vmy@oWQgfE!qU7b
z@$OHd_8;52=|_s_MsyEi=x@o<8SN(Vh;qpsfm>N@g+>UR8?=&*PkmJA+;8Ice|}Tw
zq8$ivWw2xf1dObc64kCz`Bq^TkA3(<Q+$TumBuWHff&`^lhpFkqr=<e$Z`Cll8B-q
z*`H-EjlCwBc*%u?NU)EgS-Z!*E|t`;6;IR{=w)RhBZ_1$M)lrlcUSuM=hMbjre#A&
z@NAQ0i9|zP(9Vo>HtWUmJT(y>*;2|??TbAp+u7(xeLPr)kL-FMO&WtgJ<sR7(flJ<
zEZxYO-2L&;{O^NKByVxZVcZF!xo;epPOHM&FxS$aV&8NsRo7<_!Xu&%^(dGh|9}tf
zVmw<wTpp=oKoTuTM+xtd-yaYT5D`jxM}<9n=%)m60rSF9fjT^B!)L|mDJk`NJLh5p
zFR=wzwLzq7Yir9i89`dOpnKYzERRVzPqXc2PS{@gW)GgEe|s8<?q5Ks3jEoUS1apT
zFQJ#^V46(odpRO>EIjegqc3A#tu`{QiAx67DXMU5NcwQ9h#*L>KOYuGM>U;kl5gMA
zV>nkEP|h*XMw|UGB6b-5UT5?cKX<P3Qvj`UJ-vt?V7(K^MyCUFsuaBrv`$+rV>ar9
zphg5F|4yI(&Z8+U(fr<}>eH(`iR2s|e^6Fg!K6;c23>ZW=-o`l2K{BCB1W>@6g%h?
zMYM;TN24DF54^<eDU(~GJuz_gIx)o5T6G5wKI6@&XU1~W*_*Sxm)VXE$})J!8Ga=5
zI1<lEJ_7ZsR<j2?cZ)@Fqo>{NGk##QEPdVDlCNJ&&}(q8=!FWI0Al)BS+1cD-gVJ?
zzOf{8G+>_yUrpMh03yRH@O$gY=>#113yc?@vSF(<poh+dbULMH&SvUp0(mjYP@Q<n
zcaun7gGBN!8ib-0Y3GpTs(1u^7PHC(J@-(XBM3`yO+;G?0o>^YQ8EkSJSNK(54t8Y
z;b$^Ek(a%_y?LK2(x0nErc$BtN`|Qr4yrvRq5+wz{CH>fI-(#PfN}BLXj{>q)}qb~
zR!C^EFJB}GpWkx=ah*n17^e5))<gCL4N&2Ub6g6w7M=q+L^asYyPZCtS@^|C=4;M%
zeuqDmxCQP|`A;ljkR)&2T&g*Ve135mW7dBqzkg>zRcxf8`>7plgj3E6!>bVrT{(;v
z;>px6OC3HwIJyw;CLf)a##JmTS`E_KI_0@T#OXtG#DGC(ur*M#ZaM2+{YX4tGkidv
z(WI9>Klq3P_GSHc%yRiNl}XC+W}bQ4Jgm(u64PU)I6rxe0rchroIc|HlARniwWg#a
z$>ThY)n~V6CELE;Zz}MJ<ITWJk9+J*CEiA83f?tkc3TTE7Wn7Uq!THGzz?FPKec3H
z-?TrNAul>=H`ua*c|n8Z7<!0w^&7GDBr%J*a|4DtElAa8xK8n?^^nW-XQz03ZnxqP
z;MD`Ybjs?`>GY(_Ih?D0q-^*qKeXU8oAN9el;$%Cu`aHD`#TNz!Gw_B*21wmC9eI!
z!NJszL|ce3xDoy9m`@Hy1#^J^`?aIvULmUkh@`X7g7cgb1l+R@*d+?vXp3*7El7Vh
zD6<Cx>pz4k51hDU!uO*V7P-7{`xjzA-odC3x2(!GFq!615n&Eva5ge#$`^Kt3p&&5
zH3&#^$g6#_?v!*`n8+1=*n}Tsyvy;b>^(1EiBwWFZ~yC&BBZwQ;v~bZ%$dnaduaf6
zV>50Ac~Y4Zt}m%DA^$T{{{1>X<S*>{Bk%L$(Iyy%SCMGRnAPv#%;=<?HbuQqa!$Hn
zNls4WMo1>)vrF5?Xu)rcqVU!HggkBvDjW9Q-a$&PRPH@=)_U&o-hQUmT;JwwN^1Ep
z=<^m2=VxM1KS5rzAqc4=`1iw}aPY0nQ+v{xEvt1N5KsRp^pMh)DiY7H33=v)Ckd)U
zyl1yL;mY7p4t;2wqq=#3&D;mse3@e6P>-z~`lPh9#!JxIs3h&I7q^4FU_^|Z!g_kc
z@}FDrknL$=5rr8)q{t|OPXftU<^6caO^}r9c{+Q%GiJ$m<bM+%kBadhW6?LsNE4t4
zPaEL-*x%)rQe1}QS!kNUc<Gc1bDjfTkRKz89o=crb3eRW5b(=^*U6Dj4Gj%pgX(8^
z-YSxs$NwF+zi4Lq_N^abnRfb8jN{PzkxTh`iDUL>>jm<MpB|*0oUN;3OSU?eX_69e
zN97xk!i`m$h-WMKziIQB|0L=rUf2FqW$NhY*q?kS@Et%bH>kB)+!A(MQsfuye@Dgz
z#m}Sa{hB4ij9zBp<FpKzkF9Z@_|P2G-F8Fw@_E~o7^}jJbyWB8kbgIV^~&?WYoq#_
z@Iucp!>XSE7e8sFv}ooqHBH+KVdp)nY~o@(Rtqy!A)O=@Fs}IGz;#zZP5dJhyQ1J}
zh3RZ1x7C9#LY(+UJe(b8XzQ~$!rZnt;o#W;8od+r;chj$;$K^KitHuzSD-F!$bB@!
zXP3)Q8RVIp!`&AvE>rH10Dp4O*x>-dyW>9>@kK&(&39+~JJJF&dPT$OooN)+LFsmL
zNzd_Ho8{@iGqy9|_98G^ep9+}WkhU(b^bWW9fwFi`S<Zg4t$3$Ye_!h4WVDWpbP}@
z%ro81fp97j7p>fjoA6}DP|7R<{#MMAl#%;Qzjc@6HUf@QmwcqJ?s|4#@>h76@24s@
zXaT=Rx|_t9Km$J7u$&P<pIr|`y-z}=E1y5ifUV!igf>(QGKzFYlAl>dHbxWPj}--s
zzJIVP5abDEa~KxD>wF<jM#LI&A<<aK#X&#9`%R6<oP!wPY>sU~le@PGyQOYPirnJ>
z<#!HYXTbl2%zwRjr34;bnSA}!`)c*ux@D#Dbjabe35e|>N<S+&Bde`JU0*|Qb>yg1
zIuZ47oP`EYuz0I(ou<F%zGkF!exA^vT9LRSai@!$O@PXHB$Hk-_c(5UWs8++7~ga3
zs!bBbcE_CW)Gzx&FgvAFWEKRE+ohM|Cx(e%sPHONn1S0Q;-o?GTmWR_yB1iu3$oyP
z?M;!m^LgB$KYZQ{s^bQYqMsb>?6vV^9|(UtN&h)PZx&-hc>l=x{wkyy%(?yyuk-q)
zQ8Cfb>wgFHDlhV>(H(R&kP>%D2CJ<+awB_7m8U8u!KxWn#7x=zIWuI*v}lntNI+Ve
zC8q?G@_&WBA+BN!5cb5e_9Pk3bfZ)Z6JY5g%1v$Lg!yqT*0PPa@slG3gcm*4*Zq^E
zgu5%KO|u3#cUmJd18J)+1jYD7YAfRch3{l260Gv|n~W6y|3tayh0U=vYid0G=W7l(
zKx03<6Yqb)!@q;x<Cqq0a=qM_t1pE5)--r`a>!Z7gYcNKk30kF4Wb~A{%ow3$6<I4
z3aaQmreHmBz7o)&Ic&pt6s7ug=QUC;lRc9=!CF==HWDq;&JRMf^FEc2$~%#mOy?0c
z!NNk_b_&_Z5PA$%lgNR*WNWH732qr=ui9H~x5Pp8UN=ATyFdo)gv?_xGsS?&Ix${?
zu0$(L*<3E;D=1_%pS>KX#=N;9?VK&4@+@>g1Fp@$_gfkf5y7vd<^72;D{1baCR*fZ
zJalWi-p#7bg!Q{VeTRy3nuA^A;n)H`t*EyGB%%cii|}RKV`td+=7p~XY9R8Vry0^^
zIsNudIz0uQE49#VF#nBcP&aPu^)f9DDCvDM2_9a!eDpQuAf#YHJM{F)Hq*YBrUJl-
zTgxSnay^D&A=BpuC%5>l(++&+PBiY3qO5bWvOCld9ti$8?7tmHCfRs<&UpP53Ah}E
zHRP@Sx3J*iH<v>i>k!(_p7`I%hl17r6qj@MhQy<GIJ~;A%}==O0*CpC#qcd!V}}R~
zWtLrf{#nsy^faC+{H{);V|!^ZEz1&w^K*v-zT7<NGFqR`D?@XLgEhh6c-N0o)v_v?
zu>7J3ln~>CocA>|{4j!T3s~Z;1&)Z{iUa>;m8Wcn-Iswk36EYYum<IBX{HPRASvtY
z|8;ubkn7r%5TpMB$<3GoGUh}ica|`C3?I9QKXrzJA1qXm@4kME&E#_PG?c;9xE7NC
z@UyRP#?0J&`AeULxhUw`uvdWzo1L)+?S|z)tbrheL&84w>fu(8J@aRJ`8dckaOpdr
zNYTpjvfax0Rf!-2v>+;dh2}w|M$?3Djh%jTW#u0i20<y_gC0`Hp>N>-5JOs!W=j~I
z0k)*{Q5I$=qZ(AC?VzDd$*>~~3;ip0Wss-z!K)sBc&le-UF4cXJO@9?I!Ew^x?@VN
z3%$P2Q)Kz)G=~R1d&*Lmyh148@3B))r7>6Lzd`3-end1_?^@<=&^b5Yh;};yF#hYZ
z_|Fq3ie2w}e7ANh;wG*0^ozJ`&~GuZ#yvi>_=Bp+O6#0#6W#`k>ksp*8+}ELkYy+D
z5^q_K+WS~MO`CHN7F=h>E~$&G|G-Ro^?O0kPi36tjYX5Rzh1FJZa@4;MF}^xAh_;R
zCZ5-$Jy2$m&J``0>$F6E=ZPfIoubZeG>pgnA+<cko3V&`*DvBcOif6a%K~T)f^RRG
z!G|c!iK;3DXvaW3r@W7kL~h*|;Iu93s1OY~o>E~j${;DJZ1}WhtiBNl^hAT$j?Z9F
zZ?5dcApAP~QR}K%`m8{Zcr<@3cg;Kq@@${TvhvhGZ5)D8LzCI`E>ZA(iFW!#JC}du
zjqlI`8sr0Ewque(as)hM&>QO+4tY7Cs8a#V7iRD*>ikfn05KGLi#dM_EArjD{q&5%
zmt~r@VIBz;1|K~fjMD|-)(MV^LIY7R8d+NsR5NTo(8x5h+VZRTaLQg*1f>S6mZ67n
z^LzvkC8g)h3snFHn2B33+o%#H!#Wn4lRW;apMKp*{F}1;Uw6u15`B4gcOnhC7rb>Q
z*KAa#e?3U5JQ>hvIn4qfTwM-HmJ2$FS0h<8hk}PG1`@1HCmS5YL>Ivu8p&uHy*tr!
z4#h#9jWm4sx07({O}r=lf@3-ATaxmOsG9xwyxY%>bhx8_jbdXUn<U@#)HXE|EVYd<
zsb$JiWrF5&YI8Gi^|?}rWZ*YxW?!pq$el>8D9uH9ni+xIbQLjp*-A~AA1w~etD+)0
z9rbu}0$q4~MIl`HmK}@US6;g7dx=E8CjM+RDQK!wf)=34uGIAPeZk5`kvBUtjS7fT
zzRu3LbjWHx|9Gy_@J-S=_z3eWpx=K6F{49o!$70s4gq;hG(PECe__%M0iWs`FsnsJ
zlVTYlUat~dKlDA{xS)^u!j=dNaj=K2J9z}z${$wZY^W=Gg^iSM;~z90RpL{tiX~>)
zA2Qcirfa3Wfc`9dq=#xQ_MDr8yq_GM!!tG(1aX=_?fiW{;1_!s0hb)Y9OSvgOu-H!
z-oL5Czh2`dv}mvRRjd0CxTJ1eHL%%6xDUM?I&_xQUsB7MD;t*QLXa3vZ5JYr>e2!k
zOvMb}bmtP`gyaMkogwn4?eI$6lU(&LOIKd|Qff_~6sC0)a#*r&tnbMQN_?$Ql@|X<
ze=EXHb!0d_E%K09Xoi$S>u%4Fz8@KlKKS(-pWh{Hpbx%3A%hIaxF_Q5c~bsrZZ_N1
zZ<H`eO0T^3*oQFt_12aRD3#&8|MKA`ROiUt9K52qXJdtNAm*tLb?T5vx7@6M1vFuh
zPRWr1Vk8T`zan@YI%-t{Y8azs-mq0X^HP#Q{qWU3=}I~2>mNu|@dHU}?}>Uo9wTVN
zPXc*};S-5-&AKA&{vJkHkI<(kt&zFNH&X<l%VrN+P){@{!+^6-UrQzJOm{NmS21y+
zlRa4LE0<Qpy4PC0wesXZ_P21-=~Z2yrvzqM#?_y^L-UH$h#P5uen-k|IX)d?oN3T#
z(41nXO<=r?$%asV1-*R6JNpvi5|*TIKGTIX)X~$sm?wNt#zc#E`S7%2a-hG-xT2!M
zwM1j;wf0yH7{bdFLqp#oMm$)-=akNjxNv`{O984URT_UFxTZS%SsnZ=Wk1V|s0{h@
z&_Ms47B)H#p{3i1+M7&+oF;f4r<$}8%5khxG{OC|3C7{KPenZQIx#MvVwDyQI}A{&
zIu9^5G9>g@XJ)&L4?>1z^xe$!mQ(KA`!n?>qGcMKDjGWqf+Cim<~-~%I8`Cw0*Tzy
z#X1a0aQN~c+~hx{fhwo}=>b=o=4*^R`Rd+BKEZ{%xPu0^jo1&!#$I$CZ^C4?P<bgf
z<dSF3-aM^|@16Z|6XJyqG|ATm32%?iQgY{a73gb5LG#IXtWiZ0No04Gx(@xdr&sj?
zYQ{K&6UjL6WQ4emEJg@}y|xr1rp2Wm<GGU5_3|T}7T2+h_k38p`CnR6R*0!e=(Lxs
zW<GBs4-}bFU40NqYn}&U00AZJe}eNrbjp|jiNk{x-=-bRyc5i}+EA;yJR16}9iqXQ
zkFcACBN5UgiiD&0$sE8=J_sRJx{7o^^7nV7zgU@@{;}cRlecb4ScAkV5JaEJa_`d(
z12DSF5EO&RlO{vltD;fs5;4pK80*Pfj}4RMK#)286CDamgDjSQ!g@$TiB^HCH8%aW
zm%~q4x{NN11P@gkmt+W#TFt$#Jp7_bOfVs#u0iwf<ofgH&r8s<d&hCe)w~H538I;|
zq8}ql!D+>8+A-`$W+)P)KDC-BK8p+T1mm~%eZ&Z@n*75a+WnT=jWk*&2U>}N*%Q$>
zile8oiwf%f>o=m3n}^4#6w;fO_qzM|lHWc3cunH@T_01hK#VCXA=EaF5piLm1{`aC
zEV~N4SGQq3HcDG??moe*lLdP}Bv02bP@uZ-0x(u(8vWcn*TxOlN%%_<c8ScpLE%ru
z=?bLmOEiS6jEyDOe(T=RBK%GDXn<t>kRoV9l~K_0>r!tesz{IrKl^H8?K5gCRg<Z4
z)Fp3+DT341Uh3OnVXaucA?XA?QX3cCe_h=AND5Gu-le~ZZ4Uu%$+^G$|0-?fiVwJ^
zN?vV51o&vuIYg{Rnxq|QD3E8iY2=6*{8^dq^YGv0EF_5_lK>f~?d}v#w(q>b^~E?^
z+PL5W+fKgb_we|66W9>;r)dt($Es}g3kLmZIq_^t-;$y?awRmm6Y~g@TT!oK`3vy_
zO9m7o`S&a3e&mg~D|k)<@GMVf{3slRsqQG&8|2LVrdXZXpFQ&-?~eA`WK}ja;|m$q
zD|xzAJqztIL!!anx25%lki$pjS6|Ex>~5pNS<>A@fu+4UFw7uzIL7GYW0lU<50B<K
zjv#kxqQ=}7Pi-{=HiWm=*k^gqMrh{DuV23kpan4y#I`{Y5pVO9%e3abp*N2AMFBda
z%NqX8g93=c>RCz*Q&#f;mSsF4Mq48;m|5FD&^)dCIv%#n6fO{EDMs{$v-Vdn0=~o1
zV{-qLFs7o%!bO(bM@e`VppNp|+S<Iihn2b5^u8HkoP*=FHSo7(;afQmZgvdP*w@a7
zh`$nrD`1s3GID!ACZe%_D^}|4>8-iq!S~C4f`HkC0>ghF8?YR?B)aEd@-pBnWA-(J
zUJz2)qd`Tv1d1!CUzWL0j>r#n9?}F+)ClFA{ZbX&4pxCF&6!ZyFV}KjMS2<$ShTff
zK(~H-?fe*h!3B}hH4)Ju#}^8uAEAC`Jf86iAY-NVzE*p}TmFS0ZspP60}yO2)a6ET
z``_O^bcD{M2R|mK$@<os`#+O!mW)++9wvy=YfiPNf8op|tgf?dct@H-%Q+Afpt6o$
zt3$pHwwI^{(;U7hs}+<ux<y*0R>__C@(a6eEo$-$QJPV5eY=rRMLi|eH}doC>`EWr
z$QmaW=28oW`<wNC`-^Jsw%Vt!I!Pk2Ui7^cXoEfhI5Cz+9n|b)&`GT#D0uJc^3*m_
z)HhPO7b`-PW&^1nl5!7%cw-IBs@mrGf`6uGouke5=pG0IuU$ss_&6BDNt5vDldtiU
z>XyXM*>h>(mfu=_Ii?ajw^*+yDnM%BYrF5YI98WW4vB3I*bx`m3|`!<v=6$zW#$X;
zgp>37fhm>ZK}A!|#M}w?3@`HeS4|SY4wnX_>#NIe+vhKrqy)wbGe&_;HEgwnYR>Q@
z>~7XzmgYd~A6dax1Z+J9P?$!a=9MjetOf}8{>uxHvC%0H2|W@iSC;)g_4Q)7u@vr^
z4j6onC*)4zMw6gy5S5hfkLm*KqmI13&_eDvgEF!1uf_F1*f+mg>)DBzJrd?3yn8;}
zNxrA}@b;33JwHT8q?qsJ?pPGZno+m#A-qMS*cUF_{p`nKp~Lh5Uo@HLou<~zsN*Xc
z2t(tHohIFw)IFR1bn4w&s+-?zJ7p2TK#H9HnLf983*?r;m(MJL^8A2-0Fg7yjm$Uj
z|A3?xXA*%8F3}2I3>K(ssc5S~u|t&(CpKv@L)6Y67MdeeW)?=->sA9VJ?Ry4)?MjM
zYaFpl$#xCXCVkE3jps=iG%gE8?=$kp7e3Lu^K#-nN~N1n=wJ0^!uj!*v<ENvHPQCn
zJsCSl)NXIiJ8~i%bF3eVLAIUe13eky6O)amCIn$Xf8D{yjoR6L8t#^J3UA}r6#aO?
z{(`?T&c7`+8qk3CA7PVy6=u$0>O3LVO@ce+F1bPIV`(q+w1JgEg+5E6qLb7kEy-`O
zZd*igSuom36ESklm_?%4X%YD?Da=<BdV0yOD-p^*d)yrVv~+djrmz-p-<>Oq=%dZ^
z0H#5)<=y#q1;>p2%?f#8z+|XGecE3a|DZ6xnQ+RV6uA{mi6D@~iV0iyxOhrg1Q(n=
zXy8wYqzUDby1$Mw@1p?q$hAt%MuwKgWV?fHDF(9A_JfWv=$ogc(p@WniKak|h_Q@6
zC0++PcCG2BIj?(vn4Xzf%<m&i@BWiG_<nh=N2ntm_fPPNB*<4#)+XMKa2N~WL#$B3
z-_vQziH<_Ep!jwCtN0y2OT<IFL#(?0?PaEc9^ogK-7G8p+|57^UB?8i4C-XnfzJ=M
zWPD0Hls)(0RIU~xfPrcV=kc+=0nb&s5iKB2KyNBHJf<K|W-~J6|3S{+udkqhJ5zC7
zolA24-dP);t})l~BkhkMMkdgLyr##!)40nNAR2Nu*!Pm8Bn$gmqBGL)z1zvx*KmAu
z1B&JDGNmyk$<}HH#B!7mj8BCf(VpS>-VLtnaA%S3*2pI@icEpbI?N<|Q2B=Y^X_?)
zjyuQBUI?0n2fHyO=_aj2|2wVxQ)+eww}9pWWdYAWtD+2ob`+vK2gjh9smy<ilD<Rp
zj-!6hctp}^3Xp{0VRo|8TsqI+X?Uqhbag(3^{7?U<v7=!&~WdTcgM0J;CbXppqbCy
z!opq*wB4~q*?S@2$DHje$_d5PF!Yr3JH}RYi)zs64&)RVF%b(}|5YhmI%?fq@#V|z
z>?6_W8_<#&hhF3}4@Pa($Yc}!IxzJ!Yvc+=8}CIL?)fygB9ED}&WQ%^RG__HmIdo8
zmWmy-`$AiqUy2j~VZQS{-Od7bBqto!#1oOR?E*eP*eX#S3JxRK7v4!`ua^uH-z9T4
ziZc8rov6LhVMs1%%b2A|<Q_1+1BKMA=#wl+7*XBq+gU+ga4Ol-#=)9$kcq*BZVXu&
z%oDGdodBxNM?N*iMc)U<#6x1ssVOy(TnJMjdCcr>)P{_Pq{x@ez(=~JNn^o)W_>nD
zH$(3{6tVK5?yAx?9dBNe#s*8(_$})VE+(A{qXP#fL=F^|6nX#`C%)mn_ixSqA6(4+
z3e8`uY;AZutSMKMTxZJ?<=d!Yg@O!s2-O|pRT}QthojC<3#U~RMV$N{xPHAX%R}EN
zo_gaz#<g1c=FkNT@516?x@!Nc6Zm*5M?t$De1)~Z0keiE?aRbexI=R8_Yd?BB45`_
zWOBZ_l$)t~!r;l3<T|ubI%vUF&6O+pJu}USsy{wpFZu0{-$KPK<aKNJBG@-O!6yiX
z=l`psQYY>vMlZ$+II1nJ{~M&>rGOYis|q#qp%f%kyrAYaTsXg?0h=;kKn~HQ91ZC{
zd8ukVu+mQf+jXT?;$a4qHKw4IJL!~B{wV1*3D9|MMwilqa$|ZwH-IEbF8v1#!#A9@
z<h#m#7ge76)n#K$h?9c@qx^f>PzQ_Eb4uqP>t`thK=b!8teypq9}(}_>j2pLwxB!P
zbl$Ohy2T$n+A4xv$;Nk~JLj~f%Nw9`BMUy61@~ki5_z<gegfa9Q=#oek9hv^ut`yR
zcGu^63Fw%0?!wgkb{AWqP=qqz2)0?3MaK#d9-dkW%cK$b+h==%sPv3vP=)y97ds-A
zIm<lVJNX~MTJEa{FuS748jmKkwV%D2@}=VSgYj+?LzZY?|D;_!BrTs|W;1D=<uWoY
z?3lv%-F2UE6=>kXPA@{e9UuJYIX@BeNI#Bz@{$tE_F4yen*0P6#6j4tU{sZzZj~m@
z%lXwZE=?HI=I-5kSH;V9!4HduFq#q<PMb|5H6<Ur$PM*|Hvp&Qm6i(mq-zD^SN=?~
z`msPi!WP5Ku0#${Wjg%r>hYB`(F;Cs*oI|p5wwm+n>RXs9uRm_bL;o*|LsD!<-f+@
zuzZ@Z&GenzmDI)OYmNI#-Tl_|qtD&S^H%EDB&5Gep52ciPtL-~SJzjy^|jDy<}N#y
z+#o19Rct3&zc(abmM8ZB7(|w_KQrTCcul3N8sRmkuW6p-3|oqll5H;$ef{VYU{&gV
z@8rcpGc#Pq(B0G^IeTznp8Pi?mbkTUp}u{0tS~?vl*ut>+Ez53RL}xhAdUF^%J|G1
z4qGQhdi#cyC1nLwa<V`Rima<9vw=zInt$o|^9mc{ihlku1y!QU0@fk8>4s!{BK+<4
z5$3iQIn{!s9Yccr=30!k`w0$f2Jp9r2JA?vxpDjOdmV4<7n;n<e3R)h_{cDl0=^D>
z{OGN?QhqKU-ND31Z&TVLbr!XGmXg5Yq63Cy8QhSbilzeiqOY+J(flcSn=OHm%BsZv
z30PW8m-(5$TWOD_4Q&^^T-I{mf*&t2A}`*eT<&}*z2Ym5rk|=dY(vt@y_bg^@_}h$
zX>n?d{Xbrc6$KMse;V5QGtZ_kb^TPOuUJ_2Y6ogjl)#L9t^z%MM&bo_%M`)Sync=k
z)jO_T6reVAZK@`x15n<x_Bx+P!*K1vTkNgU`KbZ*c_gX=IW+4(c=(2xw)0&}C5hbH
z`_mVYrG=izS;y3K2%;&)<lqgg=b%TAq*|Pg%U?Qk##&-Xv3pEkK!Zi8l*??ytZk#l
zmf-O(n~w?##r;tc4k`uu-a8^f2ismB_S)AAF*6N5Ot+*~iPI_A-_D%T=|=0o0!psT
zP4G)A=EuKJ<tX*vTvMcwKd+(Rn(~YFHVv~vH3uCI7MH7$yzGeU2K`oS!JQX}(u3hI
zTMfb#Z9wvj;)bw}>!#6S9VTkeg42eA)tCDAA3TW=TJO}{V#0UN(JKfjQAKvPl|Epx
z&&8>BJ|~W5d~_G7<{mK@MoQT3npHw*N?3nIiS>hc%Mv}RDEvc1`Uk_hc$qu#kUi^f
z&dZDQMTcCz3rWlJ(E$!tr+mLyXmC>6mvA>;0-#9G_&-jIkLW|`2!GoA4E1l+T17ZO
zTPh<OY$q8z<Kz_^bD5?d_9l`{m6A4;qdd3=S=vEN0MFAQ^1^%?02+v3rjfU;4m;qd
zI5F`boMZ?q{)=BvelCGoKyomioLip8X(l$>@p|MrPsI24+-xU$Qs<OmJLfUhd{h!E
z?IUapy(jhwu%>5>d0nlcQqs<nZ6bR({RcC$(!ExW0?`M3;)*HU%fNix#3`d~TjVPu
zpRZXoy8Z3T7l*(CF|e#AF-a_u+qVq9KO<k=$;2XiR!k;|p0*Hx|JFxyrcX14-~Os(
zX=zD%{&0VPe-9`vBO<0}V`P#IsbMoEjvI}z9x>jZF~*S<Z@fMw5m?$9vLd|KxyOQ(
z2ow2k7C8Sz8b+O`<=rW@LTU$C0OdP<#mbNTss5b>aU)eU>%Hgo;Z{xRJA8$>_h4l>
z4WlD5IucAYv0AT;P|RvM2Bf0nZ)4p<4X=x{QVe1iW9!y4)1R*DN1hHt|2|@s+rqyA
zqs)C{7mj>2MvaLRm0JLo`23GI=U*?fnH)L|_3BdY|E$541&66V`Xl>z?|Kd@y7$!=
z6wNfz#bF_H@+k-Z0tTY*;-o*YS=Smhy2C1;pSSUue4<4wll?*00B)ev{jbcMUD7xc
zkw?ZsZH+IN;;mT3fBJ}2T2O$bd5;5c1K`9^<bNd1zson95%i(Daf~Jo;2CuoR^q35
zwpU?LmSLtn_HbG&a9iRjeMf)zs=&{S8Ysljey%-|@HdHc4yl;drz&F*N%-%&GDs0i
zOUmZJ-H<jmI-MELL-Ji^x;31utQ421445X!BC)p@@ZQcF1&DJH%Gn`_^ITAdYtbi4
zK$%%kk%C&~ojk`ERh5)&WCVNxJUAEw73B0j{|RBbUysXlUdlM-QX)Ls)6Ia$1f&06
zw#`4<fx8&ds0+zhiLP6HP-zNdf2gCohCSePE{BF-yS|^jlriO^j@yBZTI$T{<JWfn
z{Tx($YS~K63|)?#s?f%lyrOEcHgz(Cj@57MjhB6rPif;}qBZ&Y$UFX446*y?rVxG4
z_TndXOLuo7RKVWNis26Oa!}JTXOI-Sx%R{tV3N6-vY&qbAuK5|QZ$+JIsdDjXQO`e
zRpqp_gik{Q?%$NXGg0@DE<jxR=TVjbaqZtw`km|>QFkO8FXTfg{~srU-l21YbL~$R
z?vHho;2hVkhR8mbBEe>Le*={jlnu`qN1d}Zm!q*kne>oHqtmWgbLbW-FxoTu0Mbr_
zmIPhRM=kJlhsSfo2<@e$uTqjnBGNi0o>TJ1$vjv2vpK(WW<gly+NfH*6-BiG^zL8p
zJE@5)lkfW~{K%WnFq&+P(Wuelilym^$7N{DGzuPczKJeU{}EmOX0)lE`g=BB8G#MO
z&|OpDQmCl#at9;W{Q8hPL*wUw)pnHM!=bMx8T7L@5c}HD;62nDi0PNs_8ws69N4-N
zoBri?2KV*`tfwMkmrU{k#-ga*YuGmVk6}7jKt(-3RRya&Dy%8vi>4nl!aHqE6pU?`
zMvRzYFOa-=u`nk!=M<pI<;E`y=5mFt?m6NE7YoW|$GFGo)xuw7;<S~cBTqS7N}nNp
z2X5Hvj&m8a%PMuKH$}F^p>EjYwTIro&}n~fzk^uvA~EdQM(5wqKa-@!N(q<#>V4Rm
z;A>XI)etBQDNcAR>E*w*BWb~K1%#9?MTmq^02cD+&$RXPD>pvQ9mBP=Is{S`Cdgd#
z>Bsp6WSD;(&7bpj)$V2*n;G(Rh0_YOFfVDW2#q)S&Xa{BuciK+GuVart6@RIK~MFr
z9mQso)H8g!sFIAlz69NO-J`gNAp`0Jk|M$};^HbGtdxHeW0dT>DeJ_6w$P~c@O7gj
zxYN{_|Cb^XH-cfM`d%Rpe!Z%1e_vy8`hsoTW}`F<=i5<alb}*9qhTINdXwmI(yXRf
zgxiP$A&v`<oKIP`uL1alqvW{~=oB<5*dJHEXZmC%!PV4cZ2X14glT?eocylSeM_}t
zzp%E(j!W7XqW9Jym>*_hTiR@IIzPT*%^yCwAr2oYjZ?9JLlZS@*OFjN0#K#R3--O3
zWS404sQ3HMML4!O&OT?w@O_B<9!Lh(h>@uU&>F!BXlwa8k3-MPKI8?TIerJee=Xad
z$?`C;5V;bivqk!5k-)+7vvZkjM;oG)cu}y}p`H~HQ0g}5H~o=uM4{)$9PXdvH+)pH
zyh{l~KUoF%*4X<G0;0WqG<$qrCK05auf^t87;W1{|D9p-BMz&l?1YX6GNqcIm<hgj
zl$UG-roih8zw0Z*#XX2pPAf9TRsXox(Q6@=U}66z2mP*_OUkd0tTP0o`=9apJ#Fn|
z2DzrYB2O~RjbyO*sngqtp9jS@PryL-b2dcjXe5nCjTp%_&t3FFx>$yhUJgZm;H}!z
zU=ZXI{ggBd8ll+~ZH@4-@Z#US&7gSz|H?X|K$>1qeYZcSk6iQ}?-{i>Uuq`KL;wPs
zkD1&d^gi%WlG|?Mk);rlfXxD!ncF3C>*)q$|4KU#<L(0QVaZL1`E>g+U+JMP+TMTD
zLqr0X4GJr|EK00?q7SqO<jY(R+2$t2luu6AD^J2B^qo8e*A6S5t>vKf!^44EmcC-<
z`b3q#$(p5n{r(<aux)2BzkaKV6S0N4@t6Et#l2tZmv6Bc%TzO}7<1Dbyb1Vzfm|GN
zWG&xK4%JoM{X}N}`zjC2OLhBoP>|d;^HB7(hh~&LSY4bB3RIcb(*Gfm0ay(rvg}7U
zE)7{2^fG?MqrZD&7IaqVgZ3Ewt`(kxJYcH#j1&30x=u#*$HSUR&JO@OE;ZEA5ey7V
zZ)70DM%PbDfUSE2#F6@(+Zbrc)(oCT7iM7bQ~jXjMRESLd^*?|#?DI{)}<;VD>79*
z+lBPYT3|$^(r4vD>Pb|D_wYARWz8@64Iqck0Qn-k-aPQex&Xmd)}#3r=s1=72-93Q
zF<agO#Ucv6_PCQ7`w%3fV_lG*a0LyP>w*aV9SY0N&12nwAg=UlTwt-iR=7wA%!Ov~
z{EA=qk3}5u1`JWm2w&yFK3=*b(m(8?sl*3n5(<5U2|E6q%7Ocsr0V^6GB*eJvR~=(
zX|NGLNW@1|lK`uLc#}?NpSJSlz>f$6@C13pI*u<r%th58OJEFuHp)eem~WK2&667&
z?Hg8putguq!S<-Ul}jd1zNQP!(^6dKk+jIo`(dc@GWJ$eSD)ye%bz{X8KA+ZbP6|6
z!iWxB-+Ms>@j$g0yXD{GM?~e8`<O2pxXi@<+tR>*uIx9>EJ0+`)cn+-J~oaCsat~j
zb6<UH$?Gt|)_7(e?L)d(8tpLz%}XpMdb^l*{YzJ{;QBqf*TXe<o3^t4ZAKGmQw-rR
zrE3$Uy2&>3Ym>#L$iBLmMlx;YU@@9{<1VR5G=-n5FIvGY@ufY3zZk=`0bR%z!L*r3
z_)fJyh`3Xc;r2-XBS+Mn(@TJN`-uh@o*r=BP%&lGfR&E-o^5`~yJG@m?b2ZbdHs#B
zRpU=)K4A25DAC|^=|3Z>qKz-1ln*?!uQ9**gEdO1u+{m!Oh76A29Er6|L_9puZ|0y
zo+TO^5si461~BC(2Qh%Ck~K(_XW(M+1dh`km!r<s(3Y~VL7uwRw$JBR2eT#InwomS
zjHstnQR0t<sx^lm+KqpikLLg7AT@Bwi>ewT<y>W$Wci~GZ5%#m(EJ`1B_$f!TTvIE
zkXIs{QZ^u=VlqynTZU38XJ;#5TSg(^7Aup;BcFXu=|4I@8;<25*TQ9uPr3jVSr0An
zi<o-7i~`dRQD&tWx^&zH=&(B5?F5)z(n#$bKh}Nb*vBIqRCu`kXX|zVEe4}v&=ZbZ
zdTd*DM;g|h@iw0ioVK5gipkQC;2Py`sIXnw>+7fQ|IqyHb@amC2$^CHj)c<T5rY8G
zD2TD<$ipb4?^7vsv;Mz)ND63q8OLjL5$+{@Z!dv4BA~J94skX{R=xw*eRFAX6w#Gc
zkY!_!oFe7;dQR7)fL6Q4f~PYL*$+?2VVV?Xtg$v69~mM|NIfU!I(p?a7CA;rI;W%C
zjT)utwTN1^VDn|va4Y&jXT4A#6<TXTd#c&=lw3!O;)Fz%c-51VKSUTG3=p;DiP1N-
zW5j>ZjwAgy4i_0EI{Bbn_?Z;A*-5D8i3x2lF!I8B+2<AXw1LS>GVC+km}4^P4pH>>
zigu~~+W8e5Rg}!YJ^RlJSdQ34m%hK`N;K+MKrg_(JCqp%KR`J`2F{AzuS<YBW|^`+
zgEq~x8~)eh0~iL^4h`+DzX@GMU}I=#Upk@5qA0rWN9Ga6EWmOOqPwbmgc&3!*nC`w
z5Av*;@!W6>C7>O1Ra89acTB~F!vyKSYK%X{Kjr4^U*|;SC}&GMzcx4ToN4yE<|5uM
zxv|pMp#ruwqGY5{>TFM^g9z=lPQx@&Y1=ypI9Kr#%>|p$jYC<m_R#F-!HNoa6W5P@
z+9&C_If1vsSC|(QQ&yVO{3?B;?wE#LY#tM6eBvCUHdS<}Lh74Np$!F(b@jBmR4X4&
zw7l-tWPkrszrc|qN!K<^!Ria@x0msw4`+@_Xw;YG-AdwuJ@Xx-Oi0R?rNPa#XQ;+j
z21;jSEO-z+Cy>0ZDw!t-4^b|%`?!l(2j~j_H#-5P1Ka<LR+)atnsbM)`oIeY{n5bh
zDkioB5m;A}7=;FxMVh4HefHQ_I&H!}^!e9aFa6c0nx2ram(LPEJ}TYCPDzuK<O;!b
zy|Z<}Uf^KHdTT>XvU778g6E3!Avai}#KHa@$+K*6Q#U!C2k}g=*;KKj7Kd+n<GTE^
znf>VVEmbS_;eMdf9;fKidvUr*Tyz;Iosu2{qQhd5aXM{HR~T}mDrUm#R3T4kV+7sb
z+_;Ya42nw+OVe#ckmqM84cjr20p|nBpeY|y-v`*;vZXrExT(?m)%QULa6fk+G&s3*
zRs;F@rnig#8yWQBFS;yEIfxjq9C_mx4=`AeXb|_e!jpqbebi-H+(r6hUNA2CZSv3M
ztj@I?rkH>j6k+Gml}e@yH+62Wjgb6n#NX=0#%5F?fxjU$&LNtv4iEd^{gxizG@gsD
zd2!TCyH1t=)u3fy7(`gcm{nuB+^7ycJv-^|qNnYg*QQxEnL487KFD+4{0J-C72cWN
zL&eR|3NDnwVd?2^Ttde`GKdGgo;u-9vHuRWi(U|jc?n&cQ80Xj@HW)mXu-!)d~(xo
zihvKfBwmET_m3Yve|Cb8_5xcv$Ww$d<S^xncu8cSu<pi6asLZ6F85dqqIm-pN}yu5
zpJctADATCbbM%$-h!mLV@Fv2;=AHQ1%J=-{5`QM~Fi<BIpzksVQ9Q&tbm;nTZ9`GW
zf4KjAQ5jLosAr>TbQa`T-lm@MnNy>7t!%-0^T^rC(O);^ZBcysc1pk=zK;)!NOh2J
zh-9PYrk*y0btEF={=}VF*MQeLEcL=@^}9b(IB+uIyrix`IZsyJ=Td3#KsTbgZVPTU
z=*V?snuz$0I=7J%u=4zB-65vd+!<CDZdB0|O8q|J2EkB*>?HO6l{|pytX8Hatk|!8
z_Rzp<w>viXQUDxbl>YPUL=bs_hH|;4-#G(qgU5e-t}&B6$Q2+NB^7S}W1&ANC4Jb;
zfKMPjcMTTziW8O#hn1Y@0ld<REF0k#ys=0od>hLdhZ3)U9J~Z8T}T)vi;edLAxnl!
zX0N_v&vg4>{Z=Ger-|q92hjyupa~|PmNw`WJ;?ae_z5s5LJx%9cTpZbM3UaIVFtzD
z@_>uOE<@|J;@h{2{p;LiTT?Z(PA9wb;uJmsA=f{<9(olwy6wy$c5eocGc&vi6*Bj&
z+^F+OTTwh3tKQ1vsi?kF4F4)4-4vc?90?jbu#w*1I*&YHJvH}CmB!Ki<r#0l-K5@H
z$hk`TiiqjZZ~taG)w{Q=PLR_hZYSyzt-3jMIxrtj;v6d<(Da7-nlPukE|Ih6F<)n%
z?rAixym!WuaZv)$%HQ1b7AND*<kY)65Cv0gDK%NTjRO;4#<c4tA_Rq!GTO&$2KaDu
zPpDMB=5yeS`)`ed0Zl%j(KWBWR4o%tvFr}5h1-yVfPXpjNd)IWsna6yW)%&RV}#B5
zw`qFY=sU@0B$?_aw>k4-IT$)NQ!gfF!nK~s9Qz%*3`?4_$9Vchk*azX<&s`J!O11+
zkH1g|<J#0LNW)y9xi6ch@=@6K$QdVL6;B=8j@?8Oz+Vz5h1yN=)Wd)+v><a6LO|U#
z72TR4fpx=^eLnxt*-4FnmrEIa)r789x)#9q3j}ckokvWU37)0=BzQ&@wehu)?U^Lj
zUdTtkt<52%?+Fw$-;cxG<*UV;nFeikF0T6I$45}ej)<Hy@WX^Asf>=hr@;3cQmZY3
z2xUo@8|0vr&I+6LH}Iu^DKp(nn;KBn2hN#_i!DfU04Q=(_1I|A|1^@#9@_&E#O~(?
z2taJof5J{0a<&6*N~7K4=mX$tLqH3TS$6$C!S@%YQ9y~?N=l+k;(oT;ar!PxUv=Z~
zED)@mBuIPnPY=n`&ktWa0eE?hcr`Hp)OSnol%Z_f2T_bL#zcP<A|ChIm-*H&7Ltkx
zhe}&Cj?9Mzr=RrSTspR#>5Vva`KbBp^quMD^d-QK*M(a6GE2=_f8plJuE&y&6nR$#
zs(i~D()m!g{U@zsuU!|jTkiYEF79_Vi8yE<iRI24C^ASj>qxcmK)s|0YF6?o3b34<
z50`b=`CncDwRy7t>ppzfGJu61ATpaL7uvXOQgoQ+o>?Fw`LdYNp!n&m^)!pY?izB(
zW(oBWWvGecBDXOzUo1&;XcAo9)b@&N|M5;S$ur(9JXpR>{5ElnfVtkgh<ydpMN-ZY
ztDB1H9$Nv06)P6<2mct)TlFMs)>n~B;cKEn_Qf+YWDvTQC^yTPm~o$g_!n#pQD$$!
zr$CakcrFKp&nKb3$2vsI41~Y6$aTcK<xh0ePZ6|R?z{w)=Q3sn{e-VE@{n`EEwkrf
zHsL0ezv)i0qhTCO^qG}!938~?A^{E~=TG^`MYfjvkyPr!1@Qi@$ej_SS1Noz!edLR
z{UnN-f6S&TnLH-=Z%O`cIM`?9XSBRxW6R`frl`a9LX|@CxfIkR*lt_kgz)5}v*9n#
zjgl@J&>cdi*7q+NOKsZS{E7rbz0Q#_P%~XpuZ)|<=+o{@n*TP0hBi29pqol)_I3Y*
zcDNHKCMKr9f^T*v=TThg`aI|QiRaM=oQRPOmd$?g{-J+)38e(5pam?z1?CaoK+#D7
zR^;Zu&x$);iRSmX^#JKh0rBs1TGYf=mPT(j6Vxd~?Wh%vD!+tcBS!i^!e;G2vu@1S
zI~GZMtz(NMs<{2BEI4&J_f%DWsFhKjh<C+iV7m;A>dO<OlND<+k2Tk^<&GxYotxsN
zjZSmjgJ`}wOR|`d6cIgzdUyHm)6$0-$$<nwO{|rOahoua3hjiMq5300&q{WTg#TSW
zrJ~n?0CWk_P9*1<Dx$r|IUX_6Xu|TM)U!h8OAl}tExE&tZP0>jMa1Q^o|UwfSLo*k
z)M#avmN`6s6x{02+Fxl@sPL=7wR0VP*g=$q^sSR9`g;Od^2Xnr+!bg5wyWQwRQQ9H
zgMH`?oFi9bE+zr01kd@7(Em1*%%&|qR+fKw&Jh&&95`HjW5S&E=c!;lO4607?GFHS
zjQV8ul+USDZ%x$AGmc}b+O6!D?p7($*#?#Ql`O+q@kSm0@}AW|xnvkQ@!)Ld)?-Ba
zvd2swW|dg@K|Fer8P?6%$|~6U78U1BpGsP20vHPN<e-@a`K<z&U+Yjek%x<3nl7Zc
z%Lj;7e{}=c^t4rP-L(H<FLX8Wz8-3J%z<|;`p5v@<q5BikOS*q{Y%_^RuA@#_0mVF
zp7{YaHG{Jz@w6P2<PasfF669}{;}J8*Wr!Q6gbee?t)`C^v3|=mHK8X#?(Abb=L>h
zlkl<e;ii8}4iV5mZX>;w0%r3RAZ6@AkEW`x6O&xf4(-AEi`2guA$a4XO@DDBH0ycF
zG|AU(t)IdVQ?{*1A1pp6J1o>{$i-AzF2Iz*zfhdF8GJb7`aPQAT;Dkwi0|(775NZ`
z;C4tvdpok~hP$i!(i=tuLJuK$IKiuCq?Cb~kNNQ9>_9Kp|0RUEvC$u%C96~_^S^q}
zwTV6DiIMDzeks{xIi2kyo>Q$loJ1=o@m1~ddJ(x5y4RbE_ua7is{@h#e&O(!(<l2z
zisuS@;TQ!CG(Bx?k4cBFlW-<*a8~NDM-~#k8bH$a1aXgQnTyGfdMgPFXi&T(N_-5Q
z{`T`E(gz(W^q(VtUC5tjl&2S{eo4msnSLZP%KEDM8o=UCI6(BKbv$b8WI>*4b>&zQ
zU#%_kGw9`W!&~;cX#F<9bgpQojz8zHp8myfz<A|J|1tQPdI{^{kgv9#Y6?rbmOD6O
z2E&a(M)6JUih0rnCd25NU&;N-g{^~ib3T>Gsz=|)a+Ok)ZJ(v^FHKv-pZry>_$l)q
zg~9jxJc6{@nyE1}z{o!Z?4W#CsA%-B(3Tpe%$a{lLayDz-@*9PQz)cFt6;)*66je2
z;WkRuMR7b3$wr--ZCS~%IfF45TqSRrC@JOW-U`8{c`N-x_+P38AwXH(Nb=P<vlsct
zOke{uE#C(~>JT3KC$Icj0?@@u_58(+4d^jqq}lS~FJK=rNn<pl*k~Y@d(+_m?w%rs
z{}2|_Wi%;!3$mhJAeA0bWc_7%+%1XYo+w_F2q1?eKWEQeSf_ERTt33$l9)?S=r-3X
zW33lOO{NhqL~Xk2qz!V<g!%2Ak?-UehP-m+2fm-qtMXfzvWnmw5!v7RgbBv%c-@?%
z>c>Uh<EOGapuA!eSu)<Nz{3A;lc^$yzxtp-@wtMcC5o-y(szbrEEuvxxLma%bV;+5
zI9ghCfOANUp?epjw6blK32UN<6E8PCal)T;qULsxZKF@`;3;<%H^tX@%1~qap&)zy
z(BG1yP57VmukrR_Q&x_aF-&0xAt7WTp*QH)cqn-5rdckQ1Sn%gA$K!pDz`5tqd*yr
zL6nwD?{L!*lKUD~w^pSn2O-%w1p7ok!#z#COT_gSJ!YsCeXyd6K*Tpej5vYO6Toi;
z<fH;LS70m5<2?|w^7bkK?!3n)PD3kT+>sEVzQmFRU>FXM9C7Klm84M{olg#S>Pl0_
z_xPS!$Y43X1O36{b)d>0Ve-2PI^eVz80-ZQ{?=irYvwe7Af8(C^TzwoyaoU#0rbWG
zl9}XvQ8NS9wSV8=|C%z_8=3>>8f<(fbIQGM;ewWtKr&cQvOU29L2xP<HEBXUpa$iL
zKA^4d7%gjB8`Nt<Jy1`^9>=x+Ejf-h_@VE}_dhnN=uVCP{De?ktbzBMs`qPpV{Pq8
zJHyc<XMCaP3NX;yt8@|SATAN>H(PRYib-qogE8o2u4l)J>tW{(Nu4l|xi-$g*pOMK
zC2rooR5BII==LoL<7cw;=PHYsCXJS_+w=QhXi!wga;P*Gs&TzJweXC!@V4y>RKQ$T
z@H$y!V>@kf`6p*J`}U+WV&&<9L-etiWDKZOxiPnVKVWTHX?!T>b-w99t-M-OAeBb{
zujuq;3_JC#Fk*!x>-1gVY;fjD@e!^1#!6AtpWhf+eUA~7QAfYIh1w9Koj#e-E}7+x
z%DyEDqovp0WyZ212u=X_dADp6DjYs)64n+vM@mnfN34aXH~Pp6P>$4A==HfDVC_S9
zYBf!a-#+y>r4)n&&km)-ifH!lUked!6|8p*4>7!>C&aqxU|LiCw@KbeRZ8lzxaVH0
zjucBQlk^7r#RJHanECUUWhj2FPox*3)2P9GA^sBFM8e-m($0?FH0D<yHtPHxY@YXq
z4s6&?9eD+oJ}!U$O$hz&0T8URlr7tkWk3Zib2@qYR7$PR3nMF$1QU;saC}-^I_2r<
z?roHdp|y!vLA<)f_67oOuT;*9FN!YNn6WAVAb3O|CV}Up2Vl*aR@tbvylFEhLU^j9
z?{OC#&}|)*H>EjC(rI{|mPTE=2%rdTh<`tb@TLg0sEY3Ym#=y@pF}Np=^n4xgv&G&
zkzpxZmnb{Ef<fjoz>b(VV;Pw5ohaF@9ENt_&;qE?B{n@;>v^e`mh6p2dI6gC?ARc`
z%AYO>^qC5V-j+u1g%)tpf<DFPe>Gwo3lY#(1_fGd^b@Dk_^}|oUrwAq`DdO_<z;%<
z@%OLaT4qj&RdH2jl?wG^^cr!-PTLN@eku=Yf26eG<a3A+maN-7%RIAOmWZDZR6clc
zk*{5b%eV1ae&Ni*JtAaQz0CH*Lomf`$LUsj6afedofcek*q~$19l%wtN-isVs!U@*
zGFx!Av;7xJx=lDh3^WMu6{tcJE+f|wjuI(psba8gc(4R0yvpLp%s!CXhloe<`w(Z)
z&tC!UJy8psXQj<CrJM!u=VEoM)gAByO7XVX0){VOb%ZJC?EEFSDlCJ%U<J#plJ0|f
z*vis>j-~%B@^OQ;gSCT}dL`8HN8hc?MZx&?2)s<)=!FI;$l*%z-i_SNS2;`#7T_fz
zOQb}8qk`F{yU*%wEd$K6Py<GOZGY0W8w?l1;fdx2$l@FFf{2FtpRxzx2y8p`X*=iF
z$per$;6>~P<CD~_o`HY_lGpou7tn>>EU`TT8j2Zs#i0D3l4yZUlyGk|(pfG}m21^|
z5%7^B{X&V$M!S(X9o1|!j;4mrJ3o6i9|{7N5Wjz&sSgNT0ZAmYEPMyhja>5anOesM
zd*G(ec(iz04+y~TkK@K2Q$tx1r6a|c>X0SIqWvF^{~udd0Tt!ewE>4x)S)B<8A4i8
zL^`Adky4rgM7pG;V?esQLj-A1x`q%5=>}<~LAs>>GkWj;-S6|R<ywv|n3?ySefEC#
z-p_tc%57Ett{I_*H03qWHfzv`rFy5%urYJ}EKmGCF4ew3<KV!tKFijS?ie!vckNW;
zb+5(O(#1FLq{HvXmTE}~$50afkmp6jDVEs1GFe&ZSMMjLsLvIOC|#F|>SvRz9!pAs
z5#1N|Dl79)75cDjlpC%4^1SrMZiq!!f8AQ%y24O8Olvz%lo#ne$6+at4-z$}#n~hv
zDpr)0yp-e}riYRv3PNjmNj$*@bYPerzp?(2q@m%E;wxgq8=}X<uO7Jg7G;S)D^VV~
zcvR=3Yl6)DRO6H*Iz-5G9pZ~m&Z&$2gQz`O#J4XZr1ZKE@O*8N0N2b{vka{hzHu3<
zs|pD5W|K12b@?O~FB6pqY8OuE_~$yxda`*p2?uiAXB!Wj5LYfAN$|AWyxBShnpd9k
z{7g*QfHXcM9X$#aulEgr=Qy9^aEpvPOjBhc94U;W0R`|W{(M9X*HPn<#pKMVZhrl7
z(Cb;2*Wv;yUN`)bW0WgiS#(c5lfB23ZDnATo>=ic%phFo_4y3aHM~t&y~8(;7J^s%
zWn`W(@z_;jA2Q}ZLJ)#Q=^`9UN5OMWaPjbG1l=&0;yKb*SI4Qn@g5HNSdFFOo{1bQ
zb;_OCc%=#&>+uIJQHmu7mjTLa6|#8>yyT%7+$vlAt69-ux{V(0(0NyTn(Y35J5}{F
zyQD^t<s&>no<C}{X8#or65XEoLx5YP)nU3XDTIfzZ18-)Q4NxKG=TI6dSIVWdSFj1
zV|j`JJ$}5DHI?zFmU1U|mnK6`?~fmaLfqW64hC+sA{C$+Cr#`T@*?!FJnyViZ^KoN
z>4-d}O9&!NM+<~>yw8@`q!v~wl8sd!SkIhkfjawTGM6#4XH{1VjzN34*Tg0EKp%aG
zz|3MUwiN%@#L=DF!utm`qjFdd!b3M)A}t^9vP;(k=)i4fX!1hH;oP(?kZ~0)=t=tt
z7UN87j`m0HONnkYM1>@5UXRyJwZ3u?eE<L_6P$&Me)l9MAZOT97d#GZ<A45)G9y%_
zg<@gD>$lp0C(=jgYEoM;`ic?f&lBInqfcG<dR>ZLbU_}vRy1;oZR68)A5{@2e{Ey?
z0{m@zcr|Sa2<O2)j1o9^AnG_fk~A(MVsyL=4zqH}K%Eg>)Ege%W&x<>pN-0#Rf0l|
zQ9Qu-A7DtGNzlm3YV#T3q{i$UZf%ZdoW=qEsKQ4o`*~NOA&BlX3pQa{G-j0SQnaTY
z2}E>6>u-hJys*|#<+$K&*y2q*&gpi`26-Df2W2qzFVMwYCpU*rQyUu)mvB7BGJV5S
zOybgI(y|J94BF#?7*|0_fMVeZMuKSlaRV3^Tt)RrvB%)98v_~U<lG{|N9W7jdxX(m
zNkLQI$L|iQ^RC^(PX&V;9?Bu9z2@xnnMd@YO0q)DKx;W%^1I=~ND%n?ZrE=4OArZ4
zZ{`rXn8lq`$Lwm*Qvhx&C6)9=;nNq34|@w@m92U8G|YIBn=ZipDl;8~y*Wq+(_PY6
zW<&ferm8;*b938ODzC9g#QfB`Z*)tRY5EktEu1utcxBkz188hfAoN~Lbi9obGyVSb
zcLmFPdG8q<K(a_W=5WljXH1LPEna9;`_DjyN~){W+p4$LJV9Jl4hJ9SAWEXOFG|_N
zjkE?W8KxNHb@F8h2n+UqOToN3dcSxcGwnUag6Y&w?>4NDEg{Vl3!@RI+Z)?CPWAH1
z*RRw#bFw6>n2jANZg(5hboa4Z>v<#rD(0z}#<>MVO3={H#PPkc-og=hpNRe?<dNE8
ztTlDLkgUsiZ{!k{wOS2dEZp<Y6t8{ST*5ou^puRj!ieb1X)kAoY%pP&Z|Vv<RE8ey
z?VZwhe-Gt6mj@r0A83HIf%W5eVorBHC5;jplVPHYG13AGFM{yY2U1K9#N3yF{X(eJ
zFZ3nLM+Nqy)>@Y2f$XZ=eP|e{;2DFFnh7hZD`ae*w(>YTc8}5w3opc(7H1jA^2gS1
zg9|=c`KV@Dvd_3xmZdj*pfu2V%iz<lZV{TyE#~Yv9}!1>%3h8=PB5CvY1f}uP;y(p
zqgU~b@_X&QurBK1F7sL-+vGl<`m>FX;mR!NLlha<hr3SAS8G`#4@5_~Uy@GmV|ZFj
zrWr2jxlcbg>c!R(@YxKd-x6}R;}X0;5s0ukq20v41iXqr{^B%VH}gzcP7Yd#QC2t-
zWMBELd4fr`FQZb+h)pO<JlZA%6^Hl>xXJ9zoFGB)63)UySgZy@e9Y6lG=KU`XFaR)
z(bN?x0_{pg6JkD&p^Pw+uOI0y@gUAcE_!q;7f;<|$3*OM446_6h8=SH93jbz4uj@z
z!Gz0{KoG8Z9Ec(2uh;P(;&^#cqq}`!hFldFYAI~y_?JCSfMKk)#c$9N3+r&hKd<2&
zbuz~IDuB4UerxpVEiA$fhgA8s2woZe4=Mq8xM<YF$r*3_3qX~?9o1t{{{J3BK8kt6
zV^he>1_8=fp!_u!hEp7%MYV2U?owc6`(6<=6YGlYPfk4V2A5%|^+qVeBX33MpV#9=
zU(1d$bj)M<F1?n5>XobV6RM7$|3%78U3!Ludqx;)Tq-GlkdBLstDLeQlK0;mzny(E
zZ2qiDnHl;n&Y1`Bn_F((x@E;Uw8CMDOMT<|x5WMrtm`4It*xSXW^SQeRG*%PooJ*&
zp5W|g(wr0!Wf)#vvcJ4@fq|d;P76#Cg8MCm&Fa76UJ=mIZsg9Tj6Mcej#kfK=beJu
z2eH~43jSt#M0~?dy#E(0?Kbw1{&1V(nIhQRlSr$H@-Kia9oRP`wm0(<q>?90fKCYJ
zp(!QoBLW)0W$1SS7rK!%F`N^o$OAI(a%bg>&5l}pgr+QZy*^O9z&48m`T>}rjLc0+
zf=N8hHq`5$c+i=m!Qlmas^jQ#iC)APuRtDV<#HGCCXwfV<-UC9$ZnCpe$3=}qlrQV
zr9t$^A7;A#wOr{7-(dk$uIwh_;WA}`;k&r=wdmJh5g5IBERN5qfX)e$^LH6y_4UUD
zZhp`7dtWetcDpTVZ`l;o<v_f+!}T%={PR~#D#d|JC&k6bi+t)1?<BwNdTyZHX$J;I
z7&0>m)z3~)icA_0DBEkmBrnuBcA#hl#CoQ*WlTz6W%ia6=eWd%G2mXobc**Bp)`jW
z&NoB*7c4z2*~9)|%F-iFjQVqnH+#QvXqVHu(d-e`jLH40xSUdpKEL=hb6y{h-eXT6
zh20T4`2c(@)~O)p*wstXm##@|->jhnTT9m^m@3-A?NpQxUu8r(T5)(hJBA)P4ELE)
z)&aiGj1B02l04Q<JpKYM$yR++_az1J31zxU8&Wt90h!qDJta}}Xqn|Hn5@eW6!8fo
zd-fq_n2&WkIB{#McGL6v+I)2%xapJ2$m2y&I%>eLaqDv|Rkkk{RVJe^G@UlHeI1P@
zdj?3$r^jJY`HAWU;pC<ZIxr~xjOb0l)_Sc0aj5Db7VzROvl>sdV0Mx#+Vge$qH>c$
z^1Q52n~Ug21^-IL%m*!3xD^Q!&X$o3*O<I2VD5uD+y?3RcR_o}2IFVc^8g)XbRj<y
zapd=7zr_D7-vOSlA|DD6>r`u;dx*XGR-_iHL1_hhsdYXfKKM8>pB63T>()R&&KXS>
zIPR_t+T<y@iRv9Hkhkzf(_l<sNV2Gr??4WH=4kNAi2)pX-ncJ^2NXq;=a9bZOW?FS
z<KZJ)Gv1Bi8(o->Iry--bTwZkM<{VDH3RBV(`TQ*MR;Riy_}pM2QA<_;55kTjPa}y
zsDV<rfySukZL&U2lY~?k+iLy$&rqU5F4B)_z6|pu9)o$T>u*#O)flKQ0s}VF`Kmpe
zNC1~VoefAFVJ;oM5BcOzgwTvsEY^U-J@!!)PtBLRjLV>TH7Ht-zxRz#xziI|Hsa_$
z>&eZXZZ<c65qq=C`$l{^>9ZK_AUByH>zKZ^YK6ZV1aUD2k2c<q9Z2HrSTkYtvdc21
z54)FebtE=W|39lHzFl8e>igk+hBwezJpN9zHZc4mXxe|OH;&f%K}0~n95f8hjK$S@
z*_pE8D?l?-$<l<xO<fY`3CblLUwQoDURUipK&bMMCh<tI5V#B(w>&&~k97t{y$))g
zh|NH6?mo%uJE&v~f5l!}A}Mg@``z;E?Ie%7^^H5qXIs=`N97Rb!rrwri-+IqH@*G5
zn{gWr=Y(-$0G$n~Z1af}q_jl8QrbL_c<TGn(*5D~5j+Z_ls}_Wo8!YKU{9_ozn^y@
zPU)qRQp&y7YN(wXTqY9^?1{(oHaZ|g+y|*LP`sq?bq0T1b<MrH0rJ>IFW<K+3NUCE
zP<2+%9#+kcY3W`M>`;+)?sjVd1yI9Z>O-V&e&w`8G_@@F_W-k`C?3_1iA(taE*|6#
zWQ2yV)|#$%V^Wr4h>ekC#i%W5hkIKAQGv#wfJ)9i%D|P^ET1Y20!<!ut5#3WMvj|T
z5eyLE*U07t3-+DLz(u6^z&w4p$ZIe>pTnG8*mSe2)U5e3Mw#M_b^mD2xS7;wDN(f%
zJKL<;Ywj|YuUWTl-VAZQtJKQsL9!aiIH;Bb?ZA-!x>e0$oQNV&%S14)8_}GQWDn0e
zrIFy=m#xV^1d<ig-P&#O_OlIuG{O5z^ZGsPso>6MfkNHJi|ev^{M?tf@Ov1u-HZ+c
za}RWnwx+f9msEQpuCpg}QhABJePOt59Cj@1st}<2UBb&>Qnx-@R0nGn>8jkABilwZ
zy5QlK0}7J_7jxi$G-xKwG)AXml*QEgTpc!NuSk^7onEMshqmz;daT#3xphp`;R_eI
z7K6H^QTFh<Lj%`dG~j{>vV@80I#MiO$mW?x|FGwOjZ6thk{l}q*+0wiS^V1@S<^>|
zxnnBrX4fVHZg!UJxVlSSeZsTCycqt@Zt0d1OwX<EYmlvL0JB=(pm?|~BIyopVJ}mM
z!=Dl`HiEgD(#%Q*%bCB(klsg!{saS*G0GyKV*>73q<7F|6_WVb&;R*b%G^;|jO!kX
z{;19GMhZbN(PVg>#W<ImI0|I3>L)HWbr1XS3uKhuxVXQOMY6;GLoP~pI+SbWCrMai
zRU@AOp3zm}97db_Z?^&ct8RY?*&1MlgX(BuJWBvN?+MHpKNo4rQ-O6qx1uHe!@AL-
zCh_kzDG$yn71_fhLZ8cwsd>L*FH#(xW*PE^o9S~b6xh_SQXavETY5lAk>d+^>*R&$
zWWia0Zt8`6+Lyq8a2xwZ!W1zP8MoyD*NO_ihimU~HwBKT$CqwIz_(?IeeSN(8^8%o
zDf)`28<hVBJrm**$q6-gPj=leJ!6zF<s?qQwl(Oxs{7vjJp?oVkuiG)j67lBTf*^a
z@66(7O|wDUFbxv<psiTbEG4K5<|i)BPdy{!!Hlk?#Qxj*y`V7NnsnW7&NFpjviP8$
zc~^g!)2K*<m)9|B>vZDAdlnkxR!#A3!ImqL%l+)spRog1TA8l7rzYwGdD$Z@{2$ka
zBG7LEwT?650ecwzkGdG;@NGeMr-nHU#9Klf2~_pOy*Wtl;4G%~aQ5}2<q@C#N`Hyp
zCiE5>@J>+Tb<Ae_bsfJ?NMVHLfQz27C6HwWOo$ilNX7cLgQig%nk!%sSx~>c5oh}b
z;4NkNP_Y5jKmk^<3;o(76o-=z_z>~Q8*gWl>an5s|0tO>#So&zR|djd5`9ZCQg>)5
z41Ak5&GURq6Pliyml)@s`wL=zo_X`(n^j5>V|IVL7F(;ajEYA(Oc0u2gsQXcZHDPu
z0P4bQTGP8-FrTS*M^62>q!ObPV5@rSFG;t8ivXg*3~OP+^D+&<MDR|$?6<IIde;DE
zb6zsV>Aj>w#WG`9_3%9~cJB?;Jxe-2x*!{xOA9?2GO>*c$?QL!zARF8Z9$3f=ARYl
zrIJxg^vmx;rZs_Xh&qs@k;nAV3*>pyWRs#sI8c+ymZL%UG)~z=#2g9-Z4whgMG=qA
z-dOnP0KZ<fkB-^0puBV)j`~oT<NNvAww^V+$fpQ*8x#J$Se?t1-0G$3zqtU1bSm}1
z3N<g5%l!j?it3R<2H&`_Tpo@CbqO5w9~@0B^uXtThBn0)ifb9t47Vh=e_nV%0h+)6
zr6&dciFl283Um=aatbO9RcDNGQmi_~D=`jTUGRHGiq}5yp0G12=BZoZ3#u%~HY{Tg
zt#4(G^Zs7OxOeKh7`lVERF1Zg;s$OWA;~2#F76h_PrOxhq;)u#=hNU3EFvR1Im{qM
z7!%-x1OTK!i_s_*4`e_?t<K1CZ-Z!3I)FonLZxj9fPr*P!al66HAj|-o$~!U7UdUN
z9XOLi<g)W}oEcFYGwp}pY}Y;eYpT=oCF_rn@L3a$QOD@dW}LdS^x1+XTzZe-xuCi;
zSX#)=z`Cczw7BQ@0|S)EfzsW%aGc?J6)slxFd>+^N%^840>(Mj@Aea{YpjWPs-y>_
z4cmX0BmU_?&v2(7Oq9nfrw0%3c=JTBS!o>PzZv6RPghAXgs#f(EYXys#+B7D4S~w*
zZ%1)$13p3Wptc3&GVJ}g{={46k)m3;%T)@4tOs<D4fRU8f$lU~{0w?YKW?&+=r>lf
z3=Xo#l|%wesb{M5GR&~2IJ&wUb1KE4CUZ~N3%aL{s?xAl>i0YZqk>6a*LcWsp)?Qm
z>+Qtdsl0WRWm7~O=((?mp_q%J4uC<~5R#Zqs;k6w;18OXwh5>U>K_^v7hchX$;2pM
z>~R=)Z+ycCr{6k8*)!$Cr}?ZHU5c5tFL5A40i-~0!I`*^7t7f(3Z%Z_E$S*bJ1Bm5
zFDPi>gSWFJUH?RnV~<;CZ|{bxJKn+%*au@44p12e;L)=bjgh_MpMA*7ZX`V+Xdo_9
z!@QuQ`0>@2$57}UKRJG){f+Lz=;BCL9~tXVi~FA_=z$mTdrVRZJi>bfFp;-9K&Fmd
znVj=aKAo3njHpEJm@88eTKoVy`wI^l@ybo|Kn9ybtn$l<5&YhLiN4`5*CJolF>WS2
zzxjYNYk`@MYPRSD_h$s%<iC&HdN>mFln6|(#Ea_1pkl|}^NOkm06JUujVXUN6&Iqg
zuZiPA|A=Z)Hm#tPHRL4}Mx3@Sm=;@G4q1QFWGb7LDa%3!{CZuQbI^skS!R!tn=b{Z
zoa`?ZrqhobX338(zq%4W<QuGOICa*1XJD`SZp6&^b*)SJjE=w{8{|zWC6$2fs2BT(
z>s(L{n<gj+-8b2X@BpcLZNx*mhY1v3E%+F!FJvqX?3|@PoPM9{aGuvO{Tf@Jl%Ez`
zJ}O(f8GApU;bd%o=+s{Rxx<0G`{~|JPF+EoOW4*PnE6!jTvy}Mb=RqRIy1}FrhT(z
z`;rx0>)?>n{^&fFdd_)9NH(u8XZYglB*u~&-5Gf!ck1KZgBjPYX}9^d&{-M6G3NNQ
zr%=e&re+TRxFWb%$~458K08oO4ElsIJC1)GGvCjPIab!1qwUNYE6Dg0hubNDt@Cr<
zXb?@>56Yu*GL;3UnP1$69UECc3ZLo*next9(Ag-gnZ5F4vuw<t7;`zbE9!Zqs~eg%
zHa7HV8Ot9nb7roaSjZO*;_?Vhsqb4DxEexIrU2@N27#fW<A?&?>R&GrPYF2s8mym^
z5z1Y3OrXg@4=k`J*}PgYE6N3oUMs&eA95;tK%?SYAF?BL(E30odWm}P#@pCy`7{RM
zKglIC;S84@dABhfaA0d6C2Aj|DU~Fbn-)w$pa-?`DTlv43ifoa674{%3Nx2@H-d9%
zvTJeIABKhSfGM)x@{~6}r(|+pP0srK;^WV&MF%$-I!fX{aZHs}S34C_IaN+zgZnVS
zRPy=b1Ct`pFGuH878Tcv{YCF!fKvmCgd<iOMF-0voFbHvW-3_7(pwoaoCR9FFr@qD
zA(o1vF2`1b;><boq(2t8pVpA+6dpP=ZY$~j(W7R$fH$*y{ARcR>uou*9*^m{J{sB;
zLW}?xcP@N;&_P<!0<U|AVQU8){MiP7F5G)uiOjV80GB|=h?snA#i`(cEX(|nIzpof
zN{Kxy^)}wK-I>U8Tb<DDgbL1;katG)Ju{}|#GG7j)1Ai%16|}V&uyt86JlV=D2eiD
z+0iAC^;wL(0buE{Y(`hO!9F3r(L_*-x~oAUmp0W>T_#wOP2fs2q^~%&{Ht~6BsjrP
zUS8gTfq?<H=6b5F$HCf&=WB59#JW$1$jmbTs$bOW8jzOv-DpuKYte%yq@`VSJ}*#}
z_t#&M8=MCgR@1z~hkVndN5IiOdlA58R9Sf>zlmK-qa$!wV37laOusazOc=QcmBdBw
zFD2I`n1inr3%;K^F={zVabatGZqS3Dc8LEmDs&t?D#U~Yl8C9W7GHImEO=v*EIZ#Z
zB}RpB(}`>(g1#Ypgv8v_7iTn6CSHxzT+TLQXw1b;>*-W*pH^As3%B0gQ5ETDbV}Ag
zh1E$1MDaz3Z|AO(|Lmn%yM9K*<%NfF_S1V2{g)Xd>46!EWK>)<IEjRCZn4C7HBPKw
z3<yI6`=VdAAJIkjYc`G?orgclbA`xe51xi{1_nB}B`CyqEvh+|@Rmb<zYFX`N-+cP
zo&^VbN#uEri5}CbV@>ofv%T`s5P1#4koMFLnGnlXRl~aSxRd54j%Y?G<Y#epC`aB^
z3@E8A%2LXg@DIFUUSAb2m^c{!bHIL|HBwcK=Su7Z8xz$^ane(A#4|6FUnef6l!jX1
zRl3%XU`9oHjRnl9mCfy54Io?0JkcbgwM?II#~xDLSY%6WIl*nI#zKHI4eK7<-p5z6
zc3O&xETn4+XaBtwuzqt4)4|XVS0JDwTXHIW`c7VNxi^))4hUGxW-0!B0)^-%ZZD3Q
zJX+sgk;@)bX-hbbP7+J`!Qqzg4>qq~8WLxSe03SDV<!62?g(jiI9-d>C7!)vzAY7<
z$GiLEUErF8T8NQnF`;%IA%<*T;Z~5%J#`wC8N_X2_PrBFQwq=ir(gN&v4t_-pH17z
z7Ib2zl-Uqp=ebK(LT%Ig&_n(9o3ba(#atpI7ukXSJGPtZ^Ce{vsk%E!)k7w?X8kr{
z6H*N$-xd~Cb5aDt$RBfffUXdR|5mgw{&fOyQpA#qnzvYhkL#nRm0|G8ZD0HC&ESOK
zA~X?$kmH><9%rK9+|-VpzdIkG2Tt0WDSIYWf*P-d7wF7+*p}luc<uuq0_5!3XG%Gj
z{5uzS8`nI-P4IttMi<E@3fJ|*tJlh8>prFJ)0b?kt~%55x^&mgs$IEzj}KT0v#~ad
z)vK=sWGgOpdnbtY!`H~?<ZqgAP}chPi%~ArOH>6M7UuENi{@8fUO4ks^j72i<#vBB
z59&Mjb7KT%rftZFMvOgC)~?80B)N|3Q4EV$+YYX%nlf=$de!P=7huN29ruwlA%T^2
zS$WmCqPP4cs%=?&lCWX3erYbQe5aFA!1eG*p#=&Brda4>MIbu7PH#D}_5iGv6=9Ch
zFMAC5wK)1A$DZ@;6{8|QPA^WWu@G?lgv3N=XHq`}`U>liWBVFy4~LSI2b~);c6~xx
zpb&gfro49VlbHGWJ}BljfH{aA&oC~Mwj1CX^kQ%mavEMgH7#pgyzMLnwfoUmBvWc`
zDDolkJYQBs+IVm|b^v?dC%iYjH-e1mS1c;woiZf`hlT=t)7TpE;oB*aathRc*iI-k
zKtjmyupnPCj(TB)E~_S4O+Lu5@?GeZei?uD8;|1g{(=XJ!C85qkq<(6*L1@l(9>c#
zOV&QDdUISvJ@JF`2}>+&jc1oZP6MYr6)h7!8L6y~g)Si!*Umu*c4+CVp#{o(v3x}H
zo}Ew|bT@_^shXen0YR76W>QQfg%n2Xiye{KxTXtu=lBdMq&fZid^NV=6US+Xu|vR;
zg^PHc{cIIm<vxFda({?k0uBYFgT}brl%u(gC)=7!)ltEB%eKia*1m>Zca8q!_27nn
zT$~UzuqF5jFa2=Pampot^Y^}~RFduRx%JNo({T4gN-l$AyZxO23u01d%+FYomHFEy
z!@=HJ_Y-Y$`g5fZ-DW(#@`YL^sm?ATxlO~a4^3b`tD8|7i*7AEvOVf;I1k7$pcO?^
zvWC#tg!={Lj!8$0squ@$Hn=Mk8Gq=gtIt_2Qoenf{p-W^aEXFjw?;KD%`hUVaJ}b9
zV3-IvG&if#oilr*-Ac}-WJ9vigz#_Va01Ey1fRL>T)UWg`_0m{ZfBjY;c3c8T~+SF
zdfueMz879{-RszQCIy<#bJn4xGGM>2@LAdev9lE#um)ExB#>;y{7C@h{1+lY$FYxW
zvdx66v#hoHekcyoiTzw7c-yIXJI;LmUIT)}RhPJPEjIe4prg;nuh%6^a}Ya~tcsN!
zGvt9BkBGW?tUqEt&4iO&emA{}!2&|mN9O;BB_!ZDx`?Gd_+@QP1Yqz*A25vD0!@l$
z*H14>9$m#i!W5Zf56KOLV+Rjg#Gq%+>?e}RfA0*y6=^PZcYS+mR*h*-pY!x&ZRAv}
zw#9`8c^^aZ%<`NoQZm~^-(fjmtsaS@Y;0xq-5&4)Bmf*ehb`kDl#lK;Y}qaQvavo%
z`sJ_w;|75Jmj{bJ9}b4FXqit?vaz(U$42i3So9L{O>s$9rnnUhz2s%vn~A({>Tw&2
zIY}?orplfPue(Ue!^V&YF30@pZ4w?7{(AgM`vC_EL>T3-Frp0eE!_u$_OR=q6n0w3
zx#VH&rK_%~IpMs!@TK4pp9$8;5hSSf`xVXgc0&G>crj=_kQx=m{`)}Y5~#-=N{n|U
zKZ{o^_RDddaZWvLOX53h=Q<Mce`OoLSRbU#_h#Ne21<%@Jy*Dt?16!-kfAo%MFN>t
zBu4J{=LW}kF?p&Ps`OaWm#C}ONImGzl%~D?f+^WriN|@}bcpi7p4HD9$M4<b)>@LS
zSHlG#JMhfsw(y20B*u4Lwc!@_bco}~giBwx<xyZ9fYlwL{e1iy_&jq4`L#7^4Ab;H
zG3bZo@o++^bd+u3Q6pTpkdAvmvvRhu;sVT}8^<a)5<;4fsVSHrk;70Y9W20>U7a=~
z5gZA0!dB7Av2*}<{$-gN;!T3bJa)1IxUPBM(<I|c!xLPo@)74v-M0g8Y!F4I@s_J}
zHDSeq_?*#8aSoohR?<tp?5Zj7z4!Pg{zOOi@lB<<Ifv|36W^s0i{9x3x{jRJVSPrD
zyO}!f+qaW@ZUotk-1to28WgfIL24lpVgW-hscla5(Y?jX3&!^6;@j>STGFmL9EGQi
z!l1Mu{za|Oqg?&q`yx2BLv#MxwQGCG6+%n#|Ht`oLV*A7jA?p?kDjE;=Gp0!UVH`+
zaM@*=eF6mmP!2eQ2P5)>T7-MWhtj@;Qjy_~P<q!I-4XIIxOQm9*tNBCTu^I*9@vUz
zf+8WuAS$>Gh3E1PdSc@V`!oSj*;)>~+4mI<DohUicrBNW+H9T;$^9!WTpJ#9{XNb(
z_x=fs%7eiosC^aHii8+zJ%-Mh@k0sErd(_Tb@(Johh0+41w58}9)>@?_L^zq`ORfw
z%=K-DXbd&Fqq}r_BZQo{by5OcLJ8gpLMCK^)fYX?E4+d7?j0XcC-0EV$nEcA`~SHP
zIu0E^hk{4!6zQ`<<T?8c>HW&SSXU|qrivGh%=vZ7Z_7lgri=>Zc8(tL7<V{7;z`9b
zQpa^lTb^RAc^xALoy);n|IE#XBMRe17sg@9-3@T5x*YlmU`FiacyCXFH_FM?z24BY
zTkMS8y5DwYmmMSHvN=!U4f|=6JzVtkMwl?`&<6I6)D+{jf1V1_99-65mr_&uaIEO>
z7RxUUik2d&H}A$3lurjRCNORq-d*9FFF1VLyhDG;F8Q*n4w0ZyJUHOuHtpQOa{qnr
zBh^g3PQjcYS3Wi16P_v(`G^{WQ{mvpQb7+^!@v+v02qgUx_%q>e|el=D<dj|vGm1%
zV=KV?h${MUIccIKDww=x;3e`77&Kp~Z%C)%$3bYKU<D3p`@W)f9fkT;5Te)7Hvp~z
zh1TfiMJjBuZNv@RcQ?CUogyA#=OpI;?LvN`9A^LoXZQ@C9ZDr`NF6+l=OfrQt4;f$
zLBDJ_%f~J4>Kt`CwNRbwN%lSQLGLQX218g?<Y%FH>9E7ytIp2V3u*o?jvSV@NSZYs
z+^{ZK{~hp!HonQ+I|k3WI4_acd^e)9t>)|ETrZQ*-{$<c?V~K)86&cs#7~xZ_=2Qr
zQFAK2%XPkQscw3mg_cu+|1Fi9eZyGlTX*n-HsFmzx#zC}a8r)*+e^2Vtjli+zw4pr
z0BVEz9C(-cH*!GcZm~%uck<y$CDlWFv#>mpf8x4n8W6Wh+6WFT)-`S$UJo+lxnf3d
zqrm)%V4GmQfTEkIFNJgQ)^Lb&&6_G08ZGQrBx}RSYPUcbe`W@YAU!?CgK8K219FIz
z4NiHysNb_pZ`ho`?&0@4#}m*4DGyvcAv6i(+w>P$07Cr3xPR~FHAr`~b%M%U<$OKc
z09T!v5;J}6dvN9#Gt}9`f-m5~bTt$h?&4m^D&a(&T988R-$VC!qnJo8fo#xuXJ3eo
z3I><&DlS%C<Zl~nvc-2XtjC%!3A(GlAH7MDlqgdCPj@CNg3oc`x*YT1OQ1_=fCZ0F
zC-qOsMea_+)Do7U*0orhXH>W)>od;W1r2X(vTV(5>uZ*&9}`*V*E`gdo8F~etH7lj
z5zXMku(<W0J+vSRc(O4H-ePhJ3{ywhQO&;7hDOI%QsVyt4?$Qp#!YB9HhwPF!)$p+
zS*oXas>^&ED<O5mO2AyY8x;w>MX5lHXTNCiGGhmKBhLWEp+T<GS2HUF0Co$3eI_0a
zaGBP!FCV^y>~`vdYYk_<L^E`ylH!A>ow!NC&)GPP`;R3?SP^h69`!OAND7(q$(z{`
z?pxe6_~JCu;;{K@HOQDJ(1P;TSAx?q%*cBs)nkWrP1V&Tch~rIPi`k}ekI8fw47P9
zd{bHd0{=i0(y<6N2?eOs{H(f`6J>Hcnfr67%%Ps|UHiXyuKk<ndka4buRdbgXtlaO
z@$EdnS1(uASy?F1ITi7w2)e=x-sJoN?LN2y<=5+2vM;o2`c6layV)YCG$2QAd}@B~
z@37WL1Zj?Y;}lS7A1usj@B{mXWuhYUf9>f_TvmoP-`LP%S?LLyxW_uLTUZG`Q($vg
ze^0|CB*d`0i<BhL%$}zBuGd5H5+|UhE%Kd^mO^Om3P<qCyv4)XtpfL+W?q85Ok3>z
z*(3!dcRJthNzI~J&&AZ#M1>Xp!GvU#3#^|AxTN<VY?PvbLrH?P8(sl*Dr>#>L1F(N
zv;MR6956`td<n6QwZ_cQ8g+I^bi)|0kNS)c_euXoLoe;ZK~H@@K8}5=sL(RTY$g~4
z_W|}n1pq<86IrC^{VdYUSNq?tzL%q<-Y$(tS`1Z~kA<;hZ+4r^wFBRL9A+=dBuN$b
z<?xFC(;m))e|~`sEl3>U!HZ0|d_LUgvC9VZtF~Ct5RDdupMEqjSAEY!nW)?$YgCP?
zpNvx(f_%cqZW(${&NZrZJX;`|X3b%}FZDGnLI$MAz60`Li>9;WwUDrgiu79$^wTh8
z0@1{;?BPF;ii*Q}Cb+{gZd|gKcWkjhFK4SYS&vQ5X*c)IqmJ7X*{w~m{`-jX<eINn
zckU%gb@@WQnxOc4K*B09z~xbSKAEf*4gm5cgPNB%%7$$TN1)U}^<4H|MfS%SSrCj9
zKrsH|{NC!HzQaDrYp2m&qqFnwcq`%BF8%!CJ!9!)`rgG{1{(@z$$HsSp6@BHmQtq!
z*+uj_JY8d!0t75KSEl!3-_$RuMmV2U*vSSLj`?B(<P95wKn793oB!W|_dnb+>ZPbK
z2JuI0N&Po<ZfA^`=s3e5omT^_3lt&&{&!ULj#9V)Vi%j+4DvoFr!h;B(XB*ztt*7H
z>Z<RjOQ0**bac*8*Xk|I8oh~qV>-$1zW|qL9VpYmwL9|T(bS^LjZ#EwZv<Ps-X6JF
zgdXKLalNT?nJ)@*HGS+FD>ac8LNa~KO{JSGXtIxvJXN{(*kzNPTcLg_a?%mYvYk?T
zD4q>K?#sfwn;6&eJ0>_51ZMpdl^<dQOQwim<mv~OjOB8&BKR9k{m*lvFcu;X=jAl4
zOnC<Z!<nc|pH()lAw?}gIO<88Xx9!*Fb7%x`0b1qNPeJ;k^mEzR=$^!fLhJdv_g)`
zOyQgh;>8yS!vS7O;@TnNs^V};>JrL8*SzyC7Us5^#H&>g{bP<yGhf_}+K-CUYBv;;
zK;F2FRT27Cx%(*okKHLf90H389Hj3td_VYie)KP2ADCvw9w@8@Uz)&X7&GC^Ifz@}
zp@dO8mI#wT3hU<C-~4AJ{N>2{=@3a0$S}H3HJLimG+IO7;bEVVuLE3prY7jF(jRs`
zwe=?bx<n`B$d}lzD^O4#`4ZVH2tD9$iPfZzhz33q02=daV2vyFvZ90;A`Tznfsaam
zSo<$v=5HX2iQ()PJXs|yHEfRHj0Xq01=Lyl65Cz;9HEeH@-T-!#$Sl{9}lVb28Z-g
z$gb*n?RnQDD@QIk=F8_VyCOs4)l$0dmM%WYFL>S6&_Vrkq%bler7BZc>mfDn2YLj@
zJ8d)9NE~O;t+aUOALH-dmEOsOw=abTQAdCxJp7Jqj428D?yn=W(md!vc<U>BH2c3F
z{hx1P?<YE%mOMn02{+zWVF4gp6awu_`O-=7J9pfo%&5u!vxtw56EGi3xAy#tz4#P&
zm#`jO?4ET^($^=18xwS!HMlMgr)p%@!;SSAdNa0NKBIMaRy!TlmjcIR@@Fh2FJD0q
zCc<K_QupJ4?<r-XRQVkEsUm`f6ERpsWp9njF0A{GJE}?y5(p0&^K+}BgL?+uIitlo
zr-AeB;kp#v1N#akLVc6@6&}*0e%8-rDZzcHIkq)t#}}*ALjQ~{e{WAZu=j%@i+&Pd
zQ0ch#i_%Ze@tQVngZ!zR=yy5gzZ~KM!w1&2MA=OEB0v5>rnJGQ@!&<msCl#6VewLJ
zEyAPkpY&+wL$g;jv>jTJjOZm|N&}zd5=h?hr@dFRJl`4XWww+UHQ&0TRDu-DyN+?4
z093pZueWgQSPS4_G|*Z{FtnDT9G18gLUe+E`F;F5j?)l)jA@3y!{*1hOD-JmF~>&U
zmZ9&%!^0;d1!|Yp2Hoeqxok|?vSQE_s<cQhaGU0TcJi;66Oyp5idPdbr4pSckf}pS
z)iKU+niKgYkO-Z2dx}2}ON1;;X}HXLCTO7!o#^Y!h37l{FFMGjdm;6?`KrEcvYtr<
zwDFUpbxU<8=g*~;4*(E<<1MNHzWo8wWd~1KF0ZYMaMYEjb>>`|Gcs)0Sl1C@gp9X>
z8;dJ04xJwHT1{2&?XM0Q^UoltEt~cP9}k?GxH<WX>u-Tx`&q<|WPPzeo&na60W9Oz
zoDCB&NM%Uv$)g+(hU7{Fe3kdLW1|A>+~==-{m*aGK{*&^xC`tBA2Y?>suCYBBd!o2
zV^_82ac0X#<|~>IuC2ueGpw|BskNz4ss{C|Aqx>N>&Xwv+z1J;5LQ3Cu{!NvdvYGT
zDv^ID^G>=vbcG3gg|zQfHEs9@QdXVPwoPuuVGHHlCOG>cq94`IXp0V#zf<D0Y|8;j
z1F9{JcC7kuprfRsxt<<!^O5=abx;|vLZ`Yj{cp)Z4`k44F0LQ1S5oYL$dkXh0Lfof
zD8U0e{eIwfpWLx6Zi&l3&u%DTfCN%pa=3tHfsaOuF2>>U%d!?TiKzmy0U3e}ZBn_;
z0)|b=DO<Xont7QYy6@aZ(2=el#XB0-7gr^aOJ8m7LQZ1Q#2+{IcE38jreytaIaX<e
zpyNdQZeZXD2B-@qdo4SohMo=T;RfA4i?+vsud8^O4`OArZ;~4{FbKno4g}n1FMX80
zH9h=i(uIN&U`slr(UjP!7-Uf!i>lG|@{117hl6qP|0zLnxS^nqm5{PP>6@(jB#aH2
z!Qkx9ngd>%g+<wnV1`v5md(cuEaLDniB{R<fuW#<rYkN-y7r)GRlD=q{5zV~X$!D(
z9uMv7h+DuQD{vyZZqopl2W7i@lM!qmdg{a4>j?C_pct>qm<^BbKgaflJ><%0S#z!e
z`~l1C;i@(P0>5v`U<*aQ?%f#tFDLNVfjS8irR@8(!4l}=&SPhf1<)=gNc;WyGvVxU
zu>aIM{{L6`6`EMxk^`;Op2>HtKZ?8;A5kU#GPoJ&2b;gHf9kFzrz1;R{>8s)AAUY}
z=C-URV|fjD#(;6Y$;#INs`^C*!TRyQmp96_kuO6*)w}HuK2lA4_2sv3Z_|c|B|ka2
z1|lHv6qmB*=lM>&YDJm{yMzo+1xJ>rw<b4<gImKveGz{vgLCNs>>c}Q<-9%KAB&HM
zNP~9#eEM*Ps)2U%3*G>z-1RGfk4yR55%lsN(N$NHTeoU2T0VGavhHL4@!K4^knT@)
z=AmWzAB?4A;xb4S@}qZgA3KjcG8NY@^VX^NG^K$fZ5D`sC{GmJ=@kg8UtHV1-hw=7
zmae;Se^MtYblzm(hRvHxb!Xqz1SiY#4whKc>~ep%ttTEtsYGN60k=;RomkEvb(uLj
zN2YF}-vit6P8=*lqpB>DMtFt;Z%a#CTTqypVnV1F96V{wb>6$fr^I4V3M)s~;eQF$
zO>+btwyXqFzPY^BD)N2QXov^+bJ93BWF(LeikS2N3>*7o=w;iRvf`S4GHOH1csA_J
z@vwDC|Gleb&Rur{Ev!zRTaopP%uwa_0UVN?^n=ebT|@M%Uw8HO83}-&Jo#}2u^B=K
zwlXs1Fb$7fsf1zNv>Y2UI0|g%CcO_mHOB(LHiV`>x(NM_2VppAWV_!SEu27YFmf{e
zD@)Wu#b}+|tY@jgbrsG(Fd7=h73mCorSMl-h-5fF9w=gap=G9B;aR7`-z=O$j7Z1+
z?{Pyzu;FuDo2_MCTfDWJWx&J*A6~Biu4NgaYniHXuprY^pl{r@yb+HakLw#qa|<3G
zT@Y1iZ0N^(lsQxHF~_NFSi4{Klp898D*sKa+yW0~75P3DPN=6)G{!A@3MHikJ#r9=
zH-f^QM)IF?GdxY%miCu$mDc=itIXAv@hla8x_iG!(FtL#L3R8CKO0OAdJv+OfY-S3
z5yZ@|9C(ZVt}J&zzRGs5^U;6&{ojYv--~aj2=x2TStM$vzQdCx9)Ld9cGlgq$5HSb
zk@wO>)g))Ks?`<;I-OF@wH7A@l-O@%9JgBRqo2>cv+Gf70?Q;S0MJ2&4~@eDwO4Mu
zKY^hi0)QfNk<X-lWJY+<6)KEu<y9sSe0T0puT?ydHht_1;+L(y{x^H`3Ok;E*_syd
zjL7>!WZbhiZ+3Na-vCcUjCvw~hgIVU#{QY^{eGg8c+_K*$tFs3wN~eznto4FVphmZ
zk%W0`9z<^Qm*~Z->xzr0_*RcQakbVMK?LxxVro6X3ket6a#D*+9=j<xad4l1@3T3u
zx}(0tG_edO4Aaaz$-eU67Bca`N3(`2foJ1o=I(h&-S3K`G5}JRLOlb(T|M%e%KC2=
z$7?|qx^>iS+3a6Hg`Y1dHGk-)n+F~oh2s0(rdH<VR-_%dH?E6ZU0)K<_^(Rqugz)t
zKnSc2+$L@I+Bt>k#n-t&i&)bxg5AL^z(|Z|gwYSHm5G){q1gqw^6bs^2JTJd76-d5
z0r13Rd04f+B_^^>I@?%rjs8G10~0*Ly{7p=Kd@0eZ-PHvc?#Poi*>I+Pgk$Ki4aCr
zaT0MK%Q54%ygCOpX3(ybE&9X!L0O(0QND#3YHOC~!nX8VWd|+~7voH^<1IjX3~{6m
z|FSVYU}Nq@<*Ko)70JBng7#<XXQfoNuhX|bS04sY_ds>6@3{gidI^@@3>|O=7K@dh
ze1uqp?E`wBG6q}^KqJ!8oH&sIz89dZx?)i?;tc%W)LEWk$TKv=C~lS|4e)!83q|wU
zC<e`)F8vsstRhDVq>?6P2r)(~!<;i*gNcXII#2>RI-X;5-CHkKnc1S~b@4x|xqpEn
zJyM*PHN>X#23wY-MroX26N(5>{!I*6T|D8~zZ{;wtj?E4zldm4IeI^d=!h%&Sb3r(
z(_HbTU3;|LwmHXRc21kV*Okw0*wwgn)eL!ok$q1!^H93&E-LbKMSG9<)m^ZRH!$3`
zCT|1{N5+fB5do2nv*d`F2^Sa+#s#wDTB5D{Bbb3ai$}-+?~sHvjAI+PJ=dp94#sY<
z;#^)KA$Q%kicS@yCoG|y0wJ7}<)g`=oPDN6?nMVSMF;I55M8{#BH9M(gtJUUS9QTH
zJqk<?RkXL*n6uMHG^v1O!Du?*M+ry1!p?a8IuA6|;IBdRNdN>!hXK+uC}_K6VGULv
zuI;&+ndMiTjpV2LihoPkI?b8D^{xFAUqMa6L?;}ZY|9=P{E&|_U*8;Yyw(#MQGohM
zdq0q4g_uv!^K%xaYIqG_{r|ZC_xpgph~4+(_@O~%!VP9*UeIe(T$df&*F=?W;>tQb
zqTEqG%5A()cZ0m;>F1x<r4DXQJw}Amp|?pkLpV7b7;9tljqiAFqzuwO&M+}cT*JOI
zUsv_M!v-HRA%_;31@eh|$-q)rUdG_O4DA0=5IQKUR|s8UMcjW|2s*5IEs9Zx7d6Uj
zq0TOED=tz-PpANu*^OV}xRC2tr{H*Z5!_#Lt`{YfV7p<Ux$e1idT}{sc6xZCoVRbN
zPuk0z4bQ3Y04xP2)uX%F!^<(s?IE0(NYGk&#^k`_yPgfhdBAXB3D_RGwcr}ut@2S&
z_z|J>`zFE;s4%mBHbh`^8<p2iQ)_2#L7Lkq6nUTj<*ol-qF)7i3`Dum&Fo=B%xywj
z!m3+Nb||nf2>JO@(Si*`0%IHQKhqo06wtBkx|i|XEH}u#hF~tmXcD&hwLz-lwfdA+
zWJkPXy>pkwebX2EW(hj&DhCtyR+$F(f1-Jd&O$qCCv&z=T~FV?vwz75+{pFT`&<Dj
zkWvYuns;33ANj<T0Y>Ve15mLQa{Q#&84NEfME`6M!SU9$dN6GTt`2_QmN0w2ZEE((
zsph&a;DP&F%lYYZk9F}34Gql$gIbPNzN~KMPvdMmD<3`KB`7`>he8Ctw>xf5F)_KP
z2lhP@$g=3$I|R~mDgE)&0H|2LvLO1m^gtf6Hl2G9?@`TT+a8}S-$_Wi8K`<7K-GgJ
z`4j7&0FHMJBHae-FSz{=R86XnvGkd@hxuI!dx!ze^i`#vPo2r?@l`3q>oG;~+wsh|
zmWYqO&zsPov*%}OP?t2nY0l!A(BJ71+(~*S_aRRt@y1bAo-i;rsTjagaQRG1%S6j_
z^T9P~dog>b@c<V}tS1NV(lE=OJ+eth6-j>>#|H=(@GtH5uPiKn2&6kNC0+}Cw&0wW
zHF|DJJC+r5z5&B&&|#_=Hqqhc=4R(@J8R%vk|Re}K0R)Q;RNk)r7fT5Dz7aQ6raz7
zpM38tTnMJdR_VSClI}HtjK1LBaL;eDz<)n&xhQO1Dxodm5mTTG_VztZuU_y|P5n^O
z;b8f@A`_%Z6+NQ=G`HWs9SG?Tv`$iy+ECjqlF5gS8AbLSN=yV#6y3?KHvukhJ&#P9
z_B;Fu(UTu)h3WDPo7o|}&qJPQ)JtuR+!W@Y-ORR3a|-Q9W}!=aKPfh7PEJW8o<Rty
z=F?)xRW>FEdBJE~F22m98sUu%8BKB?P~gLWsz1)~$xOR{h-!5UvmYtvm5-6`<<&2)
z#|~*;W^J8{=#-?m&g|7&E6P$@EF{$%R*Wi~He8lEIRhr*6Q+Z*#|NYaoA=_o()%X;
zw2GfGBU?%LKE{BKO%kG$=Z`Q4P;JMAZo&PJB5#=hvfarc2ld?#AU$ebx~XLSf#g2&
zPm^1q)=N8XGn6Z0uFuA?sw|xfbOiVv#SJ=;Kn_EY>Oz&J#0PiF<E2-+1ZL`|rfi0@
zb}FFoF(cgKjVSp2#uG1(?SUFW^GGfLr<TEoIizvkt4CF#M>xzNKw;acg~OhpK$i}H
z5VHcrj(7$)gzG|!ZDvlX=zyVxLwi`WQ5#&=0%RnI*JDRddK$JoGvPG=0mi@4{~4pa
z)*DmW7a+<&lg_zRKI)<d;zc&+5>hr#-t?eEvtrFU{h`hG%VBff-p2iGcH|z=)oCwV
z>rzqBBm-<sK(QJ8c?|s9y(-`O*~c|jo=}L$$dkIi7@d#D$PM5cZ1EyPPmtPI{&47C
zqUMmn!TqdJ7><3LGYG`}D9pz}2a=^1gozHz3djD2GE1rYe>yp-B!C{|%Q_RJDGaO<
zrQzsBY1Q#nY4tQ(W(4V}QpsrJUW^HVohM&!Cn-=C!pZ&i!(iVtw(D%|4+37HkP!1d
z^NgPG2m3`mLYe$HA@>oih=oH2l4M_6BtJj5lLWH&Rrct`-G~Fe0u&hYw7xt{O9KGg
zM-=9tdz8n4*XRk@Q#(JMHXBYiEelkOPN=hE=sw*QX0}a|ao4{U4%S4>0Ng$n8%b9S
zE;dKHfFIXBJL^pq(7omN?UJKyD&4+_)2Kn4ZTY0Id(B6ss>9L~RCb<_`Xz%7mrq;5
zRyWg;J3WrKTHERBFJf}}a6LOjj3YaycBI24=`W@in8@}8f}aZD%KM*2v<}L*{>UiN
z5Qor?ZKh0QMfu)TeUZ>v9>_XL-7o@-Kk{`G9Q<rI*d`IA{fDpn>!mu%;|XVedQ}-G
z!@7F!c|@01@x;V#u#8DTfaS;)tDA+dNJwUt2`<U;7j`R2a2+elK3=HJF3G^UX*Vls
zDUu_;Gkec?WIwam)Y|CYJpjDApAAJqjNs!fsUsZP6+jsn_CAUf)$0GuNH9mA@b2My
z2pPsSZ3Ok2&Y-4eBM{xNwOfyNQ3Wk8Y4h{ua`<h7hCFSyOOJ_TV8DqScnVo5m!ANa
z?NIRaM_Dl0SmOdv&hdPurT6sg@-2uQ*7sOh&*X27tgf!I1B9dT#WlDsfj72mk>UG+
zQ(Dys>7EZLE9G+Ag=30sPh_EdCW!FYnZ_D-4jnv4JMycib3EV0oU}8IotXd*VrJwC
z<F$X-F#?@Nu`K*$h?1r`4=-<gZKM4{M`W$50#KA!0n>cXLWRfXE5Qv=Ki}&eMAg*r
z8HR8UUXIfqB)M$Z+yMm-A%q}rv&8WFx!}MRkjpgU4xfitU2J-sy2c;0T$ex|?!A}#
zBWn;X!C}o=^X&>W8+lV-kD+NFIdlNADKXMB&D6Pq>(oVAG(3^TXtJIQPg6tdZoX??
zzacv)2WQ>uTzOR0TIUS%vpv)Gydm!yF3sOqvt;!C27u@+xZCdR1(|<SzE2Qnyv;xS
zH8O5ON#Pi#mzBl9A1s30(}{VjBe?auqj`}hPxjdW4Ci&uh(p~?+p_r2mn9^BRedx{
zs6cWDnmg(8Oh?xM0ZT@e5-cycXDgJ+P`IGNqX59fwc(uy;Eu=+#=Sfk&V|(=aLm7=
zxc%m9&AF|(u0`v-_NKJP<uuny)_eEdIg8E)2L}(M85D%mtAsoMneb-Z#4*9&{4o!t
zT}Eb&wq_f3k?T5JgEaj$;b2&6^vvV*t^XZmAf>Dv9Ubl9Ip)4LoJVZ<dSfCR3`6IW
zyN8t}r=%3Jd`;zd=p4VO9ow3yC|J_O#}+yJb~71V)AcNPW45tT^F7Va5zCUBOYv-?
zdRaIxH;ORDpzaqhH+(~gSg}x+0rM4|zRbAIV>_kqJZzM*^S_x-6C)u<y64!GrJS|r
zb1ymh{<4^fRHt}V-9qG=7RzA;O)uZr(jDutLkj|3-l+Xj63eK<vB>@UBrDiz=&m%b
z!Il9=2d%px-=hg02_$u@x?k*LSRZAa7>YrVp@Bgb=t2mJ*nL`CX+o~5>3nueWZA>^
zgFNP=g&K}1Mmg@XD!B7Z8RQ6`YJfX=Ri$jkq#y!jL$86sNP<o_EHT9=1&;`JtaAEN
z<peyz*n@yVy0AOF0u|`lQ85!;4dkuiY;()hW^3fjOGA;dHUNb(D^#9oV<8RY_K<A}
zp2^R`DFA4_Wb9LC{HNbXuo2{5jPj?s84qnrd_JAS7Rfa>)}4ytciw$@P%GqmnA_ri
zvj9-xi5n+{T@Nh|#@&ymbHXSE-Y*<a)!5Z|%)5X4<g2|H<z)G_^lm&OFDIwIMdQL(
zcZ<o&qW596(qXy&R376UP8$XwA<;DQ@PH$?+_$)CcE9>3Pw|2yYEw1L*2^z1rYC{>
z(v8Fp1`uE2mQPU!Z{`Z=15{s+8S(yd-a?(sD9}Z7-Ff!t=*Idu8Oa}|2t6WPAOrPw
zin4Kc#jC?8dOK_FNn{WHQRd77@q%qkLE_Z-h3d3N>-a-QOda7NlTWkTMfXvqxxNQ(
zL!AMBUZQPAnJ>y-;m}v#^aokCf3$dftvJkbqpWI_`0PFyGo+6=JXU5Tssvd^*89@C
zVmpFc;Est)>Q!w`@$_Y9Fp?;(47+p#0XNPO1MuNNf=Bh3DV@1iUO~ar@U7xc`%Z^}
z@P*w&u})CAr0UnaEO=usOb%%tdq1-1Jk!UVJ=k^%!1CEC#W|x>8bjk6k|EE6)b9w8
zs_*nE-IV+%$^+@V5?5MsK$ZDe@!CarvNuDZt%At8{XvR+O|*o7bBeHgtxGTQrfjyd
z{l(ew9?M}Y8@xW3fq`KpPbG&=qr}L~wm1@fYy9IQp`$6g5xMVMd&3pZdrL{zMJL&v
z!_uQqfWA6;?2)_fmxqfQ5Rs~)y3N`yncZ~3@2l=n!^T~;t8QTEvgl6u-QC3~yuFZE
z0En8n5GkfEyFzdFPJ<TBkJ%EQqh<i+Br&d^`%f}@b(Pj1W4|O)MKk!{&OAbiPA|;8
zC51WRoc+Xz>ATh@q_#Q&oI)X+60X}LukCQ0AABvAN;7|!RrV=Ee!HxI#(+Wz-__&u
zVQa`t0hQZ*&PY9?%QRAM$Vni`TSa=>O!hIAtnV)+Diza-dNUP(qWlUGqTExfy{!$t
z+s)mlZIU$&VwQtGDO|2r${eN6Ac8LfG^gsU<qR8R?d3`RbZdd8U`@R{X!5X+Xfnu}
zv?Ty8dnw&!k<-ilZ&Awm!!o5A&(b3Mk7fba!LNJ=?3(TLvP!S#0~~Lm1NFbD0<twX
zvBkDS57y@XtA9Nq3M1GpAMK=HsQKbl9AT`oLx<nj=`?up`SY8nvJI19a2b$<Y<)s<
zlWm%<z<fAQ<79*FqU^p02xn{EiQEHaf^51CDJQ3!zzS9a192ZuMSB!FRl5B+{&`xy
zzVP$sPp2jN*|yNzp&L1ZT?VMP7z=<o&AK0v;h+3?^jWxT@q249&tg$GgW$A%1MuUj
zU0)@H@qbznq$hu3z)0^l{-mQg`9V7u1LNJJ40uvnL`TFic|&6VN`zq+*za7xs|oIL
zipyn?tm)d#JrCr7|8ICCCB|4vEb){e<GCHmTNN?p)u9@lHC9-?Ot)!BzxUjz2*0_+
zD033VTr6{iVlejQuX^S6Y!W;sTt7@i18s!Em_oz)M%^lT(hcgh>pjo7BCv3WkGmJK
z$*=EbJ%qrvX;9lbR;W?7_;Y8T<hCI=On`hG-(E7L>6yND+DIS*2Xid2<){f<Z@niV
zwYR2v{bT&2{>DT_d~_#(Ff{<ezz1u}=>!bAe+9WOlMVNHd3YS!$H&Ka+09b@M^BCb
zv|cJa0e-neHm^j1nb7ZBJgO7}Up@i1ug^q%nj;1+2u~jVJO5R@g#~+YzCRH#ecU<O
z=-J2*?p6+pssHljOA_ecwb`lcJg}U|vQ$n<_FE2eERL!AKenzs5bEuH-&_^C6-kte
zWC>-ckX@n-S+mDzmuy9rnXyEvB$Tx*Lm|6K_GNGtvX1QQkZtT^8)Nr-r@Qt2-FyGi
zEwh}@Iq&j3&-)zDjn&A0tE+P}fpRm?i)^+l^w2<1GqFG>YKmAi+|NW2BA4YT5hcR{
z6aJR#?}M$_XlAC{JMfm$_8$89H4`u$Yp8lRon+4GV4Of8$9kdYr}vBE)zOKzw6+d=
zK1=y`M8O4%qK7GFio$*fVQ+{2Qjbwc(t06M9y&G(H(a^wZcE;uKEeKVH8B4VA^UTC
z^Qa|@fU9>xWlp=$AFoPW;?_Cti%+Pk3Y%;Pt4ysf|GPK4%`m#$q4YlVZHD0In!E;<
zV~u@vHKL6pr;OkF8R&iVy;Ty4v{&7K`5@(VR%i3UEDte$1w~ojrVhqWQu}l1YHrH(
z<hG_lZ+av3s^wx*k20@~R{*hB>X#2JOfSWM9Ode2X(ejwmEaQ{O+zjs(FsFH4b=3O
zrqt*LJd0;T)j)k+NNnw>46@0HUGmtvy;~ADcM}R6SEw#|LMAjPAFt1xW%7x=m$Mgo
zc30o)%Qtc={W;cMSQ-1n4I!xG!EP(kpejK;=XMfg+mriQteKy9LCiLmls)jj587LK
zvG{IOs1RP@6n#x69`mfn;bLM8r37>uwY0R<Mu<2CZUqO(i>t*GyEw(h+)m>}M+-mo
z!6Bu}O3#5YgR^8=%YkFW>4ja065R#Qy6{sXp7r4oLvS}rc&OrtV!jPW0aYUg5I@b{
zW%lzrtLX0)j$hFw=H>(0?=9uCphmK3TkSI_6*bOmW%3CAl7G7{d?G8`ig*!kw@P|_
z{8H=Zy-K!V+IVu|V2q{)2}~5TonLl7y;E$&MEy<9y99|gUE}@S_<W9D%6-)YcJm^O
z9)i;R4hG@>7nV@Ic8;EKzdL?HQC|ymJ0p8FCxSRe%-@Oc+hzCtNokER+ctK(BvX{7
zX!=q_U{mRVVA6fhU&K@2@x98^GdajVu6;qWXX6Z)Dq-{OE=;oroqJ%rIsAcBl&6;|
zdu`#oKP%S(dYOwVcCl-Dd&X*C%lM}%lt`FoGgk<I{>CIPlP9ld1`xRKLNOL9F+6*H
z1~LL1#lkdn`m`&Gl{{nGS7TR8w-6x{3-T15P<@A&v1h8*34pa&9-98#+G_B?*7l@&
z!Hg2@hP@MQ5c(}e6H%{qXPP@ZP3GQO;Y>Gk?We%Bz(F*zRC3&$>uEOFA3dAMZ9xAV
z<oMY>rl5o02$5Z<yEzoV4)iSMfP@1)ZK!Ck7z$IMA@xs{b)7MPXbau?w`{FSdSzRB
z;Ftr1aXoa@OY#eJFvcomwwF<!ZBq4a$g?nt>PM&kp1E`s!OXJgArkIJPm%osmuVS$
z4G|s;U9`TsZz2?1(kFTYLReMaapKak>LI&6$QPq;IF?kGL!NmcRATuDyAl+ES3VsZ
z)lPna4gTUfT7NK%f41vvsXr&es3-Q#o9C#WVFE;>v5gH{zr&s*vitq8d0UXy15hCa
z9W!kgJRV=>og9tGcd#?>yP_#t-O#-Qy)-WOgHCvkVK&J2EdO1JABx`pQFH#$kc}Z2
zMmyz>>UGx(?s91Ecyg;a^4J&quVXJB`n|=J68_QxjEL+^^D)onvq*PHsfd|=!ojK~
zAS3Ql(%o@VPs00>mzLEU_9mS|j$fF@zc~bCrVCrL>jf#-D_ItH`g<9l07g6s(^9W)
z$<}5QJC8NV{Ma)P35Q_H$~Ey@I79&nFsT<br#V4&<+ffxMqe6sEoC0@l40`kYF1tI
z8w9XL2&UAC{PbN=4O<0yj+V#EG;Dq$2wGJv003FbYE^G6p@ybp7@LAB;$jAsA`k}q
z=~|)rX5w;Tj+E%g(Oo_Rxzz5F|H<-)8J6;i7<+p$jrtHLtH$UQ!BM|A#z+(YMy<=2
zRf*O&Q1Ze$U*ttcx}L}vl3uvkpGF10%D>Za$c}xY_lZ2)ocXuL(gWLcjy_;^fCN4P
z+5Bx8HGBcKLqw_HvQcH|66t7!6&muNAXPisai%s*l&@YG9sVqADZPo4a|v`QdKpBm
z9)F%7q4`3jJKt;}v%67m$Wlg$RS2#53js!046o2sdo?wBdk9!p!|oQ@3V9V*fsr%Y
z84#eBu$N}?nXu9&wJLPdyC*JZ;Ha2IK~mbfb&^ZzUx7hc*l8-pN#oht@dJgv`8^Yt
zLhnWhc1!Y=ydBAit6vp4V~AIfdskSkcWM7+jd|_>wKMb~*jI}eZ<Zb$`Gq~%iL9gc
zspS_P`&ic_dU&GH^BLz*Wd&p;krX(X<jC~fDSGIS2oINvmPb3T7$@$Qhoo!>LA@IB
z&1~xoTu>+B*$Wnlph8979sHp}-4Pl1Lh5ZY4TRd*eXHv+pja`B_xO7_&z#nCp4I&-
zVRdP$D|;0RD}Rj?Irdu5>(FdfLbpn~PXy0wio?#g?#B9kK)(ZM>4%wi1jOJUn8w}_
zB`@jV=y-ZnAxEo0lYJrml~u?B6dLcuEQ<D9h~6L_aqT^(Vzx*U94Qn=8#Otx&kyG^
zx$&`dtI8w4YHE~ob91L{+V`-yG#i;$In@=OaRBlioo62286hxI$Y7~o2!7R(yH&3W
zimm%K2{F4OZ^VYJs}6)Po=2O!C337(RaeI|DzYv04lBG~#CdJ5jLE*Hi7f0_xO$z)
z;tr^nWGUs6$Lj@PH2iUP-6ZG)-qWP)f)W%IZ(a;jCY@FIenlkgs6<u0;K$StGXWx(
zs&>O!E*Ai@_~X&jn{0j>#dprtG5`Fv|5yR<?*cyt!}FZ;mArgVzG|tg5q#x_dy`@u
zq~^rp33<%EpXIi)UN`)hT00$oJC2p>K7)I;+yMKz^1e&0+ygc7lXk~+$qNxYHaWhh
z!{{Rd2^&3!o_=2i|2vz2W|$}ofx;`$5-1xy6m>`GMcofw$P<@MHE6oJudUt0k^3$e
zj`Hk)>>k_*A|9HizC}f0EQ`;zl^qv$%e&0<n$kHml?s8Kk9pmUajDRz(GK0*wA1Kv
zg$iA6srRp}AiUPqlk&*&{5`d!6QTbICXQ?JuEs3%_L?u+wk)_%NJj`t==lUiugwy$
zmCcvME~S@3yiZL*bBdWaRo>7u71mLeXIoJH67og9pGM8I>j~AW&sIU{tKc{m`Gs}5
zbKm2lUI|&d1-~SJ81(ySWhI8JXa6oH)(+A9N>EUcv5<LV-5Qkl!dUa#TN%&c(<6D*
zeA3X3MR#B*{ae+0yGyu{>%Q{XUXG>VXb!4vS3v<;aK<lLh#?;S&mqw<0cNb!ZJkCW
zL+q)L@B$&eeh7=kVauu{80fwSHO9A2)o!PdC2gGi!6UJek*60H7Jww1uKF<5(9qBo
zV5GlvacRiv><%2q{r#?jpO%y_K0XjnZ{S`9L8rx#gi#tC2#j4hz?t~}O9aq{D#NVr
z+SI4?q~XHT2^WLs2L0S;P};VZLim9Qkgegqb|joQSHz5|TOdTgmzMr*@~wpOH3Ozw
zb|Ngl+E+=5h`w7J0z6L9$!rS2V4Cs=1W-74-b?Es!CSeXU41Rz8BVFqP>LOTN670}
zw7i~*&gwyDXuT|<{J*5qSJ+JMot!BoA9b*{uF)AMPx|n>ogI%Rqm>-xZt)2dDAJF4
zO<w012VKZAi3WZzN_S=m_={W;)zS0by?Yzb0{Dd*5aQ3X$|r}iB-+AAt8vvE9`zKO
z&12r>yErM~p8yqcGsfOU`GbZ0V@IG4SZ&CT!a;{F=DfN%KDJ!wM1GiJpwBmBDnk?;
z%djjfjf@n!U0q$xeq~|M4{cPRT@e#DYkrLQq2H8^$U*6KYPL78znXi`km}mY*Y9Uy
z#beZ&?PIR5j|;^MO($kVLk_s45+%CW^@(qw(pN!r9*JHvtL-mxPAl3r{sL=1i1V-`
z7W;V^t!JymhAmI66_i_zD7>a7c&u$gt@v#XgT!(oQ0W?27`=^<!7SdDyiM6$qeufT
z_d<VZ&r$j0^qhRJEzkJei=EsVLGNC3*&VmWnBMMk<@Ot?;Duzo0~khu^3;W(K$TXT
zYrcB>{}mj7Kd8^b$6TqJGS=G{bHv~x1FE4QYX7|Xd9hS{pC5Ma!PyhMdun>`{C2aH
zZ@gjemhQo0j8C5_6DY0@qV@c;yNM5Jed?h1u-6#V-OF^Vrg<uhdQIi-J_Zm+4E49S
zX{01xd}@6106;2AxQ77}qce_$?x#J0Kiar*vmU{3zK{gr-SF{r8Op}v#7LwcFH@sX
z4Ag{Rt-5*gd+!Y(FS$p=ft`1Kr)zRKnUclJgo<*#4E5+5U^wa`_#faXkJq@SC5GG(
zB^|3Nydcw0+mf=zp#83ZmE50M;N@@L9LyMd3<;n<sQr@OpSJg*UwjaM^TB!I2Qo4e
zW#)(jgi#nhUjk6e2}#5$33^U%<CiyXVrSDiYLx2lj|qNm<tuOe(aD#Pm?%I1M0Daw
z9d_+vppYM0N=9dbZJSPQmlZa{192nf6py7L-#}qU-AY!Lgy<JT%en`~O(Bzz5}v`+
zUP)jqE%+!z!J8NAYrzqxK_W8HYIbU&4KicN#8^(Y=4d%rZPohJ9C~RefW#VSn)?MJ
zYf-l|REbtZTmTMi*zX7m2{nBG{4D;7g3Krj*2(Fcg2KqT3es2i!@IYXxi7bbC@~cN
z{$Tz0(w<oa$3=iW&jK5U*k(fDalBtm++lpM^54qI3o5+9RDKJcJWIao2(@Vcs>$fO
zlbkxePWT4VWz%w_oHJpYm4U|OV_IM9)O$4#Q(j%RZ5CJm#ywbb`|fA$I1;V+Ip%XI
z14d3!l|Y(LqHfZsi+^M;>QjZh%{Rv}+q}>7y_$2F6p88#ca;YMOkFrERG9{T4G68&
zB$Jc$_Fyl2qXi^~_iV_SC6nQ5Tcq_gUoS^XRTg;L2Mi4jMd&wDpr$Juz0ErUMaCtK
z6ie*_!LSSZ8xV^Abh_a5uZ)FxkXyB7%oMnQZttOSx!XT|;inPG`aZaqM*54N_AREz
zK}nFFEX0iABlkV1KB93;A!QOea}2a}-xf4c^ZNt_afdmNA#l;0I!)OX7Um_Ia`u&S
zYqlWIrm7D53w^Jwt0Y!$S3o0%9DPfTjGldU;~VRG#<A2(d%&hsd)1E!DZmVpOZrEv
zE33i5OSY!Q1KHK^!0nq!mHwP*`(pVgE3Yi6Heokc$nl{HqEn`N#1~j74g#dq5f(+a
zKJ8lq1pZK1$U_CR;B|Z$M|JLoGKjngwKQhyU=hI9E}!s#4Zr#VqJPCh7)`BzdVN{F
z`*N0Fr?JhO#($H8McU=^)OMM~5&H*$vj?aZp3;5k*El(US$N?DA_fENjk<k%ivtUd
zcL%@k5S`E2Mdx*k)~phlrGD%A5_!)EhK+6-Dl4a8)@&0dvNU+bLm45Cc5^@dUYFa>
z7kXs3&p`IH`a!4=!0+fvv>7Yo?v3{EIPA<ESS+mXU)>U8@}VGQp|ZCJJ1rzRr`=)J
zLy79TJ<FyF6dXnXy{lOKC<xkX;YIWkTW$8$y3?7yQ47O^))+mG07t4kWeo<KLo3KJ
zRd0e47PZ>HJ`Si2M^_8?XFj~ACDpX^#ZSz_u+X)wH5Aw^A~4e*#%k75QiP`k$7-_Z
zZ9m+%r8<(4Y?W9tH#b+$NC14tNI)S9Cqyl%Ai)@Q{Ar_Ze<89!;;4YYXcwbGRHzmE
zJd3Xib*sZfkBGaPeJ8*%gql7kbORUl(TYve0)*$UHaimS>uRz&rl-_QD{#?Hct3y4
z5`kkgq2#NFH8U2f{e{dpSL%e}!Am*T<AT!`rS38sy&+aIbU(=Cms(xydOD=K2W#D0
zo#1w18(JVVj|<IyPx9Tx`JM)@p%U!u?4zAoH}#6>C_<>ZyOagx>+9osh2W%qi3oQu
zCQP<j*zF&)VmP>FExPpWQ$p#*GR-k7!NuJW9hHlj&MuBP7ay9YVJ7T&HMUkRCH7mr
z-u#&_g2n<im8{M+)6O%Fm)1v~@srhRoZ5a}Qnj<vudB@T4_>>h;(;9LeNra%99#G6
zg#4FH>uu-pUZXxZ<v42w_xzr-vSPOEpw*P}PKfjtQcfk6>m2*_k+MA<SU6SnCRSg=
zqY(RAVw7DPOz_u|fJF%@{`ACWnw2_3ppkL`McYD{O}g?^L3L0ID8buM&llHdK*{5l
z(HYd8paUn#4~c@AR8e0ePh#lC8~_kvX*iJNHkeKJYY8vVpdiS=Y|y8{%T8(X=!1MW
zW9l!0%n~n#^{fs&!VS|MSP|rh`c%-)Q%fmHNOyFU&dH8($u50z0exoaFO}Q&ZPG~k
zD)UXK31jvuEWX05jwey+d8J=Qhu%9v9p56QrqjFtAv#Q6(+3wFv0njwKu-s!_BDi2
z0ez|l4bKjFxydO}9!<FZ`RR!j`;AqqBvC);a6S@)tU*gj>eN;dt05?*laIRImWOC?
zKd<C)#DmI8dE-SVUwL66$_QJ;!juG-<l!(PPlGY^_}!|;t=ZMKpKw~k*>;5~n7IT6
zTD0gmQpR|o7~pA`mKkm9yrr;a>#M*<g`T8=vuwFiy<X01(|yq$5Q{N0WJd@&%@}Xw
zs}S`G_AzDc6R94mqxTEI1%6wAZ&;scQT)EczUXKT#{J`NzHQBme;PN5tAiLp|4tgo
zBBHQ|%zMgd^*?lJhrQL?ue>`ud!quCTbT1jL3lv@KDtujDn2ymXaftH*$*T6oOMw)
z|3>mbU<LTSWx4(JT6&*kgq%_CgJJA#OefisY3O=IZqe;9-4thXvJ}|vcRAm<Zf?c_
z&6XYZ?E{^R>fiNwmhNp+lfDWgIwC3dW|Li^j?V0WQXPEy-2<?q>g-h*ZO#_^?X!|5
z^2-$<z}vt+Aht2}Ccr~!#Auo9H^E<%a{e%kX_;W%qmi<e9J;!i?ouJXUJH#+nf<Ls
zPy_D0mvN;>J2~y9HKxxbB;ez_pC;LqEf~7F;=$}RPOUklSqcd)=qHi!LL?^mJ=8?U
zWprvagK-ICcKxLUk#UP+7cD&8^DZ)hlwn&^Z4K#|x4p91%cvkj*1#a&L*5vU)*?a$
zk)x^6$Ufe9G(W}7D33iwOl+c180$!Sfyw+BvDBSNz)<g58#hng!wsDY2B4|6#BC54
zHaC^cF=Z}HBwTDPCZ~6D>U3#p_CiI|?D~2>Jw1Ydtnd>(qUg6!Roms9Mqj4$Ec7}N
z)#h<wR3N9g>_VY#YFGb+D>-0Fnf?Abc<KZa%Rkw+(egoA32=cH&E~~b^!yk%Xj14l
z9Qq$vV63%W{K#K!5`V-HdKa>|ecM876ggaOkbkODttgOxMlvdaT<aF7%O$`N_$2v~
z56W_V1UzG43f#yiB<JDU&Tcy7Z0C^Hy2G0jWIb7jcy+fZ-o%?m%#2STm(wScva+&7
zWiDoeC@o&5R^R}VohsssIC#_Z3)jLLJZy3=%S_Q^<}@^G8!T|2nG{G$zHRmZGh|}4
z1!Y|B==?}stg(hxb)9vFQc@_Un6f6MKk-Vnv|p2b_|WTvfKzEFwtA^grb&!Shw7K{
zJpKOND%N8uech<QKm-kn#d{fBTDVqVDU?nCnVa%@r5o-fs+ya7z$Pd?QV3JXKW1^{
zZ*vQ`WJq?Y$NsF1c%@yg)f}P^(#U4}VBF!5vkFlO`PMBT#%PUD3wBV07x}I2p$^d-
z?ESM430pLm&`Br(HrZ%I9!;r5H{+h8XC$J+$$|rw?2D=&{dYl<AvCg}XR@_EJE@fx
zBnu73$U{cqm}@e-M3gcmw(B`e3qZBAlwp>u5j^p}Rl&F@+YyJA7h9~{u1NDRuL_|p
zx$9Tpq7sB|B7>KP->73F5SD?#OLWl04?V9`Sfzpe!Eb!RMYEr$_;RAYUT!abIIjPY
zq}5)$wYk>t?!vQMgh+5te_?WOiPFy#cH9(CN*!`9WBuB#saf>9S+`bKr_*G_!9*TN
zMM_XjXujp}Z2A^d7pC~{47B6%BlNt@XlSE)XZGjh7g*sMH~YfKcS8Jw0xT{E{jho-
z)-~>re=n>-@s0iAmq1~h=-t-p;#C==yyjuP=<t!Qxomg;*3-C;*ASoWo^piYhm#s0
z7n}ZvH{6U_8FxB`YM#`;)aryUgj)u<6I-o;uM$8yGw=RkX4u%04@m`pCQB`0EYfDf
z8>+L7O<>(|^6tyCTF8$er;7o}6C{wp94Vu}reea9_RbgUz$E~wp%~4;c}sQ8{<<Qg
zsxdI5a%{D{lJI9gG+dAwVvVsd4Uwr*XZr`p_I77{-;W?RG^kl)p#8tXHxVO`J}s*#
zDaoOr8A$@@0s?X*j(@yH0@)HmhD~=$h4kf%D*qVL2X{E%2dCAVjl{q;+b<XujN3!z
zA--p_Ewtk;5fu@6NK9s&m1SKv$FMmYkp&AFry2Z`jz&}VdmN$h1)6WdFUP7UtYhiT
z%g-;|=TbiM;SRe((sHPjIWF<65ewEV$W~4E#o>ZbNL=$LeQ?<N50f=eun1C3z45Pa
z0pLXsA<Wp*J6<pLc7#b@rzf7SY^yLPcfM)EcKt>7?-kp#fe#s@0=+8ZGW90-76Ay)
zi`D#wIyeDSP(KwGj-fIgZTV-OL{f-R`WqT?jjnlq*t-Y6Uz#eUPO7#bK+!?|5qRQo
z$wCoeEMJC0T$lZS#0W&8F-wNg6#QHNeM_l5rw<OA2iyf^>JjY8BRZr}Hk!WL@0+)~
zRfw#MW{;{9S-BV)s83$ia&?|P{G{gafFd~Y%?^K~2Ex!1+fL!6Bgeo)YV4?Y$$Ol{
zXjC20&$W`&yI{zZvsnpy;4JP-gi#H67rH_3diSDd-B>3Ape&t3<IA9JbL}m1*3@60
zu^lejQR_)BX{B@(MOOcs)W>B+q;{m7RT#gJ-+n9J_5|E0_E-7->ay}aDr6PbjUC3#
zi^Ha>KX`)NgmJ$FSKyD4@81H^A_`)`-c{-(b)+tW3h<_0*pg~3yQxkibSN@{r*TY}
zrCfP>b@7J$VZ1Py1mIgM^v}7XlSqYmRAA2n@&<t25#b-nEN+e#=A9ulzL9`{07Eqe
zVX`2_@0E~8HWEE=#$N=$r+RxfWyl9tqNCC2RH=Z?5S=wWMr3iFyQLsH)3p$c1NJ3X
zebApL#3}Qm(6s3T3gF;+!MLbY<AYZGWK)>J^xgHkfc#;?FPgT#Kum+M7J0_1QI#1B
z^jap{5^SHlkPu}lMU!Gqi(`6{GBVcMF|r&-LJSDd4!#A=(FtT*+7oWR$+>qO`5;`L
zJzeAfwHy06eyk_u1K3z9^ROzP)qMLz{yj>+#D=F{*y68J?w`b@97kk&BUC0*6gKEs
zi|f_-cJ@Sb*S19dfm8jR&FEYFGinT$C8dhLwwKQEl5>LuB+tiDm~Yka<}KJBTG}|f
z$9*Rob%!U*LFFjs;aACQ$vySa6TD5Q7_MBo;`%ZmASR^)rS_CPD?Q~I8BKIn*0>6k
zCH|_33G18~?Is4Z8U{6p*Jvqk@jxP67f`YY1FOyltI~wSG~@y#ygq~$CDoL!-x2Vb
z1)Z{kMr#NL+FmvwPplSGp!jIM1N$VTzVN3ZJKl@RstRcJL#Fb2bk0i@ROaUUsCl+s
z`!;5}1Sd@ObOMp@=?^{?4m_OplJJKLQOUq>M5ZPSHIv|q)Az&j!d8h2YuyS`ii$^>
z>1BvklM}I6p9Bz<&9I`h8j;1#s0xCM5k10WCR?#Dn(HA@+5mOW4}aHi<(n?#!(qjQ
zO%b9<q9D1i;K7AXHD*la!w5m}IfKYBEc<%N9Ib=0Bw!?E1&5ydW1s_g8Q5R&;5g=h
zFhmyUb!8A0Bq%}oeZLjFCxwX`zh3~X4L(V0h*NMXr%r3g&~uFYm)#8XQuKQG*m0;q
zPW~uGPghqRb6q_gUdB-Mhi&HH1w1_v@Og74DrBFWu89}vmJCkd)codD-SoVmqcY1i
zMR>SG%=^PZrbTvd`F^s77aMiey8UT30+0<E{L55H2CC{k#i*-8b#1I;0^X1j*kBs<
z^y+Liz>_rui_7sF(3Sq^Rx{V2OW7UJZuRmlSm^c<71yS2#CK?{^lYG)UA#7P-rB_2
zl(Ihk&dtfm8Fhe5z!b31@e5`)-+R6e0sQHPC!=YiiGlp)%l+^t>~W6nPc$&u9fd7o
zu3ik@ebA?PCA=+KK(z!&zRgF3{xp{SRqz@+74j|~(82;fJqJ3Jk=2r<V<WBSfWZ_+
zrVEi5`03^5f@*6|j`bD(sTv8kMZkVg)$CMA4?KQY`}a6?%sh!>ZYnG17J*+zryHD5
z(w`_8mpIk=l?zeqEast?z+S#3AC)dtoh~kfx5MB>2fDr}i1yZC*_H>9@Dix-a<brr
z-y>SZE<`JnyFCV{)!-D=)d4RqIxQeD;TPvR-b{}oevpfd-?|P7PSj*Ul4|Flu<I*o
zTQ0@-suDnGaVaXBe`0Q2>N#U%=;H95FNn<#?_36YAK9^TGI-Dp$Swl9wcVNEW?XpL
zMuD=nNy=<Eu>0SI1h2RO_yz4n2u&Z%wp2s9ItS0$f&-dk+ZRRuG!)_2APyOafuvX4
z=+<B|Y6?=juYRs?lY!b{<dN#@*T-KCorxrpUti_;&CP`AvX&MH=$1B(H3s{t10?zl
z#Uga_PLuu=_i?8UQ6S`Bg*JCLc#pkoUp18JcDv|l4nrwk_G*ViU4Zyl`9v=wGQr-%
zDTf;|{|($~H{xNIc3I23crX}VDZHmlX4CX$eMD;tCAfW+)1yCPKjDP-=Rf99D+;x1
z-4iKmI#Mn9#gtNPPpV-WPNy(0-WrqFV4q%dr}O^qf0#gJKJPJF9l!!+`>4ZIpzasp
z%DRN`aS#%YbwW~(C)@SpStDCT5BN`j0O?y+T?Aqh0`f&Xpy6pmV>~JV<^PJVUKleC
zrVazCJ!OM-@r5g1g~)1y*+lyZ++p_f09#4v5w+U087$=`h$vCwj7(I5RZybBRR{w!
zkR254)f8xjrvvW}HN9|iaryK_UijvA1>;887W{y#kOjvJJNzFBIh#BnDi&K~^htu`
zuCvg$vM)|8K<o)vM`$&%TO)+E=)wxNov)XMP4`jxweaR*X>=5)x)G;f19i~>Likw)
zbhs)3x~sQPho%B3!j1GzU*D=<etObtm354p!BqN+X7OvYDoOX)9_cfhmq6I)NvG9L
zpFu;!3@vtFnr!}H`E%&_kvqJ9=eve;cQZ=4ZzR0&3T%Xn2T9km^flCO;1<nbDs8=-
zirWM8ZgxVth>VnGUhZdhLOQ;5qpfsiCVuE*Wh_qYUjoJAY`^nKUZ&r8z2OnaRaI5N
z*b~)tm=;j=rVMo0WkYM5LfvPhRkBP%RSEbeGs`x!op9uv!=|D8gxEq-d3Qfn;$raN
zX1M;n82t&;#8yA?-;@eLANSU1yMn9yUlUgjXRAsO&rU3FPi*CZ)+o_j%)hg9>DkxE
z*SSjDxSa439};p=X`v$AH*3u%lK`Jn{ETPy`%4SpNOw-O|BNHs8ESYu*1xAOoO>VA
zdQddTW*xI*0DJjiK$|d`+x`P<jm#*98)rWXa*yx%8FT%_e3y-y@D(tG)D#Tg#B^8o
zrQo{7KmMq14<Zb`!rk0bcSWZx3LVuztfR1x25@PmJY<RM4)E8n3jl$Cy$CnWa@Anw
z%i!Otrl6(t5W=Q9-LBgK)Uf?T&uFTTT7boEp$*H5lDZ}Z9uT!^o$5M=tgDAhfZWp}
zs-3DH^20p1uG2g}d`L!Y?`!gK7mEvYMGE1eah#lswrFWJoj{=d(c!mBplehEpsqFh
z_HcBGOUU9BR>mr1QN$yNUt!G*=g?blUA3|J{&$s7h=33^EyJkkarvMHIgZ4HIp2mt
zMPRb-OUokEd{%z~_Q#R<PpF1(@t5K|)@YO<TrOC0xernWAcL(snvLy;3^oiI?6hD3
z>BIFACC{#Nhv${{J^lAQIoJR=4ezeA#%|9(Ia71oK_8W~dw6F1?)B1F&(Y6Rn=aH(
zR^Cg{Vp^2;K8)1*_DjLInHO>B<KiAY2a`L&4|1ZE3B-FD)Djp<>**7t(nICg4sVza
zYT;VYXYOe+!@rjLEG}>Ja^=1L#4=2^&e<=<N7w<d+{^LwdrE?0Z}_4jd2KU-IUKe0
zI`1cNeKlO!us!4!y*^6;p8g&%Th&sg4mWP7#LBsf$&Lz6`h66tR&5WlVxz+2{~#nN
z7z**C86|J3Q(Gs2><a0~%&`I#>I0`vbI6e6Wky25ORFEbC(0el4AssMOnQOP2DJLa
z(dJfloix7ouGQu{J+*F*sM@;EM>R1HMAP$tyMJ%gYDrt)io1K^^mvI@m*zhBU8kTV
zKTL2OXT*FjzXIE$$O0EcCY|O*)mAyzndtPOvFNWD`(V^<i-S=Kbm25Kc!J14QGNo*
zrn!*KAvUVbEwW~czm^^B$&Y1ym@dGejyd&x=mxz;*8=mwI%Q)z=dtwczVJxrIa?TE
zIQCBn{u4y}g<&_V_@z6R`)Mj|HwIC^?xuY>aRcR0&?3UPBUl;7ojIp4qpUrx>~cuY
zfj22Y+tm*qmW^Us@maRBbY3&=*FUYjmEIWSN{mi}uCB~A&NTHW0Wa4+hAJ+PfpTy2
z8qDcipgy<-wf{KA{u5pBpch6{ZS~j08Z`pKyh+C!INsx)W1(ftA+xwkY3T$)uvT!m
zvedf(Y$c$QSg7wR6=WzcJWMIvru!b8$DbBs=UlSAfy<B+8$^oWqO)t<jA}z}=GmTM
z#yB>BUCePm{OM8k)?)XSPcL?1`9>^C+$4e*RO>_L`N%cd9tcN(ziP7J2Cug;w;-m`
zL<cD!o;-!rY3%db+8BkqIfK{LgHgx*Uw%>$(4~x8TT2(z&kGd@-_c15tpbj78(3{b
zNSJ~LYfkXE$iLS38YA<Y<9N$(+Dr79OLy;7ZItBFMguoFVok@4#3W|<^f&K3*5v$}
zv9pdKZyzCEKFT`A3pn+jio^+TApi)^0F`mvc}E4AT24IsD+K<F0p)$J{R#va;B{;+
zK(=t_ww1g6z3-JZp;JEtow_x$J){F9Mc=Zsvo)Hjpk9fGK#)im8pgntP*)d!Ag~}2
zPIsN^WMjtk<U1J8qiWSF=+TP~uj1BFUliDuhE0rsG~h%#`~Zg?<;5tKAxl%(s2rgV
zlWoKon58#N*qIeKADjZhSx_%xfv>*n(mDYfz%kuBZ|{TSST*0KlNls5(FHYg$Z7q2
z4@3=`qttLN^K{i1L3Ffc67-0wDs;6YGsdVs#73<O4E}~YLFLHW#;x-WGC2#D5AIFB
zEr7((X~$a47dZJ?Kp!wT&?E4T9g%m_MFx|4&2!p#$*Z2nrL19|^dVDg;RCZl+Q$a|
z*?>KI7wI@KuM#ZlZ$>iGzl~@gp!<4+dzn<hg#-WY!xmpX0%x`Cx@OeKyNoX#+1iBV
z7$l(PxUro)BgLUOE#}=9uDLM{b5NJ}xVnv&^k4G%!=whI#Y@yWy;46zJqb?ukjAUv
z(y~NL2}pY*sd%8G;se^$t^aPLfAQ869QXiXDBq!9(HYvxjiw8GQ<7pm&HU4a+yw;%
z-&-L&Ag@h;M&Ilfy~jRbq0^WsFoi6_aV!l2sJdu`M;&Q9mV$v|{3ULLAUae?=^HW{
z3|X-6#);0<JYeepC_bu4sH1CWzB_C#!F>HqNj@J=KB)$)ul+@aZK4_K<aVT7=6#`#
z?uYVMgQ6pzX_bV`nT!)pO@5d_u0V5r2<oW%wEC<3W49B@$U=ClR{l|>4{2qAoCZj8
zoY07BEj%-NKP)g_BO|Rwf$5KLS#;^(3NYQ@veoIB(V3ll4(md$kv6-exee(Ms+7Fr
z+LUwYGcAItS$OZINYXEt<t-p`tkOrzs|>(csX;dJ_<uZ|9B*Z{!H`JQ{uNt7ol5N5
z$J_JF&Vf&a!gvRBuUCK=oi$lAuCY%!ts@(4!YQ!5o9_0l#hY^HF7t2~w&f3Z%e%+K
z#p%!Cf+~2aU_<p|bq+RL0Yk&$SuU%}-l+SxT1A=0bob^<+4}>^i`-Vmo%Ci<>Idpo
zL4s))PG;=9qC*8hT~GF1*U5Zvn*g=n^SA>?BIr_pOGH4tQfn~}&~zeSuLx((&biV%
zOaP|uF5BWT8|PdwqrB)+VW%BKEC6KWskw_uUq6i0IA@lVk8&Q<`63ndT<CU26CE6j
z&To#<sty?fAqW|i-ey-aQI5FQW7cS?^K|gy8&+nf77d(F+6S^<G@Y@}1xhEmm)@*e
zNuP(T$0vJdyMpw2{3kGD8LP2jBwRE;FoH$Kk+j5Gmg*zJ)-W<g-E=>NCOjnmiTcYp
z9?Zw<o%p&a&;RIO{(!YUYs8T*#-EvKm3zELl)I`;4r)unO`W>O7kJ5J>Da!<XA&1d
z^8xeN;O5ei)87BuXTUCBjSv(de)K8b$o^04qe<2~^&?$M;ygR>`<OJ*z5-Vg5lTn^
zRY5*{jj=;fpXloMu?>Kd;sITpJ7kJLl0qCaKUx8>{84D6SyjQGZ@eTr`+aOwxfLF{
z59Cf!(N^rM=}d2SdYG5^Te3r!zQEU0*kQV&-3aA>IQubTsSEL{gic`Pl?9QZ7sKa_
z8$)P-CgW?-Gg#3f!feH^Z!2^niVigjC?JU;v<NYaR5m&z0)GJc2Ck&TEF><%N>A?t
znbFam7P(BYiJ4r^%p;%C+zF@uJUDU~TUAYTes4?r!wA*a-SrJmj-iUz-|_nIRQ%UY
zUu5$>%pG9w(4{;TU&y_b33&0^Z)Zwf*uEBSRgqnbA6?l?k1TuW_llH)2>-BVO5|C>
z9`PiNzn4>d5ORPV<N(7fOO4So@m5w=VOm{4vC)NSv=^ZY$WPE*8Jg<w6OPEaJTK>H
z6L6$sxO^oYnHKEh2njo##-2q5^t9kiSD|V49W8?-b@%xaxUy=(23>}sPE1hp)EYy+
zD6D-1J|-XOJ|~m!h&p<Vm_g*2Xl{Sa<fe}o?UfliZB!C+)84~g1s7e6Ubtg;FPjhL
zYw>U$sE)SDq0{iQVXITWaRFCUv@7JN695PKr^CPzk(t6#gLXtUFVKg{=uK1%r_2x1
zr~d9JncTKC-4j0)90wC(LWpTD*2RnKkF8!kjlD@pKZw-%uKruINE{vzw=x)mPdmdd
zBh4~m&-`O%Ai|Fh^Z#qf|J;U8LFGuXyS;v%byi{dqu3M0!<_F?JlZT46PgT=hqJG9
zDF(+oeZ|P|>M_!ZYXp-U?uTkLxV!z2%T8$TEpvWyt`+ZPd@1pbM%cM$-EwK>_N73Z
z22SiP>&c2WLE&73FGgWS+iEU;JG1-cP3}|s?(Thg=1jSQ_`pu_flK1LhQtBWUlk+<
zkPF-8?_Nzn>?AKMvvZ&1ZcY%%{XN+1h919~{$RLBU}<OWr}*Hv2Va}Me*5}u8N-v3
zFE2Q3HjG{ws0}NX^$k9-u}L$&jX>e?custLElU1_26k?JY%Dpuu<)%~90GMIj6kJQ
zQI8NlFdj%l;k!{eNDkVn==8=Wt&CW@G8)wgL4;-2;5K4AwHXUXKqPPPF0cRq730_`
zbFXU%snGsO%kTS_{mac|mOJ{z><~(UEVGGrpSv&JE?I`DAKCV}Ya|#ncroFYqJ`N6
zSdp{_p{mYn`TMi?NS6LY7R(%qajp~y8x$+0IZ;zhoC14f<@@*RS(frjrX7P46=<PJ
z(k^mLG<W5sfr5jB1Ky25AXvz!;yS!W-3Uw!l9~B?N*ZOyJ^ioUc*)4b`X=eo-V+96
z9qJJ@%&d118dn>r_2`#v4DeUH%K!XOF2eNEsX<B+|83gEA74hjaP0tt4<=jJ<>hU*
z<TYBMgDonYLwYX*V>ogEzqN|rinsLIpl<P;PV(xXcG+st-~QruWKrSYubHD6M>{y?
zm`LCngaqJkm(ooF=;B33&`|}Eqj<|QQje=iCE;#F>P%ues-(lx89|!F=WuQ-71F`|
zmR=sgbfo|uNgTp=rS^-SH$tRI?RM!rF)nXpnr+>m*J5iZc|>&Q%zF!2d-){?|7kpm
zAZuu6uloz4JCXJ|e!b}M7ytVxgIfPsrs<e-v{efJ4QCG&OZv8DP}j~ak9~_DlrU_-
zF3(<f@CsY#rta*x8_CJ#;=mVpHl6GBE7Zbd+$C5e{%z+8FPz?0DX9>S_MTYW>HqJQ
zga6aQ!R4I&p#1fA>WAXP7z#UUtCI#Rmrk6$uK{hK%Bd}q{%;O|Mshj!)5$f5okz$p
zbXK<761%xrb{YHIs?gsLq6wj!nU_x>#B9t3c}+q2Ak@>-GwET3iGG3owgt2>rj0g8
zU89u6!q{ngM}Lsgrir{4t<M$vuGGz9&rISy_c>34LH{^Q^b}hs6#I?_Oa7_%2@~!k
zQf$L(GsP6y6qWw7#!4aspyt4$<(IZXMTaWghLYA%B{O>m5*3dsdG@A-b5aP<C~zhe
zyUpF0lQ*pOC#R^O4C*zBu+Wh_hY#<!2p1i-v$Io6{DSg|b)QFi!rkqtwi4*shKq}f
zQ%AKHzI>zgGNxu`?GaHm2n*smFca0of8-6U#4bVJWaFE^Pk7FLR??6Zmpu0OxBuyf
z@+S<=X^6y`oqg`nvrR6?bv2cB@drXKGHNp$c)a45Jkahm2%9bL2m_DD;x{jMclY(J
z-G{Vr<gKMieA%Y)G*^n=-<y&3YqM2?9Q?Pvn$Cd2p`P&ol}HzyEBpkhn#U{JgW6_%
zT$G}wB%BO;a2d(549G5ll;@`8y8u10QI6;{TC=R$HH63<6;_OrXqR%s@}5$%R@>^O
zrWyR^1b#mGVZ<tTj*b^Aq8nlinXN>0OVzN%t<UlxB3F3lg1&Fbee5P&=y8qHy97RA
zQKx`mt)_QL2z4x1i(}!v8L6gwDGvnAXOv7A7D*`+UWQV6>?ZrYQ*F{7XQmkWF5Z~Q
ztQ4y~crBWdDaR02qpH?m<G9gG#ZIMrkM61bbCmztqTbAm6P_vE$d1%;t!uUs+00B6
zF%3jgny@bU@iBpeUitV@>l^Lxa_R#x^0Xc+7t`o<=9c=_?fjmNqyxJ<bpMGFq1$EA
zn-L}LGc)#cE8`g%86<M@B}0<`3%h>z1ugvaw{LoL?oLi;L<(b8Pc8wnEjKwiS@~cP
zjs69QbFXQq9^om@>G!AmY5iWYK)KlulZ)3Ld(uJ72e*b4mtv0!+e(k+_{CWqvP5H3
zpGJ_dbAy#By5@zaAr9k*R^nLDbDKMP++R_qr#UPxo)AA;wmj&6(sw>DFAr1Q+0!G6
zgJ{SZ6r+7!=na{;*}SXut8|IWsoYV{8$~;c7p;Pr_wkC`!FfJZG1SW&%8TEY%KWb|
zNMF0E{mJNsj~!_ULvD7Hg^P@g)I-->YgkxbIw8aqH9H{M%}XwO(wFAi2L`VNX)mXd
z2ushE>tD}r;rmrETa}l04qy8xyhgH}U~rG=--p@=qs{adhJ^j<z87Z86kRPc%Ojr7
zLL}0L0~@=4ShEM3&|W`{2Rxq3*B!o$uv#*%?b*9G8>-~rHq2d9+okW;>itduV7X4v
z<8Z5$0fBpQGr!+6%42=58vQbVJ%qu{8V7<cg<LOKwJBF<R7p!rLu#xDPkk-~?h-F$
zrmwhiO-}%E1fhZ!@-A`HJb{0!v=d{>>Rk{XNnUZmk@GM9W~%ORzP&}9nbky_zFL#A
zjSatRCpl3D1_oRL?Em|?{^A|UkUHM>q}etT(uUKB>+E(;t~0MrrRNV!+AT9Oo%~Qf
zIyyQnPu$8s7Qa<&pTAZnHd_CZ9vJ^vsq#q%m>yBQ86z@l=ip%FN+Ee<u=InlBfPK7
zODVN1X4QNNj%#W!r+3*D0L^$0dpQ*ky|?hG;*y(;H^=eq(zY?1v(J?FdspmVPFq-p
zUa@;yezJQ+U`*A)AYpgoqs9iG{W~@|=Jca<6ju^DkD}(TWCprI0(N(G&F`>C4d!%s
zC>zr~<gCbUV&on9cE>ToZG~<1{=azpvaz<N6&eh=GpQO9B9kN8eomc#eTTTG<!`<R
zjOjg!f0`!Pu#0Iz(jm3FZ66nzg0;jInvtJ&?E_~tZ#ZbPO5+b@w<wRkt@HHR(;Rvv
zG~)kxL2-8PE3svyt}YqsD%^6OgzATpH75#*Fx8&nCHyEXbs_8$ci8IYc<g416CH;Q
zc|PHxMf_y`-CnnU?@{@$n=_Qn%w%`Yq01bz@9hd%P~9|m9sKsmTEjW*RL<}ora?(U
z^6FI)Z{!6G3^&Fa8}#oIqP2xj*h(B^U>rTww7iwvfK_Rc^=$A;c=5Sd;#`Jd6XLHY
z_1D*>2b?rPaqYXuyB-!>bd(xTtAglbJJswH>(i_zhN0dNxwpxCU!Poj!7aay!C!Fp
z`!k+RIG*G(L=e`_ZrXC1LfPt3i}Ih__xG27lc#JjGWk%}6t|_h<CGy;lt)g4`d2En
z8h8a!^4v`=j?JpBq%yn0W(#x=un5^)WD<!~e$67)Ct@^XAIa3{_fp19K5?WvepK--
zgIeQh*kflA;_X`nl3vS|<5@FGM%EWMznMknO6vdrI*9)5x<$oLE}y;eg@Jq8qA`)@
zZ5*i?k;UAou599Pv1!kRM{rp(BYmeu!zbOcNNQQZXP=alRaD&M*<4Ig+E8MDc$@R5
z*G=8N!?RJ&$19Q7oQ?4$VMaB#==|1*7jn;1K80}f^x?3X)DGk>hAZsJZ?B(oc1Ule
zww<`Vcs1C~)yr#()Vc$I6J9Jm`B5CI!56<0WR90Mq~5Ezckw?X{xLX)GLi?hlyx1x
z8P&V>oYW<M=Jy!lY_(tTImG>6&+sO5Yl%*Kv6xx52P2kp<nZC%qc-A)*x{Kd)1zg#
zfvMEY?(>G!s`9qbpP&8T5l+CnNSb}E>B-q>H75?%)+o`k$B1|31+-CZruC~IpOAfo
z|Kqrr2zW=*YF^m9(|a|R|Ke>A8rPqCdS7$ezE3gmCRUuRt=ymQ_eObd9^?Mi_{vYK
z`A<rUX8N+b67=Upp<mb<-~7@b)&TiU(kexW0eP)=%}GFJOYmI4KmYunYQW@#w}gso
zv5k!lEifqPcyzvnQ*h?n_9xA7HyVQ#9Chvs4#9h$C8tX7`1s-Pule&2Uq9KJbfz#=
zH>g<=d$%)EM@t!Zd`w`dz-ShI{Gj$94@Oxh?hXgx<O}9XZD)}&wc@+|;}2bEG@Ad$
zI{kB%KZF;lT0BC#7?+B<$zY@XA1_g^v8$E9Eb|Le*CCM@?Q$8Vi$5}XZGWBc{ij|$
z3}4FF>d<CyIR5>Fy+J%99WP#$V8MOlS55j-o;}8!EdAj<ypxahf4vhum|HPjFQ_qT
zG07fhVVcnDFs3jz&Qeq>7AU?g&;Mf=c(*fM4%SLO<7T)xal)E@M8=um_XcFAF0Osd
zG-!+ZrAzGR1^ny8r$?6;bfDOqy1TolH#Rn~qC4+YWI+9F(-+Luw2$R1Yx^z^Z-Ril
z^ku()ziat>CRJqwaRJhbgVy!>OfT(ih7s^C$)ZqT8mG3}S2V-bhf(F-|NYial`EY)
z`cBs6JyS1ODms+(Ri$2E*Wp-r4R3^MTuM3W{=?$F7=h~&yZ>wVnTd&CKk&}cD0R$m
zpEcC2-f!vOw_M)Z&$NF+ep{Z!e{MXCx88OCf|NJpm5uA?3HJGuvb5&dWBzB(pC?qB
ziXOMv<NdS(!+c>DE2F(D`>!?nXXElVX?Q=yTP=;Ai0h~wkvhS^Xk=2plS|Ni9T_sE
zc=q(Mg1-NJTMh5js9>S<gsT?>5e8YIR+WR=8oCY-IvT%n8uON3JM&`+pKV-~m>GWm
zB25H4lxnT>Upr9QTiE>b$-v`r<R`}=JjlVO6*P~~Go)VhxX+(Q4;{GKk1p_6|M9Gv
znGAdvJ&UWkuKP`2XJ4wktLY?ivO$%R8h%Gu>(;jI_tv9M*Q7sL64cAy`|nM>jiD=t
z0rOxhy=QucSCGEmF0e0Su3J3qyB{QudMGy-CATr;w!^jfpJejS`h8qxkYwOcJ9k4>
zm+0JHskA%Xeuoc6Ge1Ev=NiM6^#RSf=~f6pmRGM9=a-dP_U>AQ9sKVP;r*#RCsXgk
zWC#PMQ5rTyj-|3L<u@CHx%0uV`xi^{@?3hfO3n=Qg#Fj=`f}XmO^8VOkpHo+to2vU
z)~gdEl7-v%CC^bVvu6u4jQC)+bSEDF`!~x6SyazapkFkCL}RP^F4j+wYa2m3DO=Q4
zoGms6p0_wBgy}P%9PI5q6;mBaA04oPUT632*<&9&H|z3GYx(C>Y2`?EK5~M%kbIzd
zGJC?g@9~yP06$NB1_Zj3S;~fPaQW_$18%bK^t2rOh_BUfk#QtGJT40nuiRE9qtz(m
z5%0V>el;U1w2kw}3f+xRcK;>c;&>$K?%?C9mPrpZCUEaf1ktws$uHi5+uw^|LpE|V
z@XLddc>yW%N4>+P@PS!uCg$FYYvnmAXynCn4iAWI$Ki7-GqtWhI*|IEL5O*o_$;S0
zL%nKE+%M$+x7^7BugEJYljoq#{1rcGYiH*?Ln1`=6*^g6E5l;3X9!1ga&xI>%7_5O
z(XBb2Es6fVQ!idGdgkTl>r6ZCJG_4}&H2M!)wn(eLvKHQbqA}?h`0?Giyr4T7sNts
zpnd0t&mnTI6XrJY5PNJGd~2naLziSqlIBS{p;XOB%01f;9Ba){zn-ABn^Dq2@BK8H
zF!xu2Q{m0rRj+^J!1pHW9cO(-t<ZVr+D~}TQ?i6#aZV>~U<2;s7J}g!*Buvc*?Z^I
z-k$Ndm(2_*37pac5!5f@i2sc=x;k+Micb24nc3OwU7=;`@@}(nuG4}7I;9GKX#sp(
z*=zw)9JT3iI;!a2iKVpYC^eMQ8>6Fp_wE(NZ^*;A;;krv3C^ueq~S3)C%pQUD(NN(
zJ0{>4-b7R%914h~i`?=Y7&mT^Mq)wdx74kcj5pt=J5q`fiZNmwJ*F$fp9Q;nZTP~c
z9`N{M&{^OrySXoU9~8xqugllBPpn@(>#rC7tNR1_W2L6aau+SU^L$gg55(QGQNP#C
zb08hNdH2#!N6K4@`Ru`P+tW;4ssnl-AIzVzGhszwqDLql<LgroWsBRL=BYVCu)n%b
z=&oGPklIsZck_^;A^u^!-M~&w?NPq;jyyHVqyM{HO|O&-_o1NP1#1r;KEom~*CT!c
zqQwe2QNr{{d0%_GQC5#M^sC*@G4b*7jiaM?T<+h0Re5uTJU3K*(_$xPb<z5pr-z3E
zgBM-_Rq9@N`jxTbCXlDt>6Ya|2~R)*mdq@9+-J|1=I2LLx(_|@@Ypb*cN6@3lNPUJ
z_U7m3yQbQuHYJ_5ydaco&vcnrSBQ)47-4J`4)DVDXV0bVjkG`h;_}w-Q5@;=n<Gz@
zk2B0s^q<(>S@}Kc!48~aLd({+kBi4L1vgLGUf@dGy)8ZONoF@jEYOK`PxGe*q#wPL
zf#s$uFRgZ;(mrI6UL@GgQ|{k(!gB2`jmr6ZN#)AI^#sU;=hDhec$#D<e-B4*9Wtce
zSgpX+x$k@Pzm;HmQF(c-TZ#w@*3WBui6bk*u)x38d5-;i@#!g$|Gwy;x1@6}EUjTZ
z>&rtyG0?;-tu6I9WV=mmp#jvGwo##Drc1!bQj5mC6z+XPm=}nP*K-aPxy=oh+p+z!
zzB-BbU#T1d2vs}RU)Qp*!m`lE(qKY!NZ6Q+ogFw&*>IFBjfshI?7z8vukpq$X)lH$
zCIfT1Q$-rvN<i+a=XJp2d)K+y?nZN{ZAXXqsK>E+=RI3G^fp2JzmndY6lIw=oI5jF
z4mCc!zvoA>e1#)<<6U}6;0<P39z#RK2Tq=|H4E;=d;yflf|~cm-Z1=6$WMQwEJR$e
z)%7iLyj_AQTc6KOb<;bRX}%k=eeOL2s&K*r?h|^xv~#{OR>^CkNc+h`?L3UN+s}Ox
z38jHv>&lu1C@HI62dupb^Z<9U*WUR|6pyV9T9S6~W)q6>*R>0tBP^bJ-|_3K<1I<(
zRDP4pM%RZAL-(eBb#-$yc`0G|y%<ZgH`+5BCNj%R$@s!!6%rk;Y<?!=fmY7p=AMId
zx2-Z`v~Auk_Pl}S;AW^SpSbk4cZ9-ka^nB<O@0&*-oBjA@1v4hx;SdDE+?L)|AS~G
z<gd%_Nv?rWuJNg(&c%J&0S-Y+-XS|WRE__)T8rD}jY=%pU+*u12LIw3{S%15*QvuG
zB8_(?2&4|F-7ax7ECGF*tA1sH<MIp;afFT9??q0~To-w=<AVYNKY@0K;|llw9Df!M
zn4p!Bq{e!#jvFXtZNQy(dRdt?F1f{gZn)k~juOrCDQr!YLPyZoR>-jn`<X6N)S3_t
zh@(V6jwZO+CNw!V_21yjR4iHMsK&!sV2nLK9VIAsbaX5rN|*V#4ZIP)Gy{^&fX-_%
zBki~eGiwH6!{)S-yduZvf+hC>qS%#xYsckl36Gj^>`(qkx?KFPQg5XDXyeGVtK!7z
zREaN<^-k&ZCLy)zt{n8b!uJ7Xw0Ubq8h@y0Gr2CpaxcfZvr6qzxerUID4BWe)EM6(
zDOlG-2cUlCdT79k*gU3q?p(pC^bYZ&ng89R%8vo=;Ox-UKjDDI%F!P)0RIc1NX<S8
zV5_pZfvtb6fdHKXh<W_@ak}|21Opnvm|F4vMp5H0{`cG-iwvkSA$)A-LKx6xTN{*Y
zIHAX|Yt#8U79GfK#qcO-XLtC{z<@|7(6g>FZqe;TU>-h=Z4tdTosI_06|IN(U;!)&
zsCBy@OntV(7st8qGImqMgKbuGBTRHLvDSU$h5JY?mKdvqSSp55ra7+3P?1@W+241r
zkVvQlzY|(ry+(k1Al$f`GKeZMaCR^g3ijkQiqlrduLr!B>|S%s_Eq3EvKjEfq~((C
z6q_*!Z5iF3JEqFBbTuPa`YHosqu58LcOTF@+^aZHt;(<VrUe!^cMEyLahL5NnM>cV
zrEgtj(d~nn?<aRk`QINqw@s@<_fFwzprq!u-%CvI$sK?NKX3)m(C^ONF2_qBzUeEt
zj{z`vCHX><uI1;?El2*}Q4lxbwKzULH8pihPXZCC7B>yvtAu;#AReWeUI1VHHP9(h
zkRLgTbhbbqRp^+Qe~Q74Qnuw8E;6jvIRire%1nf1>B(-`^!qFP0d{y2q2e;P8oRX;
zE4Q(&WU)x;C9VCc1Hpwr25+$$f-3--R~Sa@w4x6?lx?_BKA%zAT)ugr+P`{P*ulSJ
zZWHEN7aU(97_e-;{;`|0D496Up7%bMjFW)Ho9F0wmpTfRp66KBzlK=y<FzcDM={b$
zS0~bLnAZ%!$a&?0findFp32E>9JC4d`^n6AJXr5$C<h>f{a;;}^SvX%e2ma8mgud;
z3vJ)FNDHQuA0N$<7!bI3?DU4hdi&~!gB4LOceM*L!je+YH<-j}>k_Tf@06U_^CM?l
z?YqlYB5lp<5dF#m9n9crW#%Y{@TN!Cmbv2YeJ4|e;VE6Kg+q1a7kiO(*VAYFSI_!N
zHVgdk%8%ir^K*0C(jqx!>Co_GawKOE(5#|IAvnCKh7HRZL}$_tA3b^?XinA#P1nie
zk~GjGaVW1JF_@{3x^tG~zR;kZs>|u|;hF{GK<%;O#dUhHsq}!cT{1v^Y>Y2iug-av
zNZ+G+(Te+>Yp3I<Kf)SQytdZlk3QQdp|)SGTsj+gmy;6Kvo&9bNJ3o_DBV9R`fP1*
zBd0%3*^-f_Uw}}-vJHqD4`Em4Z)bU>meAul%8RYj-}y4f^7{!Aj6D5Cdf3|7h?c?m
zMdWa{Ey>3CnA{~=o)Hz$9g4g=KW0HmXBl6K>(hu)(UdSfPe?Sm7{YsB&^x4TC#FB#
za`Ttyy}2=d?Tz$cE56;^+V}X*-OiETHJd|Pa>_FIgx#;@udo~H;?Y+Wps$HTdJC#p
zeXcrV&1I+S6|$rC>AF|L<+NeRLewgs^0flx6<YlAM&dfwcZ=vcN<TW~1@KgcUbNX-
zQjdk4th;B}E+-TH5vAqY@c+V*?~hyuFld3`zOV&4Og#-SJ0AR*J69H4bg<t`lA~#J
z)jYHcw4Njd$bx#DqDQx>r6uC%%`&DzI+vs-4uoB9NvZ}~IL<ip5$Ab=W`tM_%Yd3L
zqCvRAjqMAa{V?)W+CKk(e0_Hy)c^lD7cv@@RT8NvWF*<EQb}gcC=Qh(o2)ZWT1I*&
zR7R535sIuFQYdAVy+_$AafFWF<E5O_r|<8NxA&{Nd%d2|=VR}e?rW8d1rxrzQqQyc
zppZew==&=wQ^vtWv#yj9j7^Y7xfbJyDs7RI!vvUKs+=O%r{cGmVq*oz!&84G26`!x
zrq8_Sf;NP}3f69<+H+DC-LC$G0KLlW6DM8~+~s9u1$UxbqZ+p*bg6DlaXgUy;0qF9
zxGIr#g=ZH*RgNiG3ZnYa>uKnNcZ*fu+$KO0tCg<cJUo-6$l3DlCf{}L3YoXrdl5Gi
z&Gx{8Dl2_*+qFk+=!)Aik0&{IZx*llez~EY!+arON%S?nb5sLo)X$}PqYxKNh1KwE
zgjI0y?XK(92Stic#HiL3(a@jf(;iqlae`EQu48EyKkpk{u;impl(>!aUPePnz2e>L
zhd<&KZSIv~J!0*feqKLmSfln=9EqQGpFs^5TE-p-ZlOQQK5%3&l7rsKgCy*tIWl%f
z8*J()K#C6lo<ne-jF(6|k74|#$|>5K6xRXi6T}D6jT1qa2320|!@Kt}y7$e3LY+NJ
zEL2*WtBG;z^cavVb5|m4TTrbP*;!YU*)&9Qmw#ASBIs1SI6v}i_KdRVm;sn0Z$0}?
zc|JSX6*soOFj8<_mA2H$EMrG^kV7W~w`)_gBTGG5zCc6Wgl7ZNqIlmI^pfgzf|?Kz
z)fn5<9YN2@=Ao;4Em7ZVGCNxLGfT;g7`FvSy^PVkY;ZCoX2epD9LfXWorxP?|LS<i
zlMkCwQj>2ww$DCFR>Rw`wf&XN%26&TNAUz%Q_EUCX}fZo-?EG5EX0DO`Vw2(+kX}I
zLMvZMDp(J@*_xw%JQ|8!<84Qrih~Nly3|+>38Y9P1B)pvg3{s3dVPhKTf={J`Yw@h
z;Zp#knD}D7U0?*xn<mF$^8GP1KcFkhP_GtL$9_wL^jndg>NtWEAR$rNgt1MPc>-$r
zlTjWdY64`565TWg$8x*we69EybY0Xw)tUd-xLfha^%`BJNlqoJHW*LK`X)b3Dbv7*
z1W4{Dy&*uPXfe=he_Rt!Gnc+2lqWp1rJ2_?oB(N;EI7I`HGB5khp>1Py^X;YFqEzj
zDtAO<kqo4%*Q9E+LD>k5^kxl(UY<{GuCDjOOE58E>QVX-ON?zX-Ij4hDSY}YVU*4Q
zVuAy#NGcX*!EDL*o^z8elSI-hmMg|Lvg&tGN((U)T*0n9PSqG*<Lp$f8$mhvLJc=>
z;_pCW(g6UsMUS~u|H3ArG^sw9T7@-QAT2_<gAt2}{q&F>+}tjQrKCI?rsA)3&d+0Y
z6p$mSM$wOyeFg5;q@9Wn#1!(pDWo4lCNZd+6Wj}-rIqC!WKAv4ZR#hkpXhb@HEr<4
z^J|Dy;Mar-_np**to7=+q8~H#9f?$!D?GS-)@UJvFY1?1`Z{E^^Os%~_$un}O?p$<
z9MSr2N?LP{U}?H(rYwiG5s*g;pQ}_Ua(YcaoLGr{_60K??(c1N4y&;5JqI<xJbY$3
zGoELVo(=X?m&K$keza^dR>>D5ujoCW?Oq7)U0-~&p646HARtfNx$rGa>j&a}o;!DL
z4zd(MC9#a2sl?gOFROC;iPeY3&X~&TvwaDQX>4q?dreKSxm-2;>4dhuOJnP4hGny7
zi=|-mgL&(N-4Et%;;&rrD7(%bvqWK**Rya6jIxs&HWnsc2;J%x<lXL>l69pOxfcjq
z1LBdAPYgewh}oQxU>}%XmvdFRfp{s?XOqerKay#0?T0D(wvLW5^k*(T712wqy9o4y
zWm8axaaHlx_qTixB2ied0hQQ3sZ$f^`&Frso<UV!QHA45&(4CaG7DtNPC6zny0dO%
zkr#tC@liVBzJFgN%6$^U*q6wQ!TWRc<U3gqoZj2n+CF#VPANuSnWJqf+BvcLTVfOG
zvZ_p6lc^3O?2}PgQuy?ITk_bD_`Jx?MZVf5o^s%(A74K_`i2;yf`d>l;+sgAmPI!P
zF`Mc&!I$95(_6B?c@6U#`}b!?`eXO!MWQ_43XX=q#eR-t^kgEseXQJkIQ?9|N0G_w
z(R>n4ZWke3>AF4&9Lc>AEqj*1O@-UfaEKdw`fc>_pW?)htMo4)=ZsI7o;kavgB(&k
ztzy>xy)|xEFqfsecQBWY`T|3qM$fU@KbB5rJ!o5rvz_87_pka*LEGp@t1P*>Z>*6r
ztwP%A@w44;t|5J=FTr@OMsl3qbA6F9Z{NP{l|6}%M>?>2iC-&1<@)?OS4ur3@ctOR
znjr6wY}4^hn8Kax?}gTX=d+9y97?Qkq7uFSL$|EZk|+GT?gJ6p>0XyPl9P5u@ed3>
zjRQxZuO@)iqk~I#v)zZO(3&I2rgF3Wh%f%3g@4q<Q!A&aIFA0NZp#+{I-7MmNEw6e
z<Y^w+Jg~7f7C*KTN}L2e`?@iV-@{W!p&$uHmG2WAAsRF<3?)Af^e6;P<1TE_jCcq>
z4i4ISO+mIg<`WQ5DVsu0B#!c&%*)M9E&#c6Ix3&3o}ndCrTpdw*_dj(xpMCq68cNj
z!a*+ObpX4Kyu<!0&@#J(s$)Dcqh<EU{DX>N-6rijXL)-nAI21II)>2i0TxTXtA}j_
z!)|=0BT88*+iII0u|6O(w_zoOT6-R%qT56v$kRU{De_4JSo|3h(BG!@p@&kOfq%h8
z6KK5yszYMvhsvg$_7dp7B4rqeS2^%CRH-`t9wt8!lU)p!MsjgrCs?Yx=?$h0HVpvn
zsu1Xhi0g@!Y}1w0gu$mf-l;cC>6VBnBy}5OV+6kvi#5mQb1NLrpJN1Z^@EOfhK_cw
zGH3_xb#um!TJ<-tErGeLJX;S7rbMEACpq5`7I{tQNUZ%r2JKu>Xm-TotA4X5M5Wkq
zgr5`OD3?u5b3P$Bxe%cLpB?v_$M=_bDAcjZAMu28EEm_l%5ZtXkuqc!fap#z-X0*`
zo5gMum>DQ%VVgjXWnh%C(f2y%TV9-F>LAhgiD3un`gFAKM?T(^Twx)rovQ+d-M~zr
z`jcZo^_IrqZi|o_BMTN*tAn*Vv0d_SAAgn3ePtU%v>s`0JD6@o_oVe;Mb@1QXMGoI
zWYe3AKlnB8`N|>~b*xHp^?B7$_v3cB`1VZt)W$CY+!L|#cA<{E4Uky;vNo~EZfbV#
z6c(JJhmk@<F-pEg?it2v6EK%42C7hx_hn-YMN2(Ck3Y}t6x_M{7DrDZaQ>zJ2^<Ym
zeD^up&g>=pxX`Mf9u1@|R4igp&7~(Pnp3er!WI4);ry6V(ivr6WE8z&*`yg!om_mp
z^lRY3z_W_0{e_d|Tpl)BN_{0JSW(*Pd7g!~q_O>e20k%@6Nyx&SGKAY>g~-aF2yho
z$9y8tmxs3{lj7qsdd)#&*YG1GXqYa`0o*{r#<wotC2w|W0tAFa{9`>}q{oN3@Z?JD
zi4!MOdBFWaJ^=ynqobq41_A&RR~hHyV|Y+gtSMZX>KB4HGa<<ok<k2cN$p((kM!&d
zQlMCyvQri*NY~H>_sI0tPL&63rTbESavmtH(X`B#HzCFt-S}jDJzDdIlnywLXmfi_
zrqRq*jKIUhgm4h?Xo0dG$B^Clp>Uz0mCC4OV>QuMRY?N0C9p0W+e>f{zJi5tj!^98
z)C7Zcyc@hZO!$arUkPTE%bcL6nqjPxp_Iv?QV7y$CS>+Fx>jiYn66ny@k<OoC35Hb
z>$AOvR-OZUxB)7_i(^(h-uh$v7_s}cK|)RYfOL854p!y0PA&9Q<&(l}t$Zv7pV!3m
zDX8g%y{`*l5s%$pG7APm%KZmR5Q8kc@M{(Zk%Ec9SqP%X(t;2+m0-ycPUK5*bJ|H1
z98o~DGA2C@gc=eP3aZ3Hal|0HA;?uAoJd2wM9jVUwUA#-O?e`>_^9iG61~A}Yx3sL
zG7OPO^1MaA-ird(4{`yRg$5;)zjXx}(l|#Xam!|%>D{q8Eqcexl>1WsidNb9QOP{J
z0=_>fmt~ENo1wp-Is_$iY)vvVGc$n|P;9ChFb&m9FYSxwlH4bt6xA7EicuC7940^o
zC|Hd(*oJ9cLlExqk2>T0FpOUl1F@fd8NaG3+Z*mBKyXc!O85w_>_PzX%M@!t(B1Y-
zUJT<;qCxG*=D~2&6TU2Y>gI(`?oh9)yI>i41S_8GUJOm)DEFZa?|1KgXko-S43_?N
zB1kMu<WWzc;f;)A;S0fLn8G{Em>nRZf@UE|mmIH)p_%R_?g=MW;sAL<>t}lNbc`~L
z(W0F}ew|>l>IIJgd#a)dAn}FKO^7ZCSPC5L!(;z;lU<==MIJw_`dTx2cUn_y@g@E9
zdmO7?)D3w>49q<j=l|6zSy7-;oKhmSuUo48h2-AaQy|CnMS@gTax8$A%o0!*Ooi?@
z{>onK>jCDfnUqw%i{M9WwWLQoUcGwdN$PS?FgN`>EosX<h^`O1f)$TTNtuN?I=yp9
zeQHHY3Q{)J>j98CIRTY{!v0uN1JV`A{v=gX?-SA$r6VJ%khEN>JGN~2e^w9OUt+Kc
z&Llk(2l1Jg7{<Y}3ye@L+#kaDU3EdFVQPMz7t1FAo33AmvAO7VC?6&<aX-mttqfxB
zgwV2Xz+^88ELUp6qL^v+{+mR)>(jcm>BT1a1ddRa_v&mvdtVqMmOpSC_!%+@r?8@!
z!hQbUlU*rzEClfm;73%IMQ60$5Llb#WMwPCxZt3EbKWx_kD(zJf$ug7HX8wtUG@!>
zync#>24W`*cht*0OX09*8h+JdAGw$3Xot3m($Rp?O^`Z_?BdB1H5JVHHqa=jrKN2j
zZqU_ELHZAFD0kGLp$f2L#v{cl%}<fyJH-6_!xmAx3k4hzkEFf49`M&pE$BSSxV<>N
z*8HUD-z%0D54%((P1mIXapH%{q!)yS%aGZN5Qdp7gwIN$EIL0Pr0&MllqQ}^sO&9t
zgNXW>@M&GDj2)fo6X22RIEt3v^>+um9iJkVat|YuY0%lpQ!<i~#F24kLGQZg9qu<p
z?x!23@eqrs2IJ}Y^_k=7I(5+CJ*wZqXqp%5FB@@EMcggRFc545Lj=Sv_RwooXCt1k
zW~ugs5EDyu>KPr;Rt0DwNJ-<Uc^cpO=8Xz6w#ZnO;6c*p$+N(TA9W#^+(f_O8YoEG
zTqVO1X@ZCogXUk9;^b8cXA`%Vcx@U-#U#b&%do=%aUYNqFVBN{O7EW@G3C#0VohGI
zHzXVG;O(vh7;!c>HOaBm2My@xOdysDUz$OB3P+oWabE51KrE7xVK+xC1y@~FLi5gD
z1UQ}hGBB|_LqWY(Vj<wR59QHEg<%|09fYWlGXaXRMQ%>TKXx}sk49qg7eA-(C<52Z
z=@p?lcgM@10eb>bU+=#WT9R%QYw1Wvn!)()*knJ<(xZrVV29Ao#+xwqg$s=*A|=^@
zcz+fzp`9Purp|ZiJrjk<4=n*A1;r-QVg4R0{pvNQMP4nbk0QaUuW%BZxEOL_lWJ!A
zj(VZ>)923x%s^u0v69a(>Cza&DQsULmZersUOrgaG9w-nFpWe&Z46bB_;YWw;BeJX
z`%pHy_5^Ce)}$uI(Zkg)pUig^vz<fLnNWiOVKcmVX3wfPhQiGY#=cOiG*l_gNfze6
zFT_qmH2L~KIVZ=%?cCn@>|Drv+CwE4=4s3bQXoX3K2NLh;)ln^$6X<}ob}pmg1t!=
zM7u145h!dOgHWYZ*%bY-t)1N^h%et!Hks^%%GjU^J@f>)kRgGVa4r&>Sh<U!hZaR)
zhddYui3VSSE+owMdiW9xS-aKW6iz>ARW$@-1YA=+BTxnNJRf@q&Fo6o{IOOZx(m~?
z`0pUY6(S<Vk2pct)J<SY=!ob3_n-)SNh-L)`XZ&m$kr2GAUR|Lq*CHBHlf~iIeO;y
z+I@xe!(|Hs>;zCOBZ(FC!>ZED`aW!GPdQpX)DIbi88iSPrpjW{OdF&J*od0dgx*Qe
zOLKqfa8>=X-6y_(7wc+m6^u=e>ztW!h2%)!>@Q}^Bi+J1h2}j44D-_}`TXo>c$gr-
zkjwn>;#~^(3EwT@eEh_TWR7cW&|WJ-N)AE+<tulWioE7S&y7-Yqz)L7$bQ(edmM%;
zBZff2uy+!I-!Wilr8~n^jZuh*$ACsUbVMg61<%n2tp*tR;}!(gFF~iUNE9E42;MUA
zf?y2#O9ydfjSBB9LE){jeCP~IcTfp(?A`SZsc9T-m25Nf*SuJ2gP1W;{|#ZwMZ5<o
z<0IJLJo_C$;Frxz7Z}@S{PI~UJ+?q!r@Fw3AF;1n@?q=`zvk5kBs&8(r_*aa9nm`m
z<IdXJqOA#37`;06Hw5}`A?MU4AnT_JSbiTgpcN}Wgs%T1xg(XZ(<v!wZt%`1K4b6?
z;L8ulblO6L^rU4?MSy%gYba9cY5&09-*2TBptJDd#OUa|kF2#@>#cv--((xL_UG-`
zpO}Qg;s>91f4E%vFfKNXEJKisD#qZ+mi;P(OGCuJ%Lz~KgGkP<c40TJukm$<ZlVKb
z^xC5MUljdD6RrR?*;7$q3zj3inPXRZi+w$0#iEki<U?Zy=8mMd$XD&~oTTMD`Qm-n
z`>9_(+m~8f&&rQQ%U{y5n)v3{QlF6f9B*P(40XlBKR%Qwg2?)A-|7A}r1z&lXqou#
zZxQ7>__(GvcCGWCzanRXDy9p5aoR#Z<<vy4xW6~#oA56=CT0z~)Gyqspr0<g5aWl*
z)_7C+<AT4iL9NhZrMD>0&q+#t91}!?H;55LEVPsoe$Z6xn=Q0{O0(aDri%{Us1ETw
zpM|j$8pG@$ZBQ@aZ#`bvYVRe$v}e`@)xC%{(>n=RBHYE;AjtU`KHW=5Nis-JFo>4N
z4;}bndcv7|!RW1rz(5zFYq7MgqpNFz^x9#IYysPnuXO*Cz??(7aSi?Y$B~Hn_>DVq
zRes<s*f?iA8~x@A*k7ax;5;bI8$^N9LJ{1#I9h8pvkc~-%}z<1kGj6@=(X-5{Wd18
z7(Pkdm#m3HN0sVc@Y&~fHrd~288gZUa7AV&zczVZDdsngT)?+YWV&ANP||p5r_?v&
z5qlXjP%}&L`md%VJ_H?>(?LGHNY-RY@yO+hii+g3<s&K+16CDbjRFgOAi_v)pBI}<
zeaEghe;KiS)X9@WGF=++CE4@E8EpDi<}*YDhYZ_O_+yH-Ry|a}xf0~LQMOj7jvAPb
z3y|^8LcHlg)rIB%br3wIfC*pNg2SU^BoRK%FF&X6lFMJ}VJ?;Oqa(pPH9i-A`KgBh
zwTi;Pnq2%NwiK@VAnPsb*RRiwQYh&BOJh=_zA~Mj?@65uy%<_fqY0Sra=j39xb8o^
zXO;Bub%^ZHe>ylQ`O&N??V54v*)!5IE^5Eo;E;%zn)Y+~TUn!B7ac@Htm@s5bBn^5
z9=B%Cr~OJ-aD1j56@wpJh_2gm2*sp?jFEAJT$AnY*-$CAp!5;C67DLSe-@@??gGkP
zQW%Rw-Q1t~K(Ie?_Nj?H{Dv#_rA2<P(N|x;tIcO|5_-9wwwf9@rZlUUn(&ja*uRv}
zpVk56ATX~0DQWwoTFmx>OQG-7qOn>Wzx<VOHqg)!QPYT5&nhuW7Wu;%ukKGb-?RU^
zDDkTmmBkpKA-CahD%FE-gRwK}M_1Goj)As%OJUGaP3I*wfA8B*t{A^<MH7^|XBVER
z6q>8+3X~Xhy4~aLI5hLPO!_!2mw<ekkXw-Tq3{sR?cY3nitXAg4FoO@KB|?D_qB>j
zKk$ov(}ALt)Ms5WAE^oacb@on{<O9Z?Squ}xpAVT+{&_WmP?`_Ks=pKP^jJIGyVw!
zG>&%k3VLR2WB6XxpZ(C)(B{#u7gp=vJ(_oDv(f*eLxE!HRAqGB^XeH^`JfjGap+e2
zdrf^ZA)ddo?HhdFwuZwvzLB)*KGIEev$`!F<>=CeCf4PQz3p#EAjQw31@sK1PUoH6
zRmz{*M|`|)|7nhi3SxBL_?OyO|8>~qzRnsPh@RYPsS8S9O~ad#3%f;4FI|OHvPZb8
z@sczo@=Y45T`3N@yv34iz*f=$`Q$|o2s_@Yq=sNc_04CMg>{T~G$al+*4~WozZxMU
zOldc;pNSLfK5V%Vr`)!ObiWKg6PiP^r6V-)unLo66`XK4#+u&Y?{L+tSM12?I)rk&
zCfFwF9gX*pG=CmC#H4q9{Rqwa^(F&X3eI*?-R`(*lKRYATX0Zg$;&yQ3_-*wQ36Y`
z&sG5l9TZ85^HF?;?A;6@D8}erb-#{}vvo_TuKC*VAWoJ#|A7<wwd}3}wS{L{re`Bm
zzH(+SWm?!A%#1{ZTxJ+<V5m*ZuV-j%7|PI8y7mh{6XcB-ju;J%91?z_Jn!_Jo@;z!
zqU&7g{zqGI<wzUN(|Q(4IABrmG7k^#J5UGf?BEiY{dXf+l3+KgOwXgO#RH$i(UsCs
zpKF!toOFC8FJ^gD8O;{q8tmlrH08|oOFmDXFQPu~6{`yDvHef><3v9CP2r)6oNTVc
zu|F2B2~y8wpk-1FHnoNiv~+3tzIJ?T`BZRVBU57oLwjOA97VM)sWn7yDd(4m_$Jg-
z8oe54A7)Od@U-`m@LU$Uf3_ZX3P|PD`@bn6P6Nz#+p{4uclM$?f_Bk=uX5bAW0{4k
z_tch)>7845QfK4W)6LH99<gZod3%~zJ&t@yzYDog(faAX4^I&3ZHB_h*9491fzrmZ
zuTZUq1Wpa))1hNE^ybZ*R-Q9eQeCVykS~{owv85u5C=6aec!zFJ-2wFV<J!|!3U8;
zoh}ciF<0uE(ti@=HOwL4&`peLmLi6=mXAnvx<7TFgQy9-fblITWUQxP({&#halVx1
zuav=<!_C9PVX}jV=kcSrnStI<{e7>MAGA`=_~wOg!_S>|kZrwQVoVb-)n7Uur@jZw
zxnn_Af)wUl_HlrWq`EmT#b*fQlsDp@dGV%}J4!;Mk)F5cjVM9U=tS8!{-@iZ%gFMH
z@;Q{@)FzWx`;8@QGWD3WbdH{-#)U13wx&xo{FEC(`i^39mh&3Q*0k8B)2^<rfdgRv
z?AvMIS!g5=?-0&(|FxW@CO`trMRD=xSH8T+ORf-2sRr#8;|=Y#Er{Vg>kFMy)eurQ
zJ%m)PO(H#h28EAjkTn0~FX$?2fk+~D0w&<Sl19ecVcn5jv%P1bsgDW!omr^**zxs!
zuVZ~prs5aHiNYyOMVqC1U#u6rW5n9_!V;uRE{b=$Llq6Y>tt@=%))Z)(i=*;Ak>$h
zI|jLK!T?g^0PZ8Kj<kQ5eeGHzbuHOm6!_|qTsQP;0b0p4xd&j^RX$L@&<AB_>iH&<
zvZQy5pr5ay7B0&^i%Rnv+Ft!I;{0A$&0=de*_e(8nO#%*=UX(K0)Yn4ylar8;4&dd
zfabnbx9?qx6ti6*x7eLpS|tX5cB=#@{d#kVZLT#-F-U&PleFSA%(){*iYdA8eIEtK
zP~nnjAWx^1v0(0|LP)A7BIW(A3>#AOIGzUMJs*3|7|p=8XVPCKRC+>cZ-fvjI8mnf
zY#t*YO4S94J@Fpa{vh5ih-Yb<yuQub1s!ZrNFkh&e9~Rhdx21LF$cM|XsE*^`BZf+
zEzY(;8X3wm_2`7xFY$+mnwbo{Y4hy2jLb=NgnB#OW|97FN?@bOvn4>2pz4GRV-FNY
zeOh+9s-W;Wkd$Hay3u;+Pc}|x*NfRs=pCE%f);T1La5cCIT$nmeYF|13osb<KIscF
z*0Eg9%F^+k+4+to0j|rxQfu}1*|X`l>=;1unG<XdUKGMH0@2yq3>u;-bQDl|$q%SV
zKQPgnV|R(f^JDuAb-J2j9QyLhsdRry%V1ju^Y9A(yXXO%FUtubedHl*1|8AB_uBil
zjSV0Phfj9_(#E1^dr5j8qOr-t(4Y=4vLcL#KSOmHn0kks6#Ecn7oi|i7Fwc73Ev@1
zr;L2jkoq3VS=M<%I~>3^DA4(6#zMjs@4juv`-)QZq=qgO>*x2HEad=qSb*g42+o3D
zL2Q@Lf-SpBnbUWX)<kWL-oD_+5bB*Sb!91Ms`#1jL&8pw=)ZYSmT{CV!1Vlmok$)*
zPi*v++LJ=0PLtQcvaF<1;^;Mf#?xDrgKwXf2xz`Kn7I@b+KV+b_H`m8hbJef9C!js
zW{U`(vIj`a4OgH-fY4l9I_{Hq{syw?^ZO#L5N@e_jG`jLcNlV+4LZL8cO(wdT7@ea
zG{(4;B7=ytrl!8n8BJ|O4SIu`bfE4VCL&~|uyTa<;k};KKsf#?^ic*Wo6aTgnBjV*
z%W_v;C0{4>S!lhF-s3*J+h-BDWg)k0Cglc4=rzIK7{hOnv^6){kz>p~5vHa10eLZV
zp=tVEjzviBlF&Z__d54)u?$mgh%_PkPKIrcdu2rowrD`?Hqr($UkX}D+NC|aGbXO{
z&@}On$w^iv!ANC7`hyuVUkdHdgyGp!ZR_W9M@}mWKHLE*fb=?jF642B`*BTz-xFI-
z+!a+oPC2V9S%i4GL6r`Z?(XhCTP0&ssyW7>u<UC}o`F16(EJR!cZbo(%2V%0P@v0K
z#qz~IMz^<jIKn5s<Q%FF3-r&R?boFeMZXIo>7onkPXtUt75%xw)8Ui7bKu4=o_q54
zhZxNd{o&B458!v3&46Udl5#snakdCVInFgTZ}{yJuE?B{gV2U`pzg{eCh{lsp!}1L
zry_l9%+f3;!+&mhk0cyAE`cWindgvUhYXVnr~_WE*iPBQ5ekJ8eebOmulf57<vr8w
z$P2y3{?d5Ju68EHhhrj{(LGT-nz2E3Z{fGQt;d)}6_gmhG4#&WEhh$7TF{UNwm&8{
zbY?-f$NzR_MIhl~HIBw?GNC;JpA7d=<%Et-hLI-$F$PPNaq!mIZyvEOKI&;>qsLg9
zn|v$L3&$VA98>{yw&8e_hLM7iGwH68Gq+#F{*n^DDBxwlVYqAvKX?s<+#Xr7O2~l7
zBM^<+Ypi!>O>%%5r7jF&`dyIq6*4(cl)#W_M))Z_@nAY$hQdx2vxg^oWcYohIPvn)
zzXI2UJWV`6MnffI@YZ^!1TeRjUKz~Th&xgslTse701mn_JK``A7Oy+(+%C6%{rp#g
z38><}-{#G7N@VcS*E2&)04`xoU$wp=K>0dHS3L!pJa|-6UVc1aBFB0Mm<)9H_{pdK
zeX-mRmo$><wz5Oq-Yqab;r)o_2~|RYZY#y{mCa;;x*{%cSCLc4%jyZDN7f3#Ym@Kv
z%$y+42R5u4n%YR|DF6LSl<OO+AOS<;t7j;kvSCEC3b<0OZ0HYA<B;FdiMy=Wq`aB!
zD5Rx!z{4lCS{PlQ`ZQtV3j7gjj5q<n@M!#UdKG5OmZ1oklk4)`92^{8E9{oq^4B6}
zY=~rWY^Zuj_7VAD+1t&d%M|}-@3?RP{<+&0ht()<zXW6@b;050-#uB?V7|HWE6R~+
zChZ<jmf-W8e99o3nLI2rS(r@$2tQ?RoO!V|aUwMy!>wUUaSVjta8?kt(>Q0iTBf4D
z8|eCwx)mQKge{?jKf_U%n{qAQt@X}Z$b|mqA_%cEPo`R=k?(e913GB_@Y3dWHE%&j
z`chN+y2P~*K87$g`<Fu$TeA#@`uNm{w`LTdEpL~RnrEkL;OeR=`?Y6}n_U%1G1!Wh
zpa3JiFw(GbPdLG9^|!pWg1Ca&0jOgjK#6wS^X-LiD85|gAtCU*a-G!eO3B*~W2`5z
zaFQ}0bJE$>RV+31VwAEk2KuD;!Yzg)7TM%mpwZ(-iS+D%*MI%z)1K*2!VCazfjE5q
z4U$Md+Vb+a<Fw`ZEc;TL_7SQnk6VicTsDgSM@{j7Oh0JTe!R8*TnX*riz^TKO|F?2
z5t*42mqhM_6H~xB+R-o$YuIy7pI#BI2+j|K-Sf&39Zb=o2{mNP?E2}bcE+81hbhj2
z?6Kq!;xj)>CD-ub61S#!?$*|}AByM<pxm?=xaBnrl(Q|Mue6?^B#<J-?Mt9I$uhW!
z@YvyVR;!AZHx+qm&4)D)Xn?k*bI*x~q5Pb@)FlD!*~RF%&e6tq|B*lZeSv!efT-TE
z`r`}qHqdDo3)xD%2@Me+U_;qrSK@>yzb5P>N}3#Ff$){HZ@Pl@r9G=_B9AviBb@Eh
zR~7aD<?KXghdwvI_pr+9E@=9JHKYXGZz5HyloF;K2!bo>(<bslsfcE&Otl{;e*zbr
zA|MAx5gu5%x8y2K`{SrPsTc||z04#1u*FAG4ZF!d+`?pIV>5cuD91#y3^C#s78Yjl
ztu1SFu^*9e|Bew$c8XQAwSeutt<N8$_-vUT%pl@-nQmnPDx5e-^uMb@gb-11B*+#|
zaq}lcfhV>9?(U)ZJX!^5%yf4p!^ThnXU<bN8mX;gPpsZ3K_Sn~UeO-+dhyXNx2@w@
zYf&rq5@q8nt_Q*WCJLx8BTWak$8M1r9JT_yI5w2aHLOtt8_zqNvbc>DiU`h>B%;ms
zF11yWNG-ea`(>!J>&4k=*O2mQP0Zd>YzX=8f1T?(pn#~N7rU2P{12eaSZMM7?&vlS
z$LjJm3~2<4zv@Vf_}`Sw5Bq+kUB7+>&?ToB0F&P&`5^(jagPS&&ohNlXq7_z*vDQo
zii#k5X3Mi??<UTWZaVp<TX~Tv)iAQ}3qbrKWr^Y|3sB*NL8`M;ZW^YMDWjP)uE^=E
zfhs2u>A)vSdPHEwt_t|o(XW&y)8yIRnfdv19x$gwknS6o+_|jtbZeL!!eg|P_#7?Y
zfIAelO@%9i!kT&**D#ADw?+GfL;eb`G8@u?c(gE7btdoO!7~u~r}Mej<ouX+EhDpt
zIqc9$$^(1?inHG|xM@}P{N}9F2$!-%A#J+9_oHtFKm043&MRgZCwR-?Rg4fBaL7e4
zX`%5Mw_d$7-QRGY0v?Z=Wt3K2`F%+#L;bJD>g9N*Lzsja<<^)VoLqjocl@<h3K30>
z65{>-`2<e~GPt7f3f>=!K9WN=2mWlg4p}HCbCC@H2u;}jP(*zI`9N^ta0!R!#Wc)F
zY*vPn`jwM{t*Z+#-D_f4_qVMJsioY2pev`yKdxg-@sQ9~B@BsN>m;uQPXZOY{A7KN
z@-@1<mJg;!^dDp+p3SzrK}i4rkmVu}RgTVV@`Ly}kf&?gJDvBWR2gPHG{5=Bu9R*K
z|7?Sa7*YnoNo`~E^6kNmW#$xTLePk7qBVZtpuc)h&NFe<Rmxq>GW_^{@|obm(Bb#?
zX&)E$P=eO%3d&`?D)AcAu!ilntCVXa2>l-;8s~#_M0wp0jEi+%W6-AV{Y{N$&t9Tz
z4#iE<;NkI|AVh!99;4ior9KE@BdulUW?3Gk-BNC-QU;4xhUGL_q9N2P@LL4%lHrJe
z0lG{Y2>ICqdsGN1%^V8@9z~GvBe{2uxw@s11<Uetkslia0XUXruWHD<7A;6YCiK2Z
zjnR?lhh^4EowL7}Oii@Nq1-d4mf3a{0!wkS1+?Sk!q|+75UEqo*X#?hf=09?QKIA*
z`3wFnmIqTfy1sZo<K9OpXU%L@S4vV;7Wz#9=foMVk=Y(Dl+|g6>4?L0qnhJ2-ikH_
zMfbB31^ms|av?4PNl<&?HiVK-TY%cXXSrG++P=(2Fr}$Wt0U@W9r`mg4^o^aSm?Jt
z7@wqo3sQwj%TJiqmyArhhJA`3N_+uBbsj%YJ<HifKF)74#GOHj@JsI^)1<z>S;@w{
zjso-;-mbG@E}}}syS~NVg5%RW${TCX|0YFtt*Tn`t#BcxYxa)AxUpzfU)7kn_mS07
zfoY&o!(&fZw9xsr`-b111Z&KLT)&g~5_PS}BWt|s^$maQ9Isbbb^gobgxeH^4s}|~
zT7*g7<>)F6D4SiU#g9L1fYYiPvZbjnG$~+07`;QTZ%FOG^!J*X@}Po0W<^{gbGg`!
zk`p}MS4WVJYL(aZ43#0RO7m&+5mi|`eS@D}IrUrc{$WlD<`l->jSC8y!?P9JHgv2)
z4LLfDJLrD?tjpukhGFz|rgcy4y6w+P%TbC5<XP(bL0aAactw|tLl>Ik?b2z#ZhXq3
zBi`tsvKhW)5i@5^Hk6kI{Xe{S_7icMe(BYb7xk<1svYA)WvO4wsu1|gjU@&C2giL#
z`nO0@a`y`5z(|&-c7x~+uPA-(j6>6vnPyd1e~kdT#d^A8r4~Q}e8DK{mq%A+6few{
zm8C9_?a9aAZjh-~;|ZT_#^b3j8R@s2XmGcsK+iG&^}#W=mu%`tBNNd};Z*uIbf+Je
z%hnA1K}-mj->ag@x+81Ar7Q;~M(c}B$H_zj*1iobzNr1+@w#K$8JcW`VGkb<?KJIB
zyp%d>Nrvd}3z+8u#&hMVZAj<;GLe?((FhiKJAVr)v|3NZm-?Kc{=9$XyJfcfAN>kK
z7lp%Lanop@x+0ijko*{;9TG0;4wy%4GRiO)^QK=A<CM1BX)=Tx>Gixn4y4%sOe+|Y
z9_7cxaQO)+hj;yK*?w}}YQz8z6bPLE{3nCi{ObPo4dbc_8Cf^FhIc%zCcVP@Jfdey
z|NhKuc=ad9fz0|4dw-FYifo){v*mHIH>S2~Y1ibFg9wO@X@EII=VD$Rrks7G$)x8k
zekpq=vG0M4d@9QIStvfP0|F@+M|sl(qsuD)lR={c*Gy*B4%ElQ4fpqCa$&>hh>)$L
zeO7Qg??({b8jd-DOy6no=iEdAX9>2Hf+d$mPtz<Ye3sOfK67>b-vNcQNRqK&uQ)Fm
zoF4Rg9rC{p$j!FqUe@nm-T^VS;V9W_CAluuUynJE!Hz8RyVfL{a(Im)BTlWDGi&@?
z>bw>yn2|=mqb*4BMn_kwoaWdvX{<<H1XPF*JVuoAB(J>{Co45OZ~qD1@rmo|?u=^3
zcABeaDJj)#60za!tAde%lz(u9yF`2V@ZsY=Mtw&3h6X}~Yk6F(=og!r`?Xx;-z;Ak
z%79u6Kj*ZSY_@aqi`k(GcrJga%0l-Pg}umU7(7JwK0u|E_#=K{v;Q^`ibsVo>2+9J
zt7zL_Q7J7KcR9HR58(DEfB||$Bua%WQBNO@6LjaO5bjb=yP{;JZC~qC_aH-&%%KQe
z;*h=d3lk|KLnY@K^*ZFZ&|~RW%F*7*q?_DNs~5*Vqo4bat))&sWh71#es}d_=wl^n
zJnoC~L`zNIpdlXjIdeI}0zdS;e%s}j6!!`T!A@VMC3sMX8wYkA-qSYhMh|}cV|ayp
zPo!1B3Ac#rxqr`*(*~ZGVOd4#eHy&o527#A0K9*twz~uD3^jYrvy^dX;N1*Z&x^#3
zj0#&po-2@>HH7V76N-y{?eBfQQy^+EZhtSX)2LkXgYoEE=?uV4hDSK1ry7y9dbCR1
zaFpUlso`>l;gFxTm$pB*AC`Yx(6VNl&lD@j)#RcJWR!r7%8%2Tt(h-ob&hs?ES@4Y
z{ZZ<|hN9EX(c!fbtNUBtC?|A4479JB8npt;;N`ldK}T$-x+=JaLXpaHcDJ$M=+%b+
zh2o1?oxgaVM;LgVWMS0*@lcck?et&8Tl!$**;_<*u37-c8R`i*X}k$EkMBs0*OH=&
zxkrINW<&763A7k_N-Z6ygTtE{FDYaftj_@7X~Os2gmu{y0dB1hN9(mQE0|7q2cS8e
z_Ua+|1i_H_pzD<e^AA4}BCe;gmPlzqsUI+;Il2l8<ONSp$Dg>!#&{xCJ{8<H4IH*f
zi{%@KHO}_fw@+GA`sD~Pg^OXy9~m#Uk(WHZPkVswJcLqJziUYn7@FSOQ6A9}XcTi!
zzf^ja)R--=K?>)*=9W8hN3}(H%zG70xHLI0P;6>h|0A6-dzh@Ylb=?_RG*KN?!ETF
zXs%86#H*-p$1hH0tp-Sz@f8iO>x3|NhC&FKZKECb>9Od@*V<B_>Rg;LKKbfzDN3&g
zVgk*u8cWIif<j9%hlYmQTNvdi&_5GRks6JX^;RD^_1K)B0wg6b5!Lze)FVn8T!SH)
z@dFv6GL67XI;Kf7A2`dmWvsEU5dk#}D?jgi6h(I4OE{6GVI=H0%$Xm!R&W~bmZHi6
zkDlv-!W!YGd=5ypwcsGk!XVRXi9GG`>mvmn<A#q-tYuRM6g%W=HdK%W=5H8(+%MKx
zcBgNTn9vFvQ=^{J-sqSPvTu>M*0yiwy4x5?ek5|y3dUGdwTrmm>J2pgLSn<};&qXj
zMsD-6$O{p#hrD(AK`Sbz`OXdvSN;#N*29p(RG~!jV2BdPixl8EyRsizw9N2EL#;GU
zifZ>D5{>t?FxQ~qzkfw$V4R+yn6R+aIN{}uYgmGq>C>==ZDg_Z`vS1&j};>PTE&>Q
zftenf0CVd$4h#5WA+`U)*6gn65g*93B#(1uub~Uv3=&{<vkj(*MBwHaH@vQ4f4V&x
z-<by^s%OUQs%Entu2O&n=sfd2tgD`%#!p5}ZFWft*R8jAT(;CNc-7A!Blk2y(Y|ET
zMlQrWK^U||$?;K~B52o2tYD{mjZHKj3la;$?=@8i%7-r8Uvs;%*%KOL2g8)UgN7@+
z!FezMgr#jxJ6>=!ghKL)jU^t>lfIHU7tvcDh~Cm5QYgWaAN0ZVx}OSWKMi+gsP_~j
zKJcx$E7*_DS01j(FhN-Ae**li!eYIbI!nw$8HYor#^P)biyvOu&GNB+f;fnkz)=#{
zTUx;*uCM%^A>=2v=C`oBF#x2k>8`J2#8`%uq}b0#o85h7C#_kVVL~BQz|7kOKX+Dk
zqYM$TpXyG7A({-seI7naA775CD7y&#Jiwa%h7t-0|2zlN-MZ``C(R1@OZXt<J!f!P
zWX1Vyu!6x*lsiNQ(PfaVF9uK`;*$Zg-f(9s67~8aUo3es2b5A#GVUpb?m`UBM8tZ6
zVRSMpPsAl5cF+(0`Du+9jeTop(v5*tYF%@dlM?=8F#SB^`S79<h~BbMT4Jjwmh&$T
zFmwm(UN=3my7YrYWFYXs?jl^)xOO{L8U;ThIAKS!I4bOwP#;7V#}87K%LqA@o|czM
zx>YNI=l;O%Zi-VQzb^TL6k?2hc|@-4FF8cxb!Om@-kKadt_+D6{E|GhZsp*xZjz`r
zoCS~XG`faGleA1rC@hoZo%|!=QcjDv+u0}+$K~g8e~krRavm}I8nN3BP+)6ntF$_r
zkVQjWNgPc@<?mjF%OAlA;rZNAAsZj^(ZE)0mrs-JTn<!UxwzI%7LEX2c>a0hX9`*p
zQ^Mgz%XbhHBcW0c3<brGUlASuQGgSH)G5#!&)*%4+Ox7j<uDNXn}zQ5(V2w>Jy|o4
zE*nsiv6so5le{NT!U4*v-FAbdL#8SD%pa{OYx@COW)_II<2tk|Ld|q>aPs5c>jo3P
zzTjC|beRc1A=Lzn)@C%6kG}&RIy8-s%`7f15*D|i$D~e`co&q(C<U+W7^N4+0|esJ
zqaDeR{Dd|ujq<|}Rk<|mSR=ygz`==%eI82z3PVK%Rxo&OIXpaD<5lW;RaFY&4sJUi
zuBliANfQNTM4SFQcQV7XA3>a|yJ~>ycf;GskVFnu4iv`K^@%cqo|dDG9<<X2XTkgU
ztpcsw*5s%b@}7TF7&1w;DxgJ$yh4PA*}b?}`yu0;tP;wmMBeYBAZM!SDCn0ySU}sQ
z*cP78QtXN5HvU(oV)rMw>oR{m^$Pq;@QlH8Au)D~&xdcH@1%f#S^z-xlyH_B1zpu<
zDmRdFpc@lRP&)L0wYy*eStzcs>P3I7CTew8TN%NzzMOxqVVeI~&Kh6pUu<p}u8jVk
zUW~_}7tF()Gt$u??ZhbdgwT~0&{c%1l?GWM!pkbqT|Yby(ltz9dck>zBKkmulL2W^
z#nQE0VEM;BF*L-<ng$!C-w;U2%~ZGjOMsZ|1>aEr{6h_7QseT<W@k|@c69SMs;!1b
zC`$OLQDZEeGpZs(>k{88dbOHz$16x2T$8;2bUbc2Nc>~5R0G2dF|>8UB+KZ;3CdUk
zvck*L+BHVxfPiv$Fj5vfmnqWzQ*L*92L+k(X7U2&m|kk%phz<)kI)PUOyG-X63^=*
zg&N5&TcyZO_{#o=;6xF2Y8vFxy=IN5r$v2dwCx*Hd)z6Z`ydoh?$yps;q~D9m{ah0
zj;vAF*D<nBim3U3F&rN~pj{!K-%zq70Aq|qzNaE@xK%`K_(mLXi$Tp#GA+YJZ7}>&
z;trMq4rY?HSLnXGHZ?YmLk+60;vHRlxm2}cD+S&C1`t2!pVhANl4B@GXN!35n!Gx&
zQH}4M+vy5JQb&X99*m58#MX2;q*HBDbi>ihS9>#W+{_Cth#TExble<|dd72ewOjf6
z`i@_)bdX(b)HT!rJ;i3?UPoc5#)%`*_d<LYeMbm%A)N9;lUmp2Istx`M@m$0@v{<_
zDNlWG?bQcqLwB3AQKtTe>Mq-(HXyLNPm{E5I@-1S<ul$yu>_?pf546#LTV<unkhAn
zp&b-i7=&NuZu(V`pUXXlOmh{@jcy@3tILvt8fA;9<Bz8}5v%{~9*2myR->a<8UeNC
z`TM%~T#v3m7{v6MHx>Lx?_6d$#3(NE&g$}SNo}?ZiGrMexPeURU=Oe}y}Qg-{yf6q
zg6uLq9hX}ZqhH3uWw06KB%W-7XqyF2ZJq7c8!gpaFk8=_>yxNo?GsJf?+{VjYjiCz
z?8a8wt;rt82ba^_`ba}$Lk-mwMa(gd0VfOAk^G&w&Ie|X)(S*wR2TvwyUSWITW<@e
z)aYT&_qOeRQQGH@2H}^h<NnB<h*M_ww0rlMzf(iqD!r-hf`5p>o&78<_aw5Wn@pI1
zuG`9WXyWqZl^f0X$|Nb$`v_JtQNU+HneUVG9-&;`+PfUJN7<)(?0iUBS4$J2W7gi?
zllSDU<GP!Q`?7;CKZL5ROr!!gHE+0|BB=w`2Vzep&5yahR^(}7O<LQ_2_Qd3&A}&l
z`w;K<0_6^7<5434dMQnM%`&OWYqSXf+XUx$<i=OJ4M$y~?>_PC8IHYqNz<=nP5QiB
zDbtOoTL(S2R8hQj)R2QKf|-TMe_x>T-Fv)z8t=9+A7lP6$?uRV>q$Nh)?>#Kf)4HV
z+o;0A!Y9~(cBIx}3A@E7Xnl^|L~?GaD2r>6WgzFFoWyK(;l$LhowIX&?Po9pm;7XA
zoj0NAdHww4P{)^w7?WN7u{|k{n~{&GDE|ZqDQk-w%Zvv-^Xo8+M(?ZE<h{ddd^$Oz
zoLQNQnq~t{bfMqa`gabPEwgX)26v+{TQjF-O)`l2*-F0@+EXQt!mF#OaPMhH7dsk1
zbZr}>V<he1m!Dn`QwrVtBl*^L=I=MDR1|zaZ6zFo$1Oh-`7^;ZILlX7R%XpN_RL2V
zZD{ylo0Y<C2j36##BI05?%cg_QcbP>g@?zNz@RBYJ3-r<DIs|wy~al(shIKvyhpS}
z-#qr$rk2VQi|UAsKGo7rUv6bmct5IpGvC{zRMZAEqebo8)h=jaf<qEwj!wAD4}?b9
zTALJpAI{lF+y$_>z1z&Kjyd<Z4bfgWw^b%P*7BzOoRn$yr8`&_<q{3**Gv6QKPLE}
z`i|Tu6^brw+woz*!fNT`j8@w~mz<sA-uml)`EbHm=GDW|FrU1LOQ9ufIHDhV029sq
zY{27kOnBEp1(F5>&34tZ28Oed_nCQ_nC?U|pH#*vmyqt7seOn_S6AgM*|TA2K3IDf
z8AX1ZY3YaO9KM_~Wo_WVZt{M3@Lr^c*GZK~*xXx`tEoe_9F2hJk=ExDk9x!~0oig)
z_u<&T_fopmLE_l95Wj?|o)e4Dc{(LZP3U>QU#2FxVTX#JGkl-C%FE16?T`KJn5a*&
z`*wDR0}lQD`!RNOALwu9liq9dAlp$+qlu>5{&u5~?1nv1P28|fS#AXt%IE2s>!$|g
zBfNVuZFoZ?r{rbLuQE}yI0`*rx8kh|`MzzJ#7--UCAOU4y{CO4VORM{RFK#Dd(t($
zyC>6?t=vPqW8DUB#m=dN>sQ<#)XBF(tRMK8Ke!!z5<K62^-QxsfQdt+5zFH%`aM6t
zF4ZsI7;6i&5Zoq!ie*pn{Hz)~{x~Wo_1yDEQ4KmADEdA3{KDd%gx!6jnt7MkoKN9c
zFzp}sY*B$_PF&I{<~*?R+a%wk6a3fh^7xtU{0>ByoZYiaveYVFLxv3@61Cmkt#?w@
zc9$!&AY8$GE2iXWjl#gs#1}E9UOhi8UU(~!zNN%$Y|AOzgo|Ky5V@P0%gV;qCtVya
z$E@rZx3gx2>D<*y#vCGAn)x~g1$N|~JiMMUSs}~Qz2dE{&hRqg9L|)azA3#LO0)AT
z%R%CBblzOMPUL#pv?Bqo*RJtDQ9)hX8SPdO@aK7sEW9Zk9Ih<>9*+LA4p}{m>*S+F
z50sMpceN{GAH<|1n_s~aa@FlYo!nF8;C9b+g7?^9wpQ$kq-#%T=*K3FdAqd%8b`$f
zcO$p#A^5eaytE7`i{D1KyFGlm7)3=E9O7GVm3jL16jpF<QqG%DY_$?%*L0o#FVo^v
zslUF8$~8aUo{|9Xvz(m2S1w|QEj}F*GP$^K|NamCOtXVe<VZK~h57EdF|}DfnRGjL
zgXHp><OP;DJjs&DHSD}aESpO(yUo?>qF00969rQZj+l^0DiwnJhdL4DI2zQsM=b3E
zYT6m2UQ~}&T@Jxep4cy!HlDciS=g-U7~}P+*fgV#&)Z)=pX8)nS9JtOgR=dsOaF%y
zJ^`1hPBN$^oS8~nd}%W)`m5*H-cJ13KFwX|e{7C^&uzaBZPNtYk=JAH4-|4!QubB6
zB|WvKf2RSKQ~RJ5Cv_8WOX?xz@O$^}WpMA?w~u}?awG$#Y4Wj=S?%P3FAm$E9E!a}
z!4k`&v2bs9|H3=hF3-bTe|`%R_3v8$>a^C<nHQmM6Fw64-eD2K;-tk}6Oo0)!p~fP
zn4As3n;!kayP)}7a@bVmV%e){(bz;tuPo_u(O$u?5xXnvmAIj{Hwtb1H8kY+dDFy@
zn^)`cr5xL9`XaB?%xc`Uc5>gz0U%p4ol!#l$8>*VuoYF>q4#~Ryh6<M-y(OslzUA#
ze}rJeW_RmAG}XC1ZzOX=_#SIqS!RvD{?PA@>(WpabRKj&*VAZF@By7SyA`v+;LgJG
zd3*OmzIrd|i$kpbK*M$Xms*phw6yLNhMxRGb9&d-lJ0Lx_G3a>tn=E+e!%HTd9nVJ
zE{5StCZv}yzr836>YyKk`Ja`ib)NAcGSjhf(3y|vK3(@!XARMeuDw5Yh#g)z|B8fM
z-bDN^ZP%y5PdC1d|19G2TjH3V0AAqU8+H&QuaI%bFIlwR;UFep>!&eFULv#BJ00Ff
zJ%le?XHFgUl>J1H@~4^U{TW@GEk#2=>~Ue|psEwoPxA{^-(q(i5B=Afl&5LvzpK_n
zJX#hmqfE^8S@y)_d+>^TnI$)OjKlhBchkhzX4d;c&;00B;L85@L$1<J?~y0)(KnD1
zuZi54p&wOsn}62v7DYppmzTMEz?`-+_K|_E?!Esq*IAhT^Bla)5mY^d%;r1>ge!Pa
zpc*z!7CQK2tT;1#_1WLZ-<&$(_)idS(I^?~j%9kHP>Q`hb60#PRBjS&#C7O#w`i|E
zPtVJf-px#Zh%cH$T!0Nsl2Dp^=5?LZR~ozy?fwik5%r(K4N)Jy%S9VXM94pF$+SK9
zW5eOwCko6LX7=nP5Nn$RK1@&-Pc+t`_QJM2Xy_;Q))-6ZB71}Z^F$kbg<M>+X|t7>
zmS|C7mW&lESpzQP_s<{R4>Z#jL0>rR+oc_~j(P&oytCre=8#^8Q?^r4QSrH_LH$<a
zJe*lger|&TKWY+_8uWqJ>fI~~V^l#r()>Yd*Y1o@Q4_w@oRmlu-|}63uvT=JQO0#N
z>0q%qw%v;V2Fl{Cr>d%|jJVCCq~+zeoCbm_3%K!m&|v<iOWtb_C)~EVk(FA2D;R%L
zlK75)g0`1edwYA(p==I2Gp%FNCZe0BG{fL+1}PjoZf7;TaP{@|(GE>*zmm3&WNN3C
z8o$_JQ`TB*U&BdKT|?x^E*4N2`i<oh-?wEjCv&ual;4{_UwlSa_eV?gm&J)qdx83J
z7dd!SF8`nzEPZ_IVw?BLO8%j@CSt-0GNGz_UJtKY7L;(MqLxG11<aIg7-eb9l2Exn
zNdR-GOKaD<?+^UyTngM>mTDylU%t0J={eN6*eiK7JBSi|e+coeB`VPxBb$CwD)p-6
z)(zv1y|G6nmsOD(KNzU*0^}4_wbo&A%Dg|<+pDRmxx#-p!Uk?R#BM!MNwyN!P_=Rb
z3aqakX?*Qv>KjMHF`I^YGdYIdITj~_bnw7p2ctu23fXIgHa1&PW!U_@(;XAKQ?MZL
zUz_`m;1j#uEL*EzNTK^?-OQH7&*%MMHNOi$FE{VX82y4buKe`)O7~9`GJk)dyypo)
z4CNU>ls~+5wL$^z8aL8C<HRW4ak9=fQ5z>;>TV}=X4%gApAEd&ddty0v~K3S)T?Yn
z@4KG`9rJML1=km$je~5gtl!!Q$=Ta9Yo?|r*2Vmr^q<l4F^tLh?M@q@3W{F@>KS;P
zAD`U0)uEu1eTe?lI(JgmOMf)W`yjph3%@O#zr^_Wo62d8rFu^hMaLsJy#)~vU>y5n
zm+B6kRo;x__PYIUC_R>Dy4}m>8Pgx7x|T|4;`cL;^H~frc!KnPIlR+cL1>rHxEwsA
zDC(f>C^fZ-y^x3qX6sn`{f+%Bl|ea&iT~2PyxlxFho*Nsg71dS>@d?|g|$x7;1V_u
zw`^=zbqd5_G&jjP@|W5(UHIz=zyHb0KzkMU<@opN7r{OQXTk3n`gz<=x#TG#CS}F1
zehfpoHVyC%c@6PDI=jB7@Z_F+j(?p7_l>zZ(>BVcxyYn(#5dPAs64La{lV?{`r8PC
zPV+vyh6VABjkd`Nkzp8NRp?MCFx8I;8Q$;zKj{5UwC?A4x8p#3cE&nm2M}Q)A$m$6
z!|xhV=ETY9M+3KlO4krK`6*_*n9J&Bq%A^|HzSZR!zFASZZV?SFw3fd!ZQ5gdRzM!
z7JmQpq!V@SIv~e;zi_AGdbZZ9$9=fCPiJdpmf}Fd<d-v3(&9C@w_E&^6qHMz`L&H)
zXe~<rFZ}-fLL)uBgqxckm(8v{x0H41h@--&2ydBpmX4&_%YJ1gG;`0*IRB5$@9*yS
z-(#+zZpZi&tmN3u{I^BQ(Iw;9s2I(c{FhJMKKSqZx^t8dX|?Rm9GJ1uL3oCl2baJ!
z+`=$KGF4VESc+3uSO2&VXz9Y`D?gl^0y<>JPU_J83yst?ThvObN-(ERow|hR0KPe~
zh;P6ik@C(}Cu0}}vFBD~O_tn}@goP!9&G!+#21(Xc1(lYTjp)sF22pNwbSSRB%Wuq
z%VilV@AP?}`D0y=U=v0CoA;-lNeuGe{^jeD{}Io>Wyd|VG6%1eO_vC8N#HGItwtfk
zJTa-ev6_aWd}?-gz?&wxr_?1i(MH<@S{-6XVKI*_=5Lbefozn3!{)@4_e#_C79WoA
z%_ogh0Y(4Qkk`r|tN%vMn|cpea>I62`WET~uYKEczK9Batf<p!`Zva@sAp-~LHFdz
zw0`e-7(i@~;<yqR0y}Azt_$>~8BNc<zMZ8<e(95h!vAU=<XE^R(eCoIMpm|u?7<1}
z2-`a2UWyQ+FT);H?V2mOnxvy?nVDi|lB=uZcLiNr#J(Ro@Y{R%n*acqUsT_B^IJ)E
zo-`EAsXK1X!pK}7#C*4H>#34zY}9AJDJQ36O}%;lBzsu0nDJPFzU%uu8Izif%_!1w
zDqI5V$)9ntZMzgSKdV=EZKVtO7ci+%n`jT)wDE6c7iA~eOB;x#=)4j%Cpdi16A%H}
z(OatdbSEplR6@n}==B0&B9vap>f@B5NM?qw_bX!emqel%xa+eRhlYX^6A~`6gB(A1
zV7pUbtmtEV+5ck;j_kKCQvdisz3#_3;z;DSfnBxoC@2#cZaE*X?ayTR9MGU>?ZB|8
zEhhh)nEnF7J!565SM8bJuTL9hzu(B9+2nlmKUoAl`xy6J)fSbMeD&^A`k{o&FqILT
zA;LU7US<3@{K-yQHErp}5!&LYs;a8Z_30XhH8OH;?+igR1;4SXc1CF)#nMd6OI^RZ
z;$i)@mH2&eBNkY&*ZB@EJ6JoJPd3NJHk{|(rLbAv`zrOnx%J+0WjCgnO!pUqzA?t6
zww!Pu+qNx@&i=<+@*n~c#3UDjJyvqXWW3gHnC{{B`v05BG<e%LiX4@8EWo@}kPikC
z@3rghT@^xhc6M-YZ?7c31oN+>x5K8rCWF7I{S%bTCwXxW>^#@Z32ABV$}zGs49DJR
zBUT>9Ut{0f7YDlcB)~h#o!eZ_F#r1~%qR75t}-%rr1Cac=wSVH;$_7}5VEjGF>I^P
zqVJ|Xiq#LgQ4o|o-v8nHDhaK<>P}adHYY5_6dg0H>39}*m;+3k{99`1B9_dv_CnmZ
z(q9v6DtlQKx;_bW{~w%bNFR`Oi5+hmKs+ygE*l5WZ{B(`4~$j+xaW6wzV>MKbl>-X
ztsfxNeYIz@yZ=gA*+bm)^e>0DK1G`Ny<o7NeOW((#;t%;pLD_bG-&%fYj?jQt3C*4
znztSPTt_s{^EYDP7=FTtfE-6pvqdoF@+)3LJw4XU)|Rll7#gjux&JoyfIVrVMdTMJ
z^b{ECpM`06*~;-gO}U9!dhSkxXZ2`w&zw<D6TH+>b?VmRf0`ArnI$%df)9x}E^~#Q
z`(OvW)hGty!D|gx*!}zW&s~!9d}+uzko;f&zhxxv5KVLdaZ_w1EB&|iSDLC%%L>zt
zT7h*F8@p3>;H6(f`GeLSBX%NB&v5@+vh69PGA9H)K2LWG;!V7F#Z(kh@5b)@dL<`I
zAn3Z<_iTw{|0cxm34URjoo^<bf`TO04Gqo3?qH5NbqW?%lNnQ!v4n@*z%N|3xV;$p
zf7oR}bot7!qj!8JX3>jiHU?3Jo`WOePSR}IpJH&HMhCzzcJ7dT9+UH)d;gdA109?L
z+raenmMz)d%)~TzwhtD8A26%&?6W5dbf1TeOy`KS!vCf9y!g9Ihp+)eufqdZfg86R
z<=q8}DtU-xkAE4GbedyBOM`^?PRrv6eUe4c>eV7!CRd|e-rQN<yF4i-a&^VKlLl_@
z0XqEeQrL@tEt!l%-13vGivI>6C<S<z(nx;k$-VGI<<qoV_NH9E`Uo?=bZFyItnl{o
zX^*Gk|HrK*dL?#-8K3BE@^P{*(nYri1ji9=ph$a_HxQ-2i*Sv4c;5^T|8J53cZ-!b
zEiyQ(`T<09Yfwd-haVo;z|O`-3)C8N#Ouh(nwn4%?-T#D6mZAbGPAl353_Fhe{_8X
zRFv8K|DuRewjdxOiYTdcIE0imNXLMHNP~2@fJ#c|&<#V^&>|s22tzm0okPd}j=TQ;
zyMA}i@$4DT3N!P*_dd_3p8IAgwC+#Z(k^R0LRDT_ra_s~4yxwAaLSQ?^gKb6D}d-f
z>dyyHkAyN-fAd-JhDBTY1oNLBVQTbbea-8eUs)m2!66ja`2Y241KkaZ#fu+w+55hy
zfLoH?n$$*+FO0?;0D?EsbF*Zr2_Y0F=HK=;pX&bmW&2QzdFv9bUyZM|ZlM&c`YR8k
zJPN#>i2S`t`;Z*5U0<60!%Q+r{sRe6O|fDMXfY@qO<GY9CD~6u*FOW(k!jGa>}qWE
zO(PBxf4sExbp6}J#0Ae#RB5dx{{_Fl%20(CYEfNpV`gA@GlLwGCT6_vArOjp00CRz
zsYR8iK$1Lwf}&Z8lw<nb;<O1m|5ZfXu73wR4&A{d)4$Us^+=2ocbTU3IW6jOAZ)1^
z$21^DN)}u1-)k`z^OfrZJ6{;SUj~=0LA9Ym5K@BLWm_}5-K%FH>qh=t3;n;po?lUA
z<gm(txzDroI7`Se_ZV<hwHTCNRiSH))v$ux`mZHW^B{=~y6hXJqg4gOig{jrU$tiy
zKatCahq`jiD#FEqkD?MFj;sH%d;xs>^Ns80S>>H2<(TCSI1XDPA|oT^H8pEmOlRNk
zhHg~7{`&v->EB;XF=FMSSxY7|{z5;_Cc?WIkp7YYOB+KXuKUJP5E0&*N!Gvp_aFP!
zG_FvEXkI`($YSCkg~sPZX?A<7h)K$IV;re?yt@#j{)&#CRAx^2zwaI_?6EtA>in!G
zi~t7{6A`^dj6N{_$(`je?+*4KI*_1AVdsT3<9!H)!@n<MfrfgF#J~WMI?KI3GM5kS
z&2ObIfQ6_3D9rhu-N2NpwdA_N`<{PaDjxRO)VM2_IWYVA&oEGTba4;Zvs8ZAkgBrT
z(1|+!*IR0Uvm#+kT)v32UGjI9mT##bkV%c&OwNx?pxl8wztw6$o0f?epAmJ?ec{2s
zs=t3=ffZwBXT%Yjl0vOLFMh(xy9;=9sC<JxWuB3d5edeai<ct051~~L|Bt1LUGwC1
z8NFf5=1cP=nS^^5L0(!kIiLUtCk)S$s2CgP%!$V2Rnc_sG`#%x)ziR#VcHjc0Td7&
z9i0q!Q3+YhzuUk+V9GME=j<-tU0q!|z5JQtzxur$c85$q68Y^jqCe*&_Ju>wkuo*K
z(u)!#gV+HtWYVj9NX>-Xs4Gnr1VnCJ8B%jcb*r4l&;3{bxxNYW7$%KXvjBP+3qBs+
zb}c8h6Gm(~P-w6hgrt%i1s)6!qY|$@sulg$a*T!j0yjD#XFv10_|Kb?AR<DM8jTA;
zDG58Fh@b~*H^oCD;@=mthXBE)j)CtTh(=$pXYmhXUjR;WeG@9z-St9eIA(^eHq9Z3
z<l(=U;)kEX!lUVVb8MJ}jdo#Evq8&Vu#-XQ=}A$L{?`&Z{<UhY#^h!Ct)*HB!RPWw
z_9y@>l?V?R6fPnJbNP=SrrN?@h2;=~o})79Njo7$Gx!1YoIw}bUc)uh&~LFC)SX;<
zg8zDnJcWs#j&AlEj)Cpk28y82AD*iC2g@{<G6f+rD1Y>fu=9UiA5(18gd7#kc#@a0
z-NDkqhBU(f+0h)|n(E#fg*xkXb|&sf7X14<qdLUy;HY6kO)1vSIxF9OcHxS!Sc8RN
z?agX})m0;vPgei&o*w6z_ay!k%<WDtE*AZA6<9Nuwg~<Og#7$PA88x1oj!7A1_0i3
z@?a`1NlRT_UDVQYi?q5__Uhdb-~vOM%pKb?Wp?ZO!D%qRur}Q&pk(njW0t*(y^`te
zF$OKE^rbOhFHD|odJ{Npi$6I!GC3T@rF|*d8QZ(yOBSm7KKZwrav;)R`nhO#Hxp2i
zx<iUF=Jj3HQj9H4P>=gA7B=}d;OxIP(%vOHef?qPllsv3oQ3EBG7R*tzj82t=~A`7
z56>3?su1l$!yA8A!OhZ!@^1bpxL#{J%=P%JuUj%l@iX`SBqdO6?_bCFTo{B$oP|9)
z&Cagl9DjImw9H>*=O?H5`{fcCy6`r0d2Vj@<+yRb@Sk1#P>1tN;FY1<Kz+XTNf&lw
ziVe~&b&34n7ZMh>Ch=1ZF;-(K>$^TaJ~426Up#R0Js;lOl$I-al`dfd^c+*4jiU-P
z`qM8jJ!pGsSQUY3TjBvr8f4-fEhJW=R|>t<<R|Gf6C7+UYs5!_Jr7sC-}JWg2q{Oo
zHm+fA+V_~`%H0S5+;bkmmjdiA_G8-n!|QE;^>%ut1TPFRLus=XqGD1K>0aA=cl4sf
z<^NTL#q5dA#GgRB0T$-3fQ~nm$6&LATn`Jd@$AqH<P`qC&zAwZvh^Gg@@<Pu;FYc(
zW`RIq_ntPa2qnAYQav?WdPM@Un9vq|HU0k9>^KSQLz=0$dZvdBrcbed+p(D!-r@%|
zZhXRB=B7z^i3YIr%lnz~N>B^!_4GO_r3eH6Yem3u-{rte!WA*~Z-{8KId1<~Q&<i7
zBqGEs5ia<Lrvu)}(Wr%$o8Q$)==YF5XI2CC$kabBe8!W~EEcBdMB=M;9z9W*4G5DB
zD#cy@y$e$rV?Lcz3JjKqsU^gdK4OF)Kj~3}9`>k%?T44gIBHkb+8Qg4?*CBEKfj(Q
z7yi2(t$11wK*s>}a@n%-3QL*@yNEMeAtl@W5)t#J0bEEj&w{#RnO<jfasOl>D&yOd
z`*B=;@$DB>nx1|{3?WaL>+WvvU25_bKP6@Vk{|o0Dfzc|{DAOKlam#4A{_|(@uNe)
z$;qvcSLmWRBlwuvnUgBs6=Z$*Fp2j6;>T1|CRhLlqIY{hJKs$9U<UV@m;e|~ZjC7-
z?{{4esCxCWL&c|3zq~NwFbv7RU%$QC^lB~M+B72l`h{t{fM`~+w@eyD+j}>l)e@O+
zJOA}Fe)JfYNoq_2oDsu)SRW-Me@+9Ya8#+nMztpyBv=3P7bBptEAs^WJ9)Zt7mo|T
zXKNC$m-6zrq%fsP=gLDxxnPl|r!bGZW7|vX<He@qJdV~#BJN}t$Er#zeWSRTC)t?^
zMBW2N7qB*PYY;l#yZ87flv}qc-8H+iReO}x@a|uSH*Bhvyy^OM=f1^;O(#3ck09Rv
z`U*c%U|A{^PPSf7&tg8#n!iudvg)qa7ESzxCUfc=ySt40^FD1^|8pgPEd+qW&;L+f
zd2gNbOU$ji<|{^w=?Sv92h>?A*{{;sX=rF7Y3GUz7qboMhhm`Gt~4?BljOyE`2{tv
zn6h7q^yVtfe0{nhO%C~#A`o|(B4nTc@T)RDS*COt>U@tAReXMiRynLJl*8L!k0_Ca
zDCMZY8N?{uD)Wz?y!XHx<1~wXUTv)OGGIh0xBbUuN^HEpeYl^WP!>O$k;(YGa6RwU
zzU&$QjV1`8^nw&<W?LQyf>iA6>|(;j`A4MB1H&<7ee+|(#72G89Wu`L3jPx+_HJJ0
zyKGGEee!%uka4Noo{sb??OqY|-c8x6$zB92V`_AXmZB*H;F&%DA0=a6J@mkUgocMK
zphj%^cra^8Jyjp$Q;Zf7vVd5--zG+~U%dPYmSteOwB*FeX8O2ux$kpM1fANQg@r2z
z%2VUyyt6GjO{V+j-kGq_upnNhH;192H+wE6>aZG|%`G^6F)ZLPyRteQDYNc%kybX6
z<}jHP^2B|&r`UCUY!*j%_qQHfcRedSLVu`x%D(lda%5w7y4!4eR1$`bKH;|bHekXV
zr?<S1JeA^82<r^kqW18xzvOm4a}AU}SciJu33Gx=nhLYgXl0Q-uWa{CnW<vl^$=J7
zGlzGYcVxH(BZEjD7$jIpd<l%e`b~&EP@sxeK!5u5$>lTW)<VE{6TmkVv9w*@jo>bW
zbx|v(=<O6ST4p^~)P9G)oiLD8d<S8RX~{fwiDX}H9T=Uv9j>=hrfXJNy;>?oeaU@a
z&2Y&V@OOg2;aa=7Iq|CS@Q`M)j+h6mQH-p4686UT#*)1?o#!H~RI9A);?gw3f&a8l
zS#9@?bJCreHqjyFO<&%VC!Pmqmd_2h->8{h?^6isCSsZ;`Lt~<i2m)uea4SoyO2Y>
z+@G%=k)$=-(f1lu%;46hYZvSW#=H_^`g0C-xOQ&<EU!9MG3g$~cZ;??myF{sUBFV0
z5D%MC=vN5>NaTfNj;lTPl*$2=-Sas`@mEHEL&Z<;ymR`sjj+gcwly&q@hbL@ID*2S
zp8wPIL~xhYaGMtv#wi_zwMI*ythRLXt}s*;$)`?<J1;2a-L->^9h_gfdUBLovlHz8
zVf|BKev&}lM1`!~+VBo-%A+!4@kBC<aLHW7kE97qFmksc9r@fz|2yXk6_c~QaH#&d
z=f!fw;88oh{;PVo`$+TQDh2gxB)Yj2kfP#lLe4f@Lej!3ZzLIbd~8$(#dfyeY~wSZ
zWMe@NH|sRaKgy(XnAF<0-~OgiJ5snizl56W3zI2UBWwP@a`9K;_z>(d%#_r)5j<Ls
zk+Ks=5+{0$S%97_>8rN4e^)_4!CLs?sPXMag_l*BpAE*(8!r|yC+F@hj-qb~48!b{
zKU?43giT1ffo#=bPA+vm?0=lpcOnf-*%%MIzOXO^&91VWjScRUNW;##lt2HkEPx=S
z+TV9|eW_DyFM^OPZSY_VZUc9kYqxUs#%Y|I5N{=4kVyFUO59TXqv_G&gKy3=gpO`&
z>Z45q4HMy57iT{-U8`^nmKR)Wat6=FVXi$bpLO5#yuCI~kMN#?kn_CsN7z1qPhPvT
zSjihqs$ZQ;+KVdG$}(y9XKok5CmMoF;z#ly85&s~&b?asf}cL`xNwP>hTEw7hK{gL
zB6wgWiOExgYv{%MYcrLpD<MXx*%+l<g4R}QJ7I<H&fnJSv)fOdxRqsn+mW3#S;S3M
zFIzGIZlB3JHt7HhcIYlj+gifn#>=;!;@uG186u|AhYposV-Hpq>2u>zUH|A|KIEk3
zclCbmcLj2^jTlBC{|B|Gp`O}RtOYE#+d!s9SCSv|zwd0G_+=!B2$5f@c6QqFWP0-C
zbNTRaV@Pz|pjl^}WE2tO!?j)1cpk^}^3k3j<!p1sL}*n(EfGPBkM;O)Mj%1ZW+^j7
zd^KabY3@{(4|muniqh$&alErR*Z599#9lg@3OSHp(!XG0JQ(@p;$;cUmY<`HekL?u
zplOuLRPy2|R6QX_X|8yuJIcQsqC$7i-Q}z8D?vlgRwCv|Tz?{;lVq6F!N4uquYu+V
zd1@>M&3H|FiJe`MPW`#9`R;7L_h}C(R*6sZ7aB?6^!TJ{f}uV17u}hRyFlhh)vs=7
ziGCJ;9YY#FVCQk%XUIPp$dJ~5Q(=Cithw`wD(*?Cd%dR1sM&r0rAIm4fp>M3F7f{l
z9D92~Mz&v4T>Klu<Hv2mYL|^Lw#{ahzzb~j&^T&THrcm%WMRHNv)aCTqG#TAMWRAE
ze|8(6>0PaPy>b@qVE!aRiKylb!Ju8w_d{n<+!xsz6qp4wl8kC!blf3WaagW9p3Olr
z2WJr$OrG@DxrR=ZWa@L;iZvhe*oAM4k?T8+jJH);>$f18H3k++Jz!Q2bIrt(sFAYt
zbrIq!tJoA2-$P@^k|bD=<J`fPm7p<r?kWG<Gnbg7UP)gw{+{}JO^pIA%_Gs&-`3z2
zt{YGBmyHe{f@h>a&J!xbwq6Zj@<}!9l)LlY7k)w8JWa41f0_dxD%k5TyYd<)$q4TG
zUFY>JiHapLFTB;`CEBVylfjTwdQ4|C>$-=FeUSOmONPJcpY_w_$<(Z3tpMOnn0Nuw
zvlo+mrMh|ncK)dFFLJ`tVG17YGX2eRX!h*lWnpiT?mYFmO&RBAW`IJRp!NxxuFZEc
zrXM+%9hN9gPt<O2^KQJHSnk(Jgb^FhVtU*}95<`!ZHRP%T`)y4cnlWdiYmYUr+CZl
zZ#oszeq^l+bKhnpBctt<gskW(T@|nLuRcZISM#A{q%V{gAP$Tj%zgl{ZM`TV_k7y}
zvoVny53q8RCO*KS=?W!pEwTE-&Pv@y2fYF8T;M`8GEuSp7L1wFz$zgZz+FDHQ+GvK
zf(%DYPTH!Wv9WQ`hc|JAb>?NpDTxu96Soe!E$YAJ0svzY$MJ35qCNs&LBTt2y(~vX
zwmm(~p`dB^isxhe@bK`LGu0cBQ0G0>avPy0UzTK9N~G0o0jisxwV+RyRoips>7sU*
ztVYCXMS8LKf2S@}IUZpOXkdOx$m#=S5c>k|jXphl+4JYw?&k-3BoOzynrb~UMILC;
zi%V4{(qy1bJ?IIMPtkT#PiE@itGCOqIsZm0PXdoMkLF=tMW~pLJ%+RdvesxX7#)vQ
z=?rf8kzE6wOmiYX&hT~rYVY`S_S*&v?dMY$m!2DkC#1k5jACF+7~m8j$s93*jIz(4
zc$3`*&KTN3th{;I8e}=`u-#0ozv^|#*KEIb=@H2g9Qq%tRYQ>gN6z<JH$htF0&jL$
z@s%L}=M8{Vv30%hRv#d54jlot9DZHV_ji=xCz8`UT|APc+IBITP7{u=&VTqHu1`?V
zrr#8Jy%x!6wbJ&vT-3@0mBCW10T3+d3F`W4Pzv?o>Hee$=SS&XX;HIj`L^<gs7W_^
z^m+}dum)!uJ^A2&j9&-wq*uz3YwIA^?8s-Y>-_XKMq5l_m^!>B#Y?*ko~X2~@WTp?
z_`cP}e|8_G?WQ(yGIcS^<lMrR^lZ7erfO`UrYhd*$47ai2U-zl+ll9G?VWLRRh*OI
zC;OAZPMkT)IS=WjyzG?87deeWYYOf1b)G>5V30Q-l-TRe4#)1$zZuF?%h1cxsw!r1
zUc8Tsz3R?T(cKwgt5MO?oZ&c*2#xrBpOCrg>j18Y<q~k>%kY-V%*IqHmip<LYTb_d
zRcMa0GYLlPQkCdQ2CA(8_3Gbws;>AjYe6w|TLz$V?mMY243}2(D!|v93mCFM8Vh|b
zW?y>iG1y-~Irfg3grl^y;N>^rTZ>$}?`zQ0e&xRX`QW!37q=v1)nXq0$xPO{OuBSX
zy&h`35U-a@jz?1=&^V|q`5-KejB>e0Xas!Mv6lf9|0D_x))&@5jDfSU)H?l7dAyJ$
zgPfrq$i%()L$dF`erz|?sV?OO=9H1Hrm@{Zly-aQiA&7!h)HB-pwH-RQk+HPs75)a
zvfeSj1FI}}x1r@|*PoN6x}<BCg_@v?f9J)}-Wfq>03#`H%$q~bHD)Gq)VK>&+ga^P
zbNWE*2Q5y())gPus(mvV%pYrq2C)NS**t42S8$ZP)mq4X`Qz~-6tSxvfR)so9Hx(z
zP`#^Tp{4qHUQVQL@jUNdYq}8St3B9%ewd~+N*>1%Ax~7@*^)l<dnNG@xPnz8()^|7
zxlG?o_QzNtB4_oyD#E{@yka(jO{RzVgWQJ><}`QbTLNU%vNM0VKg?vMKTI@Ai%yKv
zF>U6!s<EnJ!~Nv`<mo}G%Q;FmRJn+Hy$jcDyvSgZFMG5kGrw4UaayZ34LZ|uA8GQk
zV5z|~!^DhJ{|)vai-Ig~r(N9b0Jz5)+!f(mvHF;Sz}!oH=VT}SRs2FLT^45{ly@nv
z!Q@9zf~E0qee*-4hwrWpw6p8Z73^(%e0<im@N@#88HKEZ{t|kMyV-W&etc@LR&N%^
zpiv<cnXkR!wE6O)fj6&B2pJiBXj6N{PFvQRtsY8@@>_>Ka<)u5Zgu>1NgtROp)q4+
zO&8Bse{~qZ`bJFtbAWc`QwQ}#Eet=iH_#U!S;`#EduCJg7mGDhZB^JGr&WQvISQLO
z9M0GD=gv($QIM)u1r^WHd3&}H*16$+rbgj5PvZEtKc|v)G23Im)Piy>8<R^{7KFH}
z>8@YA_SI+oFWBj8wnAOHPKx|p-0+~p?(lN5uP}?_eK*GCOdsTSm%(*69Egpw(w7ss
zg&XLyh@ac`&Eu0ywJ1-edR;KG>xSH5cXDm=?WKc2mj|=%i;-Ij3)@=mxL>k4r9bz+
zHoF0!s<z8cpy5>RYDOTUC)u1UH;!D?gtdqNlRwx25A)D8{N1UjP2;nfU?le)df$6i
zQP1?=pq)2{e;5KDxVYr9kmy>TzkT~tkcZnNJ4#L&aN+`F$fFK_vv;w_YA-ueG*h0e
zY<WSzHLR?(t26c+=xrleAE|#w3tyEnbuJ$V7n7>F4sPakAWF7+PjO>GJWa<3^0#|O
zEs#4VgNTuOKoKSEqL@PEOEPlNqN&C9_NyUc0K~ec6^`?1zDoZ@U^qJPUBI5I51B78
zgAq=N_9sk+YWLrE-G)9F|0;Pgi_mGGXnYv8O4?JFYhygXK<(vYQFGrrQzkI$YK1vm
zr4~W3G$aqeCtO46kuXGAuvVQ|>oGuaH=>G`k=O}hfh?vPvd(=9ls~D--|_LLx5sdO
zqee#Ng6ub>UWF{bRNA%UmvY^HVUZ7^GQSdX9GZl;(Itm3L_-T5-$pmn(Dd3gcuXzH
zn`^3(kfg+w7;%LmV;brb-SFVrd^<-g<qJX=on?1dXakw6JJj@W(#x(Dm=94g7iieO
zBb1kwy$AN&J?GsXc<D4+0<bk|4ZHQyerd?1jeJ_z6*c8S;qKbQw;Jeh#x#?B73Bh^
zDCj#+O{?`Pt7UOoZn!rf+26|s>DEU5YSb2-njZvzIO}SjIa;pXcv!2#+?1ha;@_J0
z{n2m2EIC#Q;nQoi^;ubMJHxkl^)fKK@VXgRx4WwP+GcjOb8*P+wbqAw=<|~ecge|*
zx6C$PFP8dYsyM-_64#kSW`$_&35Jn#mlz<o(JN51T){w6s|EC+zi*bVW2Z*dD-6po
z7AKT}JDzN*+R<j1Ej_Up%8+I?Qy})ANO&d`#J%12;i!Ch+&+C}sHlA9dovxhW88hp
zW6e39$kcT%f+fq<IooE$NXA0CZ5wPj$edOGdt6JBY8-EidmJ5wX0D#E_d@PlE|L9@
zGCQUC8pH}y{68f#*&W;Dh&$4P0t578j>rojW_pwyZ8q>ouvWF}!%n%4H*&A4s>_0{
zDiyHC-c(x?BdyG!dw^WS2NGUJ7|B_k4&<khJz{cQNGSBpFcCC)BomIJCa|eJTd%hA
zYgGTvUS@*ay&|hcVDj!+-O@}7wGq#|q%~!2#{->gyDskgNcS>}#I*fYqiUv9uRHXL
zAxFg83{l$PHUj*SvH~$T`#@^Ba5K+K+x=L&YvX(4JH@ag^et#&Kx?|miHgmJ`H;<X
z&eZ_F%Vxj9xC5~UQ|Q`v`Lw?*;bQ|Ej4^o~F!T*?Cm(;}R<R)E)JI9x#1kA&rI>!}
zZX_hDg;Ge+nygXNa#&#;-k%B9*%u;irdaNIV6Ht7wdNo${&Dp&U1&wQ>F3r+221wm
z{cuaYrQQ@D#eBslrnxW8`x?^MnMjwH$MRyH50#lb?04RqxLAKir!rw1HOJ~?;dj+)
z5IrdNJP7D;62X=;3@bg&ja5Q^?Id^Tu#v`(?N;E>;;qzRuSV5s-Vk*T#f*kqr^m=L
z^fGr9&3&Z$CG_r&v}|-hD_UMBnT?FCt<Z?@fG^JMh(h)`LOSmE0(2Kok%jBdrzDDN
z(rf^Nh$uuM9^Am-zFNkc(?est_RV-C943(~flKzN+L6-Fb}=zlE{EIj!Hq$@`r|Dz
z$MsH5{XP1EHsH`uQ?%413#CufM7@Uw!x#FNhEZUb4q~YcsnX(AU!<s#=e*o6{i2fz
zdQmGnfl;Z82WgYqL3xH|u1b@P$_mKqNsq#jm9JUeM$WA&$OcM~eN)X=l9QE<iXW?Y
z*EDn+tJgLjvy0OieVfVKlHT;cy`2Q>GVppp#KdQU@s>QMK6!eBwd-*AIql{l3=-N?
zQJ_<A^I4oY&gL?bG0}~#yKc*OHeZW;k1Mt>hbFb!KG=a6`Hp9EKbSufV=t+2nwJvT
zaA#{1D^-`9KH=mvXE>^#v0XAUBeq?BGgRZt1WI0OuCu(E{d}=Li*DCFBnO|%kr8*J
zkIzQbV1C#Hhm0Odp~pt&`(m$;0b#&0>)}X-bmVD;hCpKLeRAGMXm}0Son<F3J=Eg3
zU(bT)Q)->l1ve&qQEC7d%r(oWuF7!E{iZ}*|4<EpOQo#fD?5+@d?i!Oj3X`AO^kYz
z8@6#WL)mNaLs(@nyq#lgZNy}*e7<uoPeG=rKT()|H*wM}tW2XEn-d>eZk~|a<b1MT
zJ%>0#H^HsX+9S3b0!V!K?G1Z{TXq+3*qypn_Rw8X;_?0HdMpu#L&)HlX41J(YG$Iw
z)O*3nNk4bjrt0VueHu_KMb)9NCN1vtp+e^Os;p;pcaiRA4~WOC{EbH6|CnK6|NUOQ
z7HU|}b~627bTJIm6g@tC<s+O588_6?$jt`Bl(Tc!+wER1AO6sG-A_o{Uk37RW5pP?
zeYw$2W}->u;;>Tgn|4WNA6wD{HzWR?31<=6z)#KTr?u`!HY(n7BO92;RZ(Ghf3OeE
z=odjx?4KN6o{ZJ*oGJB;VP`*vb#9gm2RHT7Zo$RNX@Jy5cK<j9kB^6_SJ<VKG3P<)
zPajFM<V(t;G!7S2%e`M8yB~^7oNWrJRXJKet;}phzGNbFmL_>BKzFvmR{wH&d!`o5
zJd<U}1o{WVWSoY{?e7*pLdw**xxg1$|JgnS46#ZQ`29b9dVXHZwX-Y>vU6wg6>k=8
z(<LGX05yGcH!uRt;SRZU1U&^%S>HuVtB(in$ujP-5lW|&GT^I1ca~bI%CRwYpSgZV
z=4le~lKO|9N{SVf7~Ep$X`W70cz9d?MVvMupO8%y+=mdZ&}_5o4YQCK6^UoS4H(=H
zbe=?o>iVakL+%&rFUV-<k3Uwd(iN$K3tp^3C^J?)W|Y2QIaJdVX!$kwYHkv251B9h
zDj}#xX4AU$<j`vOvlvf)?Dp@KL>Ss&E+@<Tg-4?=(;Ek95LnpHPhY(FWk^3TAi8>M
z3YXv+HsNKc%}TznH%f#7+JR@g6t#mKEQkV1)koO%xn0CYq()k+?cnfL7Vdm)<i_z*
z=9F$Mj@}r;+;B(^QEGUfy$dp!PY2D;oSSMawjG3r*wR>SiY4+B>c>D4*E5r=ny&G7
z+*xTpJY)eHHwv-3PDPp%I_Y)}oft0?niwk<(HgCk2eR-j<$Cn{R{d*`hX(O3R;v5Q
zhmUx{p=RR<@zxCKYgc!U6Wt>*fH%<<9kDuA%X=`%Tz{&(YyGOA>D8;*%I&%I6M3<3
z&&0@WH|t3~YL|<V!cCv;;FxMoY`1Ets+c&)9?Wl^Ne7tJKh3ZyT&xmzzY>qdk544p
zWPtYDW+eSVFvjdt_Qlo7^88(7yTpg0M<^<rTBEI4c9zq*C@9pL&O{a1Jxl!=fo}%#
zV%~YCBxt)m>7HAVoQP$Y-=R0*w%(k2%|#zCIf%=$Q8&6fkgnJA(5CQ}EPby1LPDf9
zyUFHjqaO{9Gka^j9K5{oMjsDgeJGi*@<1}lxFht>l3Q8#CaXb44g;UAmj$-!+vM0A
zX-~wlOJZcj=LpgN=^v_i(Pmz1d-Gu7c6fPtxghbL{Pci|FxfhWdynSBa;sltl$n`u
zaY5MEt8P4yx?GT);*+a_j6U2U-*Cy!><O&ik>Rd$In4UwS|YpTe4gqJ%W)^VVMgh=
z_P{{WZzL4YlUJL%`NB*#s=MFC*IHBxt{YAA>0T~S+C=l8Ke2V%EnFS=vT7=BzlAL}
z4%x?WOS8dF)X1LOd2P-9R%^;mjoY!E<9V1YL#w8`fRJ`~Fewc!xqbYa^UdU>1R(M$
zxL3#)$O-Zt{=jcd)b``y9!?%K-I9~l(`46qK`nLh_vxOOH!*uyO+YHtkLqgE!z|*<
zHnF83iQG*TW`Tct)=IhEGI`D<TYr?3H;Q1VoB3W=m0U4l>}*|`W%I>HqJJ4sFE8ad
zzN0oD(sNCb!_?rbhqZ1PmD5uu;mcKCW3KGCgYXO!oR4m=22UdB=R8SRr52<H6DOj%
zdbWJ?hGijlTps;!T*7I_v?S?*$``xbKkyh|g9B`I$GCb3^=2%gcCF+YI<Ghfjc1<m
zjr<nPdTEMPow;1p6*<(6l?nm`_8`|^HS>~@2|xOhDBIT*ZQaqTJxVSr)+pvE;B~Ct
z=2sOZwjz3CAV%}Y;OS>EFMp3|EdXel$z$0*j}Sv=9g?4naWqv~EC!IUSj<T@X(Uo|
zzhbJN@_LuEw+q>qiJ^APRf*dx0yvk&edwu5tL0>=(e`SNlCSM@$^>W`P_vR!7Q#UQ
z6v;_gToG%pV_>^`tDDo9awD29tCeRnJ66RqGZ&>dQ5H<h5Rw|i^`&^+pU0LZp5MzU
z(&hF)wR$iG2RLG|^vh5iza*_tsnTnH;utnU^fqdLb(EH#(;||1!U@@f<7^ja0<^S!
zyG1G^a60flk!<*}<+&Ti>4-ttWY2vxmEm;(8ewma79<-mq<hM6&3F0pMKqFLYlf;X
zImc_tj1)5a&(dA$#lvQO%1nd?Qr8T*p2#zgCzXLdp3+@0jBZP?ml*BnD7?$Inj<KX
z%kd>=Y+4#Lo2#%lF{Wf%BRa*mz%#;0ZQf4S6`OQZG$q$?8-;H0@vn8@>;KmJZwg_`
ztgsbmAXYc^`T{Os5Ni--w}H@eN;T4w1_n|D%;-|oEZD^F*cL77&b27iD&>gOu(R5g
zb37Y-xVho(YA|+}${8pRJ&<O`CQ^J#>zC<z&ezbA7eh&po*~%S_NG^d8!sV+5qxwE
z;cvZh)UJxlzV9117_a$@XR+lKqa#$>)7(zSOt+bBmtO2Qbw7WA$jwg3-O`Wx*3syl
zt|{>;;YZzLFvnYT0{ZrBHKkifEwGl4hEp@Ixz$=Enh{rAHnt~*mO%M%m#ZN3eZP$s
zUB|}?+qkPH5YUYMP=^et9PfHvS>EI(QLLEp#k}+zA3I&EDAt`3b+qZB9*tWaHcK%<
zM)vM!#La%y3odh()NDUpX~2p(UCw0z`aYENJC)M@K>nOXD><>1>1DUm{{Bw9YRJjE
zr|(IYdk^orjhb0L3ld57AISUc40Gu%eFEi&0-^h!<H59!z4$^N&~jdt+FK0SR@$b*
z)Awq(M3(@L&+Y!Kaq6!E0SEx!moV&z-S}&|zw%01K_DeFaYjZ<y2bD4$HRjhlO|5_
zgp^>0+(9JNw?ZQ{K}POpB=rw0?aOF9VrChSC$Fz0WAu;Dt9VuBE6OC#ZFANx5kdQU
z6zZAYZQ)zzChT5C|CA^lHwCpBzo;@sFMagTbJ~4b=~utA*b{nwcAPQusl~_%aBVA@
z(ejM7ptWHI^Sb4n=;#8s4am~zkX|`aZ6?9J$BfGZf#UTk{cRX^OK-mW+PYjG@2t~W
zF<Bq$vFWE->p>HW-OK+}4#9YIK%L$jtY?lePu1Y?B(1z=qi(o|l3XyFqdN4Aeji}n
zUQjMldM%+sWaIg9Y`F@wbZ9a4rC$4q&&L_kC1uJw6^w+kXv8|t(om_9lq@gGHfcG5
zmn=wsBuNmKaJjl%bdD{$KMN@IVe7tj%MZulwH&?kUSw9wE%A2n-cF^|DvmU>F?%GV
zm0qeSZrN5k`J|lwDtMyMq;;^SLbY9`w6KkK>x~Dq`PcZ_^7NLXpIM1V(A_9&{M|pi
zBCG(=G@ETU88Ax4!Aq9T^57O@qP|>cSq_8SW@DjPYV~~?r*-a_lP~z?%il#RL@+mz
zn-h+JIIA-~A|L5WFdJ#B1->WM)ROkF_jtMb`)KR)Q;cXs77xqT`%5E_Jau%NeOYzl
zh0lQGezMujIr;LmdYz?Kzw_%yj06|2NG7i~ZdPh$S3Xyw1+bZ;>lf@v82n&E+dZP1
zR)%e?nS)xG*u1VzN5~N@Nb#br;U25a*TV=Cl{RItm7bC){x-aD-5M*kSs3J95<jIG
zg(PY9N|kNkNXT{DE6hvyf$X19mHyWcA+|=F3-9%UBUA<oI0++$mR=%vZNAl>+Huc)
zy|s2b*f=I}ZzD>Y()A(m0!qhJd&I?T7*h@6IC_C^K2uyY-^E-Xo_mC5UETi60=Hs$
zI2oQ$t?NS&$BrzmM?_W|jj(a=Ev7!8h#DtUL1Q<KYLs4qj~j;+zwV}R1cIuF`-awN
z&2Ya*)>Vl?CCU{PZ3b4)hi9pBp5VEO@A+J3eSHa2zawlTAg#esDx9WI8L~FG7OSuR
zuZN%{Ad5nw*>iJ&P(#UFszLbv=P-`^&COlEH@|RcYEem(K{yQWnxY9&pNY$`K8tf(
z$Oq>`c|lKtSeE9e(XYy}#`E6|4;5L7`rzHD;Kk%7;uAM2VKRf754bBP*AK^UqHT(-
zE$W8fxgb>c8uQewzE1n`a%?UW^1L0_*+Gq0OIP0wMD3OeLYie|cUTd-ho{E@B+srn
zEeEA8>)JM38%XBX#hhvqUo7#x7RAVGFZxPaalQX_0!(vwE7ru+XSehg=T)*t8|y<i
ziutk8DJ;^8zd2;r^#PBN$H_RV2T_8V<joY1esv9yczoF1;!mt~kIa>Yc#+>I^m)CV
zS=)=wm4S%ti<fcS_b4A+i2RiW=mv0iki$HD3D~G`i1QcS{<6f+_G34q-gVSpy%!>4
zGSIdX$B9JbYcws__ee_{_lGid#vvn!weRh)M;5#G625F@)Vy09M@p@^b1(xB8l{g|
zJD(%2N^!PFejoL)Tbo^vGq*|~{V78R^se@HLZcwNA=9kME57FktE`SQwfnQ_6_e$8
z-R7<I%2`xSPP#0KZEtC_?9cNnB!UO>*NPG9>#!#ZZ+e#deQmqQcoa#stUl(Ny(0N1
z$HENAM%OlJj}Qq$D*Cg*urfF^0tGHV$LL&Gl2!^N=$t%Q>3gnHtrJU?ySoa`IffUz
zg+2NP)Huc!R8qV312Kui)SzMp6Xn<a)f*CnMS4v+saojeP*l3VW=JgQz-dvX(da^=
zq`&h9yJ-$k`?uc}Xca9=bc}qzL=1H(LZn#6!pgb-%uvE~cFfPZR0HMRwO+lVCabWB
z>e*Qs#VZ;|)z$9SP2=z?A-m7d><h;%Tkq-Ry0GD!fm!bneFL!)TqMGFP#Th4G{U{`
z63FnEo4&g9o*fiw-8D>%GdZ~F&BVieZO0mEb^Uh)BtRu`K~xZ87If+B<78ULGSEIx
zur?PrahDx!balVy-v<#z)?8(h^4)K&-EE)sIh6Vo8h6<W)sRtsuA~VtU3P@?esls)
z=KZYus7wu<s`z&%i-j}&Do6m$CDKtc)>bBp?aK(GH3|sEweiiXm-HuUENc?3$*9{6
z4T#~Nv@t}e>7RZ5_){{v-UnLG-s-4<dX(>6Y=5?1b**gO!3*Wn(~U;-Vt%#Va9wxS
zRF?U8njR|S#^6jV9w_2{qJcjwAqTst#%{S(lliGeE8Beatwqw`#XPFl8V#nd`K6(M
z5|IlFnFB-Xmo8xvdG6qdlb!t&nl~H!=aXGg7~zJie1(i7y!B|;(rgvcV0*s16lzdh
z4K>ok$OQ_Q35#7{M99<rWoy5O;6Ptj0vnGzQ@6Qr<-<UrwQ^p_bje#1W&Q=v%MWLI
zB~$xdI#&7@$^;YneDTxL$&GE6(pmOBFQ@d@*WIGUv7co_h}x{tCB(A(ON^#R;MiXt
zFQcz?DB#^yU*k|OXGIvgBl}gz_GFtJ6|`LB0Y{yix{!fmV0(EK&ZyB&LYsImqUaPv
zD}aqsU4P1PmkMq*A7gbkY(PnXEAwXr7X0R9y8h;Qj6SNOm`}%^tB8wZcMVwagCiSk
z#^E)GjhFAQNbV-Vd?nqF_UpZOTzV~AMNUXIM$+VN{B!?+(c!;FP4Lt%3)X%xtQy9g
zD;4ZZd<htCKSyEJ)U;N4TtCT8eKtsppoO_l6h^Y&qLr9q?YeTfgJGs1k``tl?d2wV
zC((XDEk)<{xF%C<RKoSBusvXdNh2!E89CVusYnNQ+RKh=_tTKk&wR%n|4MG`OV~qK
z{9wDP=kRI<q3B!3dl4Ljq?H5f`u1Zwi22O!k7w%`B(7@ah90*swnGy2IJ%mlG76W!
z$L(5c@Yjc}0Z5GsyF3~Fbb4x|2!6VVn1dJ8-8?lNt#Yooy*^GqdAjL+qNT~;m!8)4
zS?t~o2KUt-vCEum5D-zAi%t!~gEyCDYftVOc)`h*kCJ=l*k)r)caRRh1^;XA{~RYR
zgbjn)ax!ZXW-v@ujR}14a*B_#j^B7iEEoWfTw2<m^}vE3EixrPSm<~*J`FEzbOyd$
zHN2iLWsm(fi_N@LOud!9FgnWzCt_Eau|QL;$q$(k%!fBj4xJj>?o)vK`u~8Kmk`&r
z@2vq<Fm3*DK`gyG#`PFetO-`!EO!mF9VZ;&W%>T8+N<ManqQByk_7mMs!*E`Id}uj
zV;~wb&-YZDQUDp#y_?qG%mK;%-dQ+-vEswP{oA~RO~_|gGf(;r23UCAUTngB9;Q_M
z+CfcApZLUBQkR(I-la^W5pBY3B?T?#qIjay4}&cF&%l6%L+f<BO(%+k_FVR%o7{YC
zsyW>eu`9Sf1~98ni!MN}c#;|KkaFvlZ20N4_Ic<&-{C_CM0$|I#gAo7l>+7_6VKmP
zt2<xtlKHz(BJc{#+Do<@1+OIGE;mOcwh#i6O0$u=1<tB!AG(B3fCVGrdtP}THk_VJ
zl&?28-x)+@3t8N%b4?cymkByLN<0tefjDt3hsjT4LPVn-QbrEnXmtDvG?zXI(!8A<
zUt!c(=-{qwe9d-zV+8?nl(z`{W1(fa$`K+}RX49x?zDTui4*m1xLNO%au9}?ot{`9
zPN8JI$&~7fEmQTM7Ui|JnUBv7JmgjDmXRyydpW4@esXAlVprttM$;(eU~}rRYaki&
z4UT%#S&S#1HyLLFOuE?8@V{6^4X7@EvbqkGix$O0S4^rO<Q=*y2IN=MdcN5G?B&$?
z<jv;twzn^_vGo+%TQm`2f*2hXEML(Sh7RU^fFL*N_K1;{mpB;p3(`&t<RaLFQ#~Ps
z>*RpJC^x6>?O3=U2pZe2yj68yc-uMK2#2N)xaHB8S6`0@5VfP~{4C4H;+(m@O$5jl
z>ib983f`U%Xa_yD-3f7`@#@cbDw}2v5KXfx+TUowS#*Y<zvpyX+K<x!`_8v{z+#+^
zb74MK8I=n%FF?o3i!Da)Q0{L;nc!RvhaB(4L=G;f%8rp$yi3k^S|jQWBlfdfzWuz4
zc_3=NEtj{_7qN@obZ4fe86gQT&!pvJQm^Gnd8hA9%07$0A)H-SFEeFqL-73J7vXxo
z)1AVc#s`?4uQlQ+ZR^g=J1)~dPm}w<HROKtsm9A$VF*Q}A!c6Rm(Es{4rJ+H37$_B
ztUwXs={<9^4}bcopLgcJ9Vfc{xH5aDVG%)*ufdCGmC+d&E*ysJ4?Q6*4lwV^h>KYg
zR|8?F`>ReVt}`mfd*V*rFyGEa`+*%lLVr6>T`hM^)FPG7m0{G?{Ogr$+i9Q3xmBX?
zf%jI%9W#x#{G4_}_M~Hn@_Xg0FfpU5Kg~b0Nlx1-fu2I6Voe11Gz}E-)uqx-HnCsY
z38eg*3f-~Mx_q{5n1R**`(nUt%<zC^Wv-Zts{A=c+Cfm_WR6)fH?<me1ZyQC=<+dR
z<dUDfleB*D#ZlfNk|+`MCb+Cx;AHCv#6P+}NC!~lqy~)I=IZt9*a+h<*sar<auG~j
z-BGKVFe{pqwA0>8-FAUwNHE2qp%~CX2R&#ZV{uqkY@#-2j&64G@MCiT5{_>sJc4m~
z@1vdm%7Dlg$F=9L#gp+n=Yo0YDyKBTfr)c{w`9W=WO>A8jz=eF3&U^sOl2{-xPk10
z>G6p*GAUNyn*(nMcSzc8`jX>8jDeI1iu@V+>kCdM-Zdj$(oeO(4AO77C0p6E+Q{>x
zN?_1&*YKPB`Dw@u_eNfcSE8d^jg5};P)E!*pqg+yZF{fv?`h(j+#g!f>w^pGkKy;U
z;PF}KB1T45pxX@_8-s9zjl-s3&n@FlCRa}#9Xq=h30B7VA71?Z@`kX13Lg442f```
zM(ShmcwUJDI=Ibu^;hS=AhkRwtFG;sZbk(aLWQAyr4r2#$MuFKj>jBQ2@r>GN7sg)
z7#f7t%b^$FCLy-;YyFucZ%4`Kl%OWxA*BF?L=j%Rg%&QWek2tFE?rEq$b+=#^Tx})
z!@9NNgA%*E$A(%6<y4ap<v5ARJ)LhKZdm#Fdo~*eb()X=oC}yg-k1Y=%5~^MPp7W!
z#eR@WY6mksYsxd~Rn-;<ahy^U1GPyI9gsK|4<kT!R_<#772r-JS`Ms|e`rdc4gete
z?-P9I0|SUqa4^oK!Bec`^RvuE39>L$>lne@`ya=^ac~Nxv>;jj?#9s7X-hMe<d2`a
z(0S{0dm?`Y_($oIJM$^00sRp`Rsu=k)R0c4JUPJu>6jn>KC--8l*D($QJxnW{Y6)>
z+j@P8L3neF-xZ<HU!yEwzYpxa`fy3{XKS%_!b!}xqM@YGvFr|o^g?|hg}y%dOoYvD
zT+ZcVrw22U{>{p^5vPY<I!$SAfuu6#mieZc@~N|(*Z8ieY0rJSqu)6bTs@%7K-asO
z?dvDjm9QdleI%>fN%!sgT7@LQ&($0ZYdZl^+GmokPAr+_26kqkZ?V}_oi|`<oG#g|
zl{G5d`)9hrQ}{NBbf*9164U;uBvCj4ov67R78!_vLm1sHPS1)4)}T;ObMtuua?uH(
zm@gwAR<$o*#ct9o(ROU+PzOoiHwR`FRpp|kK(JbRW*)z`S2h*yc`@B5Scc*ljOJl{
zsEp(`v_eK30DP-$jVnHD=5mK8ZJ(_UFA-Pr#K#~bFLBziBWzeZ>bs-wd)|Hj+<l~;
z|L&(UJO*QJcCLewgRMvtbGWWZaxs9@-=;uZdI#ccT3{)hOm;uFl&sHLzsYyX8Ao&w
zsB?3moa*+Qjr>0KMrDds!d~7hfpMr3g>cC{$S~7xS1vt2$NKis=|)%KHH(UoOC`qX
z62)Jpf5a*Hp4OjGb6zDkCK&=L!-QRsHq7iNiyYjHf-Ry!e5G1>0kpz4@_cpi9Ufav
z+&iSUc_1HVG*n%oQ{OLMdvk7?$1X3=X&;K5t~vQG^lE0zni1)vF$Fa5H^7seo}h?2
zwt032Y)mu8yRiw|$|HxvLcSkxs2|k+a)k4Z*&Gl$%ZfcdRwad6sUQ=n0k2bsXQ?Zl
zoo%&RwGI_!pv87}MRGlD-Dht!c~~tQWq#Apfl(an)Sol*)}OFAsOvStAdL#sOnm#A
z*^!ZtTFI>glq$X)9U5r6k&KxQJm$KmoKD2Kw4i}e5VIZxeaLbGITJ7o@-$EHudJxg
zNfqqN;ExGsa^}~xBSx7wr2pXj`{<;=7hJ{mc0BG!UQW22{jj^&L5fN0Vgi@&&*E~E
zHIgyx^P}WcpW)$9ty-rc0n#jlL3{X=eZ3tVm^8>C7zB#Dvc#hfA|-2V&V>QKd6CZ#
zk`(!72*mcAa|_{5#-7?_>Fo5=t3=KbnOxmkcCK7o&JN@xq+|lpfsVQen+yttvys3e
zvaMXeVR{Ud6#W;8eA8E%^Nzs~UD!v-NY#mB;qx;qvs$Ofxf*o=*&BWAg3MY7$4v85
zwVj$BhU$u6SbffBMt<Dm=j0F|S^vMJ@L#!`Ulg$iY!%A4pZTag1ZpswMG|I5cxjve
zK!68~0UYe8Cak6j2Dh9Zsde?O1oP2h;#B(Fw3pa+_xiTyws&(2c&a)}^Ki`AsP5!L
zj~+qxHQ!xUN;Ua2jGbdL^U**A!=gdfJ`!pO>Ke<;gfk;j<W&+d1ygvXj;4FRFTS6s
zof~8AE*&4osdbj@nW*)h8-Ffs0f1uf+O(e;dMzV>fW&<e+E@&lAygT?k6wHb6ygh_
zex%<~(5zq(vGGP5Rgppw(Y{$JTaLOW0@N5wQ7xYGh|06~b;>xRbgZEM^bx}2Ie4zd
zi-`>UI1pA^kZ3C|_3W*>%j#Yi21h6>^Bc6<Y*6%Me{d=`#uVe@H#W;D>$<LMz`q1)
zf9p)oxSh7^?B^`sDsuZzn334OUCt<sGlMP8{_EtW0FcsXzB?w=DXoD}6zz^P7)p(5
z3HXeeYsqS@KRv6Kf+7@KAjf(6PDg+BA`SgcJT)^4Bbx#@=6tfKrHmx4`aG{lAzu4P
z^L%wj*SkSS2DYSXM*2Nm(y}Y&AOmYUT!kv^zP7(ivqs8k$M%R5ZxZ<6#V)5#sGMEr
zGo@RPDtk0LRvc`baF&d$@Sq*MyY)EiV;cJ@{<E;XwYm6Dcg$?{Kz^;4k8`llw30Sv
z#5n#R9TsB}Rx*5Nq7~bH14;Eyf+43Af%uP$InF0JgY_BC&QbM7EQ}{z`%^uq^;;>N
z>M8W#+}RRJdNjxJ?66U!*6PU<_`bNTa_srhN{WB>@wI#J(%lHJ5pImB<s?`K(W8xM
zfav|IK!tg@#6r7`N6!*3#l*Wvz9+EBS8aVhyT1mEKf-w^N7$i%6!g~B22!*Ze88!}
zG<BdFmt*2ZL2k0E0T$z7vh6XM@ju%Cu$`77{=TXys`Dh*nKzJ>#=>pWQ>Ul`%%Fks
zy#AcntP__%CoDdN*sdV)M;E%v#^LKpV+%#C46-!+cz!mkk0AxsU3TN5m(Xlgo?*13
zN8<sN&;0Qy>W|H<;BKddu|*@ePQrum-fXoJ{{H-Gf_=BuXKLl<P*t;KX^>C320Lr}
zXGZd2tVa(uHSIFZh?KM5b3i<wUEBRpa6z;?Ba%V0=58_e9S4_hH%Sts47hkWIfF{7
z8<Xwc>Q2`6W(azJ=#L{LeY3*5C`Y)|5}5$6YC*+CVM?T<8S0TxrURT-__^bKy6>Im
zr06BR$^lg;fAerIAZ%!Ec$s56z(qdvbxxA-UOd3Cy5WKv>fqs%_%qY#7u<Q8+{heT
z=>90Md=zA!_e9)cVk9el{2JY>orcyJFEEKgxWOCeKqB~wL4s4x;`v4!7$riWnfWJo
zd&EKkikNmS@guOby(rmAKKnl>UEaH*dKw3whIu*qwg@=C;MS%K=0Gtp&v<5K6XB*7
z8<oocLxK%!{Dz!84qLTu*}J@HzfrJFK=_+;ts4yP0%3bMOS<_?+wJqh_RiclV(n7R
zq~g(!m^%AiCSmsd_T1WpuIhS%`-u<_D6-St))w_p3{(AKS0mTBygw1fq}QTqmkyHP
zEC9bFtv=r#l2?2<C|pIDd5V&opCi_!B36OlJE`~Y?FHxGZ3M@5@2`FisB<N(PRfj`
zMSr~|Q+>Di<Y!QmLzchq9ejNt_GU|EhfF>ys22EtX$fu!dOa)kZ(vZWEu2`Jt!8aU
z#NRZK+X8J|1xC%pI>m~gW+$RB876`lj!yZwWW1Ff=PyOA5|)-lE?b?PH(V!V7Exj#
zIPu*wkprCaw<aaCx*8Qh0~DN7g<5Nsybzs|x7_BrtAywrl_&$V_}irN561pE5%d+D
zTdsBQdswCT&%<gdD0zUdDk~R`*xv-Q;ViM%(>&L|Yzs}3Q;bm`XKig}_WbZimVC-&
znV8ku+1}_Bt|4McpDPOf45U<*)%$Z7sv2OvEkGyJHFN}SOmSa}x)K-D7Q*0HjB0+|
z!R`h7_HdN8#Ejc1!RR0%VMSMro=L)2tLu+0)nTG%$4%EnWQp1EjVaW9_BJSG`OOeI
z8E|`Mz+u=o>M32ycVQSY3?o$M1q2ih?&P%`gB$$;=2f+ycuwGZv77(cG~4EANf7xf
zv!UdZ%U=M9+pFU*_}(iqAZnr_16g`Fxa<zn18ITcA#9=Or-M8s&7aP8MlDjXP%}Z?
zqCtGyZI8=^`~S`$Kz%E^{vEYzlhC<Hoki@gmP80foN@&mmOkDM8hD(Q$(+4{T-N+Z
zTezEG??wA825M)IZ6hQm>ucOwUvH#@>ER%ma=YwOKyQn!j*NFe7|dCwRQt0Qf^4c(
z&|8>zm%55gHT#qW!6BQtzDzCl&^2&YYh0(e8Bo@dWOUiq%-om0zL&8jHG^FnpY&;T
zZhu!`?9Nt|@8UkS5zke9RvDc!U5uIL2;%wi;i)l!t3T6SI||51As?Zj(OtUUmEa$`
zMsW#d#}CW(^;&U+K1^%s5i?xDF%N`0Fnw2BgXhsMFQw%7q}Nj({qR#8>R!~!-$!rF
zH1Uvc&DpM`?Z3yJmJ-{fAUs>?`i8*YDDn%in3U!JrlC=e8OD(2p9vs!pY7sJH+!3r
zTDNw3=A1oVTRjaKS1$`qPT-|Hsp?ND2y3b6fHBfiIOm0FJpSilm=0Qit1M(bPUd;}
zvu7mZDEu{cO_GdrCpYlMExbNjZ^zB0O#OlFIp10+z}$=T`sP}sZ9RoBB?p;7l3iY%
zQ;Ua#ts){;r{9yyzj%tS8B(qz&YY^AJZOsOWPCs(yHu;pEU0h4L=Dnp)1nHnteHgq
za|ZMV0%=8&Y8wlqRyBJV_osH^HVASir}l^2)f;d2IVam_tL^>GA7^J4-B}waQzmo8
zu(EO!1BVSDM22=$Z*ZJAvQ5-FfCQztiD>dbsfkKH-OM_Iz7lR&{^%x=XiOa2d)0;W
zB72JON=mY$iSMsc*Xk|4O5eXqP1PGdk&b<Z@ja3H%NL8uejx&j5RWS#*q(g8+AAX9
zl=3o+i7I9Gc)sSmkZR0&k8d9~npjJO+;C=fe!FLpI99U}bKG&8QR2{QK5a~`Sk5z^
z`$-i^*!G*9*Ns+T(@jd9ha^7?zTF|>FVRe`wmjxIZS3Bzb>48<vtH|_H&H%%Y@*yA
z!(GCXUMQ9P;_HiV4`DDF9}kmn#Cm!k&s(4$0+nD!L<!~D8Y(wjbUpvL;_>%C9=SKK
z*QR>Obt_p=EL_04{?zM__fDd6@2?5rnvT^cx)^j1>>8OoYh28FtX^lG`Bvun#}BkO
zhjq8GvxOGAdUBxEhR9?gBEMNLt-)@7PD#VZHzV#sD~k^biMXb^A|OISJ`@2rLq1;k
zh?xGgXL9=7J?Ng|Vrp=>IYLul+Fi5Z2lc0X9xA5sx`Y-(DO~)ei3A=_Q^Rl1B5G5(
zEG|!ZA#YPsmgsd27ka)>DNr8guMxVuZ8R}vdLC&SFjDN(Vq}vG;c(~ES6HGu+UGz2
zKDGJIpD$STX-61d*dy_jmAffVe2lv)HMTp>PSf!XPt=HOojDnY>!1fTVHJ8Z%l-Z%
z{kgQRkIr1W;|z5tM&oz~3S?($Ya`BkYQ=)H8bj_QwtFPrlov3w=W0QDRPK8H{qlj=
z9<0~pZY|%Cf2XM#??fhNz;(~chU>!hW$v*Es^s$<9&+6f%1b@f3b3}fMi0ak-Z-+T
z<u(XRJ4?c|87U#^`@GX%pQ+^yny;$rJdeptl5Fyo7*Bo;*JPPjF2Pl=i6*ib&sjcS
ztZ@v7=!xbwSCr|&(prIDfmBEBa;^8Vn}myI8?UU{@1#7x<<_-vyuk-Ri^X^fS%vwN
z^IEH&4C=jSDj^1R9RH84ua1gxYvVOQK%@~QL}`Jc89J2i8oIl?8>AG5kcOeV8>B&{
zyFpsIyZgTLedk;Eo^$S6^M`8<YYF?^?|y#uJey-|o>Tvq=OFUZcWbao7#YHZh8P)n
z9Crqm)b|S)<8upIfHPu|E*<|oPrrFhrwEIsLAxwnKN^Um>&ua=tu?!{yQ$IQO5QLs
zs%X<`mKQGVciGEuoGwdRhjor>Nj+vszNXc#$WULBW1_1_O1?<t62ggRRJC5?X>hf*
z0Mdgd9^LGiPoDnmW9RjZ(3u_BfX9az1i;$<js!N~g<)h5Ekf%;UF3`4AoCM0PS-Lr
z!m7!}+aprtkwm)=ce`U7tILD$lG`%Gcos~oc-R_>6Uzz9)h}LU7)iwJa6gwrTBXsq
zptJh((L$pX#*iLiR$>ZUjMUff3*@q7Ok?kcY=)_6g>9N<nDj;~y;o%@c8Q4{<ggnh
z)2OmeE^>FWV^U^Nk)BGG;CV<b94$|;!Jw~%h2b!qI8>@y+fOg*@6Ywuv&@^UV+f4&
zT%Ri6AEdK>QJar<<MX))Ym|Xe@I}ef*{6N$>Ez-oKD|0VTiqV_`2HbX0>|@w45U%a
zWu#zmmd3~CK1r+Zx`gm9nMzhEG@NBOUA4U6TNmp{k&AV1KS)l^8lM;hD(UZ#t}xfC
zr42%*>e)F)()`;J0xN2V@Qii3G@HP38YYP-$R}3+9>4#oARm5{JU%yipa+9f5_iOy
z>5Y?!eCGG5@V8RDlYNBHP7fK;Fw9mV@J{Mg>FuTYnLlF9?v=ez1f<SNeP<%$=`jL_
zrku}8uZ&iVQnI94wvmJe7N%Yg*V~^Gd#OB1kStll{jLlaFh?5Zh5pV&>_Y&?2cg-T
zKH<`mJ4T@weeulXeBKW!X^;0D+O6&}Y9ITu=q(wC%r|SyVa|?a)VKfuN$hitp;G95
zh8XBV7GV0QJD{AY%D8+eaJ$@__Eu2v71k`(TIa7$ZB~=7V>Nxla@vi;gJR)|=X}0f
z@)q)yS;_{Q<1|p&hMAKBMAV!EWp;eL;1f28%?GRPKxGJX!^Zc^2^!;PB-5M%4_o9b
zn?f6oK;&0FgPNR7ht~k%cnJ;kuf-I3rXk43*6Gwq)SdM42|^gzlgu;)isf#mXIgA)
zE16R(2^S)#Ys{d$Fmi>@!MYTV`JX)L@uH6tmnWNCMmGYG1|?Td`{K-petaHVkZP!a
zRH`x#uJm57_ggQPj+C^cvX{=ZB;QdUOvn2VR`Gd=<Ck4sgVGq^y#{_ok*^8dk@39H
z|78J0UaqXX#mBfd-5$%Se8CR&3Pn(x6A~&kTWmJz{_{*~F5|s<W{mFc6kCNdUG?g5
zm-_DX_}P}3@;hFc$o!od!w})IJcKbVr;TSt3Y{itYgg@I1Y_x+1|-gAQW7cUF!!el
zqB_5(w|PlO)mdjt5DKmbzQ4P)=wItq++1OdRXSME1h<txKAw&Lu~`tGP~iIc6rZHw
zsNbL~giZx{_<px9E|$gJA@l7kV6Fc4`Js;dv0xL#u}<hNSg=WWxfPm!baJwSlJz-n
z2(QT|m5zkKG}_Izk>$N#Q3A*utU~rvQt%Jr!^4^S^-1vBPjHuC5NfU^bymimPp#N$
zmt;UgO(6t2fOdVo?6_HAZC6oLzI}hA0M)2{6|#;jdo!FuqdZ+Rh0ufWqLbT|UCGag
z?mJOec(y|js%3D~DR6q=vgH^fY355SScJrvEl|sTKqE1p3QXfG{F;F84fd!BYlxM(
zCKK68{)S1TnH?^i?uiwE1j+7;3rDw9_HA2ek;=}>a^e`Ah{2sM5pz9|w%(bF!yfIu
zdbpkiuQ0|`*`4hxw+SM`xb*xsF7dB;Fk7ol$K!>*7zxsVFSW<>AwvU6xqJEu&)z@z
z+h<7FuPtHd@iMMY5F%~bD82Ub*?T<*P)BBPIsAJuf7|c9Ie&#V$(G7tgbXfGlITj+
zXD@cy&%!$0Fu&npb7fesTFw!uHG1&6Q%kC<v+71)fLy?o%Vo9Jq+k>w-xj9G#d=I<
z*_3$>{5VQ&bu-_Iar`L^s(tb58|{cXx4le!r!J$8{P9pKc(CEaNTH9ng-&+xU{Gn*
z=15iX=4QSVr;=v8EJN7M0S{rFRgzNCz)8OQX{;cW(o(~TZBVC8XEMljwTrD$3SUYv
zvBmXvG<(8wM``l@3pv>$4U@fo@o6ARu=JQG&({2>fY|MbDoZutK$c}$lfk{E^wtTk
zO1ZU!pwT;#XQPCh`=xjMp;L_0<&bRscBR1=HC7TJGhCT8W28IE=yOuiSbCusodC^P
zrzz0rpVb}0N_{7Q^!LK@<Y_ia0Sa&j3a^2Iq2cbEw-^ue$v|y6@P3T&@<2W(zIG`|
z&@5&)WFwlSC*)7zr&k^FK*6M`r5$aO_Bm|z9U|^KTmBNfc5*UFr%~t1I0#^ndS;jc
z6w#B{n9=rPPqcw;+tf@Lp>)!1^ZqoN@+|W?yDl5jOOB^j+y;SmmMtgi=9z76BSTsN
z0tZX^w1jA91Lu~5-XPKB*?f%k{wSe08x!mO6JBfLGELamr{nPwF+J_=C*_exxK-}S
z9H^%T0W+<w9ktsyZ4X7XQ>($*r8^UuRLaG)ZN9n@Br=<2+K_YG{cWfQQj^{43otRa
z(j>)SB@Z{bF1-GF>+tdbOO9^cJbYffT0D(ciA<M&oq%a&5u3Wxpf-1|Zu~c;l+&i=
zL9iks>R3Gq->am(g2^q&gq!QYub{q}bC7DXOVQ?D1>)Zq*UlQ)nGF~BD^9>Ro+8(~
zPFC4n&vR3uBOzp->FFzMZ%INh3iNopJDq>LxM(8EWIGo=S68U=_y9}hkWKt({PHym
zqu*<nW7IR6VigH|A#BAz28MotY6T)6{qn?3ueU~(s+DNyQU<;=APvAolF^AVU+j-J
zb6wAw5yA>a+Itz^az)p_Suc@dV?J_I!ZnsiUo2VG4ufXD2tblLOr0pkuPs)Uo7EZo
zmR~MnhMe2@@F-#XNxMmdxSJS>IbRNvaJ;HqchVux5PY&??qKD)_Y2Wc(EMPTjM%j+
zPJ1qbICGKEe;Gkw15d#CL{Gu#hd&}L4+NXT$D;zpAMiK^bB7b<??BtWRRA?pUjwwE
zi9>uRn$u0B)9fX&-rNEf>AtIrd+hrEKyFL`6e`<{aSc7qOmm!%;W0_#uw9%j*KJ$0
z+}TSk*ld(TK+RA?&i!r$o{`9#A(SggCN226-9l3qW#8W0u;e3!Z(w+Cqgs17pYini
zyM;*7crNRJseiVq@1sm7x3l8;T)l<z%ga+%-m(_r@2Kc`09Hq7w@u<r;_pq@H8U;y
znHED`SRBz9*kB(v%TWZO-nr<+SCmoZ%9C#YL{5#zPRt^F<9e{4my+Of8~zNRK5Mtm
zMnRvEu?Uj)s`9GvW0}@rhUV4%C2WP+^C9hPtDK&!Q17^E*#|W8)q{EFYh-YNnu2+X
z<=VtB5bz$$;BsVp41ROYn|#tP*a%>!aU?u8`N<+yOBrpgCRawZRG#UQRrP8-a}~kg
z|2i9AVD*d_#YxDSg_>jP)jtK`js$%Bt<`d0W=5n5$dx{S4#tS0(dOo+!4!ETv-}W4
zt@Pc~Ri+O0?tZ)aWIY_+y0R@*;6BF*w2q=ywL2Y0M`L`lR<vPSu9HjE{-yXKg(o9T
zyIHMKoFX})bUz|RK+YOrn0&KilkW|QEWub_twd<nwrL86tFp|>wt0+K36}7_Z-d-d
zXP;{a$rzU0gOPdXlYSry0+j-}-RO6{5=t}18w&QlEwvcRLXT*~C^mc!J*aQ5nw3n%
zyWUyyjq^W$s5Z@QsVme>sNd=o?7WS9hkCi4B;AYHXW}2x^csqn>ulKla#h3NJ^9u;
zF(-a$Es%vj0b1dsm109YLqt4K(1qd|auGGZ7u<-EBS1$CS^q)po?`?1>?vbs5CnkN
zL5$PAkuNgSVstZ-wxJy?klTw~m0HW>ZEYW-={&bzv9KPBpE0#CjIQo(y2I**khHK5
z(icYLJkDf3`Q^$|G3Rl5)Uc0cP@1nEj_lJ9KW3^YNGXKtQ$@s2!Bnv*Z^hpnopp{(
z7E^-uO@D=RwG}7^@zZO@*UOff23i092mue78U<)6ly2`g*LEmsZ1Om7V}sv;B_{(d
z6(L?;XYW}CE{58_C{>!uDf@cBD64*RI*d~ln#Sx37C(Ew#5T`%`LTR;)qLh+KU8mQ
zv3Q{S-FQ_L`C!_64S-$@78d0ZzCOz7Vgw~F_O>N=Zw#{L<XPE1e8u=%nFWq3TO2bS
zTwz{y#$$hZjA0b`q7~ry$z#ZUX~(K}e&j!fLPlq~tYKp^!zxlr#H+k2FA(I<qbaT9
zqRTi3YL^C*U^qvME+1$w(9kjR&zYHvxO!yQ$DNUhh#a5hRJOG#O^Y$~Tv*Ibm=Fxj
zH78Mlbjt{b(-z3W{eJAB)>s#B@7&DRu5D*N4yyqSpyREV@$?E7d4bJLRn-Q}rFyzN
z-+Xkmh+Py<=$s0K-F%E7bb!eVVu_wQm?2+pl|Ptg1Mp;#e9l(`M$JECr-)JQn?nk7
zfAj{u$dEAdh5@;%Z$>M7>0mK;$NSd8W-}z2*BxR>Kie4RkI%@?fhLlJ|CU%<oz;kG
zn|xz}W`{-Jzv%Jee22z081bKrRVMgEkWaNiVv`do&|}D6?f{lMdm0krQFbdlLBQl!
zw$PaC@dLnxTf4&obtry7p_0@MCn+U>E?FO>f%{7?AF96mp^DYgopIRuwudLvM4ZGj
z3g9iYV$vF9Wckun`+zQ1D4s%9js#lo0&v+0?~L1(U9QP{jlVE187tIh5nwJi4@de?
zuQv@zhmDfQD@s)#pBhK&HT(*q(T1^^d^tiz%|^SuC_KQQ3_M4;NMi}A|6DSl!SBH~
z2~w8qI(wV^o5!`lq|K-3R?Tm~naZ;ye>fcy!l11rgmF0X*$cs$Y|-m-S?DB*U%Rnv
z*i-H5&0Acf@8WwKEuL&f2U<|X!Mi)3yHT;TPkpgxo2%W5&MsiR+?RM1KC%Pu50kCJ
zFibl8@s;l3G=+el0pLz=mf=|GaajU^3gEc<JLMH4^V3KsUwGjFKgjT&-MmI;=TAyV
zP{qV*;m|?D(Y5lzVt>`0I(t%ULm$Qoz^N0Pc{MZBz@P96I$ELsE5wN>PbQm;9T#JV
zZ?0dp+-z&Up=SHX(`A)XvKOub^ivJhd6Bh0e<~HKG3yhHv8!FTW3V|qc<G1Ezm3e_
z&XtT3G_#ET2~>Ww<;$hw#_C$Tkpf4`EBCW$U8N#RRhc@~>5@@xs}+Cw!H4^xwtke)
zboh*AxV2##WS?{SpkD-ZoAQt$lT*DxsOEzQ3muTB#&`k&3xr^gL#gNE+1ch(H|H>h
zdb&LT=o$SQIl!SVn9GrXeFu=>2h)uK#Rm}c&KvA3F7pcY^BuOiX;sz^K*#v7mux#A
z86W_?l0vm0<8!&0hT<z}t)4dNT%Xj8_}gTfP)5l712w<b6L~hj9MhI#4qQdi$#r*Y
zPnY%Ck`7*AU&=3!F#%<QAb8B2cyeo6&M|c154*M5$NqSBo?$=r>cDdvazr5wq}9fy
zti!f^01Od1_>`$SWV1^NJK?Xc;6{C&-I*=GLPXi23MTG)A?rfSBMKL^qO2xk70;T;
zq$i}G<K=G*$3aildp@~Wn^Kos%@pVz%*V1qs95=6i&IV=$lQ+Wp+&>vz!nd#F{h6p
zaJ*r2CJ@l+le>TU`h&%9CYrep(JXGUkh|;c+uoSL6YFg%1@f_0T_1_|*}Cahj@Oq<
zh2!=1CMaFa$!wXU{)DZCe?V!aySv3S2UC);PWMzIwhTtg33*`;5?ipw{o%Q#LDX4f
zzT7y6z{7`>G8iZN(q`yrwzEL?Q&hZ>s<%8jX`T;0lI0`v=23-s(8WR6=+}t1^xm-*
z@BYr-o*)p1KC!SycYZ7W0^#Ei31rSYe605qj?vV;f7nD2pQCqcFSnwgFms;V9=KsI
zBxH+9w7e4dj~16?V011_`TPiaaonE(nQ&b~an;hP4UMLjx=339_;KMQaERXGxMt$K
zaAbOZ08{Pk7LyzSSNr<jRGG*mr)6B$nAhz0U`-~jHn$W)?r+!!dwcW-+SaCZEgPHb
z4T2c#De(OpWm5`p{f(yMdt>5kaz#sx=jtFoegzdBxa|<h`Len)bEGj{uFbF($=4Jp
z&c|uzGw9}Rkc$<;zBnD0C3*f5W$2<O;nz|w=r`zf;PXHpkA7{?=S9+cgH1)r^oH4%
zq;$4cYEn`Zf%+sf?pLt=YV1&B)}+_O2X5!>XbDzNyb2Nb)y;qHNc5+;7=xL?d>ioc
ztL5<96cx_m{HqK*3+9{ZOsxC0K7DD(zuQ01E@DV@FfN|e{Y7x)4Mr7H*_$ua<c&^z
zF6RSax9rF&6ELd6$Vz98+pfxF3sHAuKFd2_t9V<n*$)3iQK2uAGBdNRj#RIE;`)oW
zp~AZH2%8=fqrC0~ABR$6ImFQWV(5w1`=Uj#R*YrS7mLQpaZWZG?R3IlG|WFaTJuMY
zCx*=Q6YpMMOH7W>JaktuiKLhft$64@uv@d$%@#2j(T4DrY7}lv<?0I&&b2h{3|~LG
zY_q#pDS2xQ_C5n1R*9FpwwZdfR<W<IwkLL&dSX}BAsY<@`0W?=1sp}OBo4XU70uH1
zezb`JHJm0J{ZnCq`_6Jt4@cJT3N#|Kot??CYcqnXKQ50cT)|iVIR7{*G3_XxSv+1c
zzzn#@=t=Q`Z^htR4E*qP7t&J`Mi4oiA*V2~)}K%Pf>CJv(ND=fLKo8(70o+eDCOvh
z3(FrHAG9y8NDq4QYtaW&i*duS%MW6x$IDi`mFrYM8nJ*DW|==ufa5aO7l&JEpFKjb
zbbATOlgE~m%Pft0LB?!pmd-!XDEpRNXm&n9g%lbydV5=5*K?vg2xDs>$ciN_vAx{S
z2EuDaAj?>g1l_Y|0|tXJ2fy0xFH&1TH-^XQ12@|%BI~C^(VZfjGB{N7nez?cg|j67
z)K&p6wegKF+v9O%$yuGgbXT^)vnwe>jG9yqhCRc(uij+LSSxKLousjJmPDj_<e3dE
z)JIVPqXtg~L|?C?J$*_2udVmwDf0O%KV<wdg!dCMqKHB~@PiP2WS>}(V!{$q#~^cj
zTxlLpDwQziPp`>lxZ{0k+woM5w@c=z9%~K|DF$4653}*FZz1a|ozB;84=0pymlrCp
zH!F?|b=49f6YoWkmafO;Iu3t|^s5Y0bJVVGxQK#U9~q036oeIXQSow0&`0La7qi9s
zs&l`GVRuIW-HslDlyVuw3%Ssl9udgyZ|3QsEB?L8Q0Ie^o2C+F>RG2hPmb3Vr0{)p
z_EN#nXetS^{l1i?=NmlJ!;uyVG~{nLt=myKY(~y%EpsgmocrT*JIO`GyJ59Hz+EK~
zDz!@-rV(Um;atw4jnawl_f@;>0DV#EcfV8@F8e=M=nosPGLEof!7Sh#bU3y_YCz68
zX80_kS*Lik(kTcc<@1o=e1R6|vhn7qe>n}3FBx+7`?2Kd^8)v9LMkEc2KSh$y;+fI
z+oinea$ULwoCa}KfT=nvvKkEojU<K6dKr0)J|XD>rbDh)X)SJ_wDd+a&}db1u7$o3
zM(6XT^f)=o^jI2ye4@;#KDJ>VzdssqAmCF%=xVXlIsvG@$(zPTAEUO+Cli^K_5r6;
z(BX3npmfDr`;W?<ZW8GTFL<vYI4847fH4k$`$SiSwcV?Zqk#W8JY;F}j>};rO}#oN
z^b*PUA!%0geZ{&zv78efnSz@JK8}#kuXgY-*W7dhAPg}Wk=}Uf(`7CWPnRhe`uYxf
zeR19T$Um5DIr~+(K9Ake-eLJ@d2GQ|WH=L$jh2mn{6pOX>ccm`c=_S+@#>*)A`Sy$
zChMHF9QgW<K^<diCewzuGTtv_k#T(5IO{F}By^K-D58iX(9*HZW!&+8{$pe(=t6_n
zI5PQwey&oeQ8bZr8jyCBGR3jB$xGk@DLJ|F{+AkSse8?uZx@>d!HhbsIX3{kw^z2L
zvN)TWI#`ebh;5upKLkx+C{@#(Es-2TCGsi`?CsGPP5Rt;I8h;iQ<A#?F*s|9E9pf*
z))CTBS}r;%DH-efrdBl<oZC4`i%RWX!Buzp+Y8vSk4{w}zjTZ!Ep>KyjqupKR!p)L
zm5yl08TMT;8nL)cv2qOmY+-*w;dM{c*u|apm!O9SDxwETp6m4GZ93JjL20ICT39y!
zQ|ZjQUjTYxtU2J&03rq7?X9f^n^ocjf>7BpU@wF{!)L_XpoL(>awCNOO*sXY`p0Lu
zc=|Envl8%43fz{HwU;?aSPRkh%`F#=#3S+2D^^*XZmb=VjeH8-F)7h5E;gi+i(E1H
z=#8fGW}FE6t!4qp^(TmosIy(3o{;#n?eiS7ulvE?w>@%(Rrp9VB!rS8ryJhXi*RHa
zuh*AYje1_MNs2ElT&%J^Rx^;nkesDv`>2%%F)E0}s4de}jnw>7y?zih-zjh1D~ox%
z`ruu(xk;^TB=NNq(U!rBs6`mEtEN`xdSSJof6cma)d{7#w1#miY|qnb{6|t5&kGq)
z@@@UgODeptd&iq0+2Ub73-G1bs|XP(HkwftK|VZ^swWDTy-p*zA$B;on<#EgCIcB^
z+@Z5~t_MJPRT;9tssAjFMoB!7<lnkq{(GQCK(u1|DNr)g`n7VUNGRmecs*BFSC@7V
z017OnQ|Iu(htUUsR>~M=2e)QYU+*t(z}_e4)3C$cxyfR&7=*(}`rc12jB+!pk9-E_
zW!g2@e?HAv+wQSPypsd%V3%H9xfSj1N=@4=D1z^qL18B*!we4VM*``MCTKz4r{PS)
zB;iC5KSZ4Az^98WiOmj_gA4Tvj#87@lwYauyj3!v<w_(eMtU~Qg;Kroxxe1Toi9zE
zapwojqn)c83wC_(IWD=e)ew((5S|wvFbZtEb6W%o!hWt1ovuQzn8shn7WMw%w*5=t
z@Y!~JpqDM9Z|fn@Ez<aN6_ETCYWcl$9RB2G_lrg5IlXvE`si^iXii0oy=yPCXD5u1
zD;_VMNGEr&wyA5Ks#f7Q`vTGYAyxV}PotyhiOrbs_2Z)ZmAv~#Lqp~4%COG^k>9_G
zp)i~nHp-+R0$UXic<ZYXjXOqU{!F$#D&{V>1Tmf)2NAQNR6GfX3hg2aqC#4wQD(9C
zW)To+f_Q5?8Dpm--`*~+aVMn=)-GpB0HkSq>i!nSApIK4I8T(TT0Ro>3Gb++iFG=+
za#DoZZYQE&D4(ZAFxp=Z@$ST{aOxeWo;Dt&U+px`p}0l%>AaqK(h){KOVT>EaZ5=v
zTHfI2-yIavI|>`m?^h-UXUKfFB=?=syJ~j#U}~xSkDTbbm0?*f{;dllb}c2CMF|m(
z)UK0V>&i-pcKD+ViMSsnD4o=qXVc-gq$bT=FFoD~1%VIfDlAoZ0N139SMSw733G%b
zur6XP1k(ekf_&4>s;XSC13k(dVoo+OQO<vlFZYP|CDFt*L!N^~WQL_t)L$W6MnQGU
zbn^jRg1T#Ai?n=sKi^F68V`b1ij4PYi5wkUYNfEq$eC37Q8_j`vE<>+oxMWKecK&T
zc%!c{wh@9uA7RLdv~+hxjs{~D1A76ZQF8SCx1BT^FkYcTsO0-VX&-yCAOuly@ojL{
zi}Lx~c>J*ZidMr=2^A-uZVs~d>D1XLpNlJknl1xo?Lo!d9BoUKS2!t0hwXiSPb8DM
zYio4X+mYnb6`=~*Y$E-8mJ#Xiz=ayhTPnLGZmhoPpLSgZNd0K+w|@ads!}SxU_(!~
zMb+*7o4;iZ;AVL$%EtoJP1KKW0G7)q2j0_h?W9|Zfdy1`zsy>04zrIy{Y^v*LbPm(
zc&D0|fA4lvqwZ6KpfG=5>b5x~*aqT+0ou9!@#uJaV_W?ELh1BGVa?;7y1ouz>SV55
zpR^ddZq0L9;1icLOeY0-c(L<mo7ry;(D3t<u@>K@p#BOEMjJn)ryHu0^nbynU15wG
z1kx_rww6z3Uz@ASl_0{aa3&GZaUM&7t$M8Yj|=+x2FM<lB{yNbQ$+mSb8=Fo9IKUu
zW8`l-il8`ikh2V^u@IHgF>f1&KI=Ce5deP4vMe81>npaLLcTv?{WTmqHHHn@m7EPc
zK1t;<cT$QSL6ei)S8Hp{TYx7Fn(OLDRQ;dg4>x?Z87cx8!cKx@D<Gb2pc5KJXad(G
z{{s_$fv;M-c8C-;0&tY+mXE(f7v#%t6I?*T1V<Zv8NWTx#;dj1zOe3nG0LND^Q|Ww
z`>nOTjD1J5d&hXW(KJBz3SFyBU>l$(YZ|$;fY&0Eif|Zjd|H{GI&j)^eO+rK^IWsG
zO1+58KDwHtORLeFCmeurahu;<_6r_Wb(1=I{!)$37ArnKFEp>U7`w0t*4kyDDrkF`
zERbGh+@og?awImJge~Fi?EhCPr+(`n6mi#Ne=<fHx>zk%Uk(Hv*Z`3>Q``U3bO?_5
zaBmFk!t&YkLdg*9G5&MHP{On-fi$}YZYKFHjbe1F`8$E<X?wjm{5a)+REonvF{zQD
zQd*@JOPfr+_pjRdzXJ(wjZ;3GQbr*7*w^0=?cQ#_%p06y+(zSkskKG%{?xx(d4HCz
z)>c<L&HWc1Y9%`f=SQGUjn;BhGPfnJ<JVY7nVihbd7f_ed}@J#vrQ(8A++H%`WqDY
zTjgzAXY2jAz=5CI<RY^rSZY`(Z7P6AQig_}=ilIfvdS6#xxms~uxuXD7C_H~jqX$?
zPFKL*q{EbQ$P5;sPIbR`xnxjIxVqmMe)0fv^IT|ljl3C3k;rGz9t+p|US58}V-|jX
z==-Sf8Hj9X9|8%3<|^YD^fm^n;QHs4v(fvBTsGT^q1eca(w!s8w*D9lWa!YZBfQkU
z5lXZw(&HwZaspA8u{1FD0Y^=<58mHs<JeTANb&tmfw@|pVV(V_!LLX$K4aeQO%_iW
zUD+v^t#964u6!Y24HOgwn`$7cK7`#GZ!7AzDu(<*{-->!C;d^XKuq|jy}kX4{ecVr
z^Itp|00BwaESqPBAfL}4PKk+#jE1K2fH4;T>qk@*5c`PfbWbO&9Ly_(Z*SqOsqD|@
zB3~HAi#xg9Pth0g>G|c+dCI<s;mQs}n>O^6OQvF5cO4faCog3w?N>T_)M?ARvz|5E
z=!9UQdb*=Hx+(?C0DgDPmf7Ul9gkB6fM0iIA3OUK`T~VZm)7f29mc!Eaj4#mRjA}m
znM9grqHInUwN21!!{WmUMF1P6aQPeQ*t|E4BmaS3LmtSy04GHn5PMP~#ImF>n@;~4
z8eMi?L&zE&p<SP`6TPeGX)+jnLG4{H9NfM+m<<^I<~9lDXN4DtiZjh!UiV<<#}!oN
zQ^vb{aGRq&Cz-#4$q<ML-Q{0Q<Oq(GujmB*;Y>sn&P19pb50qajw<n`$#y_OC-Csq
z$`+d?$ZSo&{x1t)TBf5cY#}zBp5Nd0XoGCRpkbHr6UciNJ){Bly$FxX#M?+zh45k}
zg}j;9eW{w)36$O*<^qq+9wKm{1@~vhJJb>jF6S+Rs1I;w0bR9F>&E;1wX?;DUzO>=
z2V+Ldj>{WB-#nW>-RuZUlIvURHLgpp@qGKP9NEZ7IBeegTNCE2WiN7!a2wG@pSG=*
zg2IQN@8qU-XL3;N?QJeYaM)OAL)FBMhi;5#ScDL@LmdAwq-!*YAR$4FKXdL|rw6pR
zn|n)#?I;*G1>Xb(jRJVoiL^TBb~_+nu28zp26kM3)H6<6J5}uan5($Zh;o?RX7}rw
zkhwNCy)qLEtHa-6bVYD0Es2%p9_igz={VVes`T>WuCPptbT6{ALKDpI`p;#wTQr<~
z2li%S8O6}6?+wZS?u|S_=6nfkRpNCjErG~p_tPn#`<wH{4(juxqpu<1;ic`5js!&0
z)Im72P<R}P%(kr_`rRh?PbV}8MN7Q}7ELKbTf_S6Bb!X5p_fAL%fYv7JzwoqPVLw3
z+;`Z`qmNqHsE-WQ-mDL%Ef_IWPnG9y6OtA6bwVLd-yAP-^*-T4S#1~E7@ILP<uGE+
z2|c~{HH3Cv{%Y6k)tj#}#sxa5ixvUuQ;huMjKRm|`Z`XVC7i(6yE0Wgu+|oErn9qY
z^3r~jX_nF?&HuL7iX@o;w8p|E6g*DZ<Ks-SkdCbRj5?+1vMi<8?HtgXcy$o5(bLRX
z3*B-OnY~G=t~e2PYM=YKRzg0F@$Q2pRxr_v<lDF5ff+(`wd4<J0aGuvTVNYjzcVaO
z7IGxuaywK_?;i#eu!z|3MciiphWeiz93VCj*z|@w8jZ?)`G&c|IFYEKmC_f*#meV)
zmuSQ1meD-YdcK){)_s4A>$w>byCun}gEty<bM+xeFo}|m4f}ef9t6|V&&bc;e7v7-
z4^X6#%Cqvmn$`_96V4l<4DDQ4u9r$FeC@b{>zu^&X>BDnougDG+#;SqCN323wd~8=
z-A>fT$F&fL{661w-pr;*5}M5w6w7(-2_@Gq9hqcoInDtXX+YULc32$=mmoj(0<Su5
zk4SiqM0Dco?yoYSJOhQesTC9h-tB>z=~+Hc+)uHkQ=4&gkhMQQRnSz2hvgugmKK5;
z8iHBdp&)FY2EES{?z4zUbXhtx#zVs69j&#Ep;e`Oj=cKU4zN1+Y9{9g(29Q^7midz
zLEY(ZW{nolx}7Bo2ucYNPdqIiJ?%g^8X1Rm7CcL19e$Pd(l;tc=8xxd=u$`8Oq+nW
zdUYNX{=ab)E9S@*86(vWS34rNL<9>NF98uX7sHs7AfPpJE$6ByR$1|xI_texrdRm&
z%WP#oYeOG}<S%^n_3iA;%IrH|`F>`@K(qx>AjPsM18Yv8m+LH=X=~dMW^oizauGe1
zb(`_r^!Tx;UTvk}iz5SGo!z@O?k7gcKhUb*_)&{gyFT3rMzflfOwd*ECatW#c}*)n
zQK8>x@HI?CkdN<KZAaym>*=QD^p=4Zv9WY=yhSYSc^(O4AJliq+3Dd9T&q^lJ?Jv5
z)j;MnS>oaRa(`S5{C&%Y>&F&@c2ASNp+1Pc0MHFKHsK029JDC5f|yqh5#W)e@lHVY
z-Hy4tR<?e)I+ZJ0Sv6E4go{H#?O#w48v&xP+}fS>9WV0Dr#)2Ef0iz-AmFtIP&p?O
zuyIQPqHRhy_SKO6bKB$fmo9p3$$r-Zwz)3~M7wc}<<dvvtfaCkZu&|Q@i&Fqd9Uct
zE>-CR-!|EOR`W7DSjke!!w^tHa4w9d&R2cB7YpSaejk<T`vtv2upQxPwsaZ@`F>zH
z60piB*-Y&}xH8plYJB(RK?G@ikY2VkqF5h@l`z}vc>#(jTC6Lt7@_xkalq@oz9BQo
zP&c2esuD;?#0tFknACKBt2RmV3yM-HbgWSkr!h1hE#Bx1iMAMMln>tDjjaF5>pIz+
z9ccB%Q9|?3d`bOzPWnK2j1CIX(9CPc4W)%&>428Dlm}-V?QM60#jg9&_Ky*RXFm4@
zBBdH*xAmt~u77ap38={4&3D#+?dnifP3ogr@k<Z*d+xO!33&DASJU>rB?++7!opjI
zKVOwe0cC8uMYA^M-sg}+85o1<cEo^2H1_*oA;=~E&5<x;+8$*!-qr26>3V;E;_Sl4
zUj!EBUQ?_ZNtKi+*eRIN_3!}me49ySjIss?-9c%5xmpA7xRu;(ZTo8v2MBi9Y*H-5
z`35nqlj_pcq`q5c{}6#H)otD1Kb{^N774gjlr=5@)d3z-@EF6Jt@<Wa%>P|~o|9I!
zzG6hq;JFRn95>U^+a=zpi2UqcS(4Y2fS0(U!2#W?H!q*XMv-yXT`f|knID#;qIkVK
zukI+h1y6{zo2E-#S9OSQID$<sta5)=&T_l%6MB<gd4X#ad^cSO6El(7ME(=t1f--Q
z6Ht|vwirQC_`j&LteoGM{Th^)EbbBY^_*X(vpDY&|2zR;fULxiWdY)r=H{!I`<5ui
z4JTK}J|c&oIqG*uS2L_0H+Nk3z&%-&n30by5s~?T(5?u0O55qakIH%Zor|q99Do|p
zfxT&H=-hI~fvpYX*b021cuGx9<_k90l=QAc<E({}u9yMru|4VOQvl}=wI$*8mNag3
zX8n-mJij&^6`*FP%x0@o_@oLlcZSQF^DNn%f^H)TtoH<5uFyX4A)?y4b10RKmbPyt
zP)(~#`xBD6G$C)BSHlUH(cq+O*Q;5;=Cwd(c$WG#lnf($g$SzYPWTqpv=+}beG)sH
za_{x)Y@F`j+(7<kIAPd8+ACrumB;cXX?0Ufrby*=*ABls%RaY{mf7X^rvf-yJ%-0e
zfWi3zp!bg)sb{@7aOZ_VHzZv3zm~l7^|$K<l8HRD>+e>D{pB!wsNWNApX03Fcc+eS
zR*SsJE{yDJ@^;VuNz$p1|8Fwvcwd4S9E|x5Yq|ILo9I_5etw0+zj)m*fE>iM>>c^U
zoYtlNa-(MJRA~^>TCi0cz!tA%HMWF1{dRpF+wS8{N_836{g-B{o{)#+Z+c#MrGcAV
z=?*&|z0SnVi>@U;KbxqRe7$m+bfH0}Fq8nc90QW*OH#HTPBq{9hs?(dDiRV}B24VH
z%J%KT#cJx&n|$`vig^PxDyKU;M!p@{DdVVO?pO<W#Cpi27I}!8Jcf5ERn&1fgg@@0
z;FvwmRA<q7pE}M2M~|UDn)ZZ))C=;y$43z6HkA#hGr#SM0IQErk8eSWFUTBa@h)Ze
zW*}F$KVlyH+&wLQe96S}q@5^O6sbTVdAPHZ1=PUZfi)I!CgVCS&cg^F@0xzJ;Z#ZO
zauLtxp`kGqq){hh{F`lQz~vu2^(!DGW@ijf=A7(v_sqEMK!A8d&Eo)pade!DD-V*y
zwG1I>VCT3KUuk1Cl3p@AC}dD9>>KR2iNbqS_EV`%ccEZ;lMsWABlpt*7=!UA4-@Ou
zQJ@AYSLbc$Fkq_!^+6J0=L60TRjHN)77@qGq%)hTFi|t<0c>tN%mCHL2PtPHbczE5
z^Me%wTG!!M>Wxau1IBcuCO~+r28cusW;<C=h&MEK#E?D$MPt_cgd)iH_xn4LwRRl?
zfJ4Md=Ky<Sjn6nVc7HnESN7+NP-p6LEeS_(n(2&9*<W!Xsnnk^(YumL@kpMM9c?lN
z+FO21k}~9%;c8io{-Ae!FpGFvW8n{1f4BVs8s7wW1k0`#sOrOz<+{48{gyM>sWaFW
zFa-IMb%SJ{ySze)!k)ZqbfC<YPMfcoziN57NqqBCyIQp9AsUnOzsbH?Pxp%VodPV1
zg&b5mAhxv-=e!F7m-ny5KzoHdtJ&ABY55(;HT4}(L<11+xW-(*EO=MDx^|d;_3`Ae
z4Z_zep#xZnb@h03F%SbHg;QQ&ccnIZCBY(z3q!;xJS2bfIu8=QyxkzuE|8<wpa7>z
zCqT%sQH;)!cd-rLaeBsc7ktIrji;A_0uN8JddTmoi7&NZTTBH5r<#Yy61-D{0kb_#
zyDD%jS0X!!2g(Ewy_mBO%_VaKV@f-JXk}aa-hFcR@&?!SPdIYy&4~e5K{t-QS<HN1
zS)nvc$YhCU&29H%Eq>OxZ$$CuVOpBnKl6uAo_fO9GVKhD2#_Du0a0nQgkzJ!UBLG`
zx|I+tS)HAl{Wb{IW1;QJe|D-=s-en=8Pwt#NrHtse<th20(y!jwir$n161iT#(T!D
zPIzj=Ittdw?DGmWyLqO%_(r|Tnh23WgCeZm>t@S~i_ys<@3Xj8%%Avto8vQjRkiuo
zF-_R8>iXx0upb|dElkf_y_L0vUy(?M)2gyl2bS@%PL<^*?M|Uw+9gN<7_gb~{(@+q
z&slOBpHOY&Tw$u@7pE1b<4!EgHN@m?Du>**lGQ}x{hz1GJCjnAZ}BM`#2KseM8w9e
zdWQyB9?oX%%CG`nU@fI`K@UCZ`x0b+zjM{zIOrEGE#|PLB~z=N%(Kykk<nbR(NbUi
zOEt2DJAs5s%dE^#;pG~w$KS6O4twaUBKBSX>R||y=tJfTl>Pd(lLVxJ3=pW_Rt%K_
z^v9il`C29~FU)l6t!`ajs@2Q0GJ-3E2PO(i0k`zn>)L39CnbE~nsXbl0!i_dXAA+!
z=v%zaQW*iaN>-X-<h-$=!}Eo{7Eeo<qob(JSV8Ro7XP`~eB!f4rzjca1nu<g@ofMN
zC|}xH2Urq}kx0NYR!h%S#9R}Q6a&p`17n>t?Hy$xDmHQ6d+?w#+S{i9c;RFY`I
z?+e&$ojk7Juee6Nx9ZsKTE2<oX5|gvNgr3|*48i6bF{{3-*F8JO3Pt*>!z=mSKiGR
z(9f8PvdjJcS3gt>vKbH>=HuI)ueTR!B8qFl7Sn<&a24cFy=`H8-wOVl)KPW+^hJ(G
zE0}<E({@pMHGz@CFn_;JiC)2<8|WWOk7JbUjiId8c-ADBk^wl~TNAlbDp1nh#7+t=
zuVtGw4}90_HJkK^R>4Ns{2??Vm~;%3;<d*)HlCE)R;^`M7%(_>W_J&CvGy$AZ1Cbd
z7Z9GH11TGuf0-!TeRV(%tcIKETUCyn9H2jD9oe=iFZlY*u&j_J)=vMGoGu{@=HYkP
z6r0IC_Z4s14+sh{Y<<v}Dy><l_=+5p`fcXv63SBezzq-@A#AfEpuc<YByeoEx?=UG
ze($4Gi0!@wTA`HnAO68xKX*1q2lQ^80nG!7B6@sC<2Pg>(yEGlm3pZEtgaVukG`d+
zNmc7V$k0=v-P~FNK!%!QE-7lm4-d(-XL~dy0mOevg(@q&`v(9^z#V-x9RasO=P=p~
z?`_5?B!M9;RF|?k4bdAm^B>Ooom-#MYr?$o^!@5y`EdHAc!*C}Oi=CAA|q$4;xk$V
zJE>%*g;lUD|H%qY_BaqOF9npZ8erHhTit?M)6~l4gU2b$n2w1e;k5q6aFS3G|J{g@
znd6S*-dEpzDZCGQYxav**dd^XC>jp7$!cL7@p2I{2#^GjNnmN7BnF*b<^zK8u0&GV
ze6vKl1)GtPX?Bwnsm*M0?O!Nryrwd*Abz#(4Oo&h<lCc)6)IXT2Ew_P%~=1ShcTYq
z(4GE`)8wXZKN9M=dr{h~!$x5N%bwozzc1h)ez-D}-qa=W3Ftk6tG)K1<z9r(L#y56
zK(2>}f(Y*jE?@!VQF0v>pUMZtvi}^Q)Mu-O5Xq-zfROtAl?>?WkDzgk#Uu#Z`)gyz
z%$YpxPq&vOa_y|OYD`?t=N9^~`A_9Q;!u!Bli=?9iaBm~REU)re=<T;RB?8q&^j(&
zW+kP;A?Ri>MZ(_o0C}}i+#cw-lGKN5Yf3|Nn&<t<7C{dBn5<|e*E2oo#<%UioI=o(
z<baOFZJ^E+r+vM*!9dF`G4W9sLVYdw>P|Y_@Bpx|;0;1!HQASYnSUsy0(JSLTnY?&
z+Dsqz=O9gT0$`;-gF*A395+6s=HuX@%BsuToUN!0ibmuBeP%72jgA2?aLKMpn7Q9w
zJ^!aLZVB(hvoC=|YF{xMG^!(W-v69+;m1io$&v5gJ!R{1TB81(|MJ+0v@};0bk5D6
z#_)JSWG&z#ZO%<~I;2C--)3t3Nn>$uMj7%7u*Kr{f0KHLl<rRD+JqA-0Lhx1%bqt5
z*<hc@A+*=-78wJxoIbp9*H^2T(m;yFyftPMZ9dl4tjR^t{+l$(_bUBm6F9{t6CTP3
za5!6zwWen7g-(i!Vb@3{Dn8rl<6o8DhKmqOtytZ_{}J8=P{eHb0SGnP3Z7^)EL93|
zg2$91Kn1h-cqZT|!`~cBBS2P8>G;U>V?ksjgO{=Anfz<Q(?h052Iy2fprF-!JTmDM
zn-&fKP++MVb$jVElr914n!*-WWzb2<3aID!ei3~w4I8PZb_NE0M$u^t2sk|O0s(B*
zox>i$O)6CS=`Z+Lg~K<a@cYN-i5d-XrX1vjR-Fyd)3CD`(&C|T>v2AC68^6!-XC%J
zzRj8Y$N*58A;BNZZGnEjGOKwd^G=D0NgfuJ-g~Bgh1yRFTnEzP(TT3BCFYO49wE85
z7aIoxtfPyNe6zXHY!hdq*Xx77gb#jo<2;wbr!`VjI2>tkTzf}AbAF-~3oL%?f7^ww
z(=3yQ{F+0O?2&}TG@W0(8rh>Ylg$?w<H^?GAJ%QnQQw_XSj&^4lBvJq0Qx!sP<Fnt
zNp!JK-d4dhNj|g%G@TTwkDDMHQUBhS`Pw#eH2Awx2_Qc$n%ja_ATW@CcTTV6xwP<B
zUlTW?I7X1?gF>TGzguPI^0>UA{cYGqZ#k(*6{SL}GC)>YJ>{<Uv0K;9{d(s$+}VHW
z>Aua?$}k5>V1%4TkuS~RuzIq?TRsvP-~59~`}vW>u?&`N=ea{PoQ2er5Tx`dAZILY
zeBjW(<t4d4$5#fR?<wCl+&wHjM!z_&Ha$C?EOy#?gksA++|a#kvHSV$vh0pu07w+l
zlDR!{IWBO+DqTu#+~!MSmR8}()jk0mqa<E+78(JNBAiOvXE)#^Fwppy0Wv_=?)<#G
z9bh>0x@B?8{mMHL3B}~dEz!;k)30qoSkg*wzTAo$+hcO>i;bn$WT#=*5QP^un;0sT
z`wucmn**8f)R%Rs@q2P<?0XwEIu7!C2ed{YwlYFFJ*KXv)TsB#Sdj>-;7ZIxHE-Id
z_g8d5-P4#I2-!hJ2a@wGXz667?Cka<1qvei4r%9l*x8$XcJU@jm_BYG+gjBJw;7)p
zRU>8sEv5NDbBtvdZ<A3k51BNiORCP6I!)<=#*$o`G-$8Jsj|R+CG>R%a2w9l)=+A1
zlLQU{B>LWKy$#DUAYc$L_The5KuIlEpNYD!*0k)C;lD6IH4q0Mp0UNe2A<UK0Js30
zaUjYp!q{8-*&k{JQlv?wOCV$FNnp)CS*qiCqCsA!z~6wny^n;kwH!m;!C0U!-}Sgo
zV}X&ztMJl9>y6jR9apYwls_P6ib_ONC_MB(D&~o2e#La(QQmOcj;&^eD$yvuiCgjX
zsrRe9-%_|)zOsi0o{Kw2cN0;Y=MksISMMXald=%I$LB;;Y$G;i-?h0#X&@jZ&JT4C
zwg8hO23c(xp5w&>ZrjGA4-j|AyyJ3IEPRF-J5&?Wl<c#7XzO60Cb$;^-1O^JWOAyI
z4TmHfB#$lLT0N^}k?~H*8%xV1G1P6Fp4&Z9eF;xW8m=63Mn^KK5DNf@eLK5EeemV5
ze;ptkskVy$Y~z?uf)rO56u#H8D^bZpfF~M&JVuYj3U4*K*+^a;&(4XvCx@jdK<mLh
zJoEoDg1u4JtQWYc`1xDE7lN1+hFe`%B}a{g-B{SmIuK0lL2w;&%KJBWav~(Zpg$0W
zs@(_B*~51xLey$H&uJc9cOSeP-499bk2lldVzM#}JVmdmg2j{rT+K;}PM*AE|CZUg
zNGWt{aC{tcZ9DcjIg|J^pgVGVC|j%%!QBFhlM`r~17dk>uAdgjj;YGrYfH3~X<yS9
zNym;A-5z_|mnGYUM?EIX8?H6o18L3mcc_&>pzB!n%|$=tU?+~wG~O;K-xK_XSB;bA
zTfop;wy#$+)th6mjQDTU!~z2OzN!qY&VyT?zxkbuM9ygy_(AXY2ndTTx@r~6?t4LH
zgLSCrR1YZ%nwj1=^1Iy5fd@W~z(XyXu$^RoWb$q#qhNPUZFLBOnM*sz+<P}YuA#*3
zFKQ>g;HXJqwR?YTTF<o$D68I0w(sbd8Xz~0-{Gkpqc!mu@M8y8MGtp*j`+ma%^Df`
zZ#&iPGU5#cG<g5AL3jWeFXjG4lmH-v^1&Ga#*Phc47INE;hbvS2SHRUsKSZSa2f7*
z2jd}WfJP}adTk5<nbJJ_PiXx<2=$Kh>r1}*bZw7icbAd3mvR0K_>R|zk8`{26NNPD
zV>yy_(g%7lQfrOAME24XU4{YL$FV%+(D1zk#88k@^X*Zl;)e|~`}u{YBF1A5IUvzX
zhr_toy<jL}2_&`I>Sc(ck`{c1M8ZCY<aGdQwjhm_>2TtxjXRfyUah%4iu<gk1sU{A
zt;Dngm<y2NrTLvr_*?%o@dKFq=FjIVvMtX2vCNi&>0FyEoq?!6-Pu{V`c(?Mm!6lg
z2gCD-u(pb$n{&&X%X7;W1MnlGDCo?nlJ$|K@;8_D&V*=?i`swOGglIRr8Lq9&Cq{|
zc??27+R3SGi&X$OzJI=;okUnPklJvy0!i3FR_>!rLB8wGxA6eG)#t>(IMSC|HR6)O
z#xl}(LmWAtuEt~EzJ!QE^9-PQc<QXWj^5?yuPDl93h41(yybQs(z=v9{aLa-P6r0-
zJl@@w>yrtUp@r>Fg@<nT11lbWdl_@MH}i|{uA!QWN&e;`2z`)k1lE}Hr^@jDhHY_f
zZU7n3>qe(aahu&L^-=5{#qgdaYk-0m=#GiiqQk=8%YG*)ew7Sl(_5~!$NPBnem>Lq
zZusSF)XiTXsPQ@bG6YpjrJ<3Lijk`K&4qwKd%i;%wu+A@3RF>?*~+J==iFWw1j@H-
zID<D7(<?)|ZL&aXfM_y0$%Ivj1uYYYnKrLiJ+ANKaBb@y_W3<r3sytX&qD4hIo~x0
z1hAqcI_Y&><{!fV(*1uue64i2Jn!4hNeb-BV+3NIf&^;0w3fV9;pP8Y4t;K9M`oN)
z8SZM|@u<u)y5iC8t&1Q!Y)~Aa_X_gjt*;NH%&b!fN(lKc^B}lkPcMi?Zpl4dUR9E{
z(!|mwu&57C5E5d2vb@|I&FMjuO<OU};p`!t{;t=LH|ug%K&G<pFJ4Uu&S^x^AjcOw
z=c3?End<B)$iruLV*(dx%`j~5(zqo_R#;|>P86F%R)%F%)p;rHfl(NtW_#uMj0JM1
z8)xPX4rLC!DbN4*r@{ZsA5(C_3~XN)PEGzRrqW*1kF$Of9*lABV8{3|z66BT^6pl<
zk{BPa#jq?XrJ{w+0~2YDC3BDUe+f6k#S=inpQ4XuI?>}T4PUY}y*UzEurZWP&tq8a
z-d!|}M634pE(CfjV42kaRmx_``L+}1N1~GvPC7n~u9(<?Z!f2DQ7Nx}Gg}kBK0d*2
zsM#5A2+;*{20_eP#$<R<?S_!;`|ijfQ#=9$r)|Mx43ZpH)v`pkw-b5cC-Z8K+Bf?g
zH&!F>2Qf4GfrPY;woJ0MScLqc`8b=kn$m!bqA?!F)>>XH3zS;&#7Kf%<##X|8$dji
zRqs-v_l_ILLY0YSK(o{cL*-Mb=d~jrfGs42q#rcTuFBg$X+1aY>wRq5x&qqalE#4^
zpf>_<NEqlJ#-IK2x5U#aE=A$<r1cm5ck8|hyl$NClez+&-4i~1LuYLP#LzYoK3@~!
z6AsM=ZsCYjq>z(!cU;S8eoPKa;IULW>4Pa_tsNVIL6{YIPkmnOVvvBGhJ&yezUgss
zJWX3sQyD=!1`tK#OI}gzWt6A|YB?2p6If_xXBp5z2LZf-?Et01FlnUL|921n_3t0A
zQ4lt*7aHSdh56*<hPgRx=H(BMRzrpDsEYq9I;&A~aUwn?1A6qX-#vd28~`N0MofG$
z_zVwIMjg)1Otr0vCUsV0|HUuM4|rw&%L15>sl}e|0P|OXI2+z|#f1rhJ?7@o+(MI5
zi><C@zG0=WO=yEr2eQarVJ*;EAOYl11CKq}Y@|VFRhI^zcd2!K>EpU)D5bQrlJ<aJ
z5%B<u4h;?Y8=q^+N;gTVfm+8|wp^NWFg}N8$?xiX6J(pdzL?*VuR&K43xFn%%ly_i
z%>sNQ{hNX(DRLj)MIcAdhfQFvMlPG=u?0kxAycmi0;gB_j9>2GZBu8a)}mci8yMxw
z#&(QDR12n$kCn2d@CVzXcqeNe+k+eLiU_LBTGjGpi(0R)=XmY2NGty!{PWe>69IGt
z=MGNx1jHD+BRaR3^>M`SW~J3?wU*CYnL<DmHY;Lo4emY0v-MKul|+7M;6-bQrM
zYvy_Ext#Be=IFHSg$6YZMCi7r#9Phf!Nh<_e&Jw#x?;9uTr5qt8bDwE{rO*{UJAMA
z>yeQasQ0Por;qS(z|g(d(OJ<0t10&+sYNWP5Ser*o}LaIC39i7bTB8DjR&&h%p^m_
z&IazvkUQB-;CqRd2!^u1<$P~#%h=BDygLR=xS2eHXPYg&Z(ruu9^)3bV_~6_daTmK
z0+EhbL5tzwNJSJ$!cL$wQ?LbSbCWo5o3>KBz332Z(Z|BVZ843Mcy|qK=$2~J<aOO)
zDxZx|@x^!0z<xXiX^z8%Y<O2*H-pwU^vOyeZ^FZUdg~>4T3{oESDg(YiRNuA!Jnac
z=2L_Tkt8x;sW|M$6^W8sZ}0sZ0?Wd02MN2=h01yvy{t!0KLMyoaN{bm>+1jcT?&t?
zx!S+&ts!^<(zJzZ4GoxJAomQtT(CEWdRPat5TP!P19a8NxqJn_`D28Dp2{P(?;jAP
zK9@C~&4ltsP5MvUba|mhI=>SD?v!;$fflv~HD;(G!SefyA)q4e6T*e+dgQKRgV8K_
zPQ2Ka1We}pZ=oG{`H?7r-4UP@R-8-upEDNBz!|v;7eVb01Z}B!@3(=w*UoLKp?s*7
zjcjHg;UwRS%R!C)3q0m{6^vHnacR>ODuIpu8_49pyt=Vmcvb_XC&%G!PI7;$2<1N;
zwoa!ffD}@#z_zC*unMmRrj`+Y?*ivkZn&tzM$8h((9vZPY^X5ogPK1wv%||`ul8pO
znDb+tm68_sKE>nej3(}2O2&CNgWbBP&RB}CZ1DR&0ih&M#9LO+wt+4~;2mNQp>eY1
zM+N5PVLu~D<%oobbYlE#(54A2FLv`S%Ej)w+VFunDS<1R+iMhP4L^&3Ng$1h#SQ$P
z=d`tG=st(G9ag%!7Ah@`MxqWTyvr950f9*vTdU{$ZY-65Z1V`Xf7{(cWb^`fKJ{=m
zKi~Ap1o)vB*m|RQuFOWHu~RoUr^7UA**}c*B<+lipTT|hP;m=7NTbu)`2V5oz2m8V
z|NrsmC?hMY7b>Hu%*x8BBPk;}l0Bj!BP22-hollJmA#S?8QJR~g^+!0j_m9*L-Kn(
zNv~cx@89?K`KOd~p4W9f_P9T;=a0oiud96V+ZX<N?)pEoIe{aV5(Tb|Yh}SsPt=wq
z#6p{!CQk;DiNdWe>CWu7q-pL+x({|<h&@G-sQrqZsWPCV{-FHJFvFE)q1}-4*hfmg
zCbtLt7`^4;*<oqRYDn*@LS+it-H?vXlKHG@O`NBRl=zK)q3Rb6+h3o@4S$1l`2tLs
z9SGvrNaJvoax0W;zT&|w6j!E~V}72sF<W5Hx}g;Pbl#z?uLrKsIsc*DxAaQx3o#f)
zMy{B-#FllH$`|hVywx*bk2w?~o?4$uzpdrx^#SL@t-f>o5#Nsq^Pjb?My3nhpMQ9y
zt1gLS6OU^!$XD3w%@$1)DeiyiT=-MM<rGd;!`*~v&t6|;h7;UbN`Z0m@4fW~r|>Y3
z^m=)ql80VfF33#KVti9=%NB{uX+#k;=nV&?!?w)eKI7>gN~is6szbG!Y|yi}_ZbA*
z;?aE_72{Huw^<&uQ<!_KkLtg)Yg62h<YVqs42~=L=Lp%BTl@R_EkY$@<KwS|vx!@M
z$Q34GuQ`pnf}~?&N-V}*j}b%XGJ+rnbXS(8K9J=4onHx1P3Yi_#4MJ2F{#Nn?o(vv
z5<k}m*U4qVjXNDq<3acfe7FZQi%E<^=cxHKUP@lA!a%k=n=fp(3ZtDOP~ML={q)$#
zMB*TVL5^wMW#bGNiRcy^e!123;Q>-Jhz=K^hB2w2rL&PKtWT$wZ=AY2srVpU+H?O~
z?Ey)<N&S5Lu@{Qs2CQQHm=C!H@%T>LT0PWFR^6+iX6AP`n9*?0p6q2xCB8}ud*@?W
zFlx^;H>na~5*7>dhX|rSfU`JJCm?QN?3(Cl1R$5cb$bY7ZF-s`fN~8PGnYS53}%=?
zhc$n}+%iEs^8jF;pO0!JppPBBdIZ1FS?iYP7LHi4V@I#?Fr(s>o|Kxug{edZmva%f
zD^8qHhPn1P%nW{iMYsRr<q>$QC~3BfvN=zlwA1vs=-mE?YalVq<6y(L)DS6#li$E*
zTuL=8wTj^EE>Iu7N8L@~TOEOnSY0s0Twj8M^3wMhfBhFEG@!Jb9T&VlBzg}$Sw&e{
zul&(H%sv$B-s>ES295S|@0js+4(&g21m@~rp9&N#W%Yi-Ym7Y)mCi5y;1dBraY0y1
zuqu>$JCZ0z-*wWxHq=qy?M*w<+Y~?RzAEE%asgmg+x_=8MgHjFRlUzUZ6m|!QvT>+
z5@Th6`ok-g2b|Kc%p!L~?T7oILXOK^tqxK%es^-8#ly#79UI|N8q?}`o{4=2xu<km
zT7QjMO$GNjeSdste*B<aQ;MNX2tFyWzP4rC>d%F}b0Lz?9e2KXUDj0LNEys06p@kx
zbE1-9?4V^D1}Mna<cF$n|2RBY`C=kGCS%J3F9mbusSje@xxcRLxOO*5{P?zP^VqX$
zB3gHU^bIO6Irp9F6FOXk`#v=>trqjFj>RIy+U>`aZ8gC&hUCg*mwz}p18Vu|!;!Z>
zEF*9mTE5@AJ6cdql+RKQj8m<C9>YJ<G}bY-qx^x_+^4Q$SC_uZXPbhE&z}%`I@BpQ
z30(TsA>kGTC9EO;5O!R}pxnRpQH9XQylQS1qqkv^XsBF@*pyK=dm>nkqDblXMcRi9
ze^*h~AUUQxzb0I$l(T$@Ullu<;C<~Q9qAe#OztU`flgQ!O!}}JKLk3b!N>n&PoDCC
z@nf($Jz=mmL}Sr9<x}wU2Xg$UGlo8JzNhK!^ZS{5CsGhPl?IC|4*51?%x@3)tzOH4
z>uUfSHC!{IqyL=es*=*^h~jJY*q8W&=P>|5IoRaA_t2^ev}>Lnx9kvC?~Pbl$>BPG
zrGZ+t=Mr*hQj+BUWSizhMzI@Fm6ew+GYh_gX=@7t%Exxz+w0gH%pi34@LU}3+WQvg
zDd)aLx7-@b|E?Z4<!HcjHVO(r7sM@p-CYu?zV(f~Q{_D)n<J`Wd;sVfXV#En#-_C8
zsRy47{Pv!f$wqhL$kk^(8WyQAqXSsXMwfu@({|$uxKPof{L)lA>w0o!wN}4-elPqn
zH8<8;b*KNq0iCD&s)lK(!WXT5w6d?8s2mXCH)cE5S9f8kWLQjV+$`n-{kFeD(N~bL
z@%mEULzM38J+P0CyCCG(k0^PWtas4y+oc-YZT1sNFcg2&iFxuy34&ZUcU?+gejfY|
z@KCR*;&}7l_q!kc`#Qlb<hjQXWYrInwBB!P^)Rus|3r5^0$r`xb*VMk_9$zugFf@|
zw6A>?P4Tf$uh}rFzPo&Qk80^MTm&h9zrsWM{`OaN`Qh}cR#!;&s_q)N^JDera5JB|
zE-wFwnZP$C>0?UN+6oJk`1W^Whxy&vMeiK{##sJP-sgOh<gEj6uaVWl4%6M*c0IpB
z%1*EL=b_>wFm_erS5qUi3<&~UKvjA5?iFcyyNL)HyU*h~)3dV|?iMgO&)+5Q9FIq_
zZL)Ycq(x6y@Na^WYS0;w$(dYZ7La}h`k+0#RRAEXRTD(ja-t^EIzaH=xw|!H>VsAh
zT{7wxyZs-&1pShl_~(hl-^HsnC&muc8jyHbb#q<ntB+#0(+R5}Qh(=&<HL@=WIZ#O
zFPQ!O?eZX6=cvgmi1H2uJk-7mgi3v*v)kS|c&3N+++%dq)T4hg2kSXiIJ|BS$4#im
zUrKyNB`p@9neytsFPnzyrP)j|+(*r+4tc<R2N24$dS7Ijt5tVoy}p7aZCo}YCt`jl
z6<{9@g}1`~0aJb+{I-q~-7?UIJp%nJ9d4GsM!gGNV}ZZ%-Ihc47%AkIaRwFsNvy4W
z7bBRLmaC6>G7&xY%uawRe@oDQ>$ldBCJT|g@9ll&?Dgx{cMr?#JL15&PL4CjXs2(;
zAQ1y0yzdl;kq~Kos(!Qy_i6I?X8@5sc&Y5Jz3`3eq)7U4lR%gqFQaNcMOzb&AST|9
z$(}PPI;+oJxO##!<MkD2mjo2O(FsU=ae##6wShI&yT92CgNhH3NWo6;BNLohja(ub
z<6n%nxjfO);YYgD2^c0JNHTe(I$TEOL5|7%R4U_@G=0yK8kdCx^T^{N-Vb(32eE3T
zjI(YpKS(5*-h#q|9C^pTzxOLPSyK&I(aOG8LXceh2^#M6`qJh_6+$1PnRMEC>vrA4
zRJqwq*o`6W-b3&%&j#5pmiIq><h_Og1eu|g+E}R^c;G=i?_gC}@FBQ~6}fwd_Izym
zou?oFTb5eiQP?9`@SMG|0t+0G69Thgx>omn@Vhr_({GJ1i}0{5SB*lW;Osx6+Qr9R
zHH!+f<lYxkQ=Ikpi-X(wl1`ruIK%`9xtCD&ZxiOb3-_*W!V1x)F<_1Sm7aS*ZT#G9
zH~*V8$WTpMZ?4!rh@+{Exf#iGg|U16_Qpfio=Vm6*Qi0JC$^u#^td@=c)p#YXv1sQ
z<?~4$UsA`vi$B=0nO*1-X^@TJ3hDyb|Hk!b5;EkuyRml+aa^jZst5w3GrYT*_LGBN
zzKetNe4p2-noig;C@(gveZEa3<k&*GiYT}=g%~#nl6t+>DJ37UN4#GFUj<gS<OZp}
z_aeT|A7c21S(yYCwrR`u#;@ORn7WV_0}y8gu20#zT6l%R72>&x0J&2{XtRYR6iMZH
zg6N?_P*R5Q;1bKj{xy40u(R#<SB|;JW9l>Cd+y%F01C2;iM|$6&P~zL<Y_ehsM!Pc
zjXUCU*>;7sI<6~D|548~bl*<^b$S&M_{TFeTw~FYd$n4m`-)PaAI&{)^+d*l{M6nn
zG-!@1Bj<I~uKJks_$Fz_-Z*h$`$tB2_cjr|)kZ1?EnHN^u$h#zC3#-XV=eS$6|MGz
zRHg0f;x#}H7vms+hD27EtJSBMOH1KSxZc&-%H6yE&ezsY#07hNK;AoikA5A)_=1id
zi{^qWEO9V@ov2?Z5rj0j+<CBZ+^6e<`LskV%bi@*<Co@S8_CEE9T%Sh60j+QMd*TR
zhT{kFM7``cRAJ0#HVY*D4+hW(dm^Y2j?}A`^fpsEO%JrQS)Y7V5$<P=PKaMEf4f%g
z)PG#^3xOh{D)#HfUNTjG?!P;2Ya~hlR?_<4t649sTz?@<nztMAfRj}=Ms&i@O2)oB
zR>7EaT%E}4^9>IY4xDW|`mAszEZOHlG-q#bBE?tNNS)04Fp*(iypJ9X?Pi@V8Aa-?
zl9y1h?;Em+G5j3cr+`rJvlZ%|{3#6J6hkVG@BByc*paI@#YH59h_EhK2-}(pQ{1ls
z#(K_J0*0PwRwKGxg&LgB39eNbA{e;UuOV=ajP702WF!?_B~8r_?Zy*=@gL8HCulXr
z`8}6NKOE|A>T$`x-1lIVG7GV}lr<T)MWp-KB!vz{2NKLA-m9df0QGO#>UkYGT}cDE
zwBypVD!Z|`U`+^iOrc@t_d;@d6X*-!QyLG@@{J-(5yQH8Y~d%@_9@Bal=Co2HefTY
zx?_i-(-1W5(v9v#>={VJ?YW1?t$GaS_U)e@j?a>n{h;2G78Y5uaZP~dZE>@m&-pPW
zi?lxfD<G}kxqczCJ{{e9d{d~5$ht-APlZ!FJ`ZzE*L8Y<gD%gEK}G-V$(gh#p4*#X
z&Z1yDUD!@yJp8;0TJFvJ?mw0v{i7j9)=4)DYNs`myDP{@xGC5Xfa2Y|g~RE9W5~Qc
zYpN8;<NKudbG0Ap^!hZK#?g<F+}kJMCN;l*xI~C>`rnBFK<0c_LM&Ho=LelN($|m|
zE-hS4#21Kn-}#|EHxS$y8x!;7$uoYl^Wrg{{hOmXcXt;Ch|d+BR+D-Bl%g0|Scbn&
zRdz4)g!j*`h3Uf*0_k?AK`4*)Cv#q-yCF>q<9{zp%6yPiW8k>}+)k<YVq1=x+}@L~
zh8+X1nw<8Jl!<wI@lJA{3sh5)_QO$bBQ~PvIEvI)f~~0X5ILem<mhC?2DEv{r}yu<
zhqi%SRoUK^UOSv1N^G)6waCLS+<;B=WmJm6#M8YVM2__PmgXtHCu<?<<VTuo+0-Ct
z*Wdr6hAuH6mxh}TkbCh8*jtlp^X!)GSu<`#4+YbnIeotNY;e~@Dw<8GF;K@zq;lwh
za|%O1ojXZu1@L+qJVDYj3%aCRT-KdeSRi^e_^8>;JI|2qyQU^o;D$*Df*hAEQ74f>
z{*iJ6%g0^*h68RETwV7`Bp9UB4~ikr?1ntJ<Q91$K<c5kq23iPmFS(QmcB2Uocu<(
zT>rjxEBn`<As`j+x2^%pmmnmGkNn4e-QY`*V~dWO)Wc;eA{jc8H}^FQs+Y?iInDal
z_t*cK&GncA-|uK>X!y<Ch7+IB?3-)PzE&NxP%j-BSA_Y>;a?4}h!IY)H6FWfrIWZZ
z&wq=YqT$o8YstGw(&Z_MFMwQyAaH<t8#sWqVq3En<M5iD5a*&xw>v1%mM%Z@!NsKZ
z6=~khJbET}2-`N>Pj|RORr-+Tn*cN^_;rRBo^`)&AMtgn2+@w7f{IFnis%>Ux_V59
z&3I7UVi^<P0gZ=@>4L`FiFjU)&jZL8r>RaY!M^P|KYu+blO=?4baedCrM=Gc4lnj!
zvey<~l0MOS(5UxhjkKFV-qZ6us#Xj7Pl)+fcXw*w#x=P%@-oyFc}H_me*_Me+WI*z
zJQV%o?PZsl%<DRx_=6Bs<xMfSBS;)~dX!s%OJL%{MAAqif-h@~@uu(nyt!+Kh>0Q+
zYV2e25`p8gKXiZ3;>L-$O&BJtTr=Bp^uIWxR&#J2SKk<Y^Yjs-oDoAfL(Recp%am$
zna^Jkdd#w&bVELEbdmH|-XR*$wtmya$D#*Zl0=nU1v#;L7dcY2<wBnF#OK4|HGSSq
z@ndF`>G>of9B>Q9({Rg4%2}|p1>CGiLb8d^UE%1oGqFmt6m)pj(r<M#>mXR}@@{PJ
zj~9p~IpWr4=?a!gC-N}Y+JQ56Zrl-+obxSj8iu4@uGQJMaC2D`)yJdAPI)A*LwWOt
z1?f2fybp|8GTfOpE!6Abpjo@9(dwL0>GYroRMZ0)=W9dcVP?tB_N2sdA}7z?k3<R6
z9TN_w<a6T{d!7seb9gH#)}o}(eyCqfFA@Laa9tt{K1N9Oy$fEqBL6viNiu9}xTV$q
zJ(8OuK-Ht~jMwd{`(?V^n=?GO!X=(SwCWUqfuf@6jm~W3o}LPYojU6(|JR>wArRwb
z^GptX176H*FQM?GjgDZ@-+3N|g!xi*-vMn4gU%&Vx5U)V!T@G?Wc9@SBw5wd<rLZ<
z#hv8}+S2?&W83Q;N{-)6$e>cVoJ960sQ2v$opYbKyQ!-`Z2zkC{^_nboxR#z^Ir}<
zcyZ&a2~QfY@hh(_!E^cX#|qlfMyf^*+a8<}klxvMxO_Y66V8polmEK%iyu)gIlB4%
z5fLud&8`=kS1-=x4d1waRAY5@YISktOn1IUPh#Cl7waLB{UoF)aykyQ`<C?|F!Au_
znee3{9yEsri7f7dRJ;}Pfn24}o~klTgw=rn;}ND*a&fo$YfYTLzqS5DhX6-H>*P<Z
zB4=4#Ri%nVbV+U$p=$bO+qgY1dqXd>PCW{H!fndsz4-O(j|{4mQ;Uxmj<MW%OHrM*
z?Z5j#8=ZvZDc-T4bKioc^N5PD{A;m|O{*{*V8<`77V{Rz_e?1#h*)mFP`F1H_wb#H
z_*?fV!iOv8;BaU@)Xy(6QxrYl<xYs$Ctv*8(R%OBLjMT@C9TiCzaE^zOoz|lhQ|x%
z!S@GDYX2q9pP$4Ak=fh(33S<addWUeR#q;!T*ZX<$qRgd=Sc`uh|$%DH-7nqT93e~
zcX5A>_IDGxJ-FaDr5z*gR?~Vna_QaP0fh~s|9WmF28~JeQpveOiZWqVIFYc>S?qdz
zd~#2?PU)8aFzNR`>URK<OlA*9YrtE2@o!$dfEOpcF9>gLj!wF$V@MNfAjE7r(vo%^
zXnQx5FdO&vQ%0TNvo>u#D&8|ccDy_D*9JuNU<2(dS(2ovDtbCRUqy2Q@pUnQASDLV
z$>Ca<)U0GxlU@(!%1otb_;7;$XFYhG%8%N;D-9o=REq}%diAV}ant!{|2{2Vh$BJk
z4($ctjx(u;^T^!UGUAH~pF?BnekN$c-Y(r}QsL)F5I-EufCv*SHgYxVB9+5xSDml^
z)apuhj#qK}NM`x_lN5i5G*AnxN<}`}4(~Fi3(`kptZyE@_TS%cC;|Et?BuXR{F5w-
zQPK2dU)^U>hA|yxhmkKPl?3cvRLEVdvECy&<ly#R{O%q5z7P7tM|W*}`<Dtm_5<Qb
z;?5r=A^k=UFOF-XdQN<`5_S}`6>ctZswnWq3;T*4E~>>nQ?<sra+%0k%W71wjCeRd
z5Mka|Znc30Pzpw9toS+dN2$==)j2J<!Nhj07lH5Vetz`7%a#IHjU=8`{)?-4P)jX`
z?VnWL(GTHx2kljA{s*DY8$FF8l$Mry8Kyey;Kk~oP;({?wpL7}{T|NkKXA#(hpL3P
zBd0yZ|Fu%p^^o*W!ooe4;_qd>J{RW_zxJ`<4)mEnKmFJG7dT;kuk>T&bfzKbC0U&J
z-r5Me+YU{vmu3FPKEgrq?5t`-41rh9Qr9%O;Q_uGIvXngpKsOg0~O%8)uan_Q3A3Z
zGMgbzZS;OP{Bz;ZE0ORDqY1hKVAA;R^Lzhd-nwe_F~HkdQ+W?@p*h^JS3I5nu`KSn
z*fw_>_uV0w@-Nw1e*h*Lp6AsT*8$(mpufZd-^a;~a{t!UKYl>phK+oiJ0*`I{{T9`
za>xI|-@5VZ1oC9e%!2%dKj$ju_ZDSlW=f6DdU<(K;S8q|`+d$0N|0_Pk(F`bv<C{b
zNxeG$mjzvC!v|hi;-cz>Q|{;=$hr#(UjL!r?~P$D0QC}{^$#5Y1v_(AuExM<A==yT
zURl`;7?dZM7qYX5j;E3kJw2Mk684a}9Pdnu8U@RdRMGwBf2?w^<Z+Z?eABVwf&v%2
zHakLs)@g<gaWIz4&Jm3d;l=lJR^nI0HFhkMkH+HFvUU=)zEP~Ont52wNq2Mxs*!dS
z_m#2UB&*DvsDPl24#B5iw}pv>f>>R5s}<CWQovYebW-zGR}V5@vLi}1It36Wq1W~Y
zUu?lDpkC69`sSy}q>U{sQ+fLMJaIha`AgY<lW+ZpQARktdU2L5)o51soFczIM(Baq
z!KXw&+(1D-D&M-3jGp7MI|vV{%6TrL!br}@oO`XeMXzAlf6`8J*ulZUpN7rY*tqv(
zYO9o{quutMySO+KuIYUM%gAEufd5<n)`SV`NQRp0oA9k7r@OBOCnFFN9ueIYiU(vj
zwCmG9M=8liYskOMo`)|g4T?$--5V<(xcs&jU9f@s;q%Y$TbL3jX9C#W)SEXMC`a43
z9tLAv*0no}o)~XA41rsYE-^<)QGHCHMLFV!h-%RmY$X{F<>=M-3(G4B#e9)rCI4u^
zb2S94iWI?a@ig4zC%R+u9q`+6GT7x=LAtq`?J<JaX<r0)OCN^(VX-k>_Alf0OB1Pq
zuNFq?moGj=tcJORtB3Q%WN|YCu@e~S!U@a<!_K&)LxFZ3U9J|qe~HPOprU7BCm}Gg
ztF<qI#nngGAS%i*snhx!QN;;TjLOKD9R1<4wBqIv_b&V_IJIp+anZEbFHRBLGnN^s
zk(Il6OTYk=D(E!BOn+b}zH`m<_gq)I!dme^tmIbMGHcrfWiE8o1vtlfCu)emh*6RW
zq4o}|e7YAeUZnGu6J$R0CB|0@3J6Tl%#1E9t6fZO=+A_QHBLs35;Yosgqrf$1fd^(
z&ZHAtBnxiH<mLu(6@?J}4%(=eR7$FX*{Wco^rxCsc<o|$cXv#T$<C)CwzRquVNr%u
zp%L{@$>w&Nzc^3y33S1rYgee%Xh=vK-~{qLTx3LTj62#Gj$e9#CJg)q;RnC3$Q}6i
zVkqkqqSUYit7oAcpB%6S>qN0{FTR??ey^LOgCR%x-%;|qk;2{u((JS{%#VfFTz3|T
zT@XxYy(g&0O^nii1Y;tgyQz7&EYJ=j#x7G8e(NgZZ&c>tOfMo!!3}F%ry=@Lx*$z(
zuDZvD={f2(;b7VJgnXj(sy_w4<*^rC5~ToQz0_nmkkC3huP04(qb57Sc08!7M3Mgy
z%M#hB2rQoZHrdz77zww|Sju}igE2PF==72&E^Me2cyb0OJW3e&N4R3>$3sk|wm?*l
zn!7Z4?y>~~-r~9i+go8--092)E+TuN<G}cUC<u)6>_>ATyuKS6|37?z1P9nxWjy?;
zwKe}B6^IQ}+)WOm6d-4(gN^S0e{b|Y84~-2OyPbDan&Sf%^j^hhjSVYk%ur{f=t9G
zPs*IA{X=(wH@8_ha|@yfUc``H7Q8;CC@3hnr{MPDv{$$HUV^W`Cd3%T^c<4~CvWZ>
zszz*~=Q_>&I*5LhE(bo^wW78gqFHcQWvvWMMD4bF$uroHdg(Ds9-w9L_b~l*_+n+r
zJ>s^{lMG(RyG_Wz{A;6tFKCoHJX@t~XdiK5y{`r?t2{-dm`4^jv>-9aL^&GUC6h(e
z{6--T#CfIW-2XGLIB66T6~XJ)aV`UKb%(zEI6}5f7N;hze#JJGU&y=0D(BOYM=Hbx
zk^;W6zfEL60|zDv&YAp!=l3}kG(RuJH}T6f=O^La=I<T47HXOD%skvOwsu2i8G*5Y
zhc7vXtV`d7u|-_2)NA6g6=XRO5?#NTZ&%|iL!Ke3R|45h5TuIMCxyK?DDad%NQ=Y5
zuwP&6W}2npEuJPLubl}BQICW_+uBmf)t>L<AJ#l&&ALuqBT1mpNLQHXp8=6jQdCZs
zh85@O-{<&`Z^I6Ns64d@_Mo6k&;jKg+7=YCZU_?;jY2T3BEwgzL-$9CO@xWQ4S4~-
z@ns3(`7Ev|Z1eOR@3HJ}zHiZMi+?qPkhLJ126iBcuhIDxa}t$2^yesu)MdaUV=G~C
zo|&5>>nbOF*669QPQ&GRiX5ihdMs;Z3*Cu&cz5X`iqhwi`an?jqMi`)*$an2We?_5
z{$KTJBV<NSn8I{`6<sZ5_4KPAL3?S#*9K`C=?4i32?w#DJ}u9fS=jGE*4qb}O5oWf
zR*eKCZ*0#k0|%WF%e?WW|5CIH4s|F-jGly)8gU*9+yZMsr4rpos>Hz+rv{8ZL1I6D
z&EsDxjMb!kMEOmHjfRo(`k+`k#94hvmQj30<qZat`oO@lHUz?nCS^XN4hP)~+Nt3a
z>c0hx2NQXIDryPnR(}Co_*It>$wE;%=-Y5XZ;Wfhsvfv#q<!C7pW{Z`HxKv0g~}Z$
za*^}E_a~OVZfYLR4LJ$TWR#;>`76w-#1sS@42Lu6vpqsWDu9Ge#!UjjKuiXT0k8J*
z6(>V&LF@4cl(&S}848PHUyz^ONy@n8dSzVNSyI$I*w}X3TaZ3CM_bjtguTnhvj1JK
z_+{C)!+FLjfcj^7Aae;0jDdq;iqVF-miS12s+vJxoz+B*jdKAD(7~h`cUKD+Q$N-2
z3f+QL{GDXcM6`_9)lN-L?%6Hc)z{bewmMMgVvwC=1;Z}bNITX4f81&XBAvS&1GSSs
zvPTW9nsJVqhix$StA3)CZ1*4Ln0?2TjKljZp6&kh-!rVCz~BvzO`6g7uowKzXnOAT
zSs?cja|WW;Hp&J_;%o65k_XIhJIk>X>vZo6>I;`y;*mcz%CxZL750R2ak(`G)xO6N
zum>s+xZec+RN07*-bOv#HYZllot9oG9jWMhk{B~VnZX}foMIF3*n*`5Y9IO_#zaDT
z8O1}<Xs|H*1$+0)^`mRry+LK5*HSwbssnP*)}n|r>6c`23artOqG8iw+kW%>KR=v>
zjfp3R%|Mj^F{pY{*4?N$`k*sG=N?<JIt~BFi=lUuE_=Gwb>Fx}wq4GhFr;<H`;+2l
z2)ktpULxKn<g<#N0#J`G!1E6}{GokD_rbDQ{%KjTA}AATJVjG%!!?cn#DnNCQYY(a
zwjX&nE{3hBYjnU7%oc=tM<K+tGNe*s!CS}%2oR46_k&M+j<V1uMX~bH-NK|cjfC}d
zcgsjB-LbLx);0*zHGcfZSF6iXj5DS4-g(<rSMuX|lN?u@zEBZfWxI|vK0lV-GnewU
zzGzjZePD2pYv=UzZQfm1r7kB}4SsFBJr*`&%9mp|B>l4Y-(%u=(^eExiz@Yw&&-#(
zRg?AU{gnH8T)*=f%EKk5Xi7V8Bn~(C=F{@><L}ui4}TRZe3Mm?c+B!cDFpuwr+xk)
z)xQFP7J##7rG-A*vJt#qjivQ4YQIBMdg<uB_tWM*1x|DAY$xrx2S1nNR!DG#Gb_ii
ziDa5IRE(Ne-9o>bXyAvyIF)FSmNM3K&~>tSm4!5`CBc88^+|QKt?9m0lin_Mv6+ki
z)(nQTsO+sE1|!r`o4|(@RcQY`<DbItN4BL$_LOGSi*9mcVcEbM&u6@BB3l?JLw~Xo
zTe$N4T5qBHwLM+Sck*m<??$k0O35)XB|o?HR5mxW@*Tbv147Q)O$cJ)UCXF>U|90c
zHUFhaNhZ*wsp=S*)ogI)Sy?mhTMRBss0!De{=2SBo~dkf+0xSb-AR{;QJ0Y<COltw
z8|pn`ydM4EBCr5+zC_`$K-^)WWk(CcECDgL|7a5Il*67R8cIUXFp#HGh$iX8QjUry
zMXKoeMj19qnnT%bpU`82n8=0rM~AnI8fu-yHKi1Px^+%3U?zm~>o~PdYJXnHXi`;d
zqEp{w`slxeKgB0}D1qf<E|l6GVg&S!!JH*1f}-(!?}$(4hzcZ|$rhS_FXytlcw3>)
zdp$n*Q<h33NTOwyFaL@VrEy^CX6<YFFGiVSgqHsqSIHS%cn_YYkV9Dcvw4W^Fuho)
zi6J!-kzkj|s>n%S&_{$_3!h<6`bm?x&IA!nqvgg}T$W|=m;pnw&0t(;Y1Q=fsbPtr
z5`JJ!#+PG(Td-mXYm&$Ilg*Ag>k+DY&*e_MIH!HA@PZj{#+FOnk^Eb9TF-n<hb;NO
zs;`bppAv~%xzgixb}4wiARvEl={{j$v-sV6A+PcX0TXKOA#TYD7U$$y8m`CLCHqLy
z75&0Ry2L8m%zTp81EkC|MuySFPF`%oYIV}(rgb!F=Mh^O6i~wP%)#8k!urFz)rec)
z-)$Anut9w{X}0@$0IBLBu|NP46c0V0Y_O&I(oFwJYFCwBa9>=e9b3Zm3aRIcmbX$j
zrwU1pWWLq8Kg@3;v%>xl*$(fL<a<=FtR6r9eW4E|KjFHb<9oxa_?jREOzJ<bXKyfp
zXow}t?}tiNLOTQht<@SM&-m;Uv!&`?yt~SykCJ~HyI1&+lEM&P=jZOj!SX!%UqomF
z6m9aFj&I>D&&nPtc(iM!8On-ss3H1!pZ$dr{#hcgl3&QWqmPoAtSn3^_SlPKx>jYX
z--w>=2p+d|$q^9p4oFH0h!}Mwl>@oIzT>yk)(Sb(^ZgXLkjtf^j*QT^=fa(CSce_y
zNf67ePhVAWDmoC6=emvKaVFZJ_miY&@GY++)jx(^8bdJ~)e<Z#+0ZSyzD)7jnw#7q
zcOp}5`Y(vgjj$B)C358sPdGD)iq4rKagR~@9M$Ol&U{Tq$haShUCjFUuIA$Qt&SQ!
zj4j8V_>d4CdjvH!*RB0e*2G@hjwwP_iUt0A#_eG(jhZ68H%cASs38tv|Fnzz@5)Fj
zq+3L`>&nsX;>FAt5390a*feJNX~_?{B&!h2_S|nDnAFS>lf{j`sR2SA1%U}^sOU`!
z`X=DK+!LZ@a^v!9==AN%mH}!rOOS~JJQywEqpA6lo}o@zSML~V<sMDVK61zVV`yh2
z66u%3la1yE$5yeaCl<Z_P+YLoDQ}cp^4Zs=eJbwi&pijyu5I^&npuXdIb_R86ogDK
zSdsE@^Bs)6f%VtQpUbzFe=htSo!_0as%vPk5M1opO#6!Tm*}8bNpIcWd%`;SP;pPB
z!$i{~tvS(qX6AP1`bcMXtkl@PAsO3oxmN0<XzIDTo#dff)XNq0NBHELgPO?6CsbmF
z!XRF$JC5w~3MgKHQ>?9>{gcUNe^q^J4eOdt2^*J>MFTf%`#%WWBwe>xbWhN4-ZkF8
zPx*YO3te_S7njsWva*k?M_#h89)ET^@`9p!$t>yBVoOyc>m`E7(pgC5bHs<rDAtOH
zP>smmzD>4Z#kg>V6URA7IUwI6H@L}4?_{|VF7R!erhZYBVcF+6ZZwAzsOx~9#1#_p
z<MgG~5eEv>6A58~OwY(>&qYX_&lTJL9~-b{!(IsqJ9ZLYAEo`|dN{P(N?XS;VmWu)
zSx8=l8p>lYW=9z=$y}eN1amMs$#FN(prD#!fcjAKbcaEWWZdR_>LdNfNgQD{cz8|>
zE7O?@v4|o4exx4vxd|wz(n-y5cXpd~U1w1npMfko)$S6*rei&&JHVYe%ynsJa%!By
zS}pvhFN8EI^aUl8?`fy(HMJ^U1e5h-Y$>c`N2Q|)pQA&_N}W8MY=y;9e>;n)cch+a
zgr9-2J5utlR=a)4iP+H@D5xUvS+^5!mn@-%NoBIkRv<;KHwHd+cpsu`Oga*lb2FDg
z>>N7<L?GtzYMX66v^i#WOiUP{rET`zrNtd7@=8oV5>L<tyXUZ1YWe~=IN5I2yEBxi
znRN4Qt>g1QUC3ygKQVkv8S5777sYw+3I5R=b@+`;UHO{)|G7vF%f}+NgG&decUM*Q
z#-_QxEf&8rA6_Y4y!D^O>1Wrh+m>t1mUE6Xo`d^t?2c*qRi|WKjSM(O`y<5Bh<6|K
zT>@`Y34pFYMB>?PLYDbqk0aa-&8D|)B<cUsJjk<T<S6Z=@6FLqTaWr?JJ{T`)G)H<
zDZ8u6j91QeG7A(;2((}Az~G_`6%F>>oleggW+MMmwzHP4%khQGjibx2=Id+Tv61tR
zjD9Z9RjmDoozGpuH^lZ|zly9?GDA8;QzCO>=)5r@sAyrAK2@mJ4ePI;V^^%(3rJ`1
zeD9xISTo$4FEmK~I&WTs^>9u$%ktXEA<T<S@N`Bx(8tCK<8$(yIQa!5A_LBZynjS7
zQo~wb?e-va;2;fmWaHzwH;HS>@}G1>nbUm~WUt-X*17CAs$xyvo5zNlH~(@mS)FpU
zxvFQVk_;-?D<Um|3F5=YRTk$96;twRGWy=B=(Vd{KinNARwpWE_QuTPe*4lj3U0WE
zo<ZTZ8=<o2Rog2!9jZTNEs-&3-)weq3YWEewV$f!7OktaL&sVPol}O6V+8)zvh<MC
zgyI!E-$Kgsa6(51pX*cc35xNWj7~Fu(L)J2X=yj2TgP@QyWC0yl`8oGd>gXXow(0^
z(m4Y)n|^U%yK4O>HxmBo>YmT+EDMR2#qIwkFq~S2IcQM8MN3e@goPs#*SoknV^#!0
zIOD=TLP<8qW%(#Q1v0%Fv!iF}>O%*0yWccZ4k3l1L{uF0ev`Qx*0bzyY&zWgKHrut
zOSJcK`geh|ycqj5OHA`VlAIMjo4N@*SIcP~Tc@DG2i#f=69t-LW=5*osg(uE-#buq
z5A}{A@n<l)Q5MJV<+`|gDmOmM8qY@G7&;%A>1kZ>vft4s;!e=NBDfJmJrlsp79_6R
z(x1aMQ>pF{C^*;O3T+=P+pQf^Abu_`kyTZDm+~SzYpmjSE$AQ^fMXS}(h28B*=-Ca
zKkMH@&2AIlZ9>~;=Xvp9xA*~UBbf<>fJs>mU#^7qmp3Hk7C+m+)cC|sasIYB({T1T
zyV-NTgtnh<K57{yz*bj0GjyvxQe4sKg}06gjoj1(!6BKSYMjZnAYzI-G&3Vk&u6WH
z3q1rJShRx+KSliwG*(1dexAlrq<-;L4PDr~Aavs36zAp=-cE)COSonu9Z&J9uul<t
z4!sG~&WS&ug|Vg-iT@-ARj)k{%MYl--`l!9tQ$gyB)c;;CTNGQUd~ms(N%<U_T$8o
zH}A?l$08i+zkU8bLv!43EHp<tE<dSxt&}H^8e$Tu4z+0E==k~V8bz#uXstkaVfxB3
zD8n!ZiDd?Q&8627MAP35Ln`CAYLWV*PLb4wkka~D<wb^wEN*_DPzCijz$92p<}B4o
ze#O@+T4Pujm<zcsUtMi1aO(NCv>QMT|Ism!lX@jASC8?8ci!l7+BoSId(OsqC!kHu
zm>Cz}PQ3CfH330&jCIve@zOWC^e94*ss@RSbxmlx_c^5ei)};=?G6fiHPwIfHx{5i
z{r&R*{E^YNLt8^;5*$JfqvYb%Y-<l-6G^qN+8ISeryLm#)xz+niqX<lGE5I6L8pJo
ztJaEiof$$(;BIx1G~PGjk;mi29!43O?^cJ3*)1r$G<RdohI&{ULudPXa@`38A>C_x
zcEs083uh-59*)y_`j^>SJLg$$%)s)Sm43rTtLo0VxL&_ic3Iqa7koWv6>CS^HUVXf
zb*YI-Zz2Dyz!{z5nd=8;w%zJ2ETs8v`$yoqrcG!mvN)$|g6P7Xy}6o@1fQ44%3?}i
zWnZ9gvFM3_s{X~g<e)9sc8F|DW|D@-Tcxwbr}l|+GEuZL{bN1zv;SR_^{82a&Spb*
znu>AJ%Hl?LnxUsd<NYW?V`JlFu7=P?#e7?}wr(YGWdBw+Hxuwv6yyss!nSywfFFdz
z?>@PB!|um@+1vGm;7wr{>Neue-v$nAPw|PpjcwYfn#;OX4^g~stAaGW!j9U}(fVc~
zet!KgbIujIXD_cIf_~Oj1Q|?QMST3(>xQU5FW78<r`@*66J9=^@<LloSZ2hgq$2cA
z5*SWo*npe0krmczBmov~Sb|?a+@iRCqLB*Nqy4=`%ms_fYNk6rTa~XQHdTliNCERg
z4>%K9G`l7gdktlm!X6V|6<aWd-I@|^?IYSGEpfxXTzF1fn>R`(Tt}cQUB2kTl#mv&
zl64W{JGb@tjsSHwE!-K^U5$K8^95X-I4<L2XPZw%I)$H|u!U0v^HLOJqSsp?secna
z!hZ%M6mFQ6bzT-n8h&T4pHqKz_TyGwT>I;iN2N~kkJ;rmDGx^5=;>5iit|689;XrJ
zW0j3%hLfELZ?tkn755%U2((*f1aJ#A98@$)Rc6pv^={K}b}7Vbk9NQOqBol^J8h*K
za^=nI*1=PMiyMfqE%V1^g2k$xOAq!c3u}hY<Kt$a2C?16cavBU|1NM4r+^WP`C@6A
zb!wpG2p`RelZ&BLPO!#C@%=Yb5_bmeOLG{UaF#1|*1<Vhw=%ROB__?)a3^R*;({sY
z6u@o?MHhwt^o^{g9x<3Z)EAaDxU!SyRbc%P=Fo3Z+V}wqC7u4WrAxc7@t$u-I=5O{
zypk2~o<LS8-|~3=8FxZ!gkgL>(D871!M6u=hVvilx2E^-LiQwl{o;5{mve7nFa`By
zTO=HfCgA!!d+*Ni+_h{6@Roo@?jS@G8IX^Ey4$?t@q&oQn};>a{@%~^B)urBmdh-j
zmRyXc*iEF69d(Js7|?c?gU(@u?g?h5LkC4OTjsSq)u^v&YtUmuLRRKWHZ&SRmi(sE
z?;on@4mjh+s}{7y1-!rhY&BF@BH+e>ucsIKcG+IQ2qmTTOq~fo^FR*61T94sj?p{9
zNQ2qW;v<4ms-AMJkTJITcDsp9d~&;fwml~<ZfeWe=5|@o>8LMd6)TrrG6*7#Pr|qp
zSQNQCwC=58PC6dm8Jr!N=~=B?U2G`KchS>x@)jAcQ>GcA_#%2ONt<GW8yf~aih@*r
z@7b({^gwZ(tad7eN@No&Xd23F7b!}Y`>`8lsw=N&2WMCJfZw1D*61maz*mv*s&ydz
z3X<dfM}M*Qlmo83VrS*Gg&J^N34x~ljY4`Dp_M&iu4Eq0+a>icDX5}TNA85a@!wc}
zLpL6$l$<jDrS2BYp$-=dEUu4C`l~&s8Wl~A*MxL45A<PhsaD#VAawMk^HB!2jj#SI
z5K&~L@<P&+@sQCV_KM1mHS?4#Zky=Q<XF&v=2S6Fz66J;u~et)U&Mlj7QEh}Nu4c!
zgkIeQAj8h0(zokJNX{6w6}7SH-JW-C9a1gS&=$fm*WsZ0n-{?PcnkI(=#@O-v(1S%
z9@pjWrSNNKexA=-sK&ciaJORC8Yq8}XN0WW^@N!D1!LR@&f=IlCY9Mf;2!j1Sw0AI
zB;=&ZaTDUil@+X?Znr2e(!y`<57g6u)AgR{z8E2cpKfPMk*#$7M<s_XeeC-25k$v(
z<oGzyQMy=DPiJqM-IDnk8Mbch6>G9JT)SdI-aBdDo)~P|B_tehZ+6-(_$JBQkcbek
zzj=X<I*uKuh#qqY3(wtRjC)WyDkY0aJ-+;+jz6Jwn)-3cG})Y?X=9k2^PaoGB|}=9
zaXuvq*uv%N42F!U60xF1mwWdn%e%)Wmvn8#BBBg!oz3<`FN^!J9Y%<I)Y7uM=|j}I
z6?7qe_qgsA%gVIdgAkEft(q2*|3uvD(uQ568gSJTGe6D3Fx=yF{xl<=&#O6V5u{`B
zv|j=M3Z}?bTczz)xZ|j&UC2#vE*;~4%P9%z41+efZG^c}M%ES<Um~w96kJ=t(Jto}
zi52f@J_fj$b=0stxK4!vu}O-p0s7yo3%)5R2sKhi@o%!b5Yar;zs@eUSl*#4k(aNl
zYp^ZC6YLK=9ra;f^aeoeW8T8-@(Q2opq{R7$8IsV;vWBb=NgecwGRO5Aaqx_&(b;I
zr{U)g-P_B0M0(vBR0G|%#}C<wPY#OcXr!dp5JL0MKxw34>ku!RLlkQ&^ka{t-Ip)5
zwY7`V^>IaN)_B@XMWLU!tco1=^}f11I9IwU@dVl^$Qw0=PH!K(dHC=&4HbxV)X)pw
zgw`&C*ww46-AhZ0cUJ4=-j}}860pImzYTEMMc-Imu=wS3KCe@d$+QXczY4galJ%0{
zCadG9$1Lp&IHg*7Re*{j%W>uar-RyeQk~ntr+z|QtIKRkDU<*5RYiqV;h3$PLmbuv
zQ2ZBq(_>?w_&hI2f=TZxuRxdcm}rlSzq3z!R06y@{WA2liZD7)c+A5fAn4ca$R%T<
z9u=Lx)&;mcg36g?@pn)lZIiIs_ol1Stc#Gwv59pgq|0Q}MdEBsEo=^}%L%q0;{o9r
zxK^-1OM_T_o9$EDKh|ya(QxbSuSXSgt-6F1+si)GEc>0wy}6%AKP%xJ$JELGTqV}!
z^61g;%AtywR1w{+{?HZ8kh#QS>WZ0y{GsV(ZOsC+dFYtAR}lMVM^w*&EwnI=@o<Ql
z#MYULwX~E%Eq~j=tZN4Wyt?OJdxWb|EQ*|w1;JG-7u#_Hu3DgIAlLsBsdKx~!S}}l
zP;=!5`dx*qxx8-WIO+?VjS1#SNy1b^dgydU-Fz9LUP*c}<K;G$!@g_Xx?pIETbrER
z`i#GRG>Wjpx+!#^L%&i$ga}Rst3(Ej_NBUo@VcOUDBuM!LU-l7014?F5`Xu2CLP+f
z(9uJPabQZc?-DvKcz~J9j_z+i<c?NGO&2Lk-5SC^*%|TUpJ5()yibOZN~-w<%+%+A
zD+s38TL-kqG__R(=U}OGG0D?;J0UL6!Lz|gH8-y1Me*(y#SH=cEt0}Y9PnZY<JPKj
zTxPkaa8s8BG`!|jno;C<Ub+dd9yu_S-R<qiBF{8gw6V>dUg6+#K76A~=v2FUi8zF;
zXGZHb%O!cHlF>?ShFnirplf8dO`n2xasRjit?JK5)=3DaBtY!KJ_P?1k1LOjjkN;1
zTc{y2Io$C$XT;YzbkZ$!Jr;?$Q;<Tti4IWTryB5@yXP98OPILFpC&%ulsm<tmv#&_
zr(1tTc^}#+1%%ReFw8Q|MwRZsk{Hg0g~i}PSn`SAfQ*0Lj%i%UJU8x%ap5?LZu>QS
z=$!Q8(IOd61VY<^c@kyFmLE3Hs>g<RS&v(v4-Ix-c(z-6wl~;pikui8@POds1zg9w
zuid#WeHl6sUX<}VT$z9kW#ynDp5F&OYZkTl=d3Ng)Q4*g)YTfqdwrMpf0Qq~&zj7*
zN%gQ_P^WPk)>D^yV;sC}?o3uj+GMF?gi_oxV_5;jWhdEvEBi=7AlWRWkVUG5&RKm`
zWe5MHu+1JzA8XTEbx)@4Cf@a%&T}>ki{61z^E@&(S1W{AF(M#u0-%6C2@NrBoCj~J
z-OyJ^HLVxoqKy5QSifVLgbN%~$<Dm0>=ZFplM9+SCB5fT!BK{SPqr#S=a`SA`LQjN
z=~c{BIqylwb_Ol>nTREW0p~9L&`ZLbjz~iKfZ`F=7&TQ-vO@u<O93Ixf4gy9(`{Lt
zopUU-PWs@9J~6Z|8jqH)UV(hic<@DSn^J`gN#@5z`ojk{J7;%jTsm1dvpIMc^WJG`
zOpgwr1W>+@m<CruYn)Ylo~A5L_{fdOTX;F4rmoh+&Go+oO`|SPp$sLOdsFF;4ejsz
zKz5|rS)gOWfDe_g%Y)Dq!6&^;@n7O>0TcaJ!w)C><--K0OGge7aU~(`Cj(Q%F1^EI
zWolIql;AcuRI>?D_YfyiUoK{<le}XTH}sZo_a~Q72%vMfS&}IeL3IcRpa~9^+uu9)
z6B1iHJ3?MF^J@Ku%A5M3Gi7lvS$2d@Z*}fs*g7pp<A*#gucp09lSrH2$R4W(L@G49
zw%mGRPs^PFOF7=@fx7mU;lm#5xP;f>onKw@OVy3~ax2GEyy1euaMw`5&k;JqmG94c
z%MFy-?tU8<%fbE?<0AShqHlzs%e>ycJvGp`;Eda4OMx(b1It{^rD_Jea<R82bbzx+
zE4R;u+<f<SwnbCo)?}Axch5&B?ueWbjb)bd>zf0T8s?9#Nd%Q@<Eg&>SY<iRkZGg0
z&kAk}CpdhmkB*U&k{Yf^eht~fvVl_-&QyKE!`X7wKE%S(U_%OBYHkV$)W{@(kNs=M
z*j%!5uQ{4+Ukf#aCJl!)c;Qmg0mQ1^$U<j|@6DX>=mpZMscxyE`o8G%+j9Q#L&**7
zTkGa!mL8UQV9f-LOx{6SW@RU92syiOb{3FU<~UfsE(AK3OTUu@w#;kd=t^^QM&|b5
zk_7ZhB#D^NPZ~yD?}aiR%-?C1m`rv~Nv_Efh}|J3_^TgFTYN)TNp#zOASHtHe==)|
z4<HqrI<ZI7YGg-pBLkY-EaFd8$S>Vx-T!!56Q||ss=fLohZ-mf^I9JT4=mvUx}g>6
z8hVWxZ(+O8r0p{1;S6a&>fw~w*taJ%Hv5goaV5D^J?Tty+1V-l*YUXNl;pA=7o-aG
z>Le&FG^Bi9?QKHM6{-k59XhxfJ`aeNm?9UwV_of3eU=x;ostY+8L;4eG7nP=DPvN5
zsjW(;Pfs*>DqDPDn@blTxoVA{5yNd8*H`{+gns{U-@s`6efoL?fZOkwWH=tqZV(w|
zn2%G3D&HIeD(ZKuc@wa0u6*Tu6DX{LW-QW*{M)dZQ9fGvIdS5rBl*-F%U!qB4pB{C
zRb8EZeFi$-dls_m4kfhC@1|Q_>fD9w<@-V`<z}{paHhEpx8uC^X_m)}t%j1pmqLP9
zQ}j9nTG~*<8qFv+A5uF}a@uPWp-ee3$~@U919)G={CGV@#h1FYgcBcyy-0trJn7sn
ze<m^nwK0tLZv1y9qc6JRcA{gd%Hq7p#9fdH29INfH?f*zb5^&x?gB6Z8Xr9-JezO|
zbf&LMa>vY<Q$v9XA#A7q-R?`=qLG=|!GeInN$HR{>2T+#QJ|oM6O?y|RDKYHqPnWT
z#?Y&(+iFUk-$Va^aMQkw^)!UEB;C(EoafQwq?S8IeVxBtZAnt2t$Kv>CpI%fmxOM}
zfeDOjgsd!WDK?KN!$O?eKP03!(ASr53{>=RCd9EQHa*)6-mf=B6miU6ZXQ(+=4}oC
zuBF8`!7$N>ntN%f5<mqN#hVVHZ?m(>=FUZSQX+-UOeZ#8D_ZboW~VPFdu(!@?&y7_
z@;!+T&K7)Ok`9w0i3`O}LQnS}>?ec1y&e`mzuvdk=xC7lgq>;WQtbe^{@-!SXIAVJ
z>t;Pef=g0_&jWfZ@ZaR2jX$13kGQ8Fuho_qMM&C9J9gsxQ5Wl?-ShN#WmoZ}$W4#o
z(8Ss-<k1p>asV(y@D%oT1*=tC3ofpmMT4G$E){}H6z-onstP2mFP!?W!Q>ojh@w{i
z)r9@SYN0Jq<uWT7Iay4n-EtOAK28<I92S5oOq^9_9J4I0B~!B9D_bC;b#Zj>?)D@I
zPxp78@!C`bg8>r*jw#L{ofRF7P(iu<S17k!b0C}R8}~{4HGJYhV5A;(f?d>{C$ft%
z)M*)Mz5;ED+#rb^`>CcEH-Oh<bK}@HvJOVP)C${)-B9H+GF84Ov+w9U;q|8S2$(@7
z(*8Xjd2h3s&!wdJ?HEmVq%*Ng-m|dn!<)8WQ&ie^UVQg~qM{&2&G%0P<(p#aI%l~|
zU+mmu3sAbej~52&)Ks4^!bJ=av^Jx9Ce5HX*1gK8*%KE<SW+z%g`yj^Wp(+3&i9oJ
zXU>}jvu?Hr<oo!9M>}-0IcQ7ICM82?J*S)n2yRHASY-D;gd7B?f)ueKs5yt?Q`=0P
zE!ZK7M=sOXn9KF1K&p55yi>K0KI}3buUZBg&g2;?xQ3O2tO>da7u=7JLCiWXrQy2S
zx~b6Za<H`g+9KWB+V~|i%8=FdJPZTdAj1o6g{0zwEtQGE`xDl>yi-!g+wT%Z+#7LW
zAuFSiIZo9d-qEoWAF7Zs;7K;8v`y@XFlfUv7c!eREH%&@_;%{T+~(9zL>>toV^UJ7
zJ#MSmD7fzSaDMH8S%GAOlg<*8s=~4(quk#)|JqbT!Tfw#0vi^RF27>2AtFXAv?Rgj
zL3>$gCyYbBwjEC4wBdXkXTX)9#c;bIo(XT^u&S=@s+UqQ*e!A-ck;!_Iy{l7JC3W^
z!kkgdH!1hVlfhoW$m6p>LPEz(k*$HYc5eOHoS&$912Yt;vHw7M-2@fZIs)lse4X?z
zL>#{haS5RohcvuxHp>spP-`0D%RcZw5+6PJGRjazY6LPy#F|#QEaaNQSWjlp^VV0U
z9Tf0j9+a))ASu$*mFMYRul)M3vko4P&&j@bZ#d(U7L7yckZVQE8$Gqnj75bU5T4)j
zJvsS_P=5Qky_|h`O}rbhVaX*5upW&J_veQ^v!&Z@)2^12A?_GgtbJ&6&}$k|@^NaU
zrKM0!D9)a0v<&atf`@=PM3c9m{-rDqFE<1S|79l?v%M)d!yGsshm_>zj9xV^RM*qv
zr1IGP0~v-jgXu2k=$)_;B)fQ}6$7KC^6%BGn>q_u9kugf87e7>*l~1JJL#B11a&8i
zix*3<b6fP+ll@X(BfRviYX~oM7QVq`CM*5}=ub#nvq|!60c4EO;ft?~jyTy|NLGCE
zz0G!$8mrdYy3P6%8}s^Dl$KVLw$b|}M!bcirvTvj&VzSWG7i68ujuK_=PhMCQ`8@z
zIzTK=K*jCcwSccP-bx804ATBkNf?SYee?jn&(nAF1~sWA1^6z)izhByOsLuFo^9q>
zGRrOpR{^SlaBwHA+^ubW?P5q3<s?-7UO_<5qL^AP(`&2!AY$ns#eO2xsb>ZDUeOgm
zu;(G_fIYW$ZpS|-AB7?2Th@9ke|sJzLX06KiYMMgcmaSuRiO(oN-4P(Hkvm^8G^QP
zz1II{^F~n}+`^S!r&#DMI$0otZOCze+V1@P@#oKTI1>UL-ru*dcs;IpYcqmIM8&Al
z+t_4m%F<HQm$Pziz;!BrqoIG(<{^k5S`n7t|L&aR_&!+~D4$$t-PB@(a||KtfAG!9
zx^eT?t>X-AS9ik66*JkTSm5tGVb)j3v=SXKuFL4&W|`)A%w3<iNO<rF0MCzQ7@Oh^
zv4v5Fng&o+6bm??8qrPx;A3>=Du(@*+Oknh$jyxEeX_V;eGtp;j20rpAW>qlV#gQW
zU~P=`PJxPAdC+RXN81iUMhKYN4-;334MgT`2y<QqH4QQgqA?11j@W6|Fes9*1vPC=
zdV0P1JX^2CZy)Lqbm1R<`j;$Y9-<p~-NXhtFUNAckOHpf6`kI@J_!odaX_vv@6p&i
z-i*#5<8j($!qY3M$G$9!5%N2p0I2Rv5LT`cttWdgxfI)l<;sKsXy<9-^UVPjI?9z)
zKJxnr=_N~-%P)UVi@(V>SLq@LN}SGo+u%cohp1|HTaLu0pc=am_0aHgHFHLKA$xkx
z=pvpEv#-B?Y9XfP6+Y>jJt?Z=5Z%NH#xn-^<829XKj;Lf<@ogq01m#Is4Ko%j`@+j
z98A#Cs{rg(jP=w97|Wb+J;Z12g0UXF!F!$HApP9_b-*U&cSm!Rt?i@d*(3C*FaJ{;
zI=nKjdXyJ!L;x|IVGh4p$aUgFdbEo_L3D7(S_}T~)s6N*?}W=lb@7uR*U|-kkY?lF
zxzyt6dxcQK8n;Pu@J}pdn>#eADi;sbFulx{$uBDd%@xaQL~?lr?I_zBoL3?Xp6|nj
zC0Y3+G$0N+?@+%BM3PBKmy^DU;qHgTZd8SKslOrLxEJ8x6Jj-~81wG8b#|C%sae8h
zt4Y2^H4K39%v&~lV*ny4<Qtx-w3#n)RBw8m#2|nf1Q>!)V8YX_kR2F{LHCQA;l3Eu
z=BVy%Uw|V2`=9nYFgp2{X+U2ln-=8CchtT-MmA^u!Fme6#t2Hj6Lw$f8^XiGXXMRQ
z7KdAN*-ir*e3WUV_%aa>`Apzf^;*m!<XLgjCj<SH+mU-M_+xi+&;h*7P$Ze$7}rVN
zPMe_?3E`*wh2--@9+RoXF~!xl-d5-^jFQw6OwfvVvlxa<)^aRBJ2%RZ+{0P%x`zB_
zGnIJ?VM87t6TW(X;=mSJoKlbqSjP8=c9?(3jf$YUPcN!;@UPPY@sp8J97ig4@=;$(
z47Fz3FWcthD6gc9hT0I8CoA_9!hUA!JIb3~<$VlN9yMORB$Sg*XwjTJOmkpuNPS~Y
z;F;co28bosZk9NOlf-2)0jy0t^QA9Cj&}gB->LsW2tA{&rsD<@Z)~9O(0sp#@EYRP
z&SgeT!iy-w7N6StIql9ICX`YdzMBtZN&nG>W2ofaT=tT~-6+Eq84L*t=?Kz!#%I`^
z`1q@^@PA!cFk#r&`QW&^C=t>_Rt>NKn1*fQdk*~e2YyTA$ot(MLrsM5rP6aEH7j=n
zmv;}q&AHnyU3EHk-M+wm6hXc!7d6kU4JS1S6{)O+0#nYNY5o?L4b~cDM3z=q<9gDZ
z(I@?vjkI8o0hkX(E10npdKu@pB%6&u9jB!BsKqJ~DZ?TBZIl5wZFV`t04STW9J(}$
znv0l<iZUFP(t%+1@4|x0&W&bT-agv$M=_l+$vG<vE$&zQ;W28(h_u}!CfWn!kMiBW
zlp3$-Ar}Iq-d>P7gceP>xp5c3o#sOd(1#EjoD?md7lVX$y{b%YnFtuSg?y&XM}Nh=
z(J;F5E+wa|L0J|ju@prK(HT;(Nw^5)&vV=bcV!cyKF3|enMTtHvbPk=KqX^Jw(}5-
z3F%cd=~{<+IHzRAW{GS3eHjS=p%AA__^qA_noz+~<c6pQeI86NeO7hZ`}-l$8%FM5
z!-fnTor`o54wIZIspp3EdL%uZ=N6(YnGkS)zge;an5S$+eRQe1w300>E3?n_Zl=8e
z;|A^XA^edL1L0<O#gLJZnym@95DP6gaoRtk-^GHjKBs(ap#E+-G})tl%#y)>Cc9-N
zcL1%^3HGO@ue$uo-8wt0ahumhy+m#_$QIS2WIzV@4(&s89LEm8^&)2KR?g79nG(AH
zaFej0$agU8a_CJ{?9qwqa7C_U3bg8m`*P+}sGkc72;7n*cVIEF7;WtaT%mo34Uttx
zncCc5v9wG{NM+uyu@Zm7FVw0R)$R#!VF)VV8U=_`#rY;4fm%xsW{B8tC6dwyt@)?R
zb&4_r!iN>E@U9_Hm=Oeu2M=%I2PmGj6m;W4&(OH(TPQm+@P+@$x3w)ry(NuF$ph?m
z148ycWek`uDo4boqE3@Pa>*b5{Qr^m)=^cqUH7OU5)#tgh#)B-BHba~(x^zMbeD7r
z(nxoMG)M>{0wU7gottjh=i1=&e!ufRerKFB&VRaLaL2V`uDRw~8yAf8)<&l5HC<bN
z0=o>tbrp<zEm1dIvh;Y|R6tMfzr71#1j0uG=1@v~z{@tU#zpxv6EyvtK^>ek!Ul-d
zMJ-4#fw6|>6aCGjpNPyICL*=N1Vl&g_R43rIbJCCb+o&>4abawdSal<s8m!Da@K5h
zi<Ghx<!TuIJ0*e_5Dbu9$Mp06j0NHbvHuS{9U9HKygmd9x;9X(2C)2NI7--^|E`k3
zJYzHnhcC+QNFBXkMWZQ~v5CUjrz{WcK<puN^VLBOC*X05R>YdzQz*$NWC~fPh5s&*
zDSyMzavceCK7o@LXmEnjHt_C`#6grZ`di<m6&Y6CuS_^+>9Hu)BW-PwYS(SoppHKT
zx)NoQD<Guj=cO|80mlDb7tGs-y2Ka4M!Em*Cz4(QF7nPvPmy)!IzQr$&vRgI)Ehl@
zp@Dyj$KZUezqOQ1V2jA$Jpfgn{-T!wh|o2!EGEogVHF_VLC2b-e;qsWQ(+MMc31%Q
zHBB)Ls4v)G5c2?j^}n4m_;sHWP`C(vOi}|SdFA<d6&O78Xvk`kgqUf$mHcIbNoW9=
zaU<y5z6J?ry_z$_8Ei+e?<WR>P+EW%Hrr!60KJ?cw7)$PA@x7_X7qbUyH2JpEY*yT
zu8qQ`OL5sys~+dqf)3`^_b4R_;id$q!oN)9Pk`+_1Wu!XPTPqDgIPf^MOkJuEG_>p
zm$W(m`F?&V5y{r_PfvVmk@Sx>1(1PsV{HA6={H!vL$_)SS-I+L?`BR(n9O<hU#AG8
zFH2x4p%H|hKU)VL!*Z2A;1JYr*$A3K^}l&BYzQ=7st?RdnyT7$gYsIQHMtI3xJkiD
z4U8@GP9^=@ftB$I@rDi*e?D&j#i=BPD##65H>N7{znUs0uxzm!|8za-a5MQt7<1CD
zURhV?ytTjFLxAY@Y@^Pzg<|<AX~jN&n$Z+=6n>!7{KN8|1eAllMgO`Dteb=oMwNqm
zHskOo#p0hx!;}F&8wi<ScHREzG=;JQ=m89(er>*=928!b=D2@Z!ULA|{Q-F|+f@YY
zg1h;F!RY8`>4OsASOuIyT>4Dirp5c~V$vNyAeQKyVyXU1EHDAX6M9qI_~0xA3&A{#
zJb3b;b82uF9DmEwuu)e%_Q?epSF&w19C2T9Tf2+@%F_#?K><GqB9t8C>K6tCo<FNp
z#8-j92#^11;UeJ2%>v*2>#YC{1+W89rA!LL7=UD;#f7ahp*90TC9z(``~U6#;AK(4
zXR+Xu^9_+4nav8o8sF)<_dw1rnk^_%SA-0b9Y{0{IhMKpn=DNNbu58XU??A@<-guP
z!ck%6uaw6SBu&6p*mm0dEAYTfb{QODtTFFu)V-q>G_$%rU__2mKVgK-O>eouZl?37
zhL7rR?Thr=S(UuR$E>p-!XM{zg%<<mv&m;zOquSz5MF6}E!M02@5dF|M5v)y_dnwO
zg|Ai1wSo7B;eb-OiQOUG<90v?rm-BJe%HjjGF|v@FY~EE3A?{ha=U%PlC-~s3tSzK
zDW5=h5*JANdcgy5a=n5cx(NQzc^;Op`Iqcv0=B2=DKmq>S*W{CzzdpGkdv|o43)#W
zL-AtDM?N(ON1&MWALF|14dU{Q2>;vm5s+iRJod{{NO9KcGRfz?dq+lUjsjqS0<g=W
z`B~udup^KA3r5v~xw~r5iSK_Dbpm<&U~5sc<mX4jGp8ZU#I203I_`|BO5NaLUOIt_
zba}cMRQ~^EX+F8&>^i3VzqpI4gUN;|epRr6WEkjU)T{+;zki{43xf?Ze+f7x;1*>H
zm;K+nhK7cAvgkLvQxyz9e#YaI3HK$lQ@mE_9xYgi=j3qlOn!<SjC@7gE8zWQhe7&d
zaC`5Z*unc(V+B_DKn*0!BX}jmBiO#9Jc3!>7=zm#V&V@XkLzx3sJ}-3^?yGHkWm1^
zf=qrFDk=fK9`xKmd<Jkc@rwF1EY&7=%GTatv8tJ(+^?d_2Mru$4G6v)H$dyY$r@e%
z{!8I7_}EzOau!ewR&3U}<%4Adg)lTT<GvII&g4H;S;b!t2DwV|A5?>@wfjePx2Ay<
z3tMXgUr5!7f{Q+f#~<;`#|1<M-IDs-qDMgCjlo}{5ap8`V8}5xcqwu5)<3$zJ>(PR
zQVCHK!J?5?EdM%(+rnx-KyCk<lYs3<X7%ZMBE4o31>$#|#UsrFLe#zi`pU_H4y^DQ
zPu%!VKlgwBf!PL<ztQ02gorBeBfVbqC;V{mFJOe(&@*t|zn2Y2pnAuiz`mJ;Cd$Ew
zJ!B~ngF05I5D8c-VTDh0D5}UBms0SzMM=J{6%e1y*{0)*D|)-f6&Hno7&f{ucJ|=g
zADvR>((OPXi>q^0C&>Jh#QuMITYzvtv4jUJhLVQEltU4BL&!(b@QQ)}R|uJqKLqt{
z+;RTDTjJhAMfU*xzH<!d_y1Rj#s*f3W-tRpqga`wWY&L2uYWNq;|*Q~a@d4Z*2bo#
zWrwosJWt3B6FVW;8S<p$FA_rJ7}(Af2FTC%1Xnx5Bu7TM&p;qWzbOY3|5vDv0U!)*
z6)o&g)j6bZeVjwJm>S>b8N$(iX$9F<#~*N^p_To&7hQt;PmN-`{a8S?*fbm<1uaPc
z!F(7Hh_U{m=!<q493sAi2C)#wbz9r~D`<ck5#%E53Ce#THn7&}6q{b=xaq$fG)zHu
z@6Q^HDWlAZQ9v2Fq2dBo8a6$F#mUT?U;A$1*k=D*R1kyuzk~iApJVU>a)viSlGT0w
z59hVWps%CD=XkcijA~jUUtMwQU@SF91$3}=iRKp|4jiYd0~7u)SHGDQUVvn!>s|6I
zQ{66J5+B0(f3)xcn1!1jF&>%S*jIc*;Q1o9iKt>3SR04y0V6~8w{#DTK}_3L8vZwS
zu7Ux{>t<kSvowGwzywRr^u+l8SL}RaX(h}cuYzdUK;_riskRdiF3&k~dwGIQp5fSB
zq2%{BEh5lVw~<@?426QW=zvB3UxpBopNhe*F9ERo|Cj3nOIL4EE~~i-qy=vIf-ImW
z0g=%$+w8y63G5y7!M1He<v|SnQ+vZ=mFxeCoaSI#8MU-UU^RY@p5=`<DW52M>A+iJ
zY*@~C@d2IfaZ<oPXl<DOFRvBT1h#JO5*;X6{(oN6*A;ejQh&RKvmPa6V)8bkhZISG
zry|~ylk*n(471&8t1G%0Vo(lCs4uOj{@+t3HKy~5K^n5xsU!uO*MuAUm%~au?1*Cb
zcc-8Fnpu3iV{V%cNsm9AsP<hIInI|~^D91u0g9~4)GwaGc(T-rPrY9r*1cG9Wb)zj
z$wfM<Vb#4q-u~o)=jTl`61oUbI?f>O!A8&iCy3wv(c2MqPhZeOf7EJA56(vqGiV>t
z4@RDwO-ze74#fb00H8Fc{!dEda^K7O<!K<a9qNMjZL?)Rv~Aq$Xx6XKy(<*%Pu!&i
zo*3(`wekV%3FO(_lssrQ#~)&>MA;^4yfKsQsLBBbgbC82$uAJ+wJCRq;tJrB)xKK)
zEi3^J0-%r<POu~gpMW4zx7qzn<dPBQlL*})@iNZVjiHSSHl0){fQ2L9m+5aknNrus
zIPK!?=?}>?R4(@s<AC_)L)vu~v1J4^T<0aHS(4^$H_z85_pII9ZBGl_T5`T`$lkV=
zl3?q3G^|0fWUeu>e3PqKEYXKMfl3tA6_cr|f6){AfWdp#YY$-CH=AT{*2`8L0@+TS
zJV+`&Kkj*6lR?nDM!Xo@hc|iVNm`C&+85r$2zFDyR3-N@1Ii)C_C>Gb`1uDjS7N8r
zG8D)y2Nw|3Nd4=c-Jcw3b;_-yGsm>^n-O20O}Wa@U51akH@6H9SAAzf$2?Vld?9T<
z_y&DuOFP+e9JgEe(p~<px*5vNiBq!pY|px~7eib7a2d>QRe#L(#qXtI6$Hvic{^A>
z1ahHdVk;jA+n{Y)pBDC)1%R*gfFU15Gg*V7!hMTyS#Ke`dzrFaOn=BPL+fvHtYDZQ
zrdb5*WUJOAXoS+E;L}dO=?5xY9m&`2aOm?rEk3BUhHmdh&K_^R9P#Yj({7Kv`pxxp
zpe++Qn-Xj@zCM~s-a|}V6}T?gHNJY_KMQTAYs+dmnc1b7bvY|k)SF5AT)XLUz%_&0
zygn8mQ06+FVRPk>7UdKoNcUR{#%VS|*U;xrXB+FQ1@LbLQQfcym8r>o2P6@`oLf(B
zF{sj`Ckj3_NJoID;ypZ{{F4}6lu1JWQpEV6AngCyj+oqW-~E=~p7{c}V%WM@nG>Mp
zvTwd-HYoZ-{Y2)xaRAQt%E%`#?acW!NB_EMckVdNU-0~63-Z;;)v%g@QAWzvK||ST
z=C1zCVd1XI33P<h&gyC_xeew~Mx`BK+qmqbUAp)*G`g8{nfXZQAuAp4JKwC)nbYNY
zisrK-(N$FaXJ0ijXu*IDcyN;D>z=m0%ca`dyX_1&Wt@;dI5NC9eFhA|O)m7rS?NMI
z;q$)yCiv1H_<s~?*K<DO=qzB_06Kq5ZFMZKl`K+NgVNi@Z`<=r!8Cwe&9y4{i=G#(
z!2r8#XVL%JW(@Wp8vbXylIsU^P%E~TL%mdBK$~Dz9}^EO$%<$VyxuUF*M~wNXyYXN
zsV*xx9CG_9v&XUXX;)*jiKb|DFBv~HaqeX)c=5TP()9V?c<@-KK*dQ~pZgXLl^e}u
zr{G?}Ze~ZvZgl(?`nT4aKd-myBfZ%9PNwqOfN1@EQ;)QHS1paCd2Kk2pm|G1A4{)_
zaJh_QI8&W(vgK-65Lxil0Q%Cf((}aW3(MtPiaw6j<(Ixf3wazYAev&$KdEirik+Wr
zx$3Q86}{E$Q1bia=_Ov2mwe46-b*obC2ME^J@+Br1l6k>*eqkV7ZMJrCs0Ripcnq7
z<EoAt=K5pgP6r8K%*kPlxzIgeJam}LDcdla@%i3wn4YWgJwTdGEw6wQWX{Zo$@>Vb
zit|%}O~49SKe0U?X%al`tql-=T_Rnpji1U3)sBSR+Y~sUWyKiPJy~di>S$;nQ$SDi
z+Vnk^#&?$j1@NyIL=^%bTz3r4-=(-TIi1sYe|mYQFTh_rV~nbI?%Rjz)ND%YK3Y3#
z@j$iw?&oIcV(pcy^96K|7w@Oz5$^cx5ydr<<caGQQQBT+_bL~a&lt=TSX=gT=KE)#
zYif^=YunDA@j_6*@hrY};__>`a`2-0vTz{SSmP>ttGHWv1jY&jesOszWX!`>p~Sj@
zDHqTuZ;?*FNe0d+KV@L=&RWl{>-k@@)CwpSxvcxoBI)wEyi{ngDSx|z^s3<sDsmaR
zA;3$$fNy~YQXo5B{$}-maCs)VA|QJFJ$<WJSYR3Dt@YQ<meqpY3y3E*-?<wERlDV~
z(GGp8SL^J&)?NH`kK%%Nrj(0uDd7OR-(}pk2k~+{Z5O<L6<oRW;EFbdq<!7%{QK^5
zPuVZQF)RdPrVpS25jO(_tKIj=U5!~Uv!LrQ+tZE(?0csKuNG&CuXnhnRbAx%NILC6
zwV2Fky#5|#BUawRR{{05zGDUpYLPd{Q2vOP=7<OWbP<1sPW+#*;3547ew)wG=DfBX
zCqb;#voX4M^OmLMc}D06!NWRiu=x_!J$MRpMM(1hAk~L>L#R|!I3jfylm;|<<`+qw
zBC9KIc1}`cT$3yK%XdJATtc?EYMe#2-oCaqm7l#LD?>fG-1SnMIJG2t?xsC~lzJ-V
z3{CTT_yuy80t0f@8Yl{BQt<D&%4>t@xq2O<SzjSZwrkH^4)>2HzVV^B{$cm}D#A;}
zP0G4uy5%z18WnoPb=`~0JuLf*1s<%Hji;z*WKun~b_ePqstb6Bsa1mOcnGp&%Lb<%
z+Y{s)_?<8Q3dr1zz-%Cd*~1L4F0Zm73D2h2sr#Y`EHlQt9#Jg{?2^HyzhN(ebBG%Z
zL4X5;vy=OA|I^}1FK_**;9&vLi-Vqg?JVP2)rW>2?K(*6X*-ktvsPvhP#AL7FNZ-b
zrS>8jlsAu1#9Z!*{4?-|43v22)*U7T0k!y`xX1gE%6F`Ay{i8}@Zxxu#baSx-%ZAO
zm{B0_&?Sjo7$0cID7>D$wH9^o+xbTe!1HtFL+zDROH3`9`ufp6i9Rv;Rrp6W`SFl?
zSlF9yY=&;nJ9$Dq`b$m%n<2-$cb!NJo(AEP`TA-BIY$1;4)m}c1T%pw_AG<h^Rj^E
z7uH^ItRCw!ZDnbV9PVF>1ZqDw$li`%tuVkG?1;ied#u_1v6F^ylTIcBM&0V~iHXsO
z)6%o<OYodknP+x4##egwMqLB8{3Tsa&8?U3*5QO4z)I)F`F08q7WW05wENIAujBF3
zST#xO+yE3(=yh!%QZ3YCt<dvm+Zv-s|F}D#WX$#kPy(pquV-6Xji*1)TnN@$iTBp1
z0*w%C4J21L5n{v5Y(gE^Mz7Es_<RB9hluNR+U0}!)!DsY(jji`qh6Y)iD{R|(ks&M
zqVohUE$U9k+}gm}Jm*yd$cqJZm%-=IE?%#vUrxnC2S06mqJq~<KWOxN#qrj<E2-0X
z9aVt!l7Op;!TrVR^1Mpg`Q|Qxf&dOn(?xLN%+SfK=V-~i*CnLoHTg2pd(hnkrWkoG
zNHV|7dQmJr;&8^V;9usk*uwO5biVAgt=t7Vd(yWlqIIM70fRn(fHi+14_MX$2c~(~
z+p6w~{{RgRsOqenCdr61XTegX=PsZ?`OI8E3XedIdHS(OA;7EhNb2-5ein1)v^Kt-
zpm{>Hn%X238#xU3bXPKU((QBu(zJPZzCzF$I^*Rz*C%MZe7f~I*XtzN>b}0+CPCn+
zM+bp_-_@s06KqOF#kt9r6N0vm&KcgrwM8$;iVZ~2RrG5Vsl*)A0K(XsC6#odKc)Mi
ze$F#c)EChy5Wy*M&a?F%{}eL<D*3}YUT(^|MkJ>|(F5q{uD`w%I|qMdWn*Q=hKuvg
z#n?vSMy;Dse5q#<A)JIa)++=;xF+N$N(!vDH6)#4{FASHQ^5}?7bsiB_&JJcagKhL
z-zU9G(~3+ObSLUCKw&JHTLy!~L~VK~tiM_*N5|2jLtkjoXwL4^?KD@coX2w|wcV))
zNB46P+!ZFjE3T5mh>FnM!D^Ywq&c4Bg?=S6-9{>Bog}v^W7^%rmiCV)6atHF%aRE#
zTgcCL>hi}au#SStJKK;irICX$k1o0F;j!LYQf=1idP_n%+e&>|W$r(CCI08uC0dz~
zb^7jdvkR?Z)@P+Y#t#p6&D99dr$JVNPtBhx^C9jVz(>C97US24e7E@p@$+7+ou4s!
zM_8AF`uwYtqk%Tl-3!<|kZ8?C{^EA>qB~YB6#tpNh+o?Mof*M>345y+7gO_WndBe2
zY@a#6RZj&G*6Oq2;~&NtdmS#Jmrgk|MD1fhp|flRWdb7z1b96~8Vn@exP5+T>(M5}
zJ>6lgWdYvUN|DVpeOudI{&Zf|a$gS&VVCsAD(8lm9jfGBN+M%dQs8*YcY7Y-8??}p
zDfdJv`$c#1jRUIZ2x0e!iSVfJxYJMbf^(b!l5{U8)KKL6;fwj0M^rP-7k(%lC#KZf
zzlGN;wx=<9)t|~W(}b`)Jen-Of`7JM7u4#;O4D8SwD*0mYto#uDmkYiml4bJw^3hL
zj!-i%pk-RAPlTCX^Qzvn|5?Sq`=IVIUjS}meh~_bdc+;rF^rr)m7Eb&o_t%Q73_*T
z^rM9#Ln+1mS#V^^6Zo(@z&f752T%cvX-}_;qI$G3j%?z1Hwb%I?1)fBqLm&YAs?7l
z)-m?vYw+9$)HeH5{KXn>#H*}=ZPnTe!eUJyM9wO4Uo1*b*U)=aowOdM39u?ZA&Yb2
zWQQPdvrAC=QtuvVV;cPu45}DB(XtF6`h?w{<-n#|mhiDlLR~ZwotGV6T8uwwRDJf@
zk5uk@8ignW<J))q3PCpV4$hHmi`c8o)U(kbnzyiffZZ%dG0Oj8-hQpeO)r1vc1piC
zuP(auDYXm@)Gj!ftfdS~Z9uAY|JEgvg`_5YrRCyx#oMs(Ub1YycAZq+@_9H(Z5r>n
zJyXbRlM#6skxmCS3EwF%JY45pU}#a|HcTgGXz{-C4VD&@c0|)iWe@B!gu3Q7TzZAn
zL^KI~^x@#JGG~Uo%rad4NuT*FZiMI2YmdrQmUXq8ryDKnBz5lH`H!(%0b{566t)oI
z>ZqObAwn(KHHun%!tLUCTRAr<?!Ao(f*6W(rYi<IB9b-3?ORu|l)?xAZjL7Y3YtNZ
z05T*gw@c@nMx<ws5=p8kOf~I@7J;o^6fJYZn-83?R%iq+PtH2ipxMx?duc81;y(#z
zLz6u(Pw!D&U--d%6}WN`HZ6Pf!;tW*0vu|)fNZI_ho#%_#tPW*eW@*?C&HT*kl*OZ
zlp=mj!u;JC_It9=_B!PW;UtBH!U3lH%qz$h@i~E%?#61uyxXb4Q5>+6fum@VAf*DI
zCOm+9w2}B)E&&+7+O6^5K`5Pe2L}C(bv@K^AW;C4KJ;2fjJ~UUvg|H(;y#J|Tf_|>
zRo#BUImEb90TgPEL`@U^fvZ$Pl*LYwr5{lrDD`9?z92>FivBKKh`W~~Z~kuao6zB!
zQZ_?`61AjA_Kzq`Zsmt!(VE|^g@uvt-*2le6FxoPCTaFWZ_De`2X3b1r{|<ya!@o1
z&G2pH3PLD+$y3V^GLxcngaKvS8A%~9nKj2_(eq$>8+ttQY<A?6BECpyEGZc{UcHj;
z`X^RW)f8&B>)79WC;Mz@cDYMu-2;laQs;hMy>#B&Z$gk9%8$b0RkttTu$-!6!{jx3
z`Yjme?%v{ep=LpqEak+l5Xd;bu1CcrH0hB^LDI?DRnys32@0pW(Vtkt2#Y0*mq{s-
zAyr(uO|FO5QsZ9fAhdH7@(Cfr!2%-r-Ckruf8Y(^)eh;s;RUYyiX7pWzH5+I?pA<T
zt6aM;cJDaR$_Mlp$copCQyqcIsZ2&;`9WAT44IOlzF?4je)>Ye`FNp8Y|ZuZu;g&H
z|04ui{V))R{Lns5R(F&=SEh~8;4T728nRfRfak!(gqC!}+tyZc!zfw<cI`$ZuQ2b&
zSc)_oa-`~Hs_qlyB7R#2c{@_mHU-)&O<B63Mln0O<^KsPSST=4@ZwC&H@@avKqdll
z&>cBpU*AezUf%63ivk+lu9FF2iror=TE3&k3kUR|BYY9TXY!ZJ<%3JD;MXbN(Cn^k
zOR8Euk>q)Z8x?DpoT)fdMO_&E>mjR`?BN7!Hi++xOF!e3rAD!0U0tkNyN`@zNnFRV
z6NOXIap*#N1Ha|!(w?<8pz8#3CiksJd`jKSsU)V-s-kbO^dps0dxaD>2HzrQ?7xNg
zH~}Nn=;^7jEH?2s=`ioR)RZDaS*BQ6EN&}HbtiMx9r0YmqrHI{R}9kG8@o##MGs6y
zT)5@aTFOwAN=HgSwEgESx!^|+DCaI+>QFz4+e-4gvEttW@8K1eXA0(xhqqwZ@~N29
z0MB`}q3IS`;Q>n~Wbk4$#GDRXUkJ#aO_{okGVP`+_SrOA>>4BFYb<myM%tWT-reGN
zCbR!#j-OGvgf}Diff0UHqu2W=Gq#g$H?73+L5|$VwzX9yErvN*3hqYE$2tl)D)P>?
zx@U18Yq>KpV$vtaldk>Lfo0Bbd~t(X_%h*HcyJ2aKUrWzgmUiGvaVLM*zEeaJGPLg
z_!U10M8^CU<Ltn-#dKi*38*&AOeIqHF@R-WR4hVX57yjX{&X|{Xg(REXJRut1!@kW
z42lHx4CpJ!4kMpCUl-(dX{5Vf?4ZWv3$(+dE{K|zL_Z7*7eg_ik>6T7JxpY%nO5U0
zH#KFHUnt0YEANj!$(nB0!r9=Pwc(KyVkB05pF8TIl7;TG7({fE796{{AYFmAfm}jY
zM|x9dG>Z>dPjKLQPxQr$tQalG2iVIbC`F1txnCaMlTlA;txLSf-bkhXZ1oy-sVi1m
zEY{&Bd&BPj0(J%$Wnh_lLVn&o=8<a58%Im^pWn5Sm_q_CX%}7oficb{8TIY{U>nGj
z7{09^jjj7DA}Jz1eKDvGx?AHsSBtw>Vg!BpgcKK{KxU#Tu*?QJZ$8xoz3;Tsg0OtT
zqVM+R9UG^)fmHoh?_TmCibxd7kyh7^hQT%dw^^LBo1ZnOxcPG<3w!-2bl3LMgM7V-
zmCE$v2iAU1br=oiK%yR{#4$Mb=CsxK7;&w%RX@|lbNmMG1?K;9uoIE953lSM<6kU)
zD+4~;-MmGA4&PNmUbKCnvTgf~hWeci{JxM0!j7OTdc3rXzqjV*AgK%Vx5NPIGd}Be
zEHt88(P4#Q+i^AzdrLKbN!49FcfxkMt*3m4@Gb-CHqEn9f=<LszoPEH`1lQFRB|%4
zsNAy8P!oLT?8A;<YTIa$k?2|!aD@vS%7%qcDgvmbqn~5AR<ZhQ;Mr>Mn_g^dW1gc;
z(@)6_7@K)ux_t1*vU!U@PjqtL>y<9^JTk4hW0B5!Z>{SD+GNzjz1|jB;{W5g<9^;t
z2N^aMuBnb9dHRh3x1}F7ET3dm#bfaP9HH!c7&xfJ74ZR)68Vuj`)#C5WmBraleqIL
zlN{&hlpA)^osx3{)tx3|o(02wcauk1gQz^Zc?DiQ`ve!Y_e;;+TV-`;ajbw>omP3m
zOv?aTdcT&q-g0>-e$U`Vg{--6%-6?kD{_vK=d(E4gXoh+2(qEDLo%OSP!^a{|Ln3V
zZojsew-NE<y%_ad7)-iJ`XGHMWu}(p(R~76)_J(jV{r)#YAk0F7TQ`P0B-M#n1kAY
zljA6c?}VW+d`h8MXg+*;(ob@I0z7F$HI<K7mlM5t^eUY)4t%t@$HnXEg{wjL-%DD*
zkA$#=ibDAvL}HwJLo)pbG8CJuf=K8BKJ<1~Ua`F%@zk7FM@M~aQ)VktqtT^<oL^$O
zW)gT@a_1z;1(QQUHCb+{XVhA`;}z{^6PqVGN5e)vZ4t%Z-m|%t3D3<1;UP^`z79RI
zA=pR0O*&0`xMJ)lx))8)Ur13!>y19^`2DlV+faDVWRPa}aGZDQ%K<5uwO}W}gE<@X
zta_e8nSK=?_)K-O<zf{#hNSMn@_bC&A*P)-%xYy2N>`)4a1R&dZNz>ID8$s%_X&0#
z=}h3i6cUQ@rYr3H%?>X?MJYM-u~LCRZ;kh7)VJwD9JtlKK$P*(Mz&7}97=srBhdx`
zi2IZy|31?SH#*?N8?Qv@?3K}`GOme<O9(1Acq}Qr`EGZOnN)y0$RH1*mBMHG;E9`q
zt-oYZ=CKXgYmc4%mXJ8zmxRv*8CsJ#%%#`eV}EHD7%cXyIN7(*(dX6tDIUrR$1!Q<
z=wabhi;E#ln7?Pjf`{zl{Pkc&sK_<GoV7PL3z46_X7Eix29Ev-ESqy|c3+al(<?n_
z{m|Mf)XAgc7)Cos4?B7|tS8HO&*|v$_=SaqOXugf5n5+6pfgw>0|ZwvH<>?dBprIC
z?1w8(i%cmg;8Cl?448z*W^cnl)&4~NXrl2WBae2%P{uEjvmQBRIaek2d0#K<;RI48
zpN4g;dEp|=M}{GWA}H};Hjn50o)%!6a8j+8O}dinA{NJz@$Q#No2b&Iu}t#)@I*FQ
z&y{pD?XN<_PHZFJpY3F|F>gZ7>zjS)AZTQR7cBdu4SYd1{2@P=<xUGBmqFTN(O;?D
z+;Kn6{2m6sF^_<>d!@bG{Du0(`HUQ8p+jt92&H5uSjFxGA1Ha&+6KBRPs$Bu@jmg{
zJy<Q{!)~TjZg76|lTeqW;*A8yL;Pp3m>TA-hWT4)sIkd5M@{5a7BMD(`iSKk!^*dK
zIPQD<B9YIlTR`kd46CNzVbytV5^YrLWhW7E*ap}m*1diw14_RRsY(pf&|gm@Q@^~w
zg5%0QC<W+*XNj*0Q7$Db|2=Lf8!~1^rEnuHJ_#eJ!kTy*U<f#flzU5!wVM@$Y~_iv
zMdO{(%b9fQGINil_xHCdp1`+Y!<&imZ#<Dsk+7uAG3)&nKH29#)ql`nsL7Zug|8r$
z7W<!!!a@C{tA_%oX(<^Iq4Q@4+jgTXz;&$GTPeUcNO1rj8h4w%mK?$<y^ooBp4TL5
z3<VC+ii`>0%NJ(=@!1S{rCRpku8PD6d$y=FK;yzG0YgjQE8v2#<)cL`KaC6O*I%On
zq=Si3`EJ@BTj{an2yPHD?dRR++#NJ@5b%{~{+}I<A_~I04J);`0@26teICz1-(1Q%
zyul<pW=$SO*5shAwDivSg7u=z_2XF~+fsKNc5+;e^02|ZP3qnKVI+%j2NdlSo$pzl
z)*q;Jv4elhknwAJO1v67O-ZN}<0YD$c#i1wdV8ew`9cTYW~<qD!dP=X=IP}bBfyO@
zJgKEK_UZ`<1T#$e?bHsIhBx7`wF(yQs$_7iUV#``d}O<8B7RfsU?2rrQzLjnubc{B
zky>H>+->;H{pOu)RD|n}B>~Oc*!B}foX?x7!ExcDH-G33I9uTyL_5wX%QdvLeG%@;
zTp`4~hf*<=arQ=oZfg>9tDJY;V#Jfb`+Ie%IUgL;IPSEB98LZ*sD+;R_Ov8{+o;3y
z+Fj%99y%I0y!1boUp;6)f9Yo)71rj1)n%Unc_*8?Sk~pV^QoZR-O=D%M2G*7-&Z5?
zKuJ&mt!h1`anBv89p_tTL7eWw#G{6kewtK0|7H9?Q6CGOGWYjFHl;qck>D&?!D<I9
zSl!Ab7XpPc1A%on5%^wl0J|qg5M#Kbyxhwv(2KqqMlG$%3z<cIw<yNP{xzO;R<WX)
zt8~V>u;Fd{xtfqlEqtM?7A^)^Q~3Edeh)^5s2NYO<5QQ@Q}mrFui#FFnT7QOry$(N
z*zC_~CE~)mVBuFXIU?7&+J11dCKLU$p>%0ib9KxG`fQk?C!JcawSND|&Nq=^O3Q4a
zX=H0ef%qS`_FaqH7Yb5I&m*e(ICyA<57)lGl=3`Y;5U1g5(#(+uOwOrBff?b#)WSY
zr95sMzr-P5aEx~r00-o}w{dr-`6{UHoq<#z-Bs5|M!fq=^5(JJ`G_VT;AwwV;t?5q
z;PfzB<86F3Q*}ynVf`b41;tZ_NWX0qIkUDo$T=nv>-3x%`2s_yGj2M*+;ml3jzC~&
zr6Xl&7;~Wy7CB3iFC+8mB$1JcNSuZ$H>EwCeu?qpFtwH8_|n5Es6s}CBaF=~L8v%s
z$^yB+xM^@q0C*Q+aBHW6{@jdF#ZZTwSWx=vcq1P}>@Q*~IzAr#oRhiARfyQn7i85D
z7Tdo0O3bWx9xwg=C2fy*W!)-Qtm8#Y;-ph$_w?97!T6*Im{$s3rA9z&t+lYpgbUH-
zU8{{}V%#qwdq?)G0{I`_q-7*$!<%O+*N7SYOjTqR!T*NW;eF)Ej4%}BDi(J%`@731
zJqW)Pg9tOqBL1hvkB(=SHPTx_x4ENL3I^yNCXG~fi1BZ}?umm{K?|@V+pUZl=V+@5
zek%?9CKH$|gbXXkKcv;%875@8*P<WrL}X{i%Rf^suOO+xzC7~vl8~A)53aATcY)n?
zY_um}1K!KP(V=bWNl1GZc3lOQDK+{($iHvkywKl3nAAuhH1KevX2{+D+P$-x(PY{q
zXRF*$FI+J@-N3#pb7`m>-S~-97nUZIX=Y3e)2=b`P*D_#TDOtp67ooaQdRQI^FFiu
zmsbiU+TYF2?K9Gy!-)#NeYn<@$@1VJ5M6ZM-(pT}IrT=^GoaQSsLNGQR~{BmTb5K~
zB~)CPuJ^{7;B@$I{dj@Icl`G|-X~7$Q8t!W(?z6O&zK%=1~L%z={w?9;*%@uuD+tF
zk_<d4^GjgMB*t!G4p&t3YG5*8%y814cW=phm0I=V#F!YwG-?s-Dba<qoh!O8&|Q>Z
z>~GhGEJT{-g1YtB2v?F8-z!v{N){H(gaoJK6^gMjNp46z%7-w_Uw8H4SFfeDqT+`l
zow@&TG#FfL7-ROxH^i7MmoUbU*H||IlhpCRb}p=yHO$*2gU6R1LQ%kGZb6nUORj*$
z*8RA?S`RljKWU2cvu1&84RX_c)!z~K{aExIMA#JGT4T~XC0w6~NySnNWJ-kl60&7V
zi=Rf*2EYFn{D#If^SM+EJ^sS?2p^MvY8jW}A5ksPcoqbuciq3SadG_s*OBhNTx&hM
zdt{C&;Bg&t`p$ro%E^%ZZPzL@zc1CI<64cl@u}P81dTZRvaxnA92Q~|KH{sPm}#>@
zK>^5k#>0lH$B&BpLc+XhXwoHyvg-YrjAS6aNRj%BE8D9e#}=qs93H0-)Irxc1w1dQ
zg@qxe3=Cvk^7h{%@N^k;q*E9Mv!#Oymp{{3Zuouv00y!tIFo*jq7`V(s8tG+!!Yel
zdK*KAWj<sR)M*fYMeevf2Tpkz4ND0#J-~zA%I;ji2)`6$#rSM@!$%AexeQ)IuX{}j
z25eY<>AL2d9G~}J=kDC8vgu$$Bd*@g<5tYH?5{fik;t6G@KzzMNi_274RQ6Nw$9Eq
zhB5J8HN&dNr8Apa=cbxQ<ryg;0O)u>6<Xl`_$X24k{p(WVT;prEQ3H<A~L*&XwP_e
z^|HmB%uM=9A+4EOArYnUAw9JqmQtSuyXT4kX20T#q}!T78<)o+e4=3!+kG^(t%Ad;
zeNOx$({|i~UY{M6JzM5Y6e|r9gx^M2`itI|bB#((>DC&{1&epf9V}iY9SzK@?MG?c
zqRLx3!z@#MLAX;6b3DUML^E@U)dz2eTDy~0Eg*2ojyhtU&7SHg@3GItp61HC3^B)G
z`>4V_&BTg){}r{=yQv4tup{8?&wHXLjWXZ_5x8oNZ?;2uKlFH!#-R{FDuvxg>}j6I
zx{1g%d!LRor9%puJmMf<5VB0XQyA_(r`=cK->>UA{Q=|_i?@X3p^lrd8vOxKllWPd
zL2!AlN4VSIa4-|bk?-74Pw%_`MLm%q#GR*zLFbeGKlK|zgWJuJ3HTaL=I{zR6Iq8e
z%R~x_6Zo4;6-guV?H5NlVhZ_thB8!~vM=ig6B?cr6u?y(*u<CeOWfnK4y8^f<z1c#
zj|O#3hC^g*n@c<(X);WxI_ZCz;YwJnj*1ES<xBoRa3UW5=!70zJXCLQrSi5fS_I#Q
zI_hM9D&16=`uwEk(G!itKVdWj1|kiZ;-&Nsy*P-l2?^KM2`d+1=o?TiEHJK(rFmmS
zMfdKTCh^w2NgL~|8R7Pu(t>0(QJP<91ctdKDBTh8OP@ywXVoyYRXVtscifd|8gpgk
z37f}pV!&weSYT~h$w64Y_D)uo%pGc>75Z-fBAdZj0tM^def3YIR@8n{R$`7gScTrs
z*ZZPxW#d%~Ta8eaw4GYcdb6`LGz}+Hd)5TsgeqcdnPRAlJ9Y;pwRKrnE>w6x@T%6S
zW~rhU9<hf)&m#f^?n=y8I392Z|IB%4z+co>;%cSK61-LEvPvuasrmtzl_0@q;TK68
zcy^gq)q+tV^=Gh2F!Fnyk+*+`_<@=ZKQULVznINIv5oe<6)ymr?~&izVxQ~EsnHw%
z6bxb;JYG9~arc4scBYoFR?%J8RjrR>j;CVjs{IpE8LLh!2@?Zu+-5O9B$A%xFjFs<
z%-A3v#8V7jrPBG5@V6YMI!S;05p#fsvC$K6t?~IBK|)~?XUF=!Yk-F_R~&MZ1oC#j
zXZtghmIp`Y*$a0=jh_zje(Jm-o?dytQl3W``Z36I@8JyXKHtVzLbHE0AHc5SvjTTR
zIu!0g-Q@?r&uT~PT1pyOGjz*_3kB9J)~A~F)cJAL(Az@pdjwcTha8NHtbKa+u1!M<
zQ!rPoq0wf!V&ly4hFeS5vHyl&QTbpS8O9TFCjA^$6Waq!j#7oadJJ<E5#Sz1cXSND
z^|r3&515o<YDUh$HNLFn;v0Q)nmtBI^NI={Z@S5Wzbvi}uO3IluN4c&sZDS$_XDFj
zUO&F=FX@}O2P<ywA=lOFhIt#*sxhCsy0&o}%d7FLmt(+e5Tn#078oq`GuVGl&e=gY
zWNf_#Mk+)HZ>GBq<6hg=(e1TflH)r=XhU8mvlNM?qyyHgpy#B7msPQ4`wgG-;d$QP
zuX~?_Vi~I+@wKDAIrtU2_FJ;qe1N>=ICrBP&lcmO@r6NK@zv+C(j(J|FJ9Rr{!N@n
zN=TH{1d5VES>OJ2KIrWnD+X&;88BUxUYJBYeaX>%Zb>lL?-l;W<YLrmNW7-reH(@6
z=%9U=M8QF!38eKZQyoG;2lZQuZVHv)DT!|ZM2Qc|3mhf%tijA8R?sl~NanVAsvm^A
zp#M;1mAtiX-Ob_bKKGAk>B=FtIGZnSP&{}@^BoUX#eJSrH=pRi$I;)#yO;K&m`q87
zKO|C{@|mWD63CaiHCfMk_+SqkYw(?XK-A)7^84D<O=LD0_I-^^&~gV(HYIE7wTk)H
z5c5Z#kcQD826%6ddtOqMJDeCQ=8dAQW7BBoE=eWxWGE&)&2>1zdv3cQktwh}y4`FU
zCG=Ql{vR#CBQ<!cMq6pIbTa;7cK8yLJ8^t1nF1P-H&G=N^OQn~6~^njF%bAIfmlgp
zIri$&1`#4HiL&r5apBDYhznrH;M3~(F$|FAG$`QuDkj;y#+B40)w&HkL9>WT`t9~8
zf4<OkTIr&H5Kr7E^dkATi2JHB%pN{b0g^&~U*7l+X52s-Nwc@TF{Daa0nF)+d)+ju
z#e$Okf({(nJ5o#$fp=0=B9ki=ROSw4Rpf(1O8tK(ny^#nyI&884w+X<t&s`6rJfy^
zYn$%$-O&igdU6jwO~9jLkN~oK+NI6sa*Pk--LmMcg&_tE7Ut$yR2%m?#p*BTaU6Yj
ze$aR2`^5IAJL+jG_+R~pA;yO>rlC|-!LWReVB`p^-?7l(w&`l0-D&OWGF1Se$^3a~
zz@1ax!&w}e)Ukbx$3!hJO&23vG3s4OpVI91Pzwz==6_Cz{+YHFG0?%GiB3bk_EXcH
z5ReYgE+mIat;TZyA#{bJFD@5Yu_QmE_?Jk}UeZ1-{HRbg{^T5F?e`NkJugOUI{LFb
zP(rUX+FR%PC+8O3WJjuqf?dV@(o5f&y+!1AM&D9Wr+&kosW@8wC5Mh7Kpr7drO{Qx
zKm84Y5#wl@Fl!(B=iK~#lUDo(y`}2be4*s;N6pFUxYAX6sCP|tWea1><uq%L_!`hO
z%<E9w-VF|@OGVSy7)|P4YlS*!|LTcoB>oNQyvC-|3qdTmwNk<DPF7dd9iUkl{9-7H
zJKIN%xPCRA;ew^x*Xz41xp$JVO>O08|MSv=`Y9Y-j`yO^+IGD#r|{!f((1JDRo;Y=
zDiX2W3*WS8HXNm`f{@kNa^n%FpZE_LhDj>Sq$>sHw28f;CROg7+-12JJog4YTF(Y=
zZ>%CyhD_Vwdl*9#O(?Ex;CBaRXbCMl>&Q(#D#QgM%OHA66JjCO(H%4tG&K9)?S^g{
z-Jb2Dw+2i9Z-XU(W58e0vdb;RVTh{(6|k;L`Kv{<)taxa9GycF%b&;93~Zm?qc*{1
z5*+_f{+K|k&_v?-s*}hHT9==c`p`S(&;t{3D4nCjsKX=cAK7C;RtWTcbAT7dM2YzH
z+l3x)GV8L2d3mjXH-FML<3o=?ij2>`4pW$S7L`!_ZmO4c>pYH6TVHm`O1^)-B%8<e
zC0<NRYUPqmT|Jd+U~$qYE@=Oi595wSl3`M|na(v4)V;d^9XZzkbFMULXgEG6do*o$
z=xlUnd<X=)Kds^P^9>9I3?9HVZ0aGwt`QaTq_n^}6Hsp$8Gev}Ab77}R80C^xD721
z^^8TVC>{)pjpCx{yb+uHgZ{liryS9IUzb5@XqqXersO@P{UQnMGr+%|TCZhuetU9W
z(ooV0+JT1Kio@TCvdp!&I*Ab*9%wxP-6cc0<nVka&=q*PQ7*<)ToqTGFo@uPaAn1O
zpkCQ{((3T^K%>+ojps%9L9tOaArmK`$Bb}T;-lI5VC*F^;t}dhywK{ghtofaqTd|X
zHK=dKtuNzvM}2fMMVHY?i|*+8LX5JB#CXkVi;7>yL&dxE#f$Z^sr}ZF?vUwv)_C)j
zw*hYhwPtjk$nlKVX_$*l<g4$#?*nKz&Rt_fI`Pl3N2T93>5MB&4?kI7K#D58ua=Pc
zytAbtR(LY^m&o(yN!Ys1gg|uw0AH2n8Yjat!s{ezO}<bM28T!L4R?;_Tlxm>h1kWD
zn}rBN3`3fb%Ca<#mm&9agw|?R{0B61q<_nsVpFMgpGtkAuc0Cf;e>1LzDbaT!GORI
z6B~R~1C9VEVI1Xug`2tvJjX}vxnu-*DT;epB|X|SF3J^^3fu;aBPkLPDiSri!@kyT
z6GFJggD=iUa4ynviYZ>s_w6NQ>H`5iP6O&POqzz{wv4x<^#iIPZSqVJ<_9g@gU&Q8
zED9fszuLF(l1-@Arw^J?yKWt8DcWEuf3zefY`VE4V=~HU!B+oUbgjwmtu}$cU^T7{
zxwQ&{705~7$>OxCe`L|(d|tYU4dDiA4rS_qJX!n)EjeL*eR;H@uNNwYuHMx`Ik$N2
zMK`YLU03}*C4UmCk(7#aBsn{h5(6`Q0uK%JR%yl(v9Wiu6RmYU0%UTRGYl5-<yu6+
z5Q4xBLLdgb_df^$2|nfg{B@Sd=W@{WBOGfg#^^aF8!%pbbXs~fv#u&%O0jLdCu}_4
zpckTR6T+L<xj=-zjT#W<v>{sy;N|$(PqCpaG4rGE2Tv(lRod3a51iUgx$IcIGdviP
zH<QJqelTAJTC1ZO|N7=i|B^jaQv?I&pSI8V>6H1(*MzTXev5P>L|8q=k4N%_zTDPk
zaw=o?LeQOQIyr*~D@frP{EzZI-kgmHTU&QYu5S@!WF3WaqlZ&n<5@1upC<081}lq!
zUV~UqX>(gjmzYH~MpGw&Y(W|CSNXSA)EOQ?yWj$q4ObTD$k3R#fBhp6tjo#b`5N+T
zQr$HsW`px>7L}WX!q3BB)*I?V`8<x!$ELG*WzTd+s+H-)3a=Z?+KKpI8vqE6ttlTm
zWL~ZKQgQ+x{l4$a*P2p<5<{!vo*M?*3IUL^S$k#(<&%srHyqpVhKmYWrqpGnzSDqi
z#NBpnc~bej7C~JBh^4SeISUxuU+mus7;eSNad@=@z@frpbui)e?|8U|3ThIkRd*uY
zg98jbwt#;2m;Ul*yg8lLu5uP9^a<v#U$bi4Qp^zZzKvDpzOR1$>H?7_JttXjQ^J4m
z<q3O3Xdf|NgH{C9*aN(dhrSG<9$4w}K^GV^tr-TjyJrVnF-*~L+sXBmpnZCb{`2jO
z^^|Ksq1=k;ulHJR#4+tZ6KU_!BdHeKxSBT_TwsLU!ONVfhu(i?(U{ICNMJkhM?OM|
zi-M61p2CwBnA$2~5$Hs9YS2YQgO7f~bOu}X<P|gU1*I>r5J+3lPo9OUW$hhVp{-ud
zehgu{t?N;Jl&g&x)XlZnD>FpNSs2vH(7yA$3Ps8{NT-c1E4JN_j4qC`;LK9YOu+tb
zyTH{#w7-f^^FzbVlS0r#xcyirn?;ve#C#%0j%}#$6DsHmw5PbSRu~;{Gwt%{CgnQ}
z`K{LxT|9C4)e^F+b6Q(#`jMA9_^`z-R<tL;U#!M^D6!HJb&;pxF?SA&|4_+q<nd)M
zJm$ty%xB_mUvxPLg82^C2VVkt+oQN*fsC<bj<M$s=h+cyT5mU|?h@wx_KC$tU>r$9
zR0H2n<I1JZ@DbfG#8@Qv@WtNpCKtd+7$-X|Z7E2sSaj<4F*aS^&hHHJ6Qc$4IIRa<
z5{I!J3}xPaPiQG3c6>+k<wfs4x1ZM|NL&ged!94<B#D0+D3E@4q{WvBP##&Md|<D?
zwz3CE_S<f7WBJbj-NU7Fn&=v1$bJKhvSM^^&c5onVSyq8qfulT$;yE^W2C{Hptb|6
zF|04EE83~Ip{|n~5&y``l9)guaD^qkN9Ohz#57<BXd}}j#viE79*C|1Yoe`|3)#%&
zih2iw4g|+^?vgjGphbgs&^pgw;m8du4kpI4BMwN_Dh(lZuuwwq;Fl&^8I%pkccTK*
zEF6aWTt{hIQiiXso>Wofz-!jq<+h!RGY^SyOs_&cE~!)Q3A$b)4(+Yq(#05`-^vDr
z$-jV!1_wJV-U_LFfp(lGUZ~0`(=`RR7Zz=di_8O=4ZWs8Kp|s^aZe#4na=$DB3GRq
zl$Ko3B^01PD&=gcba%FG2N~YfZsAJz-p7YzHIILGs`4d<(1#-9)$_ZokjsB+3Uqs*
zH^033Ae~hH^TT%^Kq9(K<;k$`d~eGg!IasTS-m3V#}|Jo|KkFn=blfbrYf{z8?y^E
zR-_e|In@hq8dt=<_X9|3^*@B<qzQP<mJ1=BFa)amI?<xpbqCzEa(?>&^SAdl3Zw#s
z+1<4*m`jk7!?1!K3Q)5s9FB8=u)QYc9nGYvE5R7i?4Hm1=6g`#;Cqh%B_vL1>B6pn
z4(2`Mh}of`w}N!GN)4CY%#*#cmxZ^47Co;Z2j&rd#&65tJjIi?dL}9K{j#x}ZIhK?
z{<{6p1?dTB;EHQY);L6A6Gl9oMlAjKBgpoRQ5$`S`|-kERm-OeSho6*M>fBpjT@ga
zU-#DUtEwsI7E+<#&#ZWyi2Ie&nXl>PBebH3FZIfS=KG-cy*6bX{>HdI*XoR|SYI5y
z(r+jyrXjXF_{7gOT|Pxx1f>KhLWrgB=(lXc^PQLVKPBMr7K1<5{{3@ThV*Mi@r*M*
zKVn%tdasmne(D(#=>xGIt7gF{sw4Foe%pNvfRaOzBK0;=mOBZhpW<jfF&m<HK0U2|
zGY`ip8NRvlF*W!{RP|M5(hPTEiey((UXLK4(l6EzI0(qowG5V>QnkJup5?svcqq~k
z1e3Y+@~{#p#o9kYD~aGygJO;t->ODQ@qIAG_?;9K5I#MC&XWD&wrd+59DJQe6a{ab
zsVAl-XV)hVvwq{9aa;QcKX8n|Go&tJL9f_T904f2!a{atLR45|xR}*{S(?%F*;~o1
z)=G=q!*By)l6?KFaNU+8w5tHiwv>x9Ou%&7TfY>kn<K^J$j%IyM@i{-sbUC-7H6l9
zu2~e!cs!6aI1xxUqG6qpZqLv^UedRph5c3NMZPzs1FgmA(D*`TK96Yi#FKt2*O;R5
zy6o73khdYxNu?C)k@-qABmq<@A7`7O*ik^mw*T;*a6KglDjtna{XE(V3wLHhUb+kJ
zDF^z-@Q-i?!%FjC6n=qU2aO@wEa~$oXS@}ajSJIgt`fmw#5P+snBNYIk2jTx0j#*c
zo-Fqw7}+vhW4->7ufet8Af;2Os#N^53bkggVys5;WPK=9Hw?6xWL#<hBfr7+)!;ht
z&a`-jQ}>Uq#$w+Wk;k;zZclE5O9C0N`$P}_=1w33{h$2cjTeE-oR<Fa<6vrfI_0Ph
z6FN|Ht+hCfg|Y@yzy~*fBm>!V`67TB#(`L3D#{QJ+;fJar&!>G`RL`#kUE5_eB^L*
z)3TqF8eR`kP0U>jz-XyGuTetnU`dNF+02nzqIv^I%ewv87Vn;Dr&w@kJ<imevSO3U
z_Ro8PPs~ExM+*}T%(84XEmOX41!J9AO_&lLlQmG01OTZ#AxB+EQ+v#jpz8+t&jc?;
zHUTA(`&jtFb@}3Jx{}z=kes;Jg7&6ROPwKXq5;}|&Qz(vQcmOhQE~ZM6M}aci=yp@
zu)AEPzBOKzWr)Aj*R{(Q79%_&eo0`eO!Ts8&~5KET3We^lPnd|0FRM$JZ^uccwC9I
zoSWh1{xkI#?0^F%>*ZStRvj7-i!M0+K-V{uDOBI}S_j7{@R^IrISUwm<J4j3a!OZ8
zrYI2(MZA}T%9HV880a*^+B@%7T+EtlRPTkYV1yo)X0I}aNPO1YGaEFVrO5C?QjDFH
zK3YZQhnQ)YP)QeVm;G)IID8ywv~!Qf9$pJw|N0Xwr(@lu1XPgI|CSPF`>r3koEc=k
z9DAc9wzf7B!OEIswP-5@`WSw9bblT1qbPpb(@I-WHYTSLU)suh*%=LL;-6Br0TZs!
zd5#dZWb#~OeU20O-7Q_cAW9r|K_UOq?nWbFAk|WQqM#*efNuE{;Oy~(=7nGD70wc(
zMdw^?&$p?%Q@XG)V%;%Osw{DgzlD;p@yPb-x*~{nZY%mbbavM$k8*OL@Jo(|Mkt}L
zk42Qi<f(<J(eMu8)XLq>qND9g$sZp^ypLuf%C@@%xV~Veis`;s`>XDhe(N7+JrQPv
zN4l{M1{iDV68ZW0(R-jI%#<Z#)r=iq+{^@=^lJex&=YpOZi1UU))^t?MIT!n)pH_O
zi5Gp3_;^{L$w7Fw1133h&3mYf)E2fLEtYVi;XJ0e_p@62*W$vkq%y&GkbeTtb}}#7
z9MENtwBZYEOh;t9V8p6v>T70m&y#CLQeFon)w$#E+pE~z_3m9v`AU!D=j&M+evcdf
zkkFmY$h6xS?!qH42?;5zB)=&D9ASyt@x9(hgGU>tzM-)<O!WsCi_Omk51?4Ezy(ZR
zn1Y-i#ghLysc-0<r)1KaP%S!Z!%SKvxCx+k4cw)%bvHIZhl!Keo3wZ;p@v>+n3oeP
zMKUxf=P0<LsBthUJ0vmm`vy^)6Feo54=BK5BC7~h%xm@x8I~(h847}n-KvfW@f7Eo
z3x4#upvN%hd_UG)6mbC1K!*@z9<;AP*t)7m9~-L9il65;Kz%KxV^s)h2gK)+cuwwo
zmzynW5a`_`h+%mcq<*>0<<1dzmV!3QLi<8tYb$qyBiol&x0U{@c&uS~@pq|s#31J)
z@in(_9;{rMWT}mBarCb(9?IjQ!c@OiCQ1)gBo(HGr{nf#sSc-wXQ?DYlk}y{SU=6Q
zn$(IMTLHwYTy5-MR8;M|2vncHd+gqSnEfq%=vk`C9gi6m!Sc{z$`kG&j=VpHut<Ab
zA&gw-o>ErOOPojGF5aRjh~V>|?jMn8Iqs=FR5ypqeZBn!yFyWHO*c93C&IiF@p!fq
z1SsWYP|Wae#9Y)&@FyH_6wUgwfXj-g=mvelFozIir4X1<o*MoUACes<OtBPN+L%nz
zFkECDF?TQP!-f3u4PurZ!tsk52H2t%16h$SR!}DVC7)M;>LU`P@Dc-Odun6ByN&VU
z$&dUUE(C0r@X7gI-Y>maY<nz6o&6$K@w8a$+hZ)!<%Zu(2WEcM_XdL#zP5K-3w7+R
zp^d3WXPs({JJhb$m;Sf}ZQjV&i%xCUgPK?lnq=L>y~HY#)b`IC@|P7%J-q<Vi9<(C
zv@MkWrgT4HU$oQ7e(>sYrv3!G(jDXVULj7~K^b7yhw>8!&2*W?;i-JJp;+2&AAH2W
z2MlGLoW@{rV)37!e2P|AVtme5E~!~%dU?=z4%1Kf8`g_G8(@@}3P<ic0lje%F|($3
zjkUT=Q~ZiUk{3oSCgbI=65f2sTqJuDxvzZjV5apmCmpQQF`Y5Up;O|Hfy?{}gDq2X
z*s>G<ZrO};2zZVkD+}}wo*zRB1GXleAcC(Kim%e!K9AsY;EvZEla9lB`_Bd7GtsR~
zawD9L)=wEhx98Oj3`j)<KO4IbgKYQLcP|0)le$)E4NmfRus88z@4~5GLS5}VhYNHX
z-*7OR5(4*!k}2htqP~MmFZkl9tue7H0&hANMK115Hpvteos`d&#N9peaZ<nZ0FA-h
zj}|?_aTwxpzxwm{%rt_rr^7-dL&>Q7(Ub~gdFozrS7A#zlk>Tvf^tvELGa!GMb%kG
zMfJX4-v9)Zl9Gm@K|rJ%VF>9Q8l+)Jfng*Kq`RfNyJ3cqlJ0I%x}}@vpx@v6ujdU*
zSnxtv=f1CN@6X;x8>S-hMsdTb9hi!<zh(vehxv@g%x6QrrHWYKYjjI7;$)+FU&`$e
z0TE?8;2AgptZuHlI<~vodqaB^BUCV0&5|<<95XL_yCK4v4M){ri6(siJ74RV-ibDn
ze3cG>nWh^GQ&iP2jMonW#p`Pf>1?J_I~?dUQz60|ZkLvoo?2rE57L{soB@aWPNc$O
zKJYuT@2(K+M_7VJVF=Ri;Us*L4tlu!R#vSTz164>#o9p~{!UdbV}DkCgUcnFh&akV
zZ7?T08&>>nggboo;omjz8|D2_mXjUhJeYiNa?E|>;_dq&A^(iExKdz^N$$I|lhZ6d
z0l^BX;8u4eZ9D?D9p^4Z<g5^%<wy-&Q@~vDWVV!?1rUd6(jny04=SYjn%<>*;LHRN
zZY%i*3#HKR>dn1@KPkbeIyF7Cy-Tx=9=K0f==Z$zD~1*5a+$wna!FNQzqE)MN^1eB
zFRNB&BnT+W3dBx|Knm9cYkoF+N$C|lj?U@@Kx&nWTk_6>0=?h2FbWucO>t<b?vqOQ
ziMtK}VMuJJZX2o=c+$2DeF+V5QWp`50T-Agd_6MAtX(~3GMaRlW>D!;{GVwe72hzn
zzMlnfRF$SzFy>I~0RwMOC6rA$tTkPQSJm8I;4`^^2ZSHaPyvktWi?b{O@E_-8UzRS
zmh8J+St6YKNQ2)}^!J8O(R(MkZQ&XZMrAW3a$}Hz8%{vZ=J<5}Megq$`lH<aTY#jM
zbF-m6YHsCJRod@JK%(<g1^^JA{1W5LYewQhWxHwbhbdRgOZXt8<B*Xmi$Imd`6i&I
z+O52npgS6^c~SA}(dWr9|6RcHeLzd?56s5omJYy^@qRLh_8xx#pmh~+6AL@L2OotX
z0mJ9ra$+MugGy!mOna{;8>0Xy^Or;WD)W<pWMQ>Lv-p-tlZCjWN*0#`cNY(>j4)v*
zX`&M0u+Mk4nw^O(l_&eq8Je_0{1i-0(!tIKc^q?@G2=96?;A7e6yvw;_sNA`to6dg
zX+$GWrRCd<LgM0soeQBD=Oc)Mg{|Q%ra)B=L9I1fXQ1H~XV3OJom+SP#_%=Z^7(1_
zaXB@Y8QwjKXHnOT{(dzHBPjG1a+$~OH*g1Yc7(A7rI*PtD~{ndyu-6~)NaAF+G1F=
zD*>RH>B{XW6X6(c3NCFubFnRM8H)9l>_!`+56Z(3)w|z$9NoW!b@5%Z6)=nveN-#m
zf9o8-Xf*8Bk9NNpK}HwlDtSXYw47{1&pI-)>Qq}6OlI6RX|ymu)1_wvg4$UhASgb9
zG#B+7zW9U}?TWEIRhTTlMQX|XHlz?STE%O1%$xD`v{f-B^0E#^;ysIjXS#io$E3^m
z$%Xjg(kUp7hv?YW@0%;-4M5u3x*Bot=2{s_k?v+^cKhc$V**02T9lVccxApmFLLiI
z!Xod#<e45d?w5RG$J@qnO%CDr(?@c8@2pIzfCZNGXG^&d!`4k(u)|>imuh&RJHt{Y
zIVJ6ZJne8r85IyQv#L94${SjIi;~ErH|Q&%4w5h!=xuN*8H$=<eD$s)&arDWVI-Aj
zmrWFTb+BY<#`;>Cv$^WZ9pv5k4YBra;0@p;NwpqEn;U%NnL^laySb2*PdZvzDXT<^
zgS7cGRU!pyeuc)l=2i2uU-T-e2|rHbZbaLdHweDBp#UuPaimZvE+`ahONR*cD4VA(
zxi}OMP0|&^V=H}6U%oG``Ds*M_P<$fN@O-(958@#h6?nE2|sA`t_Kf(MntK>)k-96
zL@j0wRT#yaB(_BW&^28+9+J@{^Fn)6@m2MZ{F5u~>!utSW$pOKvpxKPcVd=-Fl3|Z
zy3B0RpI?Q<ZwV9brX1enFMcrmcBAB0d!fEnv`DC3e$8}CS!K`>o6QA}BI}LoepyVV
z_NO&#YP6X<DqH4ak`BXEz6ECTBP58wB9m4ch};mel5TkTqnH2xE7UEAD1ZhI>b}ng
z9zY<w13X+qHL~})7f6u!@e}_txGDeoU8=@Eg#0@ty0GUpnlGzTB+O1(?Gyb6CRi;E
zvV@E<4*Tu|6E?;Qu|o_?eFO!#*lTK(u@;48LA^9;l$AUI%bjI@MmJwgcL5^G@48i*
zX;bnwe;wHOp5$umDl2U~INoVCB#Pv>3njk5-;`sNbFDJJ24N@{{*YOi24^eDuX;&h
z{QAh}I;v@B|JP9NQs#L!dp|GW%N^XSX{0diLaXS2&f9Gv5THU9Hv16X;~*b>!yL2u
z8b3<m1}YY<Fo{B9`3XiXHh(3E8?b~%ZP%q{xa{gkto?9@`0Aa^OCV@s#}`Ah<pUYz
z;z*gHS8w{N-kIr}$p_{$dU@M^I`D`toOC6tS7Pe;0Xs6*ZVB>>3Mte7_$6_GIDgNS
z&A=DHq$5(4fpDYg&XT5TZK>xkk(Hkj3$|yJ^<fs2eS{TTqq*NSK5GYWY(ihyrg@X8
zJ^TWgt2Xa@5dUs2;``+mZf6EE%}i})OK3;y2Nos+1UeP}y>dXj%qO56f#;}#BI<P+
z)IWG=Fkz)DP0Stb6NE4{Xuf>WH8d}EY^ADNYGx5E_@n@~p2%BBR2VyCJkp!ebpAk-
zY(Txb!cL0JtS@FBBsTe?*wqy(otYZ1{6JcXh$A<bkgw@0b%}Rz21gXt!O}2BOP-Py
z))~>9#`b|M_9O@NAxJ3@Go@h4p$^Q%!4BopE>m7tQ?o4Ujtp(tbN%Nib*OyN+PLr)
z9wJT=f-O50aBD(D>VGmd^$8sthVFWl1tdbo$K1uf1h|2Rxq_14wi<}d8fk?=LW^ZO
zCAO#fgUNN&-?jC79T*lXs2~BTMe-F-7G5vw30X4<n6AoKh0;Sd?J!@L-M>?cG8S_k
zjkq%=DcXusY~@4mCe`dpM!~er!H39wO#16U>J7DF44v0>l8H;tsih7-B<?+$HnE#|
z@^{B^$Hi^pQ?5lK8#44n>^eJ7Iyy|-WPrV=k?Wu87WU~6Bk`;$I#=62D_P~+pc`K4
zJ<0iOn$7kLu-4j!KF~DoQ?)J5)T@HzYhdbz7IPU}!Nv<N{q5`TL6#BqGxX5BzMXN+
zp^!|!v8V*BT_?P|?~!^y_9KwRtVysT(`6*2J5iuvD?=3T9Y9iRh{Q0cbpxx9hF8*w
z64TuOSO7(1fX=a+bB;C9=8~R5!&B}DX&8q+zlKT>5|K^x<mIBtX`8=156?_BbkZ$X
ztzn0?x@S(vPNcr6lv$CGb0p|akv@6dnN&xW1ILG5Yv8jjv`Nf@DkGoI@MN>o^PisA
z$OM`F8E?b8yau3|{Dn4|$amznRt;LF%&i%k<AqdD=CbI7>3@e3@TkS<KJV?;mY7pR
z+B!MG;dDw&5SoPNg{ZwfTHmlw7slDd{7y@(-kWd;>no)o!$}p_izHmFf}g161eE3H
zo$3Vz*p)3^Giqo$ug>Q(#hk5Uw*AkqGzI>$Lb>6kC4%w8n9VlI8AHR0iCw5T)Bpen
z?Bzvs)HwGHv-Eevd;8R|d;1iSnHPrth=&57E7QI!w&(gc!zJ1A{>>#N;!=a;8{Df1
zyQ~Bx$5!Tpdf|IbF_>B`Z(G=N=rcp<K~cZ8MaoDpSKgWik`;yIC}{!nL2Qk|DE(t1
zC5Lx}OYR};`tP6lNZXBZ#|&N;NJbM0AexFKUGG#`Ayz|Q+(G9?jfd!j+e4}vBIOla
zquFxQCQ5>Hxi@xZQ~AST1wVgxWK2W?q3Dcn;1~eJqlT@GlJZ4{rHXuH{3S7Jy=M48
z@(O}G$=GShu}Eu_5Ljud%guy7w76Q~*!dm+3^k+e1k8a5P|l-NU?b72kwJwOOfTg#
zBQ;?)EGjZ$pu;V^tuTI*QTFpE^D&ET_j&G-w#^7385VFj<*NT%o?m=TL+s~gllWf>
zx*FYYsD!)h{Q(TiH86VeB<*voCUTJ(b;_e#x>y^o3y~_t|J7L(t~#=yH`RDDk0s!8
z%)3>mS3E=I_0h#j73JQNnCJiDLNiW|g7cS5``(MLaCjJxXbeWjKLM*$reXuMvp<_2
zpb>ly2>$Ec)B-m@<K@o1!Q=8bfc=`_MFGG2={X`_g{4Rd=afVm<FQ;wPar)C_f~IQ
zUG?>{n^oF>Yv}IduR&92yKzjA)+*wPB0vFC>Eu?W0pY{YvQX@Rlm{LVaRmnvU>LP{
zwU0!@%YivbB}xn2KMhl&v#;6vX6EyJHRctckLUS5SE8$~ACOI8@{@crNfuwLMsE^@
z2MRBM6(c^R<Ct^>S>U0JV}uBPVa9+B>u!(ftNJ5!)M`Z7B)%qN=2X$h#H|2$KT)Ri
zP3i}b)6Aqu{5?&Rfs*;3&POyQ<^mAfEcC5lGx4d{R#W{tU`^RBj}btkt5?^I7d%%r
z(FzARZnP$0K6I>wib^)XtzgJz&8$BA3FF>=xg;`>3cjCzXg)T)tsG0gQMxkJh!Gq{
zohx_E__S)Hl(GNHJd+f!O+*9nYtiEXQ>st_KLt^2_<>GD4T8)DFL}AVeH%V3#R}Ti
zQNxQz_gI)En34*az~>78i(!0sM3KNf>8>Tm=m+pGfC3}kOT~iTpM0~Nzws{bRUXm#
z4>2eIfa2nozUlkUhK)DkCDFn3mi+F&IHCQs1M9;8E-K~h&feh9TW@k>4J-%w8(l!2
zP6aMlg+Ul(J8oQHt8D?=+WnVnLYLGoa$>KxS$otwp)w_#xD5<*1W$ue$?CMiR6jT^
zGd15GklYJx+#@9h?*W~=?KvNWg7xs_f-tXmzR$uv;67%WGwiM>SOAa*a&A1d-K>CZ
z7m3+MDFoAEdfCebw^n~vy=2-Mf+on9C;ao#l4+p;pNJb7i(EoQgy*tq@AAp+PS7tN
zi~Q$ifi93%o}zJq8mo+BHSFU7WcY`JgBq|}wg;A*Q?A6IEwgWV`nu=$?tygugqjW;
zeF}}2DFeD|Z?1To+HQz#2!4%ok+3w&(cJ;)w_L*ytC8Dq+x!X2rX=KbjS{Q{T2rK%
zcAB+xZGk{8ol-TE$loaSA{PGb7JtUpr6U4s);xgkCEBh{Cie3`B_|AG-%D;~>3r+^
z*!Z8jVEYMpbZj3~zF`MOGu#N<`|Q0p^WQmbb@0QMKYWC!xp^^kIoZe~5643(+6!2v
z#5*KEAsm?_zd5Zd_?`so@{e~+2#Q*Z*gh4w0BqjYuCs5iDF0JlFt+E^A_LT}uoog=
zZ#9QVCkJi%A{=i3JhoJ%FIUtjr3R{v?&A?#73@HpEU1-OIL$fylK3H0&sgrdoXXiN
z9v#j+49xW_14YG4&zst(2OBNYka(60)f{R0RbU{cE}r-**kdjm4~8gyJ;+;=++TM`
zZbr83`Ak05G|e{a2MQ)$GDxEhMRuov3Dk>bh5l+d#cAY<Z`XlQ0sV~)IA<dw>`~a0
zwx$7)&eEv@Wg*^G;Q8AgQ@jkgxW&3^P~pA8C-_!Z-AbvH$ASqm&bF6K2>_iM&m>Ds
zYP-oH12#cl+`RX)iG{-o({DEg4JV%&fY7IZ(fnf2r;yX-&~twNSlV{P&IfH29wM`C
z!n7F`i62Tt)8hk(p5NEd8)Y$s?^^ogfb;7D)5^6QEHV)B=RwN-ZNum&+`k3<16W}o
zOJ+OwjT<gBuiR?I82Y33$%g?T%*9SmKi<v*n^e&L$r`&z<_ifR49*FUN|G`jG<-n~
zlT(ZO>l)1L5M5OKc*i3A7L{ogyb>}(`rfy;VA7l=TQoBKpK;d4G3dHEYwEIBC(V-1
z#HhG%XbMcAwciqrMgOXgECO;>&!v6dSWVdZf-1r=%Pt1(Uw$RQvk2ffWu*bJH_2y8
z2NuuKr@-N4)k@#MV--?(n+pnN1;Y|w5t~QYOwDuF{gHMC_YS5aCo9apSe2O(I1a<i
zF~8C~NxNsEK{#LH$WUz0o*0G#{3y?WRCuLqBv{1Nr)z~(=xtoJg|$*=RK3;C`jF9x
z>47_Q8O2NR#wn?h7AOdc2O<s!x-Xu_QBAylsc(S5ClE7y*64MHp=B?uI?&ZCkbQly
z)u7mpy9Iuf{Og9}$Iv`u5yAyROTc#gk%FY)%G6)9r~*LI@Qx}4b<b!Cf8~5>1Vypx
z$rTjYM}aG(#&!IGU;m4T{`cotSk+fR5KXT6C42uj6uzGnQf*oRvOjaE`Ti@9P{s>A
zqpW4N3$+R-73S-2^oe5sW0De8g<yN4Q0|Gzhc<r~b%=Nshu$-lZGB5_3@I}uU<~MN
zBkYpWz?#UA9Gy5y@=Xfh9FHUnJ-POH46#!wb*!H&6M4-N(N+8+)$9@owQfvzDd)G-
z(0|Jf$!8`X1+HyQ=T;gWFIo->6)nE4y<9rEec&x+hV4?Hb{K`-u!7<s6%+z)#EqVh
zG(^;-bP8YE#JtpXBY%81Q)pERXahhTSDEi^06VdU;}H>d!Xg>KdOV-hQU{h-N+_=$
zbx|DMd-CMz8O4E+2~jOM+gd9X+|;Jd02s7`EAe|wB#&yAiBi!3{q#Y|#giPm++-Dr
zKR{x$lmC<o@fk>WZ-z}wAu*DVSn;DKqP**ruXE-;9(+fh8D*SBgc8A~X!VZuhda>_
z>4XB6%;8Kge*|Fxu9`TCBsGER(BEWiI|RHI=Dwx_=gBk#8Z+b<5HOoEH_)0aIU%s%
z2yFFAUDqc7LR(WnOP`w`KqG^Oo9hmx{uJDeOPpW|klQ-}NO%#^0#|0ccFS^-=38cG
zhDphWC>;fV8ba@^M~+S=Xqk&Z(a86jHHA)-@YX--TAVB^1=24voDBgD#=M8zT`+XU
z<2@h*1kbX0614MIcRnsq>!^e0FJCfdYVN7w6PRw#WMq$vC@F$57kq2q2!x>gG5I->
z6pj$`;Q0q0<@+*TT^#b{4a27yCjPjbiow5->0=fWL-OWnSE*OQIDz$Y?G5RmPwu{$
zL$$UlvT)It)kZ98at7$Jds<e3v6#vhy%7Xpv$$y0$E;bOV=>W+Ry79qwgb=qm_QSv
zj1f*3wj-8YX4El!IoAb-8E0NVCyXjJoD>{-$@S(GXhKrCwYjN1YFl9T`JKLQ;|s@U
zHp-T|Z{Z2^x-s+rdQ0)jvd8!tS&Mpbup<n(P$o{bWEsaJ<IuYcKFYs)pKSZD21rKy
z{;4RJTQW7vO#NL<DANmYMr&`r$JG5yPEI~WK}Buw^D77n?psv!$M(JO@NYVA1DaHX
zF~~81tKUlE6N{ZWhoJQP5c=3MZdB6<C&2zA6j~vl-TAD+TtlxK@J^#<SzjLrnW)F;
zvN~G7w$G^4<?93lNO3a}y+GM`dYcZ3z$}>PKqN0r!sc6Df?{NE-1wg$L8BQ-rh=*M
zeJqKEX(rJvTq#W~DX9`UCi;?mn@qezAkAHXOo}=_VL~nJAW6-S^a%;z6ec6U=`;`Q
z|I4@ZG1B?iQ7tSI_C!M&!F`>%K>gDChqEnL*>BWwrJJe~QgxG0-><AtDE__y3ZUr8
zLvb&?3pPp>2&%tENbOJ4GCqF4-^ywxmqu`LtB}PUSQNuQIqIdA&AJLWjsUfBK9Lyu
zU2ysbqGWx<{JXAHH1u2WbcFM(bQ=<2nkg2fFQk@>1+RAGXTR1YTde;LT4NJ=1Papy
z0c8ZJc&@B*J$_6I(^V*Pl=pDgq_lWM{?x(%*230Al`FTc(F5R#!1&a`;&<PZAo<RW
z7I>ffJtR}rR%;S{FU&l61kHY`mjWkw6`Q8OZB+-ZDUVLISEEV!3l?g0OJ%oMxjzPI
zpm)avY_)mP<E;Pcjw+|a>CtirMLxYBo7hg^F1u@@)}w1pl95A^_aIIk^O=g>R#qup
zwC&6Q;8;F8%WV20<#*o$*n<9C4LJA#!I=EEDj+Ofy~nOBj>Rj!Eo`j~BVbLw&1<uZ
z(aO^&jiib%5x}_nlvF6{L-?PB(stPz1lG9s^`MNe5NGACZ=B6~zA1D+S=9hcb%r9L
z|F+|Zl6Ukq$+=OSS-~jkj6H}FVr!6=+HI|VPrq6zvt8lWdouHwQT2;9P$KXn93C(V
zM=xpbF<8K(b)UBPOZ@&!MrRi(e=JbW9d_^oPE{>BuP?Ct-YWXZ^A2K0To%G<ji3~S
z)^i%Y7#B}~RDTI1+%9QZxYNw5NIxI?XJ$f%xy?p=0P5M%p-OhVx-d2FO7f`|d3(C*
zoQ9pB#tVS)4kYNgc&%7oao;t<PYn={&dE}MEc1U)kN81-f9*pqRb@(tzl~sJh8aDz
ztUnfk?ewP>_-9$9-dF6&ki7^qw&E04Ma^Q&22*pr<@U3mFPey{ejH%TvO4j=`ofKG
zSafX)iDphVTF`)MvWS|Ia#zF6h)IY<>|^ExJNXQ7+u=-TB)c6*4^b1up?bYDLUS|{
z&9}kp*aVdjd+|bKfqWuB9|&Fi6hc-QPXn}qFHi>5J6i+<AVZFdiT2ht4C15CY6-R+
zr3_HnVhOjEv+CBRrMTz~UcXyDMvFsSast^(rJhu&F;aB;_|-4ENFbwzt)C~b2GGzM
z>U{?o3A&-GGF6$HtI<tn`)d^s5r~)(?`>YI1!W8<!i6(CT$@<<uq2K@chI?-2rBV*
z32;N13pX!PAxC73>rexIDfVdG0=w>dio}I@_EZdKMg&Glo-X}*lZ_Dd&3+(HS338n
zX7#vY!H1yFdLRt}q?U{p!;a1)Q24zU07SuQ*d4{jaHyOp?hA(&%!f@<0^N0oO$?gQ
zv*qIlPv+_zmTL{MpFO)~#-WYB`?(k9KC=EE`foaedVCeTPxM_;r%Pi_Z_S~NYj%!+
zOJ3#Tt?y-aF89m1&Nti@y1241O20GfX~q+PGhB`L*vh71w$X`fjm87^myTM@Bw(-g
zkK|Xx^s#6d2W2))93^ImQ^GQnR5=4@-Og_DgS65zWNR~zuEvla)Ih0Ep~ZIfvFdK7
zluaWr$<UaLY#yDBG%S&JN+4-`W*e&souE5Ecs?BSo}W{yCnrIoC`o9O%=DDjlwF^s
zoY~-wAW1UpCQA>WFklyzcE)D1?_Q895~IWB=p;4!%w^UmJHNIJIT275$tG7r24|ag
zO;E^@Y(nrf>yu?&x-4UUC?}oFI4zm(L3&&8=F^bzU%I2aOEuiHyzhy66@7VcS7!A3
zb#WhTnK6Q5Oy}xqAOGqMrO6DFlyS)NMs`*rTC=Bt!NlkAcz@CBc_nKmw~07CcWII`
zn9yO-8+^iW#0^i+-p2;D-$mZ=(TElQ?Sc733ratwAa_zqhF9%l0*r9;c@-S6Ivo}6
zH<vP4@ClJDY}fznZ+)KhEme4H5OCT}eW@@4z6m|_dlm7%Y9kQZZGXuH4|tjyulq~I
z=Jh^s7uHWE7ufv1k3%^#Hf)mG?7k7XTl}O+HrS0QP-7Re2r0GOedRrE)VxM;i9JWR
z9sV3gpfdNn#{S2gFtIpH&ETv(C+4Wms__`+AHk5696AC`%>K9@-uOqZ7e>BapK2ub
zi62#}CD?uwMbzd7IcX$tnYMn^nEIS5<<1ZqVVFu*?$mUBlr?Uwz|1MNGP2;kn*$9R
zTcgBP3S%A!ME3!cb4!dDK8_9_@hMFAE)kZlkG4&2ICwBYIuu=#$B02X`hs3V4)|Xb
z=vp0o*vhT@=*76)_f#(ZUXr<Wp(qPTVj^MB#>?41m(&yf%4$(aA11}B8ypd2_hvi`
z+sCa}LB*3VFhZ$IX^B;&NcU@0w2)Sf)UU4<zJ3L}SpN}t5o{%L8ou8aWsc%02%Eup
zsW@BZGITqyJGsaX8?92FZ2vRdg2kCKQ}s=P@cq_iBmpLKNEf^!3uw%dT2p`4f81w=
z1a8^aNPs{M{%0kx8-gPu8)9A=pihYcdN^e-DwQB;7{nP)s!f6gU7rvs117ESIfvXC
zRWkrag^BmY<w=(FPy0I?tC0yU*XpJ~cVM6U5oZujU@3Fq!|7851=;uOJECXh?!?vT
zE%H2VWaNifx`U`JU?e30u_FupF80V*?YSTBg8y)x6_i%#f9{ERoF?+|$<}xdU5w3q
zLnVb;P6&rEYHz0tSv@w`UK77J*6zrT_PRf<<rp^>HK^6zV3BmDDqF?*?zeMWW^z*9
zFCjUc0>}TzdVXxR?92~I@io%--f?KOy7$!{?N+URK(H27(QMX9dNLxeH1|IziO^1q
zDOMAdN5XcZ%hmxz9n1!F6Sb=83;{1Y?i#9s8^L~uzO1s5kis{Db(~^WfU9c1Q{A-e
ziuTM7Ddn@DqAT?kQ$an$S|EX^7F&+652bE0SbW8ZC*x*fDk~<YJxe$~Dq!WYWEf)<
ztBBa=Gs8k(yftS)^BXBL?Ll{zk>Yyr*H;5^|GkJ-fzqr0yNG~!;Z*l2z!p4pJH4MV
z&_MSB*k7a21b{dIqmb1E9L|#MV3V_@xW-C3s7RuN&%?4tfEkP~UXyf%PI2mM8NrgR
zmZQNX2h7_Q3K{B3kzY2;`->t6B(!Kh>x+&PbDDZCIpa_^{U{7w3Va+xHv$3^ZH!kk
zK_R__%iWJch1h+8(l<^1jaS|$^SOyps)f>$%qk~?K$g>_^)RbVvLI?6fc|+H6Tqx(
zTLm%9#Me8%4_|Ov$M4Lqq<YC2)@vlP2!%aIqtNr*AC}%v`}o-2q-P5sA$ZU*)$e1!
z8}t;v0!+m6^8@w#+)FhLdXUxa{143-gQ|<<n8w&>uWiAty`Z<mZvb4hvsNTtDwnD>
zO!bJ6CYf?(Qp)+4UsO}fF_to#E63AICF01phPU4~=+pRfz2vX76qy9%-m1~h)MkcG
z^mKq)Jishgr<9d=K$ruPbgG=TU4SSCAKVn_kyP7l6*Hri^lGHQfMNn_lT0-}RUV}8
zVJ@nsh$4{aOJw;t_!r@~!^NfB5?JrYFj7{Y>*z2v!$ar)NI8C@^0q#h#DSXjUS-2$
zs>HJSm(SC&ai}l=&cE_!p8>zR6X;Mg51o!CNQYaDhX%0gvh)O|EQm`L9AmEH+?#(^
z=b@&Lxe3qlUqe5Q-b(TfXOF$sk$0i21%h9Xg{}Gi>@O!-0Rr4Fab2;y6AC;=R~O?a
zfC80ycQ6oi*?y%31`K3hyF{rG=Cs#;a)6L7r?)Im)mPR|no*31V{oue&`#3NL5S|>
zbHHFN*l0lSkv^VJLYIf>6~G^i%T;>b9yWo|)dX?!8h_7tHF38j0a2fay}>qkCB6X@
zw!v$>Oz>@<TJ|W0k<(Hw4?eUmxcx_t<uDtlGv;lhMHMQ)+i7a?Wpdz&N~sx<s<Yn~
zGv3Li3%jopTg!TYR0ja-Wqc_cZTjRk@KuN==sb7b{$Qg-yI$h9PPv?7q_hp%f)Uqm
ziw-)vC>6;e1w|eMm1XoW+#8^=1Ezb<w1@Hf!(t2ZuOE2Zw=$PFR4tK*Dem3w+^L6!
zyNQCd#$U7xkBm`)UpPQgqu%eNy>bu#Kgocf007lgmGvt?&)l)y{s8gxH8)KD=fF=4
zhd!Xlop7jo5%U2b50s%ms@lFqyeVaZQ6l*j8|2y@7?=G`7ga)V+VgBLx|riKS&%wG
z6)jXvRJewxr~H0lRFYrD(8QRp%9Z>U9xz>b_Y=!@5rzjGs_n64%5B9@27ZT=x6o+S
z#+-9?Idn&}!oJdOpgyLJBFCD{X%etAV~b%=?K#+eo1J${TWbYU9VNYDvkC(XVrSt1
zl{Zo5Ac-U3ZOflK!LMuBb=~e{vE&kc<C>n7W+ikv&8S%ZtK;#%NJjStzZ87G#$LMc
zfpl#+Eo$?lnKJY45xI`q_nh=-$2xYr`7bc03MmYB`hQ9vS1B@{^o{Liu{4t|Q%an*
zEJCBH<~}07<l%%3)Db=bxsU*H(wCJjU4RCXEuYz=nG$!10c9fuO+4?aKI3Jx%OOpg
z<q}6b)=1ZckxMN99MQ8Hwh!EjzzWurTOyMitsZ-QA5EHols|&SZV%5(X^UyA7Cb6u
zD^h=gf66d8%b{V-*BChJ@uAU@r@)8h9;|xx$n{?JaW#6x_7ITep6V9cU;xKt6^aO@
z&3*j}K=jz`bJh+4-*}0uBE&(&=9Dvpgh>}f!Zs8V1KizLPm=C_>bn)1gt2As&N1GO
zZub`c9v>tPT#-P*@CX>4adZ?OXSM-+N2Um&P?pW8!Et~F%#xMEs`YSGx6Hm}`z(@c
z7dcc3Ya$5Lak`KTPS)kuF2Q9<$b&5+Yum%W<jb*7U}}z5QI*d;9?6m!A-n*J`&3K*
zcxryZ>-2o5ZmIcv{xfaeaCy_E+s87cQYHg#+DZ$huzXbz>kp;<7GU_$G#yW($`pnI
zWp`iyWF<dZB?@`Y(mRE?lGAjK@%YAD7+3mh0Nh5zye0@_CGrhv{Gl&(31arhBPhz=
z!&EA&y$<o^z)h2(n5HU2m2K*Tt=|qgw>@<~VUm!sfj@1J0@-RA<HNVvb|8nMz*&a^
zmeopApi>NLkv%^)9?3%=Wmum4pNmY61PCS|3#>5VEsF}0gOk-$9_xvR4_~@i8KMl#
zAJPIOhP%SSi@y(;!hL#2p@uxD70}`SkM051U5^q}04DVnzPg|Beemk<4tNTAQ}Y7#
zCH-dy7M{K^-0Y7k_%Fr+;iefa@h`kkg^SWB@XcURKAvP@cn<U+2crBLfS2>yUMuzM
zIBC1O$1&*ClxFC*sStDl(8kY2AFTjNk-~9v))kuK$#3L}cQe6f-C7ozx#i4(V8Y9g
z>8vRSk@kp+Ii5%ellH95wrAq@UOCwR5a?91PUNFcO+pZguXJB}OGJz1n0<17$ap?z
zPtL$l^EG3`H}t;7bg^A4{U%k={Lw({OP<e!5*I5C9{DpG3HdT9`~|dms;~befhP5V
z9w^Ro{|~={o{4y05`oJRA5JPvz5zBnlLp40(N@t!p*Mc(^%^A!K!&uzR-ro2${`SQ
zH#)@kHd2!@lU?(O2rshZPl1StMo;bBnc<rx%3h3jC9!r=zi=i6{_2*RxAu$+Kn05F
z<;HAb?!@HF?1vy}_HVjIQy%^-8u%na!WrPj2CpWU8O@aRkVkl@!0d~PXT2(W3aw3Q
zAbtRYjeBHB4v7ADjo|{%5&oB+mw^1u)h7qO<_9LkKU33@mVgcpoSKKXtJ5T(142HL
zR`{}a@9R$tW>8P>e#Bx!T=xF3WqxRP-S3rdl8|Wv$y@}crtC05Cld~nc-b&6mwp!p
z-3%ArmF_5g!Sztm6QQ2ZgJX4j4IuzFjky}J@GsH_j-S+Uhc0k3Yma#5urbs+-)UBF
z5!H&jTP-<Bf!c#dvhGL=Y+{p-cT}Dm%o0>txyP4@ym70WDdhdH36J1pLquwyRh8xu
zHJ<+lMI4|%b%c>%0Yy4(Qw~dx$14;CRt&3>V6!Dem>Roo5xJNbagE+NI#$-vU6vm5
z%dO2<m50B+5Hc1;Wiu16a@92BHgLJ<YfuXU_MF82(-HlUxQ4MC?3~ijdt*Mbd50#|
zv`&71^q`~0N*cpi^^c(o&|SHwp|L;GjDo0TV~46$Eh5j-Q~-aKC*FN|X|XUx?{5v(
zYqwV;->nStLh=LiRokQ9PXtW6AI-&Z=+&;#Z6-ad`yUH{7BW0Z&$W`gabZ8zK@!9F
z8`~qZXu$nmERseUN2vRUB#_>?6lo2GjGfb)m}D+;RP+nK1)jxMFCKc;g<xTWss;Yn
z_m^>(Dzx?RpNn*zSrQu`H1Kafpb-ZrK^4~uC<pyS`9YZ|6y)?uFXr}SDFxm2wy^W*
z2)2it3C%PC?|}i<gxPqe2Fd%cWCgsYF1V8-+=vE+44T9BC7y_zNrQM;dIn4%+q(W~
zQ8yCn46>Qb&$mQKaZ*!rq&)~pZj_Iv7rdreECT`KWR6>t*!3C770$vvh6Yq)7c;bE
zE6_F3bCG=!q}q!0WeqbP&bLeS$4{_Lw$vQZG$;@>ESRk#Sc-N^=^c*>2S3Z|NHu(v
zVS&qyjr(>lm`<stp71TV_1h49oyzqkG7Z)YwUBC}-<CU?3Ubkungk_~#lxU9DB%xa
z(GCLwx43@$|E0r<n*mE`W1=EELL{<-GxByE;i6VE|7ew==#m>zL`mNZDHaONz<15R
z{gDB_;a%b_M6r<L1>!^!F!ol=JNu37q==oS^H+Y#5z6(WQXDoKV9g|Q<MQ<-n{V86
zMt|xek|{3Pq~88Hh{}aFUpW6gFrP{ObWf$qjd`wKr&Xv$%J-6im*Bo7=69$u>-`SR
znmrRviBkRi#8X6MIhiYuQp=T$_RMR($pFK}=IkblnGh>sqee_2ga)?5`_Q!?xc6G<
z9ub!bOLSg36RhuXm5AyGo4LI39LI#~NR-wCy%#}{lw6FbRnJex2Njv`oLXDzavMl9
z=Iwq87SDJNqN^v`@k9AU5}Aypr<r?3bwX9DBi?sn@;tt7c?dP&@Yz*k4?$<jRBN1d
z>AXHKbW>CXQWvVumF{w1bAG;w!O3V0NFHwT8?CX_ArGc}TtZ6J_S10n_*o}VXyRBR
zpl<b&urS->g)|1?pB$5BISGL%AX2@}@GRwil_MJ@#J)JpXhxuacETQavO8^beg!xv
zA&JkuTdhl$i2oIel;1I62KN1DU@g}SXVn7*H7=TN4wwJbOs2#<P@2Y$dKpvXaW})V
z(s^IcUn?Z@_M^}efSdyrEu#wt9Q^FP3$^)1;x&CGhRGku|4tW^U|_nK1W#$t0jD65
za_x`%UntuBgfS(^=#5V2e%|i!ueL`4m&P2}!)#ioIe?$crxEwhJDGi{*3?azn&MY;
zlnICn&+&9@{0R!{r8G$jBne<|f{FRiq8LMq8stX+<17++j52X<8Cv4wRH}o^(>p|1
z14cSi{MFfiOO{l=dzGP#1=ss1>UwFYkmMmgvI{H*wyx;;lgr`g=yQ;2DS{M=VcXEM
zrfV;tNydEpLjk1pDTgSi4+fAMv3+LhJK*!JlaJ{rKi5)-343CpFI0GbkYqu18oMn1
z0ZaJ}(KpGmNG#|op}Qp>bB?o^xf-*;{d)oy@!KFDpKrR$?lZWDC#vQ<LkMG6NHu$N
zf*Pa)byqD8C1aJ>6<pMJ)}-m$@46OKzFYI0ek8x+F=v*4%92CE78gQaf<H^Z%wl4p
zf@vq4xI@5T_EZX|;H#{Hjsz%HZKS$aEwi(~TGslFjxO*XgaxitPUo~;!y$pQ^+rR(
z)Odk)-NmVf{e*S>bq|AZ5uv_Ql()uAN^mS>bIV0gwOwKwovFX&*9G&UB}OH!!j$9N
zDJPH9Cw{w5mC$_64P{oh*GQjixbWC$#+jXiPN{5B#sqWOS^2X*9S8$Rtat`=_AG%V
zIKHIL8KFQ4FG=>2hV?n~RL|A#R>iGvVqCs1XnsYyWnA<6Y<NeWF6*~WZSYjB)=Q(o
z$+OfHi4)|XDvn7M2Hyc94s-}4&$ZyNzdc@#Sxa=Je9U9;&4Y(SPjSbO`f;BFA5`3$
zGf+;1#lAN0J$TdP^F|~q-T`Cl;_@`l8NJ>xeZrj9-0qgQ-8I!4I`Q3^G3D;C?axSm
ztW!IM0~znhEGt>hPa%$H1*}T&@1t__q~m(0yV9HEyXNZt+U!9ESTpBG4x`lz0TDkK
z=7WL8Q|!7@D}9G`fqK!Qaf+m0ZZCCfl}hUuKQMM>+FhBMI0)nnzOPNZn%n=_L&BE&
zS_HmI3>QpA;y}_<*0&+Z$i=hiuEVVk`r1pPp6KJN>)m*yKRNrD+a#A>q7ZTIRF6a~
z#A{q&C3pAEv|(&vMXM~fO1UnTma}jOT3U1tLj))9F%TC>5}T&pT#&Z^O7$+!()SL$
zk>aGCU}x)sF534TzZ5wAMr*5C@bql!QM0r^8oRUDmSs_~qtYncd5hbvzdE1`ttP%m
zFwM?=FlWIilA~n?S55u+mPF`fyNjx<!H4t2W``GXAD8UxvMu8eBq}1O4a&+tH<emH
zw^E+y9l_G|Lm8Apr~dTm^Mf}c`}3qICyiW?YJ!I-^Rz#Er)Hm<$TXP%e~4J&;3XE6
zlli0N40?2G6}6vz4t%oV#c$(qprJc1og7$K)eZ()y2*OL?|gwn$oopYcK2M;dj`RZ
z+dWr(0UyD@@%nu0U!(WT2Zs1PNdha9Lmnw(eHGhj>@I0V@fx;<Z9ML^C&9{w$kFSZ
z{GJp)XM(?Po-qHMa?JfH&p&0N*QG{*=lxg)$C%CBKo}0=F>Z=&KxYQ>YLoq=g})x0
zF~m1KTCf=6Si>SO=8Pb<mkSMY8preR!CS?SEt!L(8onML>Vg*m?cBx}&M6%&`;)wQ
zGg3W+O|}!%H|+I%MKv2x4{2i+Ec_*b4?!i4ncii>E@x8og%eMXdB{XfX31|v6XV8&
zOdgUJH49xdq-ef^nJXJCbE$Qe8X#AzEhO23)$5+vx2O#A?|u--H4zy3snY7Vd$<cv
zTJcea1T<@#oG}TG*z;Xe1oo&EeLp0m7>vn7gO^WWVfpfIpC?;ed7ZA1<ybh;>#gq2
zpVVdZdW;^=M=HC-o!V{b##u66=lgE#Q(-Bj>d!68qe!tX*4=&TH;Q3&P`>yvVxI<I
zoR_;a{1|3Vb&QhoYCKS~NU}v;ENzspIG)($Y*c;7t*xBp=1Nww(iQtgsQ4mh=*b?I
zL8P`%>%b_~Z=$)){pn*5#Os3SB|=H6dhlz(8648tO~!TVPY)i|0<Soq$qhOS1m2G&
z@+eTfMOCFtqd=|d=;%m~F!ojAovfT3G&Vk-Pw4&lhIKIt`c}e|lW*|gi4<88bZT6M
zcOBD1t3B)eA8+78woGToVq%~n<4h6EONEm)<PuM@H1VFNi;73rKsn|EQo?$R5fOT#
zMHhWH0`~H%q+fN;_Bbz<NQ-FaJFz6~))l8xXyGSUQ)zt}M1*zQ#}^jRql~!V)<L&B
zhPMJ4Ci=4lPjg_6KZMD3r(a>YS)F(60UME6550lmYY~yCR`ntQJiCAqO8_$Wa#Bll
z<L|a_NlDHdx>_QGmyG8dw)U9c2~G{_XBavzXok{Vg$+L%X(c`TlgFAfwEN?sZ4<*h
z#dWiG45QYG+d^dMAl3%?n`~^XgrorsQ5U%0X57fDl`WU`>Q18)Vz0MV7XmfeYQ9dj
zeWMRD&@e)=8Ik+go-4E@cav)EmAc#|^kt&(*xg!RXA@l{-r?9#3kgvMQhAF##=>8d
zM0#q=(&>j@6I)j+H1+6*O;YbdT{0DRyV#Wd`KFeebsK1e;$^$Yu|pRbf)DE@_2B0R
zkI?H4fR|Fu$!vVwrOjVYsmRPhr~XO#5O~2L=~LiCRGuLUZb0Ry(lyuph=N)PW>x!M
zvCg@LRf`|VnxnLRo=>t>=7d)kT}MxXF7Nbjdv>U@r%Iv~f}MY3>(Ck|Gb)STC0HYa
z(Vk15RC=VO7*uiZ@WUjO1uCRH8CrHuJF(UklTMm8N;U0;clHqGb-OrXB9cEsaPHpJ
zP3GYo3Dt=ikWMLMqY%I>hwbL>B8G_9di2zrYm<}NY>&n<MCc}{R|<tjHwRrT{6$@+
zQf}v6tq#^Q`#Ez5M!yf8ua|{cja8vFG43nfOu8lrhJCzg>N@>E$fhlsl%}vlFGB@8
z7rPVi*kC+7Bnwv9ZaMK=8*MpjB}S7Jafw%7*d5;(_TE&=gZL{QZh^DCab8f6ei?gJ
z8{NLuy0uMP7I&05Uj5Q(zxl>#k$xyaX_k-;EQXW9E*I-=slm6Psi_rY_~&M*@_75?
zaBZcGoP*a<(fu)rr<UHFP~EIS&9wce5lr@`uA)q#ZB!wbCv6Q>$2BaVgCilTYZWI3
z4%YX3--{<lvuyC^Zup86_TQu&Pd)H%^`oert@+^oQDE;ItzAmrTi{d{2cz;%;$0m5
zJPV)c&`@%%&Cbb{u7+YdGl;7UB@kQsvjgu+brC8JX$DU3FYY5PefbX`qDT-9TT%}Q
z3R}80u=AhJI&1Y|O!Vrdk`3WADBnOXaomT>F%Z*B&f##^@s1_<EgKs<#pZcK3Z&9;
zmQ8Jy|7@3~3cAZNx)!9Gi+z-L3OHq_f|do&@fE_yGxeghXYIya`^DqX!^vrTP9w(=
zC|_yjtze$uUAwp1?Z({zOgnVQf2<`sY0tguxH@^nfWE{y2%!JH?Je1ls?O{&1v!e3
zYz3Ey`XtxG7s(fRKu6;zbI<Co**(`+_eOsf<AWwU^Q65QhQbmowNu$GF4JT<qO-Gy
z4aLTNIXL=X*ddU`s;f(0-gz;s9<ty|DEAXJ?RvBz9_^ju;aeE;LUH7k$%y2-g-h6v
zL1RL{rkp7^r@2tJ9RhlYb6&=EYOUt`ifEVj!1XI)<G%TKaLKwb357UENE7m3B)c?O
zf9W5%;j~gkl;^NhlJ-avKfHZ68zlZt`OXl1ow}SjjB*O}NxT>(*)txeyE~%z_*k1K
z)IA6Tcl>vp_=ATi$xpwY-o>O${ocTI9p_pzF;|wUd(KGvNt_%tS@<~ImCM6m42*_V
zZg`;#Pql{4Jgt>{;5wHwk(cmn<7hvSK`6RR3T|1cr&T@Vv6sWK5_s4Dy5vXCpu}>m
zmQRKv2c<Ddu|UCO#pj+f%+)?Wy`25Bw1dHTr=fOrgE3_a<T4IC=$t+DOiWbq$LdSo
zojfJ5Lc*;<KY57j7_0546a)T~`L2DWc4=d=VgIaMUS@0&pZwgjqx96cqnm5w%0YYY
zH1p`BfGdQ)Rx_gWtzeI!@ydkSc8BRLY;B6wI||!iKr3?MY1{xplg01P0`T5f`PD@l
zs)SmnNg=)%E2EL8+|)c~(+Yi>q`L}1gbuPrebR|a<%y^o!SOt<Z4zz=J)2>EibgN_
zx2vH}wmD9Rz*($TVA3ES;d&J&W_E!irgwY$sz^brXMEyy3c2eS_t2qI8hoQ23md+7
zMTint`!vyly45`&#F)v=Wc#xs!Mu^c&w<^Av9w2{(WTI0nkkvo1nY^W@6ba83)7QF
zz%Aqq9D$sSI5iSq+%tOLr>Fy;Ds}RU#XUkf0bYxI=$*xmg6qis?rf$^)ZcSGc2BJ&
z$OA4@fw@yHTvr=H_CAM}E~RYPEm$Rtxy*>ea$H0-Mn213aS^dQnmKlq3sXp%mRBlK
z7bx6i(ml5x2!Vec^VlDQ+eg4fuccj<`Ku@*c6j+mgSlo)_;lAdtNK-cJFx{1HCCqz
z^X7YXXE5|(1|{S6{zAACx3JA(3n4k7vVV5t5j`~iF|MP&>Fcu>*DrQoovtDKR!aRv
zxov$!*F|?@SYDg-HwN_MZNV4u&y2_`zL%qU4tx<pIR4I5zLYUDZjzugSkWdH!gjr%
z-rcry?2UKz1=ng&4-d*oXcmZRDVr1oYem?yZGw%=PDu7VZUi#-#rZ`-h~CsD60_@u
zv-8hXFQB&<o*bRr+Dy$r6E*YsA#X&4j=Uhof>_llu8S&qH|*|FV{Y_S9+Ho{Nx@;0
z8O}jvin^eEuWI3978gqitVBwWemvvY_RjlrEgRJ-ensA*>`!eleZFN93k&1(@nB3=
zU(lydL%^ibY4d4DmBa9GJov#QPJ(l{zWvu`=l$hW<i2w&sRoJNUUfQ8PO2%r>X$AM
z@8gO_X6)3<N)^q1e&VG!tQB(<r>klBldWZZVMDZlKiO`F+^zOVE&Z0qx}HE$;r>Z)
z8Vwmejrg_ml^7AA#3X$*T(Quwy)XWg={dgAh;5PT45H9zNRQE%@LeDn>CjRjMg0j6
zOQxvj29;{*`LxK^Ch;0LlQq|T13cNmIrXhH6UrA;NQi}~3EhAzSL0xfZD6}d!6TnS
z)^TiVyHPi@*piNZ<g;gfPLrjqv;0U+N*+FGe9~mXV_mq5XmXI>occW<kz{pp7)(^$
zTDG3Hp>MLz*;ICXbUXUh*AH1r$g7}rntZYss+Ow|l~wDyf0o;S>lwPt(?E4nb<<S5
zR5BpsTkBq%B|XP4Fplv~s8y43S*j>^Q12kirQ!XTWoy1OeyzQYxzXxxsPa$gves>K
z*R7}zE8Y;x`#)6<mi?~6+Gmi=k<c_i)xT}2vM-N~f-6qF20RxgN|aDv;Ne3>MJ@G|
zri7+7u!{m;{`Fg_<P;e4@W!j)d5%Pn&^zRR?k{-c46J5>i9b{ex_JNg&4`z88Y|LC
zA&pNPN@#(UZeIBrFI<MaAH4Fi6BA1mfXC`DeKhIza@$OJX1%bJ9Z?oO=}YE)sX5qi
zE|%MAbDpExL^=X(Yw4}Q{7kU$77AR*uQ|?SMo?EIW<D$XKoQ>yk{;(}q;nY78nU-|
zCzr@04ONXO5m%Y{ekVlU4NCS6@X9Qe)o*!Yt8H<Nb6b)l(Z#MHR^hX)LCZec$8&SH
z*JM0qOSg~Jik4|}cgcl>C1<%gE@r5JSDnAUEy?klA~H}>dtBh4c)?JfKR;=QByZRN
zWqXy0X|dbE-KTHZbSp$Trq<V_WvZ$ZU*BF7QPfOqtyH_xKHJYa4zmpMvzHXv>3{ew
zmZIktite`IT!mFMmeNDnSX7c#x|IxpRxx6g=`{Vc*K_^}3^7_t)|d6x3rn7^NvobE
zM2?ziQ}R1|TGj6sX#TkQ3oQvs;NckvBX2Z064?(6oRxrYL|Si*54KHr&gCx8#HaPR
z9N%315HjjrU<f8J)A<noyFoqA;&L(8ua)QLi@}UD$GQ+dyc<KzJQ021(Lx7S?@1A0
zl+gPvWrHXFo#IicC&p)rlMn}N*Ky~rC;kv%G35F@PmlU~V<cQb>w_~Sv#Khk^fr%m
z1h#eSq)kKb^@0Caq`V^IgDQXT=O5}FTs^NXlnXqZ2rVhXe$7CHj;tnT&sLrH0%jhI
zGCpK5czck6oRz!ch0%_RE|_l|`MPBa!L47sy<-+kpe*{X!R@Edm~P*^ZZOi*sZB+y
zPj>m8J`-POM6mdgzPzPb;35a@4{vtmM1}sZbD~}q5-nLfJ79B0CcSCO6!NhH#n|6%
zOUrVMA}xj$oQ@nDXQn?5jgYHp``rjeO>Bj0l9IMM28W{Lr3+N~LSVDP_T;5?vz<Yx
z>*@=>F5Y*}I^DpGEeq+6kt6IS;nmT4LFu@3m1}?r_f@~aMqK($*;Q-?rNxGeync02
z<7oZavkqESvVEMb;$sa#YTZV7#FM+`-=q+fe6g$*BKQ?3Xy{(XTaZR>g7F+Zd<(8p
zE@`AOLg-2h+d#2LvUa^t`mhC$@S`8VX5;{GZe{rFH@*TDdCgs1TOmz3F-Qykq?4oI
z9z=dWEd<RKq6wPua&~T|{-JUPrg$nJur=`VW-y7h!)bIn<r@))#fECq&mGSlNg?&S
zKZu;hPS)$888Q?VHEw!2v7J@#7Tl!MXmlt@L#!t^FUnJQMFsPltoXa_{F@z@068aj
z!2P{8LVvi;&59+ac}v0u@iTG6_;|<gu{W?_O2>IO8A8f#a=_CXX*sE?b%n{^P)&i|
z6_(Lz)u7FzW0l#xI2#wroDtulX>6AoJ6MqL&ATT8hXL)NS=5_(VP2Cs1<!p}nN_Hx
z3j7>%wHz~?kWGEPl#*h45yIb-&xUl}C0ckmy7OVTp(xvJOqX7%;e4y&aJN_S4F``d
zfl_;cbF((9x}`T`(Z~7AvWtqgFD|J&A!A`U^V^oGu8reDP43{ej9|WMy(PYtstX$|
zB<1EpQ}H7ctnCzPhn8`Xm^ta(!E+`1#-o&oQa1OI>+nk{y7MAr9G3gBX*<w2bL&EK
z>&PgxTa!3cIm=mUWp05?-Et!p6>8C;=sSfqu9;szFQnU#xQmN27OW^dPIsag>ssj3
z1`X_VebMZA9Bo`*UO>IxJ+w#q-zp^n0}8qN$Ol6{ZJi_H)xKD-5aQTtM~U$7mSRL*
zxdSc1dlDDFwx3FrD%qux&%a@b;~;Y)(GLI~Ot9Ribi-^qQ+4b&P1F49PamP5%L89&
z{1HuY@{3-%2^eM=KR0=t$q#z;J!<8~a_`uxKU#x0jltIWy<e2J7WH{Fl8&OJmv)?(
zozm00va>8&Vo7YW01H)faT&)ha<!aHSC@Y`E&%20&Z)=YxI@le`q@q@J1L6C&(o)m
z?p&QV%=!;wlgt*s2w%7?vgpy&YBs==_F~quZ?)5qAMYMsSgpyqZETW+K^C6nr68j8
zrjF5aSyz13y_ai9Q%7+FM*f`a%w=DX_&@Q!-s<RdS~492sqVJ)4r^k-fLct-B1{+G
zR%`<!r(+8DZLEPD%P9sYaD!w?(B0G!?Dbi2=f!KjGns_fUcRgF<@<#Rsi!+E$o+Ea
z;EUzUZK|7b!p!KgNYQ=?)r`p;p}VGY7kBD!wlM8oro5?-$z$>^Aifw$MmgAtc^)f5
zus~iia6b&mn?01{?LY6>HTR}Sy@k+u-*GhNd&9o69?i|3+teS}vZ15$-I0`*BAdrK
z%`y?aG^^v_ReBj93efBT*K>MH-K>!a?+WKPPRrQiZl_wOhElV+>gB-EdhgKh^8BtX
zJ}#&iIYxF7Lu+M`k!AgLtFJyn#R?!i2;}N46rGXgleh`6B^>pC!2kahPjL{+Nl5d4
zaC5Qk@pFNvW|x0u&ku6%rt<|dgXw2QjuRGc>I|q!V;uW3oSACKM4+?eNaV>e0v5Fy
zW;gp0>*gCO>!piww8vGQ>(e2z-C-%}?K(<}S$2c|S6O2^oYu$*6Gy?JLgbbXg&=XG
z@mN{q?<uhy_lZNl;47Wj-xF!u_WB6c_ytK~RgazK)6v7^Syv8DVP!E7{6#}j^i6Bv
z-DSpb&(L05*o^F0N=1-S-ja?o)*ch%Wd25W!d)`b>BPc1iSx(tY%=GRtgcT~@a|@*
zzs1uSq7*XjBJutA3nWLwl6@$b@4IPoB)5Xcf{WLZ+As!DZGs1dCLFlts}tmNEd_4>
zA6-`+5OvnIC6tnqZbT53Qb1}*=|)gGRZ2=4h6WXsZlyb<TN*_{x*LX;lpZ?1J1DOE
zuHPR!3j*`I_uP}udCr|&tyeRcO#M3Qc@D9)M^W4J0QTTZXSS|;{GxDk<o>hXVauRp
z=hJz%xlR0%s|C(yWh-JH(IdR0+V@)R+&>Q8mw(pn$rUB85t%wE%K=-a+Qcj3l0)f0
zS#wc@9hp=bLNXN=uV)iabC(=6>l3}KQu)y&UoYLa+qAy(lEAL~<y{^Tl(VNzZ0Ij{
zAMkRP@;Inh%{m%B$o4nG1JxY|s2gBaLg>rRW;G@(0BsRHH_hu`l5|gdy*8iUgVO=T
z44<I;aTV-v%FNk*Cd@BVqb5n=hjO|aP;!ZhS<!q^lvlM@vCmv=L{QKi_aJ)t(pLHq
zQWLo6;F{<_2xMvFf!oyHMjwY<K`)xib|!4q#FyT;107m5fMfcJwHL!c=YFn2$J2C&
z)_V*OxXC>=Dhzy?kBeywZDFfygFFdSy?Afq?<&$?O?lk3HgPo9(V}<7yS$1(Ee%&E
zUa>iOz}X(T+U@JiN5iRSeQ!KxW_RbgDUg{U&l|X|HP7zpFstc)=YN?~J)#pLIQo!a
zha-E^mmIavMF)jNTKep$9tjP{&wOmt*R^DZ@qImyp5W1wz|{D`(tA}Zu~$+9gCr9V
z9_f-AQ&-&{|Jwa<!90QgxpqGPR=ZrRacm-2lH#`TrQ>eVAM!Q%llwVwE-fB%SOb@a
zu_r_mr5<VB4;s8T4_^^IbvtZMbV-|j)#&5y_`0MjCqE*4@~CHD;m+Fd>~d3YMRR44
z+DIum?g?bEuKBd$!O9QrvJiKHu97O|v_v6(bBS&YNc>{Lk%(R&JNy+2w$E$Z#r03m
z9|z&&c4vz;_Zo&hn}nSd8?U>MR68af*SaL;o-GZ9AZN-N<M{0lzwA@X7r*3%iod@t
z6&i@9mdzFWPDWNfGH!JhRm9qu8)LcBR{MDPjyKeu^6}ZuZ3YIzK`#_E&CwsBr)#~F
z2D*GeNlZ&kn2^!mVLZb<q2Hi{wIok_oz}*-R0f)jCdAq(*Y(-<xzyajs@FD#j(7)K
zg?@+(#)JmdeLHJktn~6|$C>!_J=&r=?Wm&(^2v0Hb}!Xkp&Adq?Q!yhjkjc8B?JEQ
zsC`(Pu}-g6oVERKf@8O_Ut+CeU;THYxQN7C3#WD4R;Imc8qM2&d}`T9u}Z`@v`%K(
zyo6WM_UbX0G@QFw5tY-RL7;`q$@S~lMQcvZ1BS+sQyBF5oblG`JLe301!_l&?n^S0
z)v(apef1nQ4qsBErJKt$U-c)dYuo9ZQTD}uv-8sY;gW`<(_LoNC@JUPw%jLY2W#(4
zo^<1LaBx8BqoS@h<Rrw7dQau>ooBVw;!`hj%i=-IbT;46hwXO?L1WbWv>OTmJw;Do
zROKYRtI)oe4Kp7keAgB;>Phl~d=p{}mI@M)PMF2Ey7ySF2<#Ru;^eC~gO*Q(naabe
z#tSY$f%!9g%_N#p8i?c&({fvg%h(8S*%#hJM`@PuE)mX}Rj!@rylP||2a-Bw-Td&v
ztO+h_YJd(*i2=-%Aa6fWR*CKOsW29vaDxk?(z$O0Ud}`A6djVAv&W&a(6^e)LRYAn
zUlVvDFqjmB#P9NEvOXL+lIHV#6Q5+%QlQfV2d@d3s(fINqX)vbyC5wNFH&ES+P!LI
z<yHt8|9T)I`8}wPW&S($LX}a!;Qq5$<E|A4tP+Ys!w*^O=K|hUXsG$E_MaF^RtFMU
zr3pS`10T2J6*B!julfPzRYJBtT#?$(f~HAkBU{+)&dWAEF4z01YV2&Z9Tm*%buCPl
zg?Q<zRd{QsbG@NLbFjyB=2a&{Li4+5f_M`;cOpc6?vU><CG>P}E{BP)_<zB%E>pfE
zz0@ncwKwU)qLy0Pr<Rihv#LL1F#RD(>K?qjw+CvdACHK(c-P%(wAaYk2R7-Q4PI6G
z!%!TxX)_0{>$c+ua9*g{3u;kQ^b5|_jW`$G=>MG9zJ{m%o-eNP)~#$dyrul$Wa?Mf
zpJ3OBLo0OMcvEy%R8VrrXzb?<lDT@es@~hGwT#wS*o@wOJU8CNsF^^sjBywy<u5B4
zwGX2Pd0A!d$x_r;@*n&L&IS>R==N$&x48Xh@P%pdK_%*J6oCO=viK*Jya89>fu199
z%0mtN#i<DeR<?Y;o>r)wgw_i_Dq@L95P=;AxA@q~q^~W?snE-k(JQixhczB;Y!Jh}
z1UKdU)nu<%p?4lLFYq*Mg9@B|zhx-{zEV=*rID9@xH*4jpCIT{4?o?L_qhf25;U$q
z0~u2nYTg4%2Zwp<mqcFSjweN03J|L-Q|VW30mAlC;=?1;>!2V;ONh34hlNOkE1zye
zguJUV=89O!e@=ftoQX`gU=n}wWTT`#u38foe<&|CUGw>~uR#ApK0Qj9YkoX!G?|2d
zM%f$;DbJ#-npv+>8e}4ay=o}3hw<vig_yUOj;{3G+>PoLC$28dWq!@5{)3Y|aABTA
zPe8R!r?RCqT6O#Cp~ANu0+O~8w$76Kl9{&eR3;b6-U-#+vJIG@NttHm(ipx6saI5G
z`#81>x#h7AW3phSG2}s8;nX)FIxL}}FFhiA+s|rU)%Sr_ILTjnl-JH+lEl|=7;^)5
zw$DixT{2v(exlv(R&S+_C8#^FV1MxBp>B88>L`hB?%wOQR*m_OQ&`064!6Ex9CDBQ
z7qAiP(2utybFBwfE9_JUqacCCgfD3RI%WClGK$o8|M++f3H6%Vd3;R(`R`md2@<L{
zbD`<>dP=n_3n=_mk2gxpM~`EVkJ~yusE3zDS>3*+oe46*op{rCt!-rDTYJ+)Z1kg}
zOsK0$Mw>a*S}P{4!aC|#39-dFdA46Rq#19%5zG8BTx_4oFZe{Y`^}K_)9N?*ua~2q
zF67szZ;ChNh&<nazN4b{C9jh2{czqoMHIo6BJ-p~Z|K*F9ggbbBY8xRPwZ^309eeJ
z<1ra}?sBa%{A23e{s&1kYR(m*Dy@Y4uvnW1;##=Fsk{mrKCp1dOm{Q&uQB3#Qtx}E
z8Qk*?#3!sRzYIHjt>1z8D%^>i%nh<Lk0mc^Q0$yDv{~?>8O;{FxuAX-UW#v3S1q>&
zz2y}9wCqVDEV`hkQa!<A9!o3Si%+Dd4xa55N#P^E%=c-4YRYb*x{GA~B*!xT*y-sb
z63|-Ma0sV795+Ac8t#hcY}wKNailUrMYpx}s?I4T+3ti&Z|5Dn6B~i_()E77@2q9G
zG(_>Nr`Vp|<6*+=r4Lv9V!87e@?r-<jZDkZv`0kgdRe)NQYM`bCo+9cSq21BMfB0?
zQy}K#Wol}WT2L`lRZCQ#S&ljLjGm;s0(>=8I5bJu$f4YXlzI455U680{!`4tJ|!je
zSh5|+qCsu-P49!^M^%`_soi~sz}ccVEaWJwiycxbAkU{0apy4;4NLv{u`DNQg~bt#
z7jJ%I#cD)gtUHlgM!C&^QJ=A(z2XPvz!gTVD}I<1vGwlNoa_5FdF6UGB080Y1GiJK
zY?f?mZ}+LG3szXDucMNk9S?Y2^6i|6@7=NPtFp^voY-uU_hr2XUFKDM$90?7e~gH$
zL19?NhojF13ehd>mtQL~@qevrRWobxo@?bOB&Y8njYAQl8<wL+NEO?gF5B{2zP@Fs
zjXUa{9(wvUH?HH>lJ{PGp9NOO+Z>j^E!ssrgIgN$EeIN)YF18nR<P9HC>5NLR!KY{
z5wZ*V-eW9y-=dcT*D*}ig4xnIQ!i976T#GgX|)+m+#za<4pbjdtiRiA<ExS<M`F}}
zx1iH`>G(q@mfK#Vk;d@=AKA$$vZOzN$NFS<9D}KWwn$Ms%@FjR4!=(=MJ^&xAB-1Q
z%zfDNA*-RwUQNA>gO8Iw$3Uanw@&s6@#o(4MU8csNe&NWT9#!I20$^%=q3O|wj9{x
zymg0dRWH(Ifa>5mL@9StzK;=JPL<W?_&_kWe(=NXtqlQ^mitTC%v~H15l8R-3I4z!
zXV40(mHHgs<#7{q5=q`U*3EtHoAV}Sl#Y)H#b{lsest*Ex6W7X>#o+B;F?P5u1;1R
z{Q?o6@$Pv|r+#lOeAraa=C8&{dqcFiH?{hb(-*#lMsM`xgBMCm*FfP6I-k1zZ?Nys
zW|6QtFN<G7!o^%nn=7b4y=~YUfm0(#-4Smw(^7KStcQHv|MNyS=%%u9>yB|I$YW~m
zEnsaUv#pQ_?OccTb3>)lG3RGSjj$i;CW2vjF`|wpoVxAL!VSKSZpf-kxTDU}E7gx<
zQJ8j5-_<{MKRU7ET^qiEJpHBE5H}QbWm>HxgF==ETBC>Gvni9SR+ox@pw6+``Rcbo
zvu<9wmFOOgOS+@0|0H8Exw>pA-=av+j$~+E%?bn6WqZ&fJigjN7XD!#W!6dV>y5Ov
zmdNzAmd7c@Hp_>B^>5u5qM01C;vQB!7juX#9WCpAQ}8+;e#A5^RvvEiHX&zLXtspr
z*}<L=nr=;xk~XJfNFPaXoyMk$euZwH%Vgb23X|j7P-&hYB~)sufyd16qXc{_YR@X`
z#G%iA)LsYGpbK`Al2gVfU>B)4VW=*eY{`QhUQXb0`GGGb0>E!ap!9M4{g@cbukJM4
zD}Zl#iDr-0G4c9uWRbKT1y%t7*#<eW{ZyoQ&818A7K2S0d368Q_$a&Om&o!D#U&_V
zdERg1(?ni6>Q?URT_>D)sF>JCL8b5+E4lP`gxQX>;SY)67~KyB(PUO|jP(7G{33$!
zZmLNDo?1xFGxmy;>-J4Vzh%>2xPqcX=`cqS-B#2r_Wk__LEnO*<t0D+JPl2G)5(nd
zesFKY2J=~i7s}StPh8F}t=#1T9S`yWXu~oD@49DE!1@Hyy&m(*^|Oabh;WZ@mb-G=
zrZ2q*kdclZj8)n9N>$hTv3Kf>ou%}G(F5Kch(x^^CN6d~chv@2+r8K`1EJd<JJZRJ
z$UZi8?iF_$F2`jI6s?X0_GXW@Wn|VK%Ngm1s#W{0QD#EEkx>~ZKGeAxTs^32t>AFl
zKbg#}>ljojSP-<N9dKo#Uayw-E*g(*=biTeVaO{dOi0`2FF09{k;X12$apAXR6S0l
zo9fkdn*fTc6E`G{vcT3F&2$vNUGwZ$POd?mJ!#CsMx`;sPgS3Z=IA#(lTp=B{92r`
zW;@+5b_aDLUEOhO9?_+I!UEdIgEP|ul1gm+LirMxyuJ_uyD6|{yf7YAZMH(=uDXUh
zNtQVDb*H<^WbCfrg$TYJem<i}t!f2*<?7FBjihXKbqzL;znI)tWwhq}*bjh%o)6wO
zjf5Wsj(n1;)rpDCz&jcd(AiX8=~yG>7q8Q-yGJ+A%QE+RO@1CE!WIP?St7~YuoGq2
zk(2Y({G0El_fZIrBx#5a=ic{m9UZtbhMRq&v_n1u8FQ;f_P}ww*yLxzZ(0>PYaXjl
zwU{DbYeY`Z^O+uXwoT|96pNJ}%2k4t{(;whIC3ES*h58+1R>KfEQ&-PdAh7<o~Kmn
z383M$hptrr-m@<$fyBd+%xyCBzFcTr4=Q~BC4jvhFWX7RK#DWJz<K2!)~0iy4UaBK
zEQG94Z9Np5hC6phY!1nmeK><L$R-A_*gh<I`hCoeu3Sr=p3l%24U%Wv@!7>($OX4T
zN3~jrDC%tAjifCyn(Fwhk+KPS%;EUD+MctH-6zQk%8DI6Maw1U1WFsJCBOSEcf``Y
zim-)Fh;2;TOhxV8cN>RXq&ntgMA!zt>c;Bwe`ddwCo)pe<bcK8x|A`S0ZKC5Ob_GH
zwCcmMu-JSEV@gLk0bYRED?TCnBYBqeM@8IZF3;pkuevV<xmUot`E!lp2F<gUY7LA@
z__4V%_FOyc&_uA=mvPJ$96t1UW(4{4Og|%0#U|kXsQ+<(A<0Rn3PpRMRZ=?r?fl$c
zya$?teqZ5QSF8u_O0`0xXMuoY+3cR&Dk>aCIq*1!O9o@5uG{}?-!`~kV91G4d*b?j
zWiv!;aJyTD19`oWlnOyDvh(ecqB?{uCirG*UWtcS+X6v+JUmI0dar~7@Du=bCR8^y
zez=bc&waVi3!J}_G9m?Bd;3JBcAViLY^2Qz9{Je_I-kDjRHV!Xe!)|C6?Nx&{#w4{
z9KNsg3CD-FWM2u(CydIq>ai8~@ow2wJ=e;Do9S;><&eJ-%yJx?lh1P7TCsN&+)tj9
z(={3vU_0qZ&*4{aa>>3*BhY0x!>If;n@w!-Wv~KY+)$cP+S~Lvbo9mAB({ck6JuX<
zde-wmho=Q_xR^J?WKm<p@CAvwyMLf1=7vbV?g|)@C4LoArIM%c(YJ&zU(xrKB|!di
z5l&N+KkX2G`PG*u@#~=ff`t?)`vkB#f|GRHExGPIxa5_EPAKTxLVYMLmv9zi{{(OV
z#~{`MyQ`V(TagF&_j-(bt!lc(6|!x$-)zBL9p8_`yL~g@SIN62pgMi@bQW*Ku2&5|
z`_c3rvTP~<B63u?S@fzc{issS)oP@hwU*s^7HMiji!N)hJgnHY&D94Ue8GI6y)&PO
zTNl(6GQz6Tcwf;-kIr5(|H%<~HKXPDPAsQ|(C3XE1U+x>zNy!gauy_a#KJ#eZ>G==
zQAA}#4`Zi4cQW~mr6VT#xQBlS;=YjYlgyZdXLt@75Q_+4OD^X_L;3CMY9kwU;BkXi
zy8?NT_T8!h==EHH!s205zJhYDY{U_o4;w%a`fwet)}SeI%4JQjtPh!EtU6@vMGxFt
zP4+S5;kEPal&!9R;v{`h-m`9{V_+F>g!fq_(&Q1?M|XjW<R$0iQ@FFq*fJp*0E+R=
z4(Er0!@sr-Qbg?mRGRrHbi}1*j<u89BE0-4xq3_{ayIC#)Y#q+AzB@)I9fe$cNWWf
z*+PC8sO-hl6fgl%6?GNFwi2fKpl$T!%BPejmyyw(eq!?e7ljE3c}SP-7FgY;_^0F9
zZj8F<JS>vm4o}-0Y4Or=Qx0R2UZPh!btdX@KT5u<Bhb0%-YrcT-@ZAzxwlrkb=u4{
zcMvTUU3M@=H|iQsXku?^(G$5Cs7e=T@`{i+s^JU3YjR61>=yl5$f&v^WLf#;?ZOgJ
zyp%43&lMnvcedSPcV17OPbPlq91kneVHMH>Sc05GPf7m_DyXa8H#G6lc~G|mLGPj9
z=FtHTM5T2kf)df$@sZ8}Kpj9i*DO<Sf?v(>yuvF{vtL<rmz8~*JJZ6#+0ufHdq!p~
z+H`-;i!r0+W8ByB0t|;I2tseS4hdP^aeg%pm*r8eDOISuhEOif8$ZD|A>JR>ZBL%m
zx=aA?*rIYnsKh)vAq2K5SD9+n=7);GDYgNB8|ZcEW#__4p^^V_p+lQcwZMQtc5#5m
z>GQGc54EWLrlos^4SFrN(TvKXR;oFp*we#Tb!Qf9YV60-#WNSM3zn?ypH_kbTnxqS
zeXVZERT16~HN!0xll>14;rw02=#0sa+*nV#bu7}|VyhS&oJ(>+?Wt99hOdsgS54zC
z=#*G6vc*P_gSX0p!l)(aJLEZ?>u7Z~-8p0Es+(o`vGM8xKARerD|@x*6y)n-_X}IV
zWYL5PDUN#-J<}yW3`$fL$^E=*O1jnSkl9xL!{b{86L-sk|Caf>Lgp;TNuFSkP&nuf
z;w<)Q7aww#2nZh5x)>G=J)4eRG1ntlBCyZp3~?AvHhgeLQ>aaCas#S=)td?ccq(EG
z;Q$B7dxj18T6v8$YxL1&LKCHIB!)pnwMR+rG}^Hv-Ruo!uQIEV9BRI&H>|6bh}P%8
zA+hIGbULV_)t~$7k%-i8VBZF;uxpA9kyc?8^5i&xuw+2M7)6?1+W)AfrE_<15g+t{
zoq9Au)Bh5Z9y+2Qs_U^LOC}&Vw2QKxLN(OVU`sI+(F8Y>I^oD^vu>tb!ipAK{n8*Q
z7`AD}f<0<qa0QMDN&WPmx6*9>>n7GS%eCMjU$r9--PxG1L0S#+!JO-v@!I085pO!_
z^Jkl^y)2O8H%^ScDsByI1$wo>01%tDCjB@Kbo`aq1D*#lB3Y=lzrkQCM8H<F@0kow
z!(882(6Z$c!R#YvpqXmn8Q3am(;AH5ZZkgqo*+LA@ejH&^#mzaM8M4`yNi`~<_1V@
z1lQiEGl3Q<rTzL2FTOXFkNo!1k_Z}u$5KOshhnzSmz@O|*+b2@myqAECb4M-L+$J2
zph6DSc1z>mz9mOPbic?aKAt@KCb=cy=YMO|c}Zv87vPZB5$Vpx;EOSiv)ZFw*&h;v
zkmV)IXSfq?58ExBCy??*)mEYp{J9nfDPm;1^5y6JY&so^Myo8MU5j{x?)u&{Jj86{
zg<u0lM-(xKzm!@jfHSssux#m#w~D1ms7m=K(wE<$v05L$w;$!ni68vc)xjamfh@x*
z@3!21G+Vm%c^kehaGb1QV<5X7zw<Uq$a%S<car{vN(~Dl!=(Org52W);oT{zbDhs<
z*xpiH$YY-f%qbiPeLN5xqi{3>2OQ&d)BM{zopOT1jYj#@>S<WzXe`L0N*=3+hTAoB
zV5?|R$<}GI#TQT*Lp7@hGh4Al*(&a|ZY+>9*Pm^gUe~i)_zYMF5!<l?MYhl1*mCBY
zlq#(>?rGBxrE$M_lz(Lcit~<iyD)(j6JjE4#+$mf88k|<`Yb-W>cigtwdsl+NFqxx
z&T|@vgRfX0r-Ld`AdFWac9NnKBi*YnuJ!lXa59gV&$slZxfgTtte|E)xH^ydOB8^X
zK|A2K$Q@2GX-x_!I2s1<uwDj)ya&KVZx0rN*4j^XesPW^1&WTu(+#MesZ_5$2pKV}
zF*L(uBxF&X15Oc<1>@Z%?2gpFUs_?=7}Owiuu|clf0#w!LvGX)$q<}p_wrE=wDeJ)
ztKHOO)ykuTR9wjcM&vyJKiBYCOLxN-Mc30gid}nD;(5k4`U;c_-F<V<Onv7H#;!ZB
zl(!@Z93eApP=PWwTQ{%fQI3tB{`^?X?Z~+_&E^sJZ|hj}e1Nd~5Wibm82jyM=|M$9
zYjM5#k!U%%@nYvP%gGw=j9(*Q4;L$EKj6()PmZHhDT<3z%B!JIRJylei{R-rz=7}9
z77`m32EZrjO+C)Aqy)CtJr(h^S=@IE37iUfdEBHqjdwm#?eQQsT!3-Bd!)y1-z4)M
zPzJq>@AmL;ZdgBFC`oUdhl{%FrgnKJka2(yEs(301c0NMo>RfYTJJpB9r2UF=wNmo
zp5+{k3Yk}1h8B?|{^0@0bK)!I+r1uXV;}tM&YML=e3`vDW&}OL#_=)DSHAa$g`Mu*
z3F~aJ7_)w1P<i0LJr8C=hG7r03a<kyjpSNQWn`*D<>uVFszp?2fvZ6!ljbc8#e7{6
z*1Dv8K)nkxQ7Tv@eW{!fZ>D;-FFRFYY)44LJt4mpI-S_cWW`#nQZvtcSTVlg%lDS`
z`1=m1I#8OujNul2$7v02x{!I&o)G<>*@F7LD`3ROT?22J*J_u<j9E16>salRH7bTy
z7XIc=oD_j-&d?)EaG^@$;6W9~lV}P(?yLSIcjoFDm-fDf`#?~?PyV1D5fuuJp1fP{
z=>DyiX^v{t0YJUvWo}KI??x=@UrRUhQ@L8>Lkt6)Exh;<{U;UMCT#+B&w-`n#1=-0
zb}sB{2ut6h>q+bbpX%f*M1idXdyxFa&1Ipz^izey#XxHOk<Buvcyb;EGj2K)l?0C(
z+bPKK>c^w$?CWg0O6vK$>n~^9(2%!YCBxi~cH0taj|{<OG+qZhslu5WV<`s{_}MOg
zHb5;rfLhet&p^EBmJ)-Bj_bfB>dqow&WUv(X$DM#ZzzhcPk8VbGn1%A1Ht#_|456+
z;PvCNc=_YV_SIEk()LAt$=X{7W6V)W3G%lTu(UujKKzmpPaGr<sPGggmxr7nYZnu4
z6iRGA)U}Bw{6^xx_4)b0rzAxS-U8{424t&HzPe*0AO8DWB@H;wKjFBsPR?(Y%x}qd
zMi7`iveV96Bk!AU!gxKDr(vh4A1YBniQq3+aXik8klC_UC<sAeoZeA!H0#Gx;qfi6
zw_Bn=I9jj8AMt8q6RCZmGSRX;tEkqhMdL~}XsHUO8Ll{lnZZgOq!MFaG^rQpgjYQ7
z1Qoo&J(NSv=v=`LnPb=Gfc3d1$RtK36q|0h_Eg){TEXHr(TtWAprJ%5Oz06$q;vL*
z)>pLB*sD+3RynV*_7xUR@U?qGHJiSveQ$f(<^EiK+?N2#gIPafp*HPI9#98b0`q;9
zXIk%I(q!b8HR%h^DalFfhH-9;W+7`I1Pfwc_X#E1*y@WkC<Pl!bYZW$lP!t*ydlrP
zGJdFEYfPa-$S?H~w7-HJxmZReParcOlp;?`hw|?@eoFtLfa|I_AW$ho?|-VMQ6;>M
zpja3C($M8~=G;IqlAhaN#*%w3B=mk5QYe)_a?j&oo%TpGl)yLZJ}hm@i5Ukk*j<_N
z|0%C%zep1Iq{}HNbP|W6y&VdRiXq?mN^-J(*z2E(1N;g(D>;HNhR}}FYfgZ?%kJ>`
zb5eY7BsZ8yS}4Hjqxj`ag)e!1176ra*Em9^1amp^&g16}?(TJa6^4FcG+#$aXtRtb
z-*P-<o1)KJXgMf93(H7Dn~bOPxDALdllJ|9{IW?n4EnIlX9nTSCr}^2rvU`V{T%aC
z!^}n2yHFFU2YS0hqK@nzTx<+^D+jG@2A2lXPRRyX^w@DLtR@R!bR6Gv!|W*>Q6I4Q
z(x^(XT6k2InrMz$o1{8vAzE!$$+)%?>xfT}<0Kec0gCZ*Y{yU6s+=QCl@s)w=L4#m
zB})|;U(L(u-b7S~;j$^v{7G-9c#)kX(?rWg>^fJ=a#_9Km1KwG-Awbz&CEx+N~(5h
zGT0red9IIWU6l_$-at9af3Gba#6MG*Ri#B#$7PjM%8s!-0jo&gS+Tb4e!{8F!D{@b
zW~b?a7UbPhke7v4{I%ht<yDRkTK78!yDCo4I#nhP>#k4ck3U;ER*!o|)ZeFbq^ZkS
z{ADk$T;B1B0>XcTxZ~=vI~mvWAcOrXryDza9g)!kSTBp(A_eH@xK1}0sL8f9^8u7)
z_$Hp0rm8y6uFZthW(1lHBY<n+V&w$PkeEsR0C`5}Tm0bx_=zb~+N8kxY0g_v*S3a(
zra-UJ$q^Xz!aY4HG#TL=o7F0{If>UIr5l`JJaJeZDN$r?Cq3!Px33bN%H!s&IDxo)
z@!y_nSrD?NYWr+jA!bPOV8}k=%5_iA98`9f;AqhIWB@qDO|+wR6n1v{wyA@}9KP>;
zO(143HT1H8y5j>#TC=ud2&o?^DV|+wA)C0b%aQ?xRnEh-a054Tu;6xf&0S0P?o;qb
z9e4ZiFMdV~MfRY+U0f729|V=7l(0fyjgV973c9u+_5e65dcxZ`G|wv>@fBE#HlWlc
zc-DEIH3Mp32z&Z`Rw_<(uXMhv4n4EFo0qO5D*1_~<rD)H+n0aOY2o84m(gG-$n@Io
zbJybAp?wjlC=b=uiG4Yb_r~An@&meu`F-r<gAftmt2vO6Ghf`}5we<aFzs1#Zj4!5
z@~9i{uFW=;b)P(_SPY5<NrCjILn@oa<a9(Y5hXdp#t=91>77U{RDv8z$48H#Y1I?}
zaJ~U%E`wzb6fmh=7hc)?hzw+A>nV3w*^_wVIhzY8<(iDtPo3T96}zXVmT5dfiG+@s
z+tf~(t55f}PLBcj0<Q0-RM|x5t!sUVn5aMuAcf{~DGeri{0NQA;qnife$x_3w<{p@
zla|q5IF<uph1s?pq=-~WQ40~9hx2X^T@hp>YNcTnsN%$6IyQk<%QS<UiK|V|;|Ko9
zMEdmfXOERhe8~t1=f**<^1-d_cpJGXj@3QU^-4XDGufbQb=I`_tI-AH^m|tizS%i`
zQzwTF6hw~OP4E`wh5>~5l9wkKT(O!!|076EwK5j0W|16!r_EXj!o|+}dG?o(*0J8@
zspb$c0v4!qefSO{ar^lraP%Mu8B=@;!&nx_3tv5;@wwmvq{sop#oFhihG!B?m`w1w
zrG9hpOtYt82e{dhgBzE|LKN{f=%T`UX+u<~Z4nBXsS%4hhucw;^W|uQ`$IwEjyo2&
zxhYnKiEE=sqjMp&uzOB2<DXH<x0J)#4aa$3Z;}#l54=BbE16LCSFe|I^_IvUi)N}C
zx$Lb)lj!NuR(&(($2m#pz5Be)_W4+Toqi(@qWNZcno`rSFuSJJiMz`44ZqHF&1NJ;
zWkUz+M@0U1JBU-^zzi=t1kMZ7zH6vEL3{QYOjz_73ze1Sn3NLUPs$V#I45aSgU!z*
zQ$yoX;SVZiWl-^?w2>4q!vkKogfpp>r0UMNFE1hxG(@cbqBAv-Jt;GowC7#!lcYQD
z>wD#AxCcL61D<)|^xid4CjBdkl!`!f>l~(B>i0uAGIHivS;6m(LE?h0gFk|ibnCR#
z>28Jt?qv+UTESEm&Qhl|`At$mXoC4fIn(lwKHeBCtpw(1Z6&o%Gi8ot_(PFJ<Q1;f
zkZlm()f+?|D<T1`Bw9e*q?X??c1QGeR|J`@>en=*lNvq8$2XQ-H_4QfT}R&aOEc`E
z+C?{0quk^@?Ci6y0bBqZg;@d}3cQLTm(I88p09UlTpXfmwx_v^a=AA4ngH=bdOM@+
zi6&blVyWVn1H7RUG70S%`zGJORI$n3B}^a@Ip7`IvwR*sav2diyXPof_iUHkYLwie
z-+w{>A@?u2?Mv&!ylK+g0tvhw-XtX?O2pLtmfJ%@brr>c2S~5V*sV#$Jt)Kp`b)gP
zr@`R^ARS)<^G(?KWY)u~vRfbp0|3z{u#4<HA>Hd;^~aUD9h1a8uoVUUyv{t9Lj^g2
z^Ww9_eokR5wC)45$|vZJnABv$3RT*=LQrf!<(8bmQ#v+z^;-n61NM~ragSEiP5X}N
zO<f&|<J41A(&*K~5N^zZE#+kQxk0SQ$nBH0c;-s~ZS3l9=DZAT_m=lI2L9nFkdHRB
zcpn;~@*sv=l5HrT_NC5(SVl9V76zl3*zy?jV&mz3sWtifzuQm&3E<%)Ws20st(r^A
z$_GkK5YA`=c-EF9A7Opk2WR`?;o&9Xb4b;Vvm+$BU`mwgN$0s)A%0GGXJg_+ht0$9
zq>b!q?$((R0>>eKgVF}S(oAqJf&@UH4v`qZ%k?dJ6mT*|x7<d67NWvkouBsIMp(Jm
zM~u};58>}~`H>{eY$27ELVT9KuQfW&v{ib^>17+C_vcE(bEMw0O9!=0d+%c%?`&Iz
z4R-Qm?R_~Rk^P<ypPB5&(@OgEerWPgSt7VAQm5Jh!kWG&9XUVVbU)qtgk1Toec5LG
zo+KZu57rD*669^ZW(h;#%2KfvcPoMiL&Ovj-UJU%2kgVN=kuvEW5Un(;z37=4pAO+
zd_ShGl*JTtbiBI2!&Ig@1<8^=d6w<*9m7WMtIafFa~aq!80AA>7LVXqeudxf&v%aq
zv>0#^aImcs#`0H{#47w>&dlqw9nlvF62YgpHmnucdN4Q<R(|2^u7YvE(Dv#CLTn#0
zsstVVcE&~pr7lL4B`-@54;yG1UJzh_nGD&fvBdlvK)C2)8dF{MyWd_Xx<`7$A<s?}
zA@m*-I4xQg^1G`ejJ}l0nm9ta0gnn)bw51j)%-x|7|}16P0u*M7o_tgg`c(P&glY6
zz3gmx7rE-*GY2&3btj|T7=kh=Cr@f}tv!VU>S>E!z0TI4MReYngY*)+v~q@B7^T({
zdbTRuBwO=fv7ug3sO&?8(BVc_*&A=b)wzedv$whxDb$8*R`G2Rz-7DOKQ=I*{kbt+
zmQlL0*jMTxwM`}+(-7r)?^~kQE68cL86RDJ=_6aY)iL#j6asy733(7Wz1+ZYyVny*
zy2{GRIW;vkBbLu076_PjhGD8zsF}b>vi5n7c<b8L0JpU;*Ot}$upL1uvv+j7snty@
zy2*bg6;;SjK&z2_cj2HJ()N|4A1?2)g#OF)!NC!tnxn#dl~ouRArQ<+l{6|)^*BWE
zFkS<Tpi7n_>$wJyifx+)ehvu4#7WRtP7UN_`2bvI6Uh~lH4C}M0&N9-=%2^v%91o)
z_L_v0?d`#i4;7*S+9*Sc|7|aL=<>iJnOJUCy}baHYnVaBmXHcsbP0$37Zn-?Yj<v=
zt#YA(q|z%bD-+fe7jD@X*Q<hm7jXJ_z*q;I8(l<T2shM<TuP~nTXd-{h)096FIc1&
z$t|YsnBf3nVvave3@pzt({t^XT7Ach#+z3?K86D}{Eib>=!If}4QM0YGacn=H#6YB
zlCx=of?y2~xVbKC@S^CpEb()bUZ^CCZev>P7K3{tM4hH~DK+;#$a-V-xc#HZDxkV#
z*p4FcUW2JPQ9+i!?orcak?5U?eTYxer3)<IJB+!7xc)&L@VeF$(h(zLh@DMgd+B)`
z$#$9_L>N2;Y_F%3rVJP7gog``Wm5^aRsjIpW7go`>GKgpqM+Kwqy&YVMsNe;XeS&p
z`*X7Z#^-^W;BFAa&jHOh@Qsq|3VRyN`&o!8gYh5F6iNiPrO|aOLPCiRGmDVyk!KP$
z_v=_-wD+S8Qoo~8rNrF>qpc!uV||0m5h3RMPxFa*UI@>9xgvUXjC(aXsP|xoLwn4f
z{PpjX!4pC(v;;TZ2rM+q8XGLM%9F_lDiJ;^;dB3-*3ehW9Tmy-=FvEziQ_NjEyV>X
z68D!<(qDQs&a`n_O4Pfa;D}AJ|I5pbk0ELP@f<xbq2P!Hv6-#;*FQl0zK;|7SV>Xr
z3JD?Mv*wD({xvkP4tG-Vvt6&G1ZyQWk0s$V9Dae{s`!L9WYF>@!j+1cK@UF#*&Hx{
zaj?5rA0Y<J&(Fh20R)1I*8$+A$$t=x?m++#@2f3k>l$H?4x*c2OPEFRYrWSV=^wPp
z_ns~ny{@(DDUGT*9Dj`QLA2tR+XI@z3<Rrj?duULkPfxim+ZtO-I;(g5*HcJ8%rQr
z*BTA@^;{U<$I>IFEkIZH=5jFEWeDrpWVW(<sfr+7|6JG?a8$Z-m5uoU=QsuUcZ^#I
zBl1!BA0ygW&~usVw@tYFSe@qwakkC)Lkw2&y?@pefXZI*zC8%aS|<RSK29%O2RXf=
zf)q-L_TVONsKn%fDYgo|qqTeYLqkb{#EDfqv|rnO!ukT-O426Dp>`#Y8|0c~-I>nq
zV`8uV(XKU;7v%kIiJUDb@mtndAnZB4SRw}w8)5(bCKklnE*eu`&U>9^HA~`$mt<+4
zS^rKt(8VMgmOm*1=I&hG*r-?ry9~nf{frYE@t)wEA?<Sp$FG4k9j_AP0Zn_uH3M5N
z2v5@eIYbW4@j0`|3tT_;fXkw|0-p#iF5o1;c#9YQ4JW~h;nuwVK3tiAr8y6%i`D7b
zrHIDD&sfwTj;y+FMr3uz)?$;2ADkFG(*aaf8J=w3P~8L`Lf{~d=enO?Ea_Fl1!OWx
z;xMrNRjCi-;+<&kAUoW=_MSfgvo`O6#Y?Vp{T#a!5RP{jq)^R2rj3A_ev<S}N{C(8
z9KBF)U3}7eySC0-l_MgtvD&rX?I%eEYE=&){~X<uTl&WCc_y~+%>1XPP_uzMbv?<Z
zLPD*%4Kx8Zt7=guH4+c%jCJcI_g`y~lJY8uvWf^Asxcqk0|nl<B(0IhY}c*u_%7To
zx;nbLJpFRMSk*UDNmqonHw)mKF=R#irLM&b67*vUssb~$#qCr6^_DLR8jku@rRrT^
z_sysFw%5ZNahYaeINj_Q+XuM~x_U<ekrkmx?Z>NqLXm)$P#{!x!_$)+ksY{<Q=-Cc
zn(QJ=uiW0Wya)f~y)Gfw;SYn!)VPtY+aqOW6{d+%o!`Qq-l3X~b^oU(21pEIx!@fG
zY_rlqulaG%N*usuZ0|H36d{Nn+AhM0-@^tnHJE+95!Kkw<DJB~AeR>Eg*Q2bHyICY
z=+rOa{W2~P5uK1#x6QuF0!~MQ<3}~HZMBdU1P%9m#r+_kaY4<Z`K2q@w5L8XX0{Du
zV967mZVwywPJiyZru}y9_fF6WNeW}6=(9*BU0x#&i69${o4(;3g{^?r&D2u~r7o8}
zc9vhU1!)iC_+W^d7aqrWm13|&mY@<;Tx(`LEDwI`zh92$z_I3L6G*dGUc3Pbf{2J0
zlkM%zfmEQOQjd_-+^>_%Kc~P}29~fW%a*`@$KqP{N9kYl$Qe5Nk^+iBcLOgT7>$(+
z>XJOZdi_xu9RnS`shJBofg?=H@*EJO6RF*Yv4JXVB~hx3pkFqvXIEE`fFUdMMtE3)
zrHd!x;*=S&ivw?Xl%(Oikfz@ll(aypvV^X2EbSpJV>7>E8o>OIgyT?izYo3T2A$=&
zE6eo*e<+3a0X==PH`LJa?P5NZ&r99IZ%Ic?sVkO-UGqXq*GN9*djz@i?M+QS`-GU}
zJR|LWaPfRvYGhRc#4RrO?wsWSfAFWk!S@M6dQR7~7HW^ei7z7M-eu_IZj>Cj(ZfIz
z=3H5gXvC~LMo|CZjpQ1hyXQHH>v~v{-TTNxB&Nisu<;0_2MNS443!kcL4|-j90}hE
z0!jTU4#&dpVEK?heQN-^<T}CAxRoit%8)sjO<8t$`y$8z5IgqQY9aamBVdf>%QPIs
z%{#DKIsac)-w==76_CJ$U_%yK59y9bU72)FhEOQx{8nw$8M?c!{n}()t@~ErQUagk
zv#^Tp6vzOL^|)$+1RDB^4J0B0U5xW)Y*^#Gs*vS*Hmb#1NWg_GE+JFnzx#EC7K7$k
zK0JF1aWw_IVHkfeK@3PtfBAPjCg8M+!f^7a|J!Bv)*|TlpAe9e|Mk05R$dn~+40)H
zqR{E<B*%Y3#m~EUY`%h1)4q3=6B!AWA94Q&Hu!WMgcAeTGTkLE1s&e=ZC=yV*Y12$
z3A%G<q;`m_b|xK=uGhY-V_M|DSe-t|D7z*c(L@9sgM$+_C6&4qQ<v)N=#`b&I3QX(
z)svl{X5N7e_ad0#6%r#5*|><^cS~<-tGGe|09M2r!10e*uK|z^WCHL1l?hZDP=cUo
zKxl$;@nAf>VbCHgwwx`nc@p?mxr{x>#YXENH6jI9f!Xd&7!dW#{<=(PY@#;FUKiw;
z3$Gr^jB{NB4t5t6er=CKr0dt2({3RH5?`p|LOvIZQxD@w`c{#oM4gR{*l&NZ5hwgB
zhf=NXJI~_B1}ozT5m|hXb^0R^wJJb~&K+QtE>B0qU6h(Y(d5nb&!YbLN?)bnP8Rpi
z49y~{$uyK0_)yb_tsK`tpsM|U0u=#ibcy<!hii;!$q~4lY24YCslYyP_n%}%Oaw%H
zLZwu7dXd^&Gs>WovabrHN%6XTRU!0w4j3^qVE#vhzjGYKy10PD9eMe-&lK1OYN97_
z6S55=drHxPYhC&}-#~65D!??--@a<zv+kaA16;x~7qIr+o?J-sw{Qw?Zve0>-FnPe
zzDp1cxjG-bmy2bv0g1@3eS0Q?XE>YLfo%I?*sY;Vq&6ZVLIVwl(!|uaWre`^fl^oJ
zU0cB1`a@V33(W_@S@<x0tB2GZx<*YSR2GB)Q2IRFiYWSJ&qt}u6;X!W3|=NBWct2u
zx^OzOm#>WP)gjQF*Ljc`KhgP>kqT+RlxO{c)bWR}kMs;rf_Vm9ZIMw7e%zPE=;s-~
zke@CS855tF*hM@#E&`U+-~qN!Ij@>kE7z{OYS7!wf+7z2M|c1DiU(wgz%Y>i$1w2v
znRqkWD-YHMGbMj*5zoWRaMChDVM3S7;#<rsK;mZyXCS0m`dhMo^}=ABs+?Rzsf$?U
zkz#R-RbMN>C+*R1Q3Cr#Jcz{y*zdDl4-p%Z1vCr%@Yj}Pw?=;_*MEw#K?Er4S{*^|
zO`PzsFwOb-`3?t1Hj{V~#N3NP#I=e*RJcsg$xmFf<V)@C--sWgk+*WV@Ga9;4UMJN
zpq5|`Tqh!#ZS?kcL5jB#1Jr*#;N5_0>@GM2$ENTo&`CE(<)`^|KC8nf;w4csvGm*d
zj9>?S?~4D)vi=aBUlJ&{(4R;o-^zI->J5D;tk_M5vCOd8;F2~5mWM+qA`m=Y!%6VV
zPw_Ku9Cg;u7n|vYJ~s2X-Lng(E~IBm%$)KSIsfthw0sC`ca}U*Iq&;eDJboFLNeum
z$rgh(R|M#e#D~3)mj|Mp75V##79xDdn%0-+Ga-Od*xV(X2QI`)?~$n~zp1R$wd_*=
zVh5}dw%|$qOYs1W5W#A<t3QxGupIa<oZd3P^lifzBJES6;HPsV3**=F&)e>*w&rL2
zZEcAAf9!_#n!B(|x5p!adaqwP1c_=gg5D?2A~W<%(7oC~DOkGkR<g=JG5&Y30G4tc
zQ(c}8M}O<C*vL0+qx;rUN^I<%5p)lff~y-kfCOZM6guDK118Pajz)>G-$GZ!TAirS
z1B3Y@t<<^_%L$JF!A5$^&co!ltM?S1yPW%d7}M-oKFijwf^2Dw)sdC64;Ch7aeP$A
z0nlboJ%n|bJVSvjv*=Nmih~%#sl<8t_cnhkG)Q~!0fe4^mEwMLHytBH3qT@8Z^JzY
zzFZfn<-rcX%>fu*qX_(A^y(G(@y2KWfj#%>LV%3)U@NHoUh+j60^XJ%Wa3?rMx*UY
zCZK><Z?k|&OH%ZYTv^d^ak<&cXt`H+#7W$yzv6;GB@{sC_DA0Jdr?w^2+N>0Jy*Fk
zY9Tg+dx@4{+8mWpDwun>z()1Xi5pqqBANT+>EHmxksW-X%x}rgL`o210Y%tXk1RwQ
z@KLH^8$2_2L&Hf3grpuupNv1v@%PE;U)Lu>)NmM$Jm3H+i3tfm?jAT%j%4wnf@NJ#
zvX*WK78;p$GmE5TEcor;gn);R097E|uLoSmJY0Yyg^0??f?WCva=sh>C-9j)?wHHi
ziJa1}<>N^0hS7FRSvA~ZzD`9YVg73`zXehVcu!qH%Ez5WF+3ort}oiu>Oh4bbETXv
zE7Dzt+s!HW_OYM=V_?8%{Og|Z9wX)OmVa&q_wRgY)X#a51A4T?U|mr+83BVe1?aSG
zj}!rT(eEbw>k^nz+i&87Kr7CO%R9rUf3D2#PiHG)K#gl5b}~TY4LzgoDt&!BjH4>2
z{>ppUUzag;4@eM3b{FyZbGP(50^ydfB5JNnBbG!)0FnNQzD%U>;Qm!xQBx4eS!;s<
zvDNt^<n&q3BoP>nPwZuciWak%4Ina7FnZa`^_Q)9iQ#|K2?LknG-)|-d-T$`+45Yk
z0+o-&SstjX?R>p@!Iddbk@K;aN8))XLqe}>EG<z##f52mL+eN-5VW5E#KRC;WFw$h
z0u?*UFEr&TfpIK<t16fPYv8_Fy6A!IurPLft-r0kllbJ68_W)GF8xjEg=tTLPnZ$9
z;-IB}e==j9o`IpPwXH4IqC)N*c|T^II!H>;yNLsr0N!pdnBwF&sPl&kZiE9L>KL4X
zuwU^1#Ark!D4QDTWWkWNw|<YNJG1f`+%)R{Hy=af+i+j?az83K^P3cRZxBDY5ZHyU
z7bdW+r(lKJocf3t(*MVsU?EJmUK}VSHifiO*U9WcBr%Xks}zGJ1W_@l!(nV}HR?aI
zz`B+drWXDqO+<?EMFKq2JgzbCRg&&UZ*2g8zCpxR@l;FV{x&_nZgH0M+kgE-T7B>f
zEuB$-0~s=dP?S!T5ZVDoMn+@*Zzx}@wLUz`9yvHT$a{ObcaiL99BZq_%7?@<C`{02
z?Fl(~L#?YYvd`N(zwPhWd3ilSeqY-xyEfyhLQ>Cs_xt=O#;k?;p41<pox+ARbOlcM
zrSnU2qAsYlND6|H&Myr3&oZaZ8!aX3dLoD^jW-%JBDG&A9ogWcovyKZx=#0E&HC=O
zk@Y?9B~NoQ-A4?W|HYK*2n#Wnm%QJ_5KQ?MfC#;Rr~(0njtWRI{hK;gNys3;`zM)z
zUf5s$nC=O2|F-PZrxg?sWLIu12JQlCT@w$34HL%li4`JrZZUe*{R*A`()W+j4dw~*
zW8hq=QVkii)RdH_mgU|;=uH7_nm)`kr=F0&bVkXF50v^lJ*fqMd${g-M-m_KL{9H*
zi6S5}1HzLDU$Gdgr;-95lWjXVe=++8A>>NgEryaIr$!unaEFmPa8=Db-N0LJL;-y#
z2?X^W3HBBFfAF$DEUe){gE?|26@1+K2)Masp8Z-P;5!9B!`2fYgCYqiV=ZS`y<O5C
z+Mm_$A99;QL3k}vu^i+4Q7i`_D)nB_k@brQav#>v8hp0n&-^P3cNX}&`$fz#1l+_s
zA$0PKH?n)q{+mJLp0n65ll=x1er!Dw<1Qz_NbBzb^R7LL{--+qLyY@swCSgKjh)!%
z*Av>EIlyiRfL{;wN_h`}^gBpuXmHRuF{0vmk(B=aU7~8nO^~K{T?zcVU>@o3XH~MB
z_s3mZ0Fvnn6jkaHcsjihMS2-7_Uwg-h=@Z^9Yu%Bg(v!9%y>vFgp_l@C3FHShMK+v
zIOLk>^S_4Sq>tcS_9h8byR^&DR>5IgrvaE0PYX!02+oy<P*DP3R`Odh-v3gtUsCt<
z1M4}n049I(bj}yyKN@cxf;*alm(7#2VAWwFz2E|H4Mh#!6}ORagzxG1LH^?JK}JF!
z@^dTzck~IuNq2`tnA{N&U<;<Uki6vOgyF=OwU9|9h-Gytl|&K*qLmvDz)d_CmCheu
zp_1GMMmKU0e)H3JWUvud(wC?v$Rf3<q|bsK?}Y*;@*|1K6L8nH`;9&@6Z?k>{&FOf
z;EovEvk=Y8aGA(E_aQ<;^^r#|53oSu?KKR3<R4#_7ZY$3*Y+neSl&8C|F?Gms#oVZ
zGG~B5x*j~G+(a}qK$9gKgs+r<QpY1{rS+*{?4QK+?KR<Eo4>w^>rMUgTQ`B5cv=XG
z(SIf%655*llM>ply!`$*5zy+>A`YeE3OL;0P#R6pfx=B)To=Uahw#(^t8z>H`ys4j
z%(1pax9iUOaK(w=o4A(%g5y1$QUH3?Fib5cCA7@#e!?g_ulV+&K8l+K>QWkPb*nYX
z8UIC$sIIgj${v0OZ14E(nlBj}8T%{H5)1tnDMVA6sRmr3ig0abz_memn%c__&U7<<
z|2WIvn}Z=L+0)Hp%;h)Vro6qq&G+_R5+myPBGAGnftYFCC#&TN`>A7PkFS7q>2C@v
zymJ|F2;H@xF(0Oc7I-D;&dJ{{N&_aahFO6H#=DP+S-B6_K(yF}MDG6WE$CA`dt75D
z6SScA=AbIKpECh8g)b@%10uBqiu^_Mrzc0oWJm{_h9m%%Kju-oATj*j$NAoN5vwrG
z&Q2ll`+xLeK49sNDYkMAB%~BntbY+M4FXEuOS%SL_PhDOWOpXmMRXt*c)>>R_^ThG
z^}E1gx?d+HKrAUh20)5dU*VmUFey2Fbiz?7i4wm(*4lq=9W@?oeJFo>@)-v3J6nwM
zTYAO<!#BL4)RdetpgUwuvSFvxWkui%<tCM<Uhdk8#tAsW7Df5X+QGXDf4P%I;*`*g
z`efXS2!wJP@0TXPProEJQgjSUf_<6>-g0T|-321?ue01&y6}fd0#6e<s8B5bz^th9
z(ushK&%0s3Tux12X($C_h(%))|5Yeb>PRJT8Xph}v2uPdLjnh1hQDp(0#4R;{95(^
zaAfth8=r%qZ|ofV)cz(6HZ0PPieFxy&9Q^q0=z|Ra@q~xLq!o@zW+Ke?}2BiFO8lZ
znV@Od)mWZ;CefhO*$5YT{5Lg6e+P2>?WP|F#dk-~)!b^wTea%vh0=bz`27$k+`C;(
z#us?wzlH0F@oblx4KPk*Io~EFxHPrgIS>7Z{BK=2O))^|f9ttYPuc+tKx<mXimyb^
z2+=_^wi^<w<X!^Y)PF1K`M3GNYb}f>AE-3~u;Xc54NL~Km}o@KDiXmV-J|}QrhY#|
z`n%DhJBfif7?Q*SiqixnkV_;KB{6^ih=BiCDVPv1criZzI^17t5XM7FB@*;chzXBp
z0fw^w^&!YKDK`nSo_j+vIESf!JWsg{f2|lp-*{7j`JN_l=1`PA^S_;t9_EEh+YRL9
z+PlgNTw=gw7GNRk=pO~MqQk=2R{)oU#V7R8T2kPLu?)0onkoY18IEhLh~bp~O2t$0
za)9bS^Qg{E{8^#>^IG6dH!{ej?zY0#X5TO-4BMQYo|JgQAE3-19<?eWD(JwQZhy1M
zjhu#!hqi{`mNAj)g`ogXl4dYIKq48)1uiSD0nh;G<#_j7ZEg9@{qzUg!_Q-0{%uXd
zGv|5(5-yM-I@d;mMwp=Ysg)4;B!{P<g?d^)v~>M%TXFo&4JBV~{`$3s;|_z?bK-uc
z#~=DvkB^U4b=PP>^hH8Vxjr=luoVxC*vs&DVgXIfp6q|cp$geTs_FIET>>j=Jb#uZ
zAn;}psCa=)VV3JfHL!7?8(%16$JQT>`u=4jAyUW^{!b7?Qs7o+v6L6~dLZgSolGbJ
zE&%$<30P=G`)Shb<}+WCSY&?^N~#J6IR62L{-_1Kz98_Ycxx`_LB5YkNwi}Sf#h^c
z02?oS?@6|}14@wXSy{GoXSdJ)7u^Snii1>A$t5IMV&@Ar$pdjFhC&T-ha47A9AnN;
zKNhIq?e57-KVs_b^q)+Y?NI;CReSMZUZuIO_Bt@;=?Jnze+m8DLks|`Dac)3&1x2y
zLh=&BS|;l#DI(24vs8^i+&}ogYYD+aHnO$g4W48*EsLRl=|jS28fvC*Ajfu<&O+>x
z*E=jHuB?TH(wfIgY=HkF4M;EBv9B=ydOpI!&-ORi42N@6SZ)I!Ud-=HkirDA+N!z;
zcO`IQ9|^`mNM4^p2kHA!yv)(R9n4Srx+dPhoE3d`kV;3fLXY`s1!OW^a8WT7P>{bu
zPQYG1aF!IPRYX*lRwOsC>Fi6VAZ}Ct&%Q8jggAFR>yf^l<CBySskGyf@IKyxQ#E&k
z%<c4LHi*riua+4@WWZv&@6~@F(R%m#Z>7$@+<B%=4c0rP56X=pr*3jkZoEcB!=UgM
zEP5|ufI}_zM)fiO%{!pFG8fi<cm~@7uygR$dx)2a=2)<yS(JB|Iv7EkFJk1*f@JAb
z^|ynWdJe8<1q0D$uH9VcBZgIFpgM{UOlr9z#vHIzT$I9lu0)7H>w7H^)Pld_y7nkE
z=GA{B0;B|d%V0l2cuSUteGIs4(gT+?@bLx5YSZdE%Z3lwm{35<*-=PboGYz5n{9Ck
z1IzsZ$4e5EXMj<}-EW8f7bOIUK%-<wB!W0&aUTE<I5)Q~)+k9d!5Y?xHq4b2rB9Ai
z{J(M)h3$H*PACS@kGL`Je%leDey1*aO&=FM`-50d^(7a1{5Ygc^6rqw-`9|_;&YmP
zldbyqpM2Cx!om8y_yOfQddk3!#=Cdpe2@ZnyLDXHnJLsm7E?Y-IO5)Vz_F6d8aL9)
zk5}`q=4wN2VJ~zbUSMS4JwDs1QH`IXuEZ1Zn*G`?mx0GbY;~Kuu#PqN1^8(vb0j{k
zu;(QtWK?v#-(Qf%G_dKm7bn(}67^#x`{KE8jO2U1YW#5+JiqJ>eM+`edUQK$&*``Z
zo#TgdE?_AC@wbRK@!Gk(<qeJATNiZf2!aYw7_C?cP~gqu(RRClOI@jPAHrfQFT<hy
zFJx}~_t_bUXH%a}owe!uqfbp#S>LF3w&RrZ6@y0Zv2#L}8QA^aMBL7b{w5q)BPUYZ
zOdzaeH8+V~rTD+Qpe?lbnqYG`dx~8`<F>syBzXLJaOF`0BXTS|_Vz)|o(AEp6j-YT
z`pzj$bKK>%Hg08KZWUZO0rfrxZn*iOx*X;5w-u6qiw0~@nC-IQ@q^v(OQsGBu-eaC
zsBp|TnnJWzxu|f>1)71&sBnq<<1G#j<|4K7X66;JeH(}EK(%lx$yWrwCH(sfDUGS=
z$<E@Tx>GfZYTB%!t{$TPm4%O4Z!dYBVxlj@9nD!qq0Uwd<vF^+Xo{x)D`_O;#%udu
z{WW&CNm6~Fq8=^km_AHd(VuT&r%H&_8U)OCyA<8d;)V!NJsaY_4A0HOw)trP_J59>
z7O)$<xF-)q92WBGHMD+ku#FU_>J~^mQYuhuOeRG}y@rR^zj8oparlY2wfx=i_9qqU
z|460b1*-eu2=meW|M7L*@l@~sKPR$7W@I%`Sq+qV5NQyhV;(D!9a%@lF)Oo0NDd0w
zdml5Dy|*LD-s9MZ-}`9XzNz2+!^6!vpYwXZ#`Cq`Cn2k3%RV`Z)og+DJ;5Md8;NU1
z9Q^u;-d&LIZA!_XX*e%7CGnQ1tLu@-e;W6TA3bl&&E3~wy;bZ@T?`91mFGVf4XmEA
zA5|ximRLpt!bb_8AD;T56>4d<PgtYY1g}ki?+wvwUe)VUOo7#%+NV<-AtB{223KUN
zJrR=u+8B+-@ny^q%dM+g?>!M?P%?-S`IN-=7cbtFOkEtw)O$hgPM{cnoKR5hyE1jZ
zb>@Nt#;8L7<IV)GisUl+KriS5Mlu`;v#a+sgbW#9Ke>+(1??}_z`7rNu-(O>JI&a!
z+xuXfyh1uc8og4(+xe6Tdz-azu>+l-?`ZSj!2@^QV85cS(6tLLtGnCRLeUG0PFx;u
zkNo1*uU{sR<&oAnj^gF6A&Zo|t8vzDOG3cT0(!VlS!mfsDkb=hkB^(6&)hi&^k)JI
zZIedhXv7zeEH4D>Vi5~b?+Evs@G#k<b)WiDv@;@Nu+T9*BU+YHg3h7jDly$4EH@4w
ze8~L3dy*+ieQIKiPYDX6+}*e2c6b1%x3gujyXJ29B`(cnzp@f;o(f^?aIqR;mPW%V
zG3K?kng!zs?1{wgWP{rj1Q3ZMK+#(DEzTIXQ{|5?I&|L9r+&3x79sI_;#>scq%dgJ
zAf0-DA@sD`8NA*H7T3JF96>3R$SD7)UHk3)&ZI~d&V9#x<SiXyKSDrJMtf2np|?ga
zj?Rp+q`NKIOQVCXa_`z+*7SX`@yKl;*^piqzyuq~N!AW>x9s+IHo-h?nO50TVTY6Z
zae3N7U8MW<IN<7+&sK<-sz|o=vZoK$*4NLkf610W#=0bVBKlb(IpLbh){o;vQzS=;
zu>9Sx4;6LgO%6Y+xB2P_Z_Z+rWb1-XQ~sfVcwA*8|3-wJE5#x5cH8E-JISZ8W+wrV
z3LSgO3FROF*h+Zl^h=Eu>K3~))X6ph<G<-jYI&U%;@sLqD2~bOOHBSqPJG52qAH8?
z;cpu6Wmmo2jO%hE2f*X<NG;6R6Vcq{S&za*yPc?)#L>2U{b?0=q7D;c9!(SXM&<9^
z8+rA%KSN$Q-Am<3*~fT$5p%0W-=LFFeH{pzYHD_NmQ_bbXJF_euaUEuJ;v))#l79p
z!CLFx70!K@!U07cMZOwbdFo)Q4PWRAWa({;i@TSW{;Ce*EFhU}suAA#goTh0do!?<
zd}eMAqsPlY965DtN=qBlJV)KpzdK&mwHBkw{ud@nlK>`mNgfmcTK8b+VR~nTcDnC1
zU>Yk5z^1QyJ%iO%QAvK1(GZw7f}yV*QVrj)$T-G=tVp<pvUQ??-wstzHvJe8Hj7s*
zym@>>L|b(_1CbBz=*m}jV+^KdGFKL`wwzrWw<noGFFf!@$^PLj_ee}Uk+D^6Z7GiH
zV$o}|DqWn))?@5Qt{Pk~be?{P_g2m?YossY{S^d>*{#yZiNoyVnrmv@WT~o6Ll`j`
zyY5Gu2_L+uW8Qjx>27~+l8}Jrac_{jQnGSW*}8%DolhzGRIW$TXL%xUgDHbE<zZ)$
z9!04#fo{g#6&B6a$>@t_spt0L0wDG+{il9Kfn$XZad<0MJb^c5KSL5~1t@Aw8m<32
z-{Vp|3jXmR*R;FlHT~mfp6yo}s5N2QOgy8V9zVEG6D4b1!xbCyc#w(+d;A#d5dn}0
zSdV_1C7}Z;XbHf-BMrT~8)f8EmL}JW&g>It`D4PTo(Q~)ccuiSG=`MqwoN`nIJvpG
zbtQZ5*)AXXNp@hnMu;1^y4bvJ){&+C60Krb2d;~kcPI7HaZkOlt3wi!O)@<dCdClB
z5#tJ*;*kp<lgXzjAE^1YC!o=2FMogkEoTX4bLIJ^@l~naTwa&qkztC0xBIbxdLq1h
zj<GB^GgjOB(dFWen&f78UEn<8cz^;9D+AagC80Q;C=ySsZ-~L57sEmcx!xW53#eGA
zf5FQ$G&anHZ<@88ygE-*o`_L^8Egql2E!wA0S;caGWsA2I<maJn-_Skb!Tl|7e{rA
zrvgVKrKR>iQIwIVPtkhh!aTcM%(9lQO*Y5?d!>Pn*+Q-nVjUbE=LH!2y%cp>RFwso
zX4qKigO>U%J;<AF_e=Bul7>8mbg~)~Tx>rg?{ML3>EL5~v1vpru6^Ut+mEh`P31a?
zd7#K(DMOjV6LLW+x^N2bGuSwsi(Y(5a5sdZ8OBlZ`)RB>t4!C?zg|*|f~Ru@f*-!J
zz*J9ignT?mDkqIjAiq!QA)@j%G9jTi%4N+qGAkKQ+t&BXHU9dA=~a1zT-j@v2NkA?
zVl?lCcbv(AdjsBHI0=^yQ%>JCx;REw%ho||6rJCmq>HWD9*mjauO^n{Gj5C8ECSEA
zQUFE$vZ_h0(&*bdhq*vp$dwp2agqmd_Tfa=&u}EiuMYg>Pj<D$l=}{@7=lfU_Gz0p
zWc=zE;V(Xavk;DCT}?22MIj9$T9UX+1Ck#PVSTr6HvsqA9z9&P<yaL(h>+j+G^jo~
zM6P(!l41(Cq^h~v-fR_P7&dSc>>^tMWkkJ!9+5`Bmiw5A&McZhz7V8b)%za&+g_+_
z-G4yB8lH^d#y95#8kqahpCxYv$8N=YB3#+f56!{&r{heGvNQP0Xha)8pB`PS$I&Hz
z3U61pMBGA+%XYFk9rU9A0&f;_D4s--tyYol?o)~RuNLcy6H)Ma-z(ApbanIvrO_Eq
zObqQcE1z8sPPA13Jd2>&yuR<C%Gb)R5U+YRoo-*rSh_12O?BZ(TP?xSYl@zT6Gtbm
z6dz^jARlD0T@ariuNq|@sV~N(*h$g4zluNr>E|w158v9<v*a}!u|f<7P)<AGriKfW
zKD(%lJrtD?my(iV3406*<4_JgjwdM0?slN{vE6F4Oa5}v`5F`)F6!E8Ho`*u$nh^W
z&r}R;GSpL<nEv6qGRaqycIAvhjl+Qo%Udv2gV@RFm|vg&d1*9fdhRHCducKa!I`*L
zSt}bwwkfyo@KDqRWbEJyAy$;-sXYJ5#BP!}c@k=-XXW~yh+B-U@)w<AlFt=S`Q&?G
za81)EUbO1|1-oe=>~03L($3^c^|WBW=aHEB$$JeYo!M0cM_b>6yyapKsY5Ps51-5}
z<pj!dlHr~&v_J3H5j1@d?rXlSdHtYwO%MXj0+U8#D4%$O*s>6>Y}IjBYm2X5Efkxg
zbY2v+K1|QY%irASx(sdR{;>2H?Z`PElHCHM+dR@{ZhXZ|NbBOgm_<BjNGxhmE6wQu
zr)+1-$jix*xJG&hla}nzdG_3gWBb(u%5fFd1@KW9P}2x7@jsHNPI;s7!DhZJ58u4q
zMqa&nH$n;X$oU)<$~zfcG=a6)eLpxx-g11Ob2)(QAS7ehSYjIk7t2Szjqe(%=-%pE
zd=TnL<@_UJs*s(~9|z^hs29chq(~sn`rLN}>ACc{H%^<`ILF;I?+n@futq|yL7$i#
z2{)an5;!m$4uG-w(3Q);9N-+`C=5We{P}_6Fmlmv)m5?KYB>c=O+)U^BE?%#-To0v
z_J0v=6$1~+Bt4p`-U@+x!zMRe$6K#7w!HB`DjB9;2go}ej9D;e?AW>+Ap2S@CBbr2
z9JjH<B2}NIx7)kG?Z@_)P|;KaWh#PBH0dS6GPAwfY$ToP6zPBvKt_%zoJd^zQc+P=
zw!5|E>~CVwQ)vaRIuoh6)%x{o+qa(~4AYa9@IRps>QQcld_z<kyJcZiO58g>sd~}s
zmKkHm^fCvRP9!Kwd5L(o)dSqyu}hn%4!fmCwmBJnx0jqfSl@TzZ}J(#dw-wi%-!$U
zCkzV-Wu~F-F!xqDK}Zsy0JsSkj=}^xov;-KS#axuM1N<jh^s(w>DKCW@6L_lIQzB-
z``8JbnRhih<5T{y>ce2mhBfvrar37&d>xWeN>t6@zs&)v-1nzB62ry{0^d(5F53ZJ
z-~+BQ`LGeyl*Ft%$)r!lTdMuqL;z8N$Z?`uKFwSmYOtPW6u8&htQS!9)~ks2lM6xL
zBgtFGgWIA)&w`o^NB#44zXyw7Dx^uK=+UF+DJI%Qbb}r~G-6XIe0ZdbWs;)R@=v<U
z9}|%=tUaAe4a8xLqD_<<&_?7wu51-Kc<6T7_JzA*wi~i$R*QD1HpX=u4yQPDvTQo~
z?CdWA<!}P8dv-ukfbLXccTG!&)dlP9Q`6SD&O$(d2Z#<acCc(lN*aynH^`ojVKqQA
zDc0@>K?)hvJ%#xn04S|YkxA=J?p?p5#dGZIr?t3So(LQhv3d&Qr^smF{mxDUVr-QK
zc<58(dexg9Y4>vvge7?9X$=FkMp2b1&s7G^UD0nB9k_e`a%AH@zbLxl;{dD_6gWPh
z;H@mFVRIl`^Y%q;+&01-VoodCU9xgm6H6D-^I-UkL9SebDx}|vYjxs#wib8`Z#pB4
z-5f~XdXQvzNs`a}$X?2{jGR1<Z(C1$f^y60dkvy}?H)VU;C=!(oQW%&VeWojQuh?$
z<G)whvK;`Oh@{b}!K0lJvbh8;_vyjMryn0cN|KzU@9@{WM4l+?-fq(H@k#EHF8|ht
zATs;|d!bnnAz6eh)!CC`X=i;vIBG!GJ_nZF`*cy!4nJoBMnqPomOT-Gr+0N93%sVl
zy?-rNJSCBhS^sWHRIW)Af6I%mLjq^xVHxqT50|(*8F)1OXgiYmQUDFUJRxA|p<IAX
zLd$#Cs!!D3-bP$}S>wbK!7JWPJ|fxe!!6u@QhES6Oh(5$Q7!&XY6VqEeMoBT?c#T*
zS~eh2=Lc;Ekfi8umVONhq8kfgv4P&+LLK+w&7+$)&Kbd-HX0Bk^E<jPs$%)}J(&RF
zMPq6qA_0Q^CQ|U@us;~w1D?kgggJgDgjmUN`$~kw?qRk4MJlKmgwlJ(YOvthptz`6
zf@G>N7d351Wb^pv0di{!dD$pZk|}%nSs8jFVV}jyvcTfv;x$J9>Wa?z+KeuS?6LI~
zM<l3b^=LNx?FY64$OJqG8MU?-$nU!FggCNy`k!~Xq<{T56vYVnEdC~AKt*7TGECZG
z!g`bUf{P&ov#g6l*V(u!jeTcCV#vOEuT<iy#S#)U*R5M?tq^iD{kw>7{&jd%UtCRA
zuTTb+Mqkmas>yDUZaF<ZzOwZ7>(`wk%wgA5K8MD|`x^5<g0kV86cT^Qu$5&wxZW}1
zEQ$tK%mS_yCyb#5+x^e?=XQ${KqkrF(RR434Q}<m5JA+Ga5_`qNuD*VLTm}l-*YTm
zMuf++-uO$9A_XDhMB7C3j$K%>kqM9{tG$9*qwBa~AV{00qAW06!E$Xf(xFkibk<2V
z#F|2i{KUbb=n{^LGf9jN-OGO=QqUG(pg9ZU5Pip}Mbj}oYjFhpXyFOK7eh`S?eKrJ
zUYUk~alum}GvL8xMz8kA#~GR&CNu)8(%UWT?NN3bYjBRS_g0v(-YtWhrBG?~H$R&N
zRvxk`%gs#<_|v1sZmSmycv3{x=Z~`PPpLmKKJHbt1{JoN3vTN^>qe}-;pO;ZfiRlT
z{zp8}^z;EIkmt#hAxg^gt#J!U)BP2~^XxeFtxR5y&~R&GHie=ezQ;Jik$0bsA3)BM
zY_DW%uGZg@aGbRfSZtb{9F?ek9UKA=xGfd*1l@f)u{(H?*_J+1MoBYNB0thdMc?d`
z(q<m0F9}X!Fqn$!>S~-leUJeReSJEp644*ol6G^StTNP*uS4DruH=bc3Dhmo=bI}O
z3aQ~w*`u8wPkV)zmxZWnkpk0l{5L^i0Hq)O5;vBN*!4+a{8<0n%kRzpmf4p=NydGm
zR?@sz>JF{&&-;A0wc~FeL%$tXU?8PE+WfK}D1`G$jmlO$2WE;TxVAJIKGWCR9_C%T
z@F4W>juZKe6rXn$O7P=m3I|sZVqQyKU*p>+hrtx1rnNte39vDbC&WToV2Lxp0w`<B
zd3Qs`W3v74-3^j%huHlk>U>F0LV%7gTOZIgo^$29#n5)vzYI5WLT`*KF&`Jh{D{aC
z;l=*-n-~*qM}5D#HLpN*fUXM#?N?hINU06b-^#=u?&B#G74WNkZF?&mHxaq0Xr(dj
zZ(aV(lI&Y*H7GpawdJv7z8?wTW18^!Cb8|}M5|%%lLh&i4E;MR&YSn@V-H?k$7WIS
zlFkI$JfArKr+%Q)AyPPt6U7rgbEK14`w)?CdKc^V;|2($NJCKHJ{{CoU={@S$DTz+
zMI}tzB+D2%pT0NKH_|q?wC1-vr$;s&INg8EdY=P~qkl{~Irt+Mb~;f!S*>mV=x&A9
z8%ARXaW?*|9#AdF=R;VC-8YsOvq@nWxQ*CNor@;cYr?W4txF=hcwu~hlInr)32c1e
zCFewQM=vPjqT1VexAW_+Dx^Q{TWx+afc<oQc-7C0j-F&H(D3z(`ET*m7&DY}yAE6X
zjdM5BBT1)E?V@82{^0`w=c*yrrC6_-CT;f@F8Y&{1~c(6in8hxgxI{t^KHQN1VZj8
z&ks7zn*iv(EwN+}U~!ZDvHHEcy>hiSfo)|s)lWBoP<vm%5(?o&F*3|lx@XUve+}bC
zG0q*uh>8}KI8MBH3;lLu09=&7(s%MKhR!PDM@c<7>WrPmb0!dRmB*tnQ{sa;%kYVg
znW62->`tK~j6)8xvMw!Bf)eMx$*s9sSxc#gBHf4e1P~mJ>8W>&MJ6KX9g3xh>p->d
z=pE>uR^kZ)ht)o!AezX?Rzqrle%(AcVnj7a2Ym&1$|;67blCkRM9@XP5;5zdobb6l
z6AN>M=DE1dxZd>l?q^Lnr;EP1p4O0QbiSi|mB`T1ZGdiK%WJy3c_&|LGcG}(-J<O4
zESz)Iq%o^5p-g+bra5b-+dXgat8UbOZwHlyr24GP#CNxl<hI|$;95W5wH1zRJltQH
zbq6SyAO$kV6G6px>~8xcP90G^B_u{u&oAzq>^F!71&Gg}S7VvSl)67!3ef)Y`M56a
za0%cD;=R=SnfS0Bro*Yk#+8o7L97ukb}V4}?N-h(_M}gwD}iP(w4hd|c}VgJm!mx{
zBkBg|StUmp_r8*gDHN3(0#9tTLQn=MyC~9qBL!~Mc|hsL?5jj$AdwX<4lq@&i|q^7
zQ$Oeu6H!m3!2TCO(G#)l%z{vF$IxM1V0WH(VAuR>w)?D>!}J6X`7ZVvjAOO*{oqO)
z>j>Rh0ou~Db{0<=%;wU8a=;UvutN<cBbCi9)K2W}S2e6}#-*n3SHc^}?wy77EI+9?
zqn_S@@G}w@tVGb5TRA0R_mmRatCp9SvoVe`Nbts<Sm<rAL!9G;_tJ(BGRzo?6@+Vu
zK8o6%U5yE|?@rvVhedft?RU~4DFC7eou`TME=Gu<iJS?7_^k9K*Qa(S-qGGM(SUS9
zk)5c{d%o^ie{E^fX-g!4lIkKh9oLNG)-d$BxtlD-hn+jS<`DPY%M#!IsTzXv$Wer@
zih?z$?a$0P!Ub-fd<Vm%j9hrSbPyieyXaAHi~w?t3{*T#U}%SNcq`RFNwQx#d|eG;
zBvU-t2BMEY*is%!$nV%tpEWZ~HQwu07DZqBEc<#m1fkxs>8kW(>m~Q{#v>hVNvzAn
zAwAvj1>JBXVyXnMC~}Jkci6!uhfWta(<EQ4%ni6z?}nPA^q-GEfF%2cC#p{OMz?fj
z5I%XXmr!5chj)G16jU$+RkSt8BoUTP{L-iR_AQvP5#y(sB$%a=ub+P9coW*ClbOU?
z0H2@lv{~itE-8WbjMb>aoBf@u7O+~!B%oGA@l9f<jNNagZ<Ayq_f#>?!N?1N`BsR_
zWY#A7uIZ6+J9z>br#<qmxNn2#U5YJJ%C9T%?mF(<=KO9vqjsKDqb|N5`oFwFy@Ss1
zBVu~%KoGX%r^lliURBJ%Q$%b=nKn)HKmtN|n_huGkrBlh%*L1I8P==*Gd)wQwMe!X
zq3O|f&_6n`^=)^ka9gV0cl{3gW|F_^D!2oA{LvpD<pGT*T$;=F&KT^x)kA4Lm@q4d
z=`{EC8>$-bYE{YGF5CK+6nc-44D+S1<gtjea8>H7Bapqb@u_&ql3pSGHv7^MOF&*c
zmCDVBP(F2!!%U)q&W_+Oi6r%7Y987a@<zOSYHN0q;;XVk(45G^$<D;*<IX7)fG#+-
z!iB`Ox#G3#>aSm>s6WmnId0TfE@t{Um^BP@9dwDTBatrcywy$R)cIv{yCeBd6l2D)
zrvY)bq-&*BQ2!2zmz4{lPW12myifX)vWd?n|1^}bRPv<@Z++CVOtklMTST{dQXr+g
z(+d-N-SG~3o;@=kEP8WGKMERt*%XGx+9fvB%P!xGDStb!@+Z^q^9C;gIGel#9Y-^4
z2<Dblq;c7Hhj7tU#3{n#U0+Lhv%M2Q!H6jz8X=9o8XUB1V1(|6`K2{fYB)J@dt)C>
zkzd)?Pjjc?qvlY)q0eH_WJxGgOlCx;(-je<^k20(BmfFmu*#*Kc8Ji%A>gL`$3$L!
z(P2DTOfgP6W*;+{Zr-Dqt~U}BVL}u^BB}TtTrT7h_A&Ha)Z>2fxff(*I_Oee?9$b^
z57<nbUCDu4WuSRsNmaJGFM1%9KSSLmQ5A7dBcC^0SaZu*RR;07p`awUKwHG>Zhk3l
z_M{#6X9^D)aIlxq=ezp+KWu4Q9)$1xk<pwy8w<5kO>Ew~ZVMHwWDoG5Ore1C4>5+t
z)bV@1lzDkDV!KG@d&Cu>#34+_sQ=XS!SL%?n}HFBdvSJ3AFApFF{yEJ^;tg4x0%uR
z%mXQ9zzH=SQ>#Hgvs(MI?G_}m(h=nCF?|5gas`?x^vDH19@}aj?}=8)JW)n5>*Mla
z&b}t;GT!a}uU+ZCfc6e9ht76j<OF6=YyOP0@4`NH1Qayi7eQaCxeYpmE3zD)QE-%Y
z$<`dsBJ)x~9nb`~n<>h2>oKg7j&p<4w^Yu2&u@f?`VQYZeEQPSL<wazyUNSoo$E_F
z9$E)&OHr=%JaI{`18Wxjl{TmR9z66DU|%S6o7KwOEo{~W(?e9aV5=i^4|}W-_pS_q
z3~*CYW~%s1Ms{|#MncwolUl`EzvymIi(748k_)v=@$(T0C7%s+^_`UZ^GqCY{X+Po
z@l<x5w`!}^?{g0#hGRM3F*>v@Uz5j|>%}A_!b_H(Mpk?dNCa~0$%7Vwy)<3yQ#Xy2
zSnVq__sGa*?MG(L>~Tsz1!76e1Jf-gQ-!nE^E<pBxadyEU%MIgeu%J*CXrdGzoY7W
zOs@F*EM#71ULF7IcNetIutt`fec>lutTZ`VQ@mXky=`q|*E-~cJhFyBf4=ax^BE<O
zoDfA)VHc5bWAB;b?{gm;ch~y6$mchRiZsZ{E70{rF*60-%e^S^KV;?s6@esIloXja
z3}f0GMc}D0ryjZaae*%Ls@^Cvy3}ePmvX#j-o3ybWmtsIAl{Ok#08IiZH>#d-llMM
znO(m`Zk-XZp>ZWZ#ccKUXun)os-)BTJDEm#T8a$^b=5%#Q>*oBgvs<as@~?_3w!Ic
z6TNrYb2@J!MUfeK2sK9<RBBPq4G(B06oU>m?wK<?n><{+ko$T|sQT3<!x@Mixz|}q
z8H7<Zul8IS3KL3PK6>L<y-Cl}uvZr=!(srFklK=J<y~A(Ug=2{GdPMV-e#Q{jjER|
zdH8`wD{x;li#U<7xzxAtwRa%^s%pM8eDqE6kTV*tSH+J2Qr?+<@v_{i$-jqR<cjF!
z%aM;JZ}?razn%$%*^>qBlSVrZp`-a2h^HxJ1<!h84W$@pjNTWCS*aFSMf#%^33u3b
z)@uVf_^Sf#S<GW^*_+!){-IwX0xegZtgvMLC1r*wq#wy%l7OAR>R8b<dN8MOM?<JD
ztGubHsV~gn4)8)t27yP+ZTTEiMH<7_FL!A*)-$r|x;|(*-;!UFp|58yi!%6Ay5$!j
z$;e{Gi}f7vb%d%xiG4TOT96e0Ra}^4ube=yCj24Q^0m^hcbDqUF2)4h6{1rDoy{@A
zw!>2rPBtNm3FbrVM;||Rb4od{b;ZK!_Wk>BAc{mg5oG6mRjBVB8`tSD|NSF#_>}2v
z{Ri#m%%bFG^9BWGX(wJlpSTpuAY7RXo^^r#Z5#4dNxztw*l@+u{^_D~(aqgy>{b!e
zF@bmp6tTmUytxERU8-u%Stzudho?Hc*?v-f-s96ry)S0a7qHCoPHOYTqd^+<)SE)=
z|0O3DdK_GYUH4vb>UWKD&z$M*OlU0Efw_`&YhqycPofwdnztrRig|}}ndx}6{9O>u
zYhS<G*y1)dy~4WafJhDrggjIh2xvMEqOApNMl#&OLqWrOVOf`$)IyhIbtvw&OeBSn
z@3auT1%;(hZ!;75?+#dy4f$HIKOQwO`n7#r684zhF0fMm80dBKAqPKY)PAq%<zLdc
z(f`>VfYB70s5=Y9#l?k~k&)3{&$n=LsH?<IhW%}ct~1NvwXuld;V>HkjKs=a-<vUb
zRV(taS?copl67Es+%2F=IGK736JKghJK+cI9*%QcDe65_M4=|6y^)<tc`Btd{#oAE
z>6V^3XU;c_4lVeATMK2%XSKRcbl0K%2I3|CEsyd%!ZxZYdQeZ&_je0P`RXz<o);fG
z4Z0ip+0b?>zKg836t6^%KGeB>anRZ>;1|$CQ=s7}EjW*rGy1iS&ZWEtd!wzdmmq{~
zv^)r}@$11L4Z$oN&N7rnOCO%*5zW0nr^NLlyVt^@;4r5(x1_*FUJ*<0)jH?P5x*Y!
z^B2&8H5x9l(Ii0Iu1iQ20fJB6;!Nyb&cqieadZljiHsWl4xQxEei_;DnOYVpqxZ+I
z;vL-w(j597^Ap<zIhH2~3q`Fg!`}&ubo>bDFP>EvvSFY_Xy0LGadSaz=bAS<+%^Y1
zHI+u08rV30E}_0gZhZeaGz~QMNuMV{UjY=+6KUL75EA&9BFs|RT(+l&xrg5DZ#D`d
zkkk$2#nOI*9&37isq>s~U_%*xd;l|3F(XkWOLZJsToT6)ouc6ov_VR5j}$)cQBR-n
ztLuEg(8aRe&<z4jpusIMP)37t;i~76iG7Dvtct7{-?A!Eg&hYi1QcPCZ~GE;9btD@
zuBvcDULGYuzI3B%6ekol1OdH66{4TnPNEm<6_hc6j8LS|(DzeF10(ZS^wBg*mbyE<
z)4k-TS3o@Ors1jzBBDPsKi`woGS0(x(}6I7U2qvSBlw?Xe@6CQ$BkVoQbj%f`3nq|
zZWmpaZfEk=&%~QFfd`;s%Q--dFb`_xK1P*Pq36jp8IGK7I+O4-SC!{b9$TwRyPQWe
zFq0MBPxi#2_iA?%x3Wl&`ux<Qi3kmce_3Dl)jXQSWG(;YJxeaXjrxe|$B&cU41(%`
zv8B!PN2U^wE8D}+CCf6bXQ}MePS;;HC%jYTGF+d$c6IYnbe;bGj@(D`l(bQSXxAS1
z``q>L_4TZ0xS`#~^_n}uI+Pe(Eq6@G=q}ewSmVXaY(p6^TyVoe`4#Xx)<nMGR-PVp
zfdH8&eiHn|jWUMT<7t)uVa$JQV|rZvW0Hc*<wE$)rhD1F2E)k1K$M_E>DR;G`LsNK
zaPjF*<6v(uY0PB*^iwh_Iv+;EO^d4>Tm`UIBK~uhzL-#Fq*`y-x$3SAR(sf=<Cvb%
zUYEmiW{T*<{LhS&!h^NP59)EZctcfu4Tau;5(wAFTxpN;*F=ZF)mvT#PAjG(o?oUo
z=Gj=~f(v~<nw}xv&lkMZa-a<?;$eY5)uxT{0R+haAW7?E=R<CYGqQLADv#lhj~sL7
z8DQ4tt{Ojn8KUm#YV7Ho9LCQgsw0tqJ?hZ^CbJPVrWb5F)yge$6BK<LLud2uw>0qF
z_c?$p!uO$s)@*NarI&nFw6GtJ0>EQRbO=mK#})j@$3sM7joG$IW;GO!&93<c{blr*
zqa_yl*(JBo;2A@G$3>(Lm~f@NOM)l0BxL1h8CUTT3{xpic*d2ZMGVR1>85`0>9k<1
ztiQmcW5yc`e%Xm5$FBS;L!z1?pgixOg~5gu6=HVk#YQ)=^|z{7%WMa<ipgd_>h`Dr
zyJUI`RZ$RFadJQu^*!K34=Wy&038Y=<nYe~H;~-(#?W&r50Ytn@pHQSgJIxChsnV>
zket2kkDB+Mks1Lb4XV&hqeqsM>$a9v5Q!}rS0Yj=!sSUR^iVvXU@O@7E$&5<)awh@
z(S!$#`akgOEch)yHR%$Yt;ll6Mtrc(3o+)C<i_&~8cbAlSV#T+#F<2TRlP|;*v&Zl
zpx$|&k~=XuV@gjbGx*Zi*-v=dWm?v-b?aE??lFM~a*h*STw5D<`N=DAuVOd=He>|E
zi+qYG7|bLXBzN}930HzfN-2m0{)kopZXg{94H<~TU~ikpy-LCcgHrAkGk|eatpRM`
zkx%{V0`*w~x{5y7K4$vaBly|!*Bp8c=aN}_so5T-r5pDTir(oX5~s9|9}K+RX?<2f
zD@lQvR^cW}z~EhJR^aiIov=st>Wn-rlK!NnK4x_}+G9Oa682npIw)>_X>4zw0fmG@
zwAS)-Ax<>1@!%jQR9_6@<yv5{_&u*hJ<hez5Xc#CjFwCK#|auC#xjWZ$(04LTND76
z*YOkN_IEqf<@8NVOfn}+V%>v%ut|@_@ZBp)(vQBIhQZpQr__2{4&3+M<4Ax4$}Evr
z_)Fe(O{|uBGfS)B)ZWD1J3i{{!}<JF=bT4hRouJL#-`Fq%MlV$uD^RuS9~v;S~DU(
zIY!)Ord^51%nJGy6#ov$VD$c+^@0wYE*;tMbrcoo0J<^}30*vu@qOMo)4|p9hdz~<
z)6dk68^2bheCYsSN{|kjRZ(GFmi(5nDVUNSE|g~17!wfYhQ}@FWn_G%zB<GLb~aV(
zM(1+fwVy87RDr`n|Ka$`Me^OoEsIamAQ_B#(1Nd+9s+|xvf9EaurY#XEu|T6tvP;=
z^nRh?NAi~?#4hKfn}bKR9Z!O`49S#b?Vx%;Ee}aQ{Z2n`N;js*bZ^4p2!rW~iEjDG
z8())<pj&2^jqy(KZ6mc%MAaX-Lo1!llo-F+p(b`tNbEbCfo}}K_elc%pGt1w2f@?X
zM8e-Fmae=X2+N_*ou0<K&xI%DC@(}zq@^FWk}6sI>Q&A4^uNv&5R#-YlK6Yc0p4ED
z79^<*|K(y#uXBKkolW5DK_wq-1T*FAkF!OQk<dESNhJA`x&@jQ*oXeGhL&9DjQ{y=
z-xW4Sa}jhJaU&L>MNat_FNDDgLol0<)B0@BvjChyZr|HMtf4*YUX+3-^*66u-b3iU
zQX~9zWGWqnDX4MbMFUsySy%fk*)+Wm&ND7IH?x5Yx4*LnAe;}`Ga%c9)8Its9lBZ!
zrZ?{BgNq{tiz*RuQ6r*%vsq9XtUQcxRJt|$F8`E!*sMI56=E@gO~S#kOMVojmy@27
zPLGyU6mA_Z{>uOfP=_N&9=odC!`x^GaDF~Q{F;nxvKshTMy5`w6yqucJ=^iZH-|9?
zBGw&!miAXZNT?I}sJ9L^zao!n7+=otdO?MJ2R!_9X8!6>Y96vaA^I-zQ-k<vQq#Y7
zjR65mL^3s;-YbTZg@wNU{#Bo`(<bL;FYT$yS0Z`fRd@jz!*9r_UXn~#H%njrn+FsW
zM3Swk`K%lgrdy*``Z)_gT+_iSjXoE+y1v<GtkLW*SSwPt^7?P!=Y-6Pq^2LR!{_|h
zb@~KKuk3vcG}Jhbe9A{f?}wGwH+W=8^@Lh#tMSIo?C)EH!`x=zSq9v2Bb$6LK4I1j
zrYAgO$Y*9|y#0i%##b&(4J|4TolTzAg0XB`v`PKV@RCSgx?YP9fzun01Jwd^;3|i(
zp$~Ou(-_lC3^Kz>lj;BVu1#)btenXY@Jn%*xJ`p}0v^ff?A7?QEovG9v8?81lS;I&
z{_2S{ps$1qfamB1L)DRIGGD-}jNV-apQ1z0WUnlsuSPnpzJX*+qpw5^+bvIgy7@zY
z`>0>v%v-Y=H(bOC7pWCD`uop@cksgx29n-PAP&sU4fN%Dk^a!OxB9qcf>MZ%_YcM|
zlt6n$oO~I$K$Q*5uF$kyi1XzDUufm6(&5L1d>o?N(S)Fx)d?Lr_4-`gSFnKdzb1x%
zitia^2TFeLWf=wQ5$=>`SCGETV%r43OIPIdSAf+}s{>j~vZS^{z1ZAH;AIYq`so5g
zL{oUk90C=;XrZIW%`dGB6}st$q|DnXyAqOI<jQbbK;Z=SRVx~b2(DGs){F)E?tL^=
zk0`0Mh1YxKUUs-#dFkPlqU)eC^NPz(rV<i4wsk#>nQ~fF^@dXx&tE!42awkx>gUN+
z1SE?KF0GW@d(Zjc2spuYFl&)&W6#<{9u~A2qhY4Ny4R!D?9|up71Z`AkC3u$xV3k5
zWhInDL8g^W=5>7)h0FmCvArDYIc*|yi%VN;;(@ESJY;OS;9D>f{oG$h1(Qr011-of
z?e4(OX;&LQaG_Zgd5O(YQ4)&!uRQt-vINf{&k&#w&^fe{3rN)CNG1c#_$NVIzMr1D
zLwiIZBY*JP<P>1528ZwgMp`F-ibB#ntdk^9;CEq~-bCHz$h=OXvXGH^uALejtcFJ!
z^bQ?bBX4qX%f$qqE1I&~1io%EIEL*d<73ikV@(G7CQNVd<)*9|hnw^emvh&H{xbOh
z<v4{*B0y6&f%fUZEvLCPSefV|uhsPnGO2_l@4y#T<QotsgSLj9y1Rz2T@}P=eL!6F
zx~tWP=mSVmP|5guyvim<t<1aosNnc7z(^sxLQGZfO=%?SOH=%SX>YiOhnsp1d>0Vd
ze<H*tJu-AqPijlE18M}YRNzERP#Tdpwur&;drJ3<Yu|tIJ*muXN1f{rcX~D!R{HSL
z$qTOFBw(Y;6G0T0lUfevIs0xPU9UKF)_u;dy1zAEu)jkAoGTWkg9&;-G<*qJ@RG|@
z>ldi{^0r(@33`yyA9n7WU_rK$bo5GCKLMfv<6R89@)yi&`c<USi$UnOCYs-@oK(c7
ziJCvH@=wmDnH>6c#^2mgNCb3Hs9<6%RtE7wkrI1^Ly!YN!l#N7iWwi`*)t{KWlDXn
zx#GpMa)(2!d{1!*ili)du>59k*U9M)kr_F11&~tTDVSF&l2rnLln=3IJQT)CPJjk3
z#umS4(BE_>0O|!!O=)O$<{I+Ia8Y78{K+amI+p$BwcD`2jaidWNTcRNb$Sj=fNsgm
zm=yO582mDuXQU(`pF<8%`8>{H1Jt&OzgUHd3y~wyd)a!gYByUM#a!=$bQ-0=moz<e
zZ~JrV->{C=O^E(<;(L-oGf*F!!7lIwK~EzyIGmizmR2rm?+m?XF0jq;fU`=B$Mb^e
z#ifZ4=+k@DX227ZuSA9oE>zaX%d3k01#n0%aw?0wVKn18+CrW{GAVS<2|r{Hr392R
zJB1a`NbIPQ%?i=?XwP~szyk?b+dfbdO>%XoyG(C{jQN!QCKRZ?2PBY?c-es^r~b<U
zlSZfF9OedcGDQGpVi3eglqzCJ16*5jlVd?c-`aZq3z%whdv{e$U%(p+029V=%8i@0
z$Nw0C-;abmmndShSTrxhPip7EOiU`<+LzD%?pkPt8-b1z_xS8(vnA+As9F*h3Nc=y
zRq=$DuD6f+<!z?#%|}lJLum=nEs-|q%VVu7I<=MABW0VNPfW^23Xb8k2VsywGPj{Y
zYrwDQIxn5?4H0xbrf80L_kiwB40_wBn?0VA((To~I>Jd?hq&c>HTp+*pY>aM0V<AW
z#6-(7o-qy2*CHaO!$WT~Xh=R}VEdviDc)V<WeP!@Za$JL0;Wm3-^J(oa%4=pd^H=Y
zY<^Y^yvoL-(nsgUYUaWQb*+1qKM-sBdZbhBN$1N`XuCVPFe{z;g@w+CLyG)(tz}Aw
zsy1s1AAOlZ>tvB6H4;UGz!yEoiSFoy_I?~X?$COn9sqg>ei4IwWbI)B`gsMKU&dv+
zsCd(_yHn~AY!}5c$ZncxASzwNQ0jm2+HWTT&_?o+<l7;lgm$k=;TbLsg^sX;Y5Y@s
z>_H`)%0{&Q?sBdSEoYA`VKV`=2U>o_ipZnxX7HOiqHPAB&r0%j^1Gai8^gP^cLFUR
zhh*)&Oce>6BPOnH#(g0Eu5x%sBYss&=$*!?5RLbxrCQ1U2as;$7g?Lg(R{2e+5Ye{
z?k7{=psC;l#Im~=3&M=^jRY9oocvhIL6^l}!Weub$;Q==IQHFj-b3d~28G57_=Ccg
z5>$lpHS58&P?x!BwtL^7exQrof`Sy33&ai+*GPWRSP{I}P!*clef@zFUzKx=!<yIa
zW70zTcp1f~<*70xQYVR#;w<g_&@H7c@Ve~lvxFos=@XYluD1>;M4CO>=s%s#Zg0jv
zJ>v8qm>9pk#>a9gfRZ`%L&aknjtpJog2*(OiK#yxvArcUkX_>ZP%Y1H4+r>No{!)`
z5#<WZYc(*sxN=nXW-=8o{Y(jWPQH({?dNZVl~soQ{92EfdubM0FnK2LrhgA?P`XIq
z{Jyeo-ML;+Q1DpPEsOKhTHUzhTEm7SzgzCvS60)N&XYk=b;ZETO?UNow$Ub1_mD>g
z^cklc_TQrMS#68Fb%v`Y6xvxPkmTK)bf`C}t0y$v;TKeckxMv-a0iS_jfs{g0ByB)
zJWT<7sxTPtyQ70xD;a&_&}#13l8e{BLm(bzhFT;&$t5T?9`UbkCw_)`^!y{0$iiTN
z^>sY}Hx%k9D$XYT{saz_!~`laTmJDk%HzsqTPyxQV^z>y?X~GQW66}xztv^RrFUA4
z%>R{(vT?KhqfJPCSq$h{J}K*LW4|Rap#(gam^tjaHp(!O-z*IM)Kco;nVhiI?b|p9
zqNEhfFwz2b(meGK_luvV{hn1IbBGKpA7v6$#xCeL<hFIG;XpT(VvKL=Bq~3()gBKR
z54nK~MA;j`iYQk2q}nGdqNCs5bAK?88QndCxip7Ns!0^uD@B~3o48T1e8zRC@-1w7
zT6NuSFb5m5NKzJVzn3uPol)iv2!%QIqWX&RglD_%9BM#^o!1JQ%`RV>k6u`lUys7f
zH@dR%<zh9#X`18W@=8fNC_3opvKafU%~CCn4v<s%I<F^NU%3Ao`;sy=_vpj3+Vz50
zucQO}R_1NN&XZ(P<uStG^N(YR$ER~KbG^F3n#Jf=L^R>;4cv!@`4G#EJI<uC7JMH9
zWG&o5#Tg3b2cx&nm1&RSFsdDH=&7hdpYo+~`Sp#S)v?R<8@AIEiQ8y-u?hQgtUCBj
z&_GaL5UT5quBkluAekzik6mV&L({!R;nWZJffPrU`pVY_Xbl~VH{QHG)>BFht6!<k
zB)rXNdc31EUYZt}nS?ErQdVzrCTyeE7~~UJk>L>-%v5vU#n(thxXw(KL8HKENcbJP
z^X_eK9tDFh<(*#N+c?k-wn5_~I+gq(#_Vl@uBL?zUe(6trq5*CLE5N>n)LHCV!OvL
zJ9zI)@?>e)WS9|NZ}l}Gia0p?mdMorczD^3w7hOLcf+ZN6v#~SZ#X69<>FkFzTJhy
zdS#4R=PSo%@<5UaArXQcAWviv@B^ZU=Icj8t@HHOTE4~;6G?pR4r^5<#-rjNh<}SP
zt@NA0Q8>v}l5gRs3HunqI>Jy7z3*9N?u>l3(`0kGG=17|m%?fI`fe?l7z_6kHDbTQ
zw;Xame41*l_-Zp)4&iROB6xPppjR9h?`fjAQqB7_LIjifqt;nHWt8b%+GEz$qlM~I
z=2h1s3G+@Xb8}qsw)wI&gakvwVO1^IGV<I?=NNqgw58itfiNKS=VF8<8LzC5CEQ52
zzrW8pW}20rP_X!|X4;?W{JY~B_rpe##fYu@RPt{sq%}xnoS%(1OjUxM2GRd}afI2t
zKlFt`QOOthi|)_a+L9Ex6fV5%Hcv@)Upm|Hj7is?rxFS+bkG|M5+$Jqqq7^1CaUD>
z2hHuh%<RpCe`|8Rq~}@Aw`mv}PVNrIMV8#8G~OC_g6NXw&>U_sU!FSb924EY4L{N}
zZ5Cg67#ySxTu5-k=#APneL;vZn`m<TB)?avuJ>7@Gt&)+{)=|;aw%^iGCXKs5%WZO
zutfqJjNfhaM2wakQ$#7Cp6IT2=^7pVqU~nsY(0cqi=|n#hm24sG9OMpaxa<rZnD<M
zCGh<?|6yK$EQj!8wI7f-cS&Aafb_2BI*Y^rEs6UGY|}+zCW28C1Q4IUZDMP8^Y^MX
z6rd<!Ehzf|tfNpTa<@1<&YE4g+mF0PW!@2~*Yuks(l3ET$~=_*Hr4cS*u9_b9l2ii
zivg$cXK!gIz+xsm`s!87)8yXe@Pheij742s;%X*|nlnf0&GOXi8L8J5jgAWZPgw=Z
z0y$?;+H=4_#Yu2X%W_2}a)e$3V^U!MP>t_3<AnEejTmy}%>iU9-dS6Hvfrx;Y;v{S
zQ#jx0@y^eBQ}{+FSbYDk-=G3l(lU@0JY8uwYz}!S8&0$-?eTMnKe#*_GQ2bSV(U$Q
zFEvvUz-dj3o>YsLR0~#Fc1GO))B=$!gik9Uor|d`Dc4uontbxXNgdS{<XjKMfM{pt
z>sQ-+@#3*?J-{obFp$v|)M$SnRV|+&>LZiSJNneZu2v!DQ>*)p13$tnQM$*ZpM7kF
ziAF=;j}eVko^yzrwo%r_Uvi-tN;(yOh_HyBL~K@~F!l>Pe<cpx*V)onWHS)Y@us}r
zG$HVREA!B1uuf#}Tp=O0jkOSD;Wx3*FKKz$fqP@jWUX2MLisHeUgZ^suMC_73*KQ9
z<)`tJym$m?aYW|HJc}8!Jaj^K%hDJ8fIN4NMo-~SkEA0ql(mUYdBt1G-rgLmJdi~>
zT=0EmM*lTX$EsssQd?HvI3KN?@deIYB|LS;h;8NR;3)xU&CE+iyG9kNr%k`RcMqr$
zM9!dK)Msk$UcQ#ij1bH8JKyJY8fmBe7~^+>MXYpIhl7KHo^CDOy;@>}qK2OfJAp^3
zFPWej82(>)3L<gKL#JSceTNm(;K{QGM&s{uXk0G0rq(?!2tj6nd5!0>gM&*=fs}>1
zma-F;gfG+>fi>)9Q3RCnX(t6Tm<Pz$Xtoujw?n3&SAoV!5>y5qsxtl^6O}orS#Fx2
zr(nGP6rDzJvgK8Ir_kbP(dra)uKjqLe3d=^#ro<X=702i=nREGbbSPpl^-7MNx7{T
zHg0iNxzCs#mafiS4laoFPtz9CxYJ>U>E=7!(Q3!PCxxC;pfjtrP$I&ogrV-L={vYS
z`4EBCVN`6{qW?})(^kLgrA$-db%Ys2(T_xS*T~_xj{VaQoI)^=8~$QTh=4NJ%dWc|
zX$|&)oS=wKhgK7wzw8x@MVL>!vfA6@^N#W)z{!nYZ`t=tBG2APj^h+)k(X??K*N>w
zs%_uMdqCxou?srz?9&n)FT0y#J@f^skEC3xcq1%|%o;8o+P=CwVL?C1zPoD8P8WlN
zm0`7;-(=6NyPK|sJFM<bO7V%0hdVI%SBgHJTKTmG2--1flws9`19z}&j~1j)H6&rx
zi(bDVS-FDZ;bp8FpnFXlI{lgB%-;14&q2g@!(jqH4e?!b|D@eQ8vs`Si7%TVPCu^9
zeUgXcbsbN#ohjrTl|$O9bGz(YcN^R$Y{^1$tG^6~J6`O*q+Rx^zlL3>4Bk_7q5Jhs
z+y1jj<KgM#7el|=3hsRfScmOw(dbpywn;l;k*qb;Q>(s9YH<YIW?DCAJBEjJVDN8&
z@D^&+f`8YY--$y8fv7;#)8Qz1wHh5yA77>^cST9am#lX_AxhP)515f|WONb=EZX*%
zgZa*uDdfU6-ba`FbPloJAZ9ia>%Y6OI3<K*VG$$4_b1Bl5cZMStCPO7oO<*MrA2d>
z#g)G(KAo0BSr0X__6S+HS)=tsC)j|y3n#@}`S0ZX;vVuTB+?VHys=#<^m=e`aBI5q
z*v~SgA*s?H09+w_D&rNyA_uw{Fd`D#qYsUbFl9hny_zf?(Pq@=)6-%H*+ruI)k3mJ
zv-=Dhgk{4NA1;`&c)qbr>@vHMdqDLCGDFLTMTz0OxvY95AxX|@>Mb$DhI!le$%|8N
zAGVS#gA((;`zhoZfGFx6Z+>C-KAJyefo~7Kh(8OmgD8T6`Vcv1WrvCP=6`;UdLp}A
zxYDLVmLsh>TgLMH$-%rQ2+A%poBa$$_OatRxW46vSpnVNhyp3;M?w-=M?*(W9R~?6
zg@U>qE0F<jrdPi$?m~%>J%T!>eRDSS9#KDL`>2^RfEsuZ!@B>;ZU>?|a65s6L(uMU
zZ&bhe8BuWU#<02McPh&d5E{z*)~AJ1viL3&9wru4dH8coR*)KbJ|?e|zZMuuaI{i_
ziTbA*LYQ3Rf4czcAk0guyBz!v%b;dL1KK7M4jkBnd_fH)Kg5q9-D1LqwIGU$Sd7yC
z`#=eW5-M3)+avji1+5v&z{_^dy_e(p{XLF!#5<6E+{6Ei{w2ISIC?|kCyxn{Sp?OJ
zX6g>t2RiQ18q~DpLE|p(nXV$R`KZxO6^F3glDs6s&Zhx{y-&R#{a1i3F94^hk=Xg$
zy*J-&zXOC($;ffg?D$R%@-9&X*~L2qrsoMYM>k>3_s=NwUg4{Jt;k~*7X)RANfpJ?
zMfKn1NSV-5FXg{`=XchDNA!SE6O04yZ`%Uj8L9y4I$#G(b^nO>|1g}H4g}29s;d9s
zkw<v}P{bM`3}XLm`|rArR0O;nJM_y;|NI3N3!*ByOTXqi{vi>4@RWlr%Ag(l55O#<
z1u1k>->bjB|L2=pOh76x864F2ua_a=0A;WUC%Qkw@c&g5WD_0);~akJ)St)FHw6UC
zvZ&zxN$?&8P$58%nN<^*#`z!S|3}_kPXrejt^Su3_TEK^zn1G?4*-d%3P@g$ziI#P
zq5pfB?@t#L1f(D-6X*W+$35}}L5yl|GJF01r3HZ2k1+XPFN3B7i$F+-KmDJObl(x7
zCV+!cD#iF>{$D8<ya))62~6DiC&7E~Lq!03ghWh!4i4>q)_G}(7ystocgc8yK*RZ;
zLZVLO;YfthH2vjfxZi1KdJRmo{E(XCej2|23vv#qRZ+IXsec^#2LUTTsu9BfdKs!4
z@D9z!R{ifo<3A8}0dTM>QTOpbbdQ<|@D9@=ujc+Qg1^f~SrI5$L}d2AUj@Fl>)OA$
z_uZj`3s;)O|L)uIAEO5FD4AsZ;=j)PU4j%hLD-~!FzK(wbG)KJ9RX_wHH-1hFA(iK
zFBfw!yEKs`)y`{<)f(FBcE_m0vgUV2^&UE;o?(o*$))$f_N^VsOBxMch9LpFN6|r1
zz-PH8j&E<5FxszF$WZ<+Vf+VVgAZs#nEmKX{c)f`Qn11<u?C$5+o64a7BF^)<#J6U
zSrjM42fWSa%Hi*?_kkG8nyY)7ytsDf3>3RVwX+#sw41%VEfQB{wL6UV^KlKj{3ULU
z{o^qP`-ZEOopko+i1z|wE#&9BZJN*wf2c`h#f`PRN3{Q=0?;JzDIbCcB8t)_bFw$f
zn>zFJ_!t(g$ETOLT0MN@M06*=f@DurtWBn`WbyLtHWmB+##t50{H7m~xdfvIW6h-V
z=eX0>syt>F573L5F(MmnjOQBFzq!xlj7pS_qGN5ODu%ojC(TS|z^55S6KsMz-;g<k
zhL8txoCS)={H%(+^9FgMNFjpUoUuf+y@&_e4^fmN@9xUhRFtIpCaK?_eSPb9&aPhp
zUMr?I<B`6A__4f4E%#ZduqUs?|MZ&BHi)9YkyC*&>q<o1QLU%*{rAodmipg#ANO9{
z6r?j#05B*}t)hAXNIp*>gz<GT^UEzv+0JqT(Jo{0?%k0zo86r)_b6Og@$MzZ?))sI
z2xtCmGd=xJd;w{Bhp58~&f0j-K$$?-NXx1Jz&!+n&G1{byxvQMO9;_Rb^inCj>*M2
zKW72HTa2JSask!2#2F5T5yiixO)(?of#owBbdZ3*BTj|nk8HRvZ|>S0_1>D(+o`@F
zwA-z><7EPQxI0g^lU|>`+cd8CnW_oBmH~5x{l0#PCE8@TXL0-!&qob+xarTB3#xw%
zNJ4^XJfkIBgV)OM_KZ%e7l8`6MdWuDe^<R^NSN3h>1;S1VHZB&J_=r83g-~u$uH4z
znw*S3dyDHuU~v;)b0&ZPcESDsSw41#C=xP7_B>IsX%+u9aA>Luz(H~FO<vfrI5||C
z`9RpOKmBGKsHYw+aC)I}3N2v2Emg^OV4Q~Jo6oqg&)8FlUDtKR|Gfl_DH%HW#gI0p
zT3=73z%;))vq>NKqlhDn_fi+Zz6H!+T%25qHKzD?{`@+xCD|KqqDTG<mPgFRhyV=I
z$zKa3^ySMq4TYT&H(|Bwv6GwsUp#}d6HWOtgt<=La5~pvH2W_2f}-Ny<4`Q%{R~lI
z#bZ7<Rt!|8FO=m+yN)e)XM^>vdAHTpEvh(|U3QGWl<>t(@ISHrJBfGuTPejt0PG&5
zCD{!U!7W<bw8!@Sbckw<rXy?Usm_;T?enidl}zF$QgNti*lnQmR)JLh9PM1M>wJ_*
zwA*Cy{J=HRX>bV>dxBbq7U{)Ppf3LModXGlDr$!Rp|e;E)&TwBb=*4`pi@DXW7OrU
zC0Y7badG4H{bFt8`DS)|ZB41Pu~G}E8fp6PXW>I~K>`%L8C5a%53m>=rw^S|==`L$
zzOKOfHs|}N5_`KGeenH<5MQA1<RdxR*>=a=LSQ%o%#~~PEIE3(zSR1$N?A$Sjg?dS
zOyYwJ%ylVkRFDx`Q(MmT$?9}Ps+2C)_nbK16!YQjhlF+8m={XvY7w3u?iW;<P6@?x
zDUT1WWC=8D_H{Tvt{y7>&?tzlA-hN)?~jJ7*7~^1hVC+W&(D^+1^fDZ&Y(V>(cSUM
zoEyK*peBDkim;AUhq5PiPAUKN&HTpP3R|?@MFNN{jj1gfU}^g_)Opo1m8Lo<e=9cg
zHa`D^UVzxoMT)3ajNM7XgjvU>iLPxXceQtK1*>n3?Cr8B=%KEgKImVY3KdHCT_;>`
zi}ptVSB7K)AIbFou~BI8xaG9pW{8(F&GzSGw^t9TzKWDIwr(oOeBQ94!UL|;WH7bk
zEU712fDT}f7s$=C-LiNOz)4>YH8XTxeKBd9GG|W9c$fc7QumM|ek>C87K)i~k91>i
z+Ctm&mKm*zmpC4|=Ti<x&r_NtoB$QCCXEjd-$3TKabLffHSZ?4Z-|BMs;XMIvL&Rz
zX-(m%4=%HcgD~q&w+)S*JNMo0nf@?SCNHR&xr&@o+lp4_6gEYhesT++1#AjrJlZhs
zLEdso-yL}`13=*=C{LU{>oB@*NwiXVTuwuu#Q&hxME9{P31JL8E=KRBs`AvWGzw@`
z5SOC%{Q6mlq6A4Bk(nx0<U^I|y!6(UgF^jt&3S2hSjQsE&i)Sj<ZY$Cj$EU@#VzZA
z=SC#dJSV7^x0GKk=p<H1q?|vYg6o+_FOQ9W9&#O5!pcX8sp_X+zQKnD$x3>yS6og7
zd5Z5syrzR-_+33shs~-rcKD%{()^_htwYLQ(MP<d^&95Lr$u$gF$PgC_ua<X2k1ID
zf5IEMxZtHLzo2hIs{NR7*MmWk$46Dbf<?=7524&peIby+fQ3MkDRs3Wd8h#I$46N6
zeDMyo2MurJ!OB@D3x&PaakWB~FY+Fjf4{*26ns94KaN~$v22yiWqZTAb4+V?*tQ#O
z!){o=Mz1uJCX!Ytl7}JN>%uzrLpa~8Mc>Toj|q|sW#ye;^mAFQcbV7TDa^NwZPk++
zbDUCtmlcy2J5(q_=`mD&RZ589r{@7cBzPW~)O;$<C}}z6Ytd?wcv(SS)EZlmg_?aB
zbPG~HxSEohYduQ_jM(%BVA(s1ZUTwnlII7%*+2-p$T~LS%I4TN>~|8^1bs*q|BH5^
z+K^1D`5N&YmMv5o@&D0vm0?vbT~v-BttbjAEnU*x-QA6(bayKuN(s_%kZzEcMo~JX
zrKP(|K;WC><=*dJ#UDP;5qQr#v1jkK*IqN@F#b?xqXlK3@ICoEY%cXMDM=7Ab7vGS
zpF!51xpnI7P6%21@EiKr+>HM8V-q**)!CWbC|9Fc6<<%$FYpdjLY9tOj3=E^9MCof
z-G=Qnf=*S&lO&$kR~}S+xLFtmb}cA_CDitMgX(RS_Ijy<?tBwFEOY08cpYD#BG}2I
z{5w2>>Olv09S)5F!8i0)_ZKe9WnveVl-u<#Q+0JeT&AlBj2~cO)($!s%EGd;C(f^L
zBm3)5F?Zf1i(04d#M=ve*!Aj+hja00iLsFlwIOsFM^f!yGz%;Y`OU+x@*aH^e#-C`
zo$~#AcLHm-Vygph&G=K}PHe-+p`2)@P=AXtGrDRUGXzXRdO^_i7Xs6l`k_y$(Km1R
z<bPTufmvJ*k^E8k@sJeTLC{%gH`#fUp4h+)!6z2cmfW8{*0)I>!GNGiXXGL-$bKYQ
zZBbzIU}TbD)pi*ib8P`%i7|%xl?f9OEOfTG)WO%Zs5CA;r`u;ih%->b_j2zyJlxfM
z!u5_Y!lyH<xb3w6E0>4GEd4JKl@uo$%mxd{yZmd@K-{JUz$^Aw3xmOO*uj#{V3_Q+
z$H6X2n$1y;;^g2$m7go+OD6wNA{+w?E1o}1jfs{(yUfVsVY8|T{$E1paD-L=u8#Yv
z=ovcoqLaR3Lx(jDetIoku<dG3g$Vr*IZ?!r)+!i$$JQ^8i^Gj*vOlWxx!5{Cp}uty
z-pIYYs@pfj_EgQw<Ip!yQ0>0~_OC1ioOkvd+gQ`W!nKPTB;kH!ps}nJl@+g`wz`-D
zsyKfQA*>T)XgaJqb|XYRS!-LR^+nfnbPfwUEm-coT`gTCD;jeAmLHs;I*vhTp{eWv
zd&*XYkcQCvD4DD9qx9u2epR|OI|wbPpjN!dU-^LI?>mKfu~}>rWUISugacO(c8UMs
zp?<;kH8;cX^1XlCB&@NK7*9FM$=}6(qLC+ogBXGX&uSU_tv1rg-?GKnQ2wV5^jF^q
z>AC&36<=+azQ7Xlz`_V(vl1NuC%5*YRfMq;VhqomkK3FV$HLWTHM>2bwSLgCrx<nP
zh7{%%uSP0=^kudt@t{&wR5~5}o&uD_t7a=|Ydw5Q)U_L`%W}hgb(JE={*O@qzdnKJ
zo3>$RVD<^Y4R!QPkc>-4aXj&d$TU(oRBKJCH9D^;D$(2cM@h#1vPfWkN8^9H;#37w
z^~Ouf_JC8^ZZxH%f(D2y0$0&l%8G^9{PC^G2a-neyZ>@Cm)!^P)}Ma>HAwC;WdZh+
ztq5;`)ebDUl?Ukj$VR@^;g;zc=0E(~Y5#kq6EL4XiRH1d?nV!f0W&Et2~$^s{ljxK
zoz}66<F@Y*K9b!9J68UA&p#jdB_Xu%hcviZ%y5I?e#m%ucqpiQzpbjVfW{go7+*EC
zAtHnY^<KbEDrzlU>c{*a0tdlQ>ZgEskbts+ysVPoA^RVAf7wudeh50^q5r`RX12ho
zdw`k<e|H{zJE!)LEJ;mt9JWaQg?BHrRgJf~O}a70!;^vB3j3kg{HBmYSaZ}|Qf)aR
zFV-!pN{hWWOtx;Qf)hG_e;DIy48IU+5EgnBi*Jz@@YKb3W1k~Z$-7K;OEDmFGC2OD
z+W6P7_-R7W3#(}@-W{r1Feia%=En{b_O3q0x}gbI{~mdf;;P|qq5je|n~Ko9Bb#F6
z{x~PLOP;bE0j6#FDeNp!$5l4d<u87j;BZwl?<ewKx3HFCw6HNu<URpAp_U*(%)9}P
zi!QEFzc1SQ?8%bDAi@=<h9RyF%W+MnJ(gq-`#f$@9~fNjOpqk!3g$-yDGeTBt?(B7
z>W%%6s0=<!M~7v-p5BcvUP@P!Mkf@61tYBr3b0e{pxF0S<cnp+<@kI5su2G+z<H>H
z8r*k>^EWrU4#6TBV{;DhMg|gUTvH9_T1ge6w?{+8lLhv9S9Th9bYLl`g*COW^Dt8)
z7?v-QO;4Y>I|;gDp=%|0C0vyQ={rJLux`=b53{b!=zAhi&)gyreSomKo`$Xa;`o5C
zR;v-|_q}yLN1Pv)WRz{yJCL6)*9S$j)KQc%CS$@-IyL>vx4J`Y9-7SilULRv{1$<-
zG=Xhs_7M2lO4F^5N#sEdi__H+YvYvTAKosXYyM(NezjfENAiZ<8;W2TD+#6u8#2bS
zvi`O}eI9C%bKjaH{?CovzZnDZ-JDWa_RG^7McXnvd0@Jyl_rXO20vKI=GEufx0cU`
z$2wke{2tU`M!W!oLsf;kq(3!?!3R~lhdrQOO`LuLhzEIr`wizCwNvXWrvd-0{(pbK
zLwcjby>HTc^TuaVnCv-v*4|d|=QdgJUgyz>DN}D7zSEPGNtv8%$*bnamwyamJTg9f
zFx<&F0KJ|^&i|GV@*I6N;BzzV9BShnTHKgEdmviAVznk<Qk3~p6pSR*#;k(08XVy%
zcmbc+fHv^h{R~Iy-hW7%%SO}R^b4QziK{;g9Wi%ORWHU$i2^ffOG*Fr?NnC4j?A3(
z%oi8<E8kX0El*Z(bnB=Jc7o!LGqtl5M1w5H6bnt}t-1Kt+H^xA$WKp&=Du8Yybh?U
zlWrdpG{$)CPc?V2^e*jvFp;K8XNTnwr0_S6!<RQVH_w2g8)Hp>hF-P9?uzaLK%mC2
zUaX^6$OaA)AF0d5sv?(n4y9E-iJu<tZ8$e|sZuMa{!cOepQ!>{%1KAYp&TxMEq0*{
z*!RoJ=-Ud^Guj-N+p1gjvtQ-f;2>5gp8My`>l-$8SWdy0r;lSfy?c&S4s`t^kUX-g
zNAjc@N4*Ws)G!0L&eX-ea-OgGbwj}Eg&Hxiy(ESWKi{vbFAv^N#Df~NK>mv9H`9jw
z&6mukai1TWFjwV3w^9hz3438t2+9&nbYNdW25ese;#H=B#7#|DU@8jy&sz__5AX;6
z`!y-tMZcd~14F5m2nJXf8+jv#6i@l&0AKn;T!&wr?%=laZ|6$<^!co)p9(6hIL99*
z{oIAO3lTzDr8TIrA+Ntr`tJ?q|H1}*5C+s2;3yyZquNvA;>e#_E@|t&3OJUtm^l5X
zzV%mEfW%DxkG)8uYd-%A2aD3szd*z`a2det$Jvn{`KL_&Z-b4*qCi=b`D+VzoFn<Y
zlng*BL^fwF({%dpOJBcov_iz*nydFdoN!%y0dC>=62iYLz(Z7TLiSAc>Pc~dds5#J
z{)K%Fa}a0gefHGa*Mo{7Z+9Y-)$}6otKQJ^Se~UI)_Antla~*_Hsu^xDWNDhm`b;6
zoR=22MH=A`BX<GBAUvo6X9UeeD)0ZfJR}9IKnDwYju{Hn3894hJX^0KbbArS-L76V
z5<df;jn7<#WTD(8Mui1CusDc+o<LR>L6O0hHd>H>)$<{O{OUG$o0Eje!7VeeZ7tyN
zn7T1=Iwwh>(-)DJf<hgHcf+j9l&OGS`z_odg-C;ewHX*|ZTs#R3tkui$wfTCa^nFj
zj{W?Sm;U8B{z%Bhs@^)*kX2CWBO<+GT2KW^l)sJ#+i5gkRH#GSs;Sj_ect|;$NpOC
z%lzPdbe)WAHh3Z7QZPL<f2lxy8k@H!yR?Tv-qjJn4Q8Hy@2w>9s{}!7EON*i2O?r&
zu~u|Ue&IYUN;!7b#x}5eXi+@@Z&tR|;E?TjZ+IHOUcYOA8h{;OsUco&%U&8^P<vHV
zLu9{g@Ojh1Vk^!m&tDs|XO{IH-(S{Ga<cfl#>R%ETq#Xt9J8<VYHyc73$4~Vs?LY^
z1XRG016tVPEICzeQKu(WTP7gVo)6pI`~Nch7h;2hY#@JH&$~28>tYnza8UlS=}J#g
zopy0{wjUWTIusSBE8J1}{pdf~K<6O|7gg|#l<OSiJq24%{YsEXum`E#=duHuOQdzK
zN>TvYje!x2uz@0v{b-OH-yZ|krB>;fmEzl{R#uZ0=kwD&2{Zy)R}B{MgGzpJ6_xp!
z<0{43&vR)2p-mKFAa65=!xymc&X9fgDwuxFPZr^PsPSueJ@oTO*&st@Y_MF1Y{^GQ
zsfg-}eNL?p<9oNrNA*Om99>_2H@~Wj?-lFmNBA+EeXkg$>^S1UWeCXNL|<x6>3!$k
z+uhnaSNi%UOusfZFDT5>KZQ?#)w~szv}C}kWhrhWJBnRdVgf&N$KR%Axv@!Xi?3_y
zyu~6DHMw_5VnIRD<qJ%nl0z8L#TOz%#A9<*Kz|~8RR-qU#1H=x95yjouFf9uVN-z(
z6hxWtOS;MGySN;k8<eoW$UN@a+A!6-D61_A;`-0#?+ZSUBRD+c3R+uFa>tctgfEaq
zL|!^#C~)?_(S?Xdk);vVA3d%|dX!3BPeFl4(*K(5h3_+lU~PGLgcTN><Z3!y$p+Rv
znf9)Wf(7>?uG8}ABhsVE_0c85|KDEsO$;K7wPq)uMNog;5&;f={r2`NK_1zo@+D*a
zyI0;Yhrv&nisFBS6xEh~>KIFK_wL=>cz8Ozshh8zPNCNV+#8Oh)%$!9LT)yN^acwt
zxOrT4=N|Z(UlSI6l#Pe_iwQ0LM!I!y*WfI+I7WcGd%N(<eb<VsulobnA$CD`AK>C@
z^HNb!K^WlIG2Y7e-V|xE4dNFU7cYpmYp>GidvxVB>fj!^Z8^BI<H{fxxy^3AX#{6*
zU0e?;$s0kQZll?p?!KNaDD<kE{eJ$;n^=GGjhyV^eq!qeCQhn+FFF48SFX}c@#Fh#
zRXY!f{y(Ju{Y2qsg-YH&9Dpff7~#PrBSHv4f`8%Li@j=>sOwu7Y>9cr^>(mJA8i$y
zK2oL#85kJcsAb2Syp4f!;EL?2&}5#9U$t^Pt7Iheho<@-!FS<?9dECKM?D)JfA&b|
z0@vC4k$u~1#1Ik`WVRU~w!8ndnEF)YUw2FXyAj|{VX~VN_jk!giTG2YcrQbqWcv<1
z;$dN{0+&$3-$(uT-~MZE#lJ(aZnipFva++k_f%F^7XCQR!Nm{Wp^d%0BFJMs;BW@V
z1o{0o$e+W7Ar3LY3?f0hzEWsAtxGxRkU~q$MXp~VA1=g@?2v{1hsVBcK}>G7D&GWm
zhTrb%>svUR^gn~mgbBQ7y?prieiyfh!M~aaYt8@vdw!2Tik;+18>(RtFk5E|#y}m2
z>Z|?1x``~)Sl+pREt3Dt=sBtWp>5ZD9S89&A08iPaNTPU>UT@ntZ-b&uLNIp3nKU#
ze5vo~CnDr#es8JuQLX^@Lw3fOJP=jXgZDDM_%5OX|7qF(+xSDJscR<A&SNP;+Q$p^
zD;Nj$d)o`J(JVV*4u;)i`v1TE*C$(~{ajpg3W1MbB3@j(4hKv}#8ZTSO=nPx02oil
zfFa(WjK>Bgjt7^?`hbCffmbVKHt}Nt3LN|og(h;J+Ygh!OU!;D&=WcI|3fFglItJ(
z%UKVV^>Sl~rODG_q}14Kj#5+|e$T6x>19hH&KLYci#=tUEPG@M6YCO-d3c7wm`-7A
z55c;**D~Ie^ryzAAjR>_L}?xTnD_HA!@0Zah72`J!9YSr?&~W2-1<2+u1@$5kHSo(
zk~WD%3tN^MlWKlan&CF+l=tIr4gv0><ld3&`+IDW8-H5VtjMt8HP48PyPtBZRh=7v
zIqw7px$YcE-S@@Q`XEjAiL6$OD=z|ujhwy+th1pKQ{69VO2CDh;Sk=Vh3oa0d=j`P
zbJT6ne@*0nEcOWL((R%l-nx17F~Pli_nzDrzXl0Mmd|OWo<W6(2I=!Qi0)c7Y!9km
z|HG1ru!ZkhoEQvit0hfZ>W~DNobo_;RdP(<h?CeYLt}jv{^WRl?!Mrzl>1m4^uU^y
zf`Z}^?4;f&gm6gQWc;c4#r||i%XrToOrMBag&hVMm)*G9FoD~Lf27a4lvOE|sta+;
zp#ko1f-K@K%7Hj;%?6LY{vRG|qC@C{ZcFrkp(AJ<XA?<<jt9p#@$;|Y4X(C;X;%E>
zH1{CIbywT*XED|6gEi)JXS+hMUj0tc`M!Gz_MhHW3o()Vo*~~`+e?|+Bb%x#=w*Li
zwiyu-@!=6;+j`hom=+la>KkSpS#XQ*pS%RPX&063!P?(i01mc$IGC7oN={VVg5b4H
zQ4roe)GrjT#b>`n6(czO$<WS;^jXc?R-orQbv*T`C<olMJ)YsH{{H??@t+;LTJ}66
z&D{8EObEL-{_2)|d&8g7k(zY^Mu3v+8F;ZAf@hXI9W%3|hoz-u_4g!Tax3Glm47n1
zTmP7x>8zui&?5*w=3Du2wU6TvRg?oNmbFLs5}ghYBjXAF;E4DM{ZPOJDCETQ6VP{x
zUW3zu2k)1ogLOQ{w43niR{QevqeL*HwY6@(mQl)Cx@|(%2V2z-XdXT9f`pFYTCu)i
z|HI?~{{qGM+fX~;5fDU##l&LS&`4m8XIls^g<_CNk_4>pX0~uzB46JD$gR!kD82BF
z*o!2N$8vn9b@Nm|YBI#5hgWu%uec-kd#&$i22r-h$!Vhg#W}@b+4P&+-KhS7f!5)n
zp|qx1h(QJk3Xg95X*in1-^wk>r(*#806a5jks+KO<>0F!Ml!qQRvQ8>p=)rRkVf25
zS}8H7Hx{vUJpEPbL-e_Oed5TnZ@%uvIsV5X{51v;0pE-#gxtoxTdxIqdJ~!EF5m@0
zoZqtu=Xz~3*Zrr^g7Hp>uud#a7=O}~{Vt->!#lTu6?s#AI}qeaAKw=Tc996Pi>A`1
zz84jxjz(hL-UuBHwCEl(F~Y%=!(|~-h`-Ijwg+69L^E)U#TSzkTLggwRuO_ceC~0`
zmeiVC&|y~K9Ax#mjH>@I?bjd;6c}!nDgG|En6w`TxCaWu6Z-n5ptzo)B8mpMliWPp
z?>5u|4pidX=7tvIJ5lRq-g{FkBsP`HCilgzSOXYOA>&xGk%g$_53Lh3Qi*QE-Gceb
zwQsa==|9Gr*E%%bWt8L9c4<i&>;Bf6zlWOd4!u?m`J<mu?6hxq6b0tOC7ZBO4h$+(
zJFMKZphN;lgc#Q#raqih1DwOm6J}2v4EnuishZg#zb`C5eoXpA<_vkdFH>kEhf=9f
z3-5hr-6qLY{A1Ro<Uaxk!tD%x*Kqn^?wZ~gfr325M7$vSH{RVgPlc*NxMUMPrRp^}
zSb|`kgu&k6P*3ylupt*p%=+Wehu}MjFG;VsMHz|2k2hKfy$a_N!cWmhT!R<(7v#~s
zZiD=fMVp*g&c#UdKuEj&@BA^M`TC~%#tZV;TL*?*&RB(f?{c}^ognX%;SySSjCVA)
z`pt?OJSDNY-d-`D<YtUUtoNiae)n21AQ&0gK}<&0lJCQHE5kJM%aJ5-!9LHBUCxMz
zh)PV<_{<&3N9AXKAKAYb9sS>tYNbUX$FvPZNTlz%zG{t@`pP+CvtQ(<IV%^AATHLr
zv3rfs$hBBH$uI~hsYqVAccZC`jqTbq<kF43_N=T2<tK;|P^!>eb>KOF1ulO4cj%*-
zmGSScBJ^v`jKMH#clcBfY8>rg{$Rl?Wy|5w<KSf&9Q>Z!#20%Wp>$f+UeD!1;&DD3
zNvfucjIz5rLjFTJ!Rj~X@7st1!x!&az<B;#0R$+n2Xx);62f3SzgL!!He$U9^&*@u
z)O&Q=ph?jv2L0hu+`HIC+#S>vxgTG4d~iNlU0qvc)BYk*Wd5*w4Rod9JcL9t>60=I
zFx6k91wSD&Xf^63q_{24x<14K;7pQ3daMORb4$lFJ<`UZJ7Eey6(0!I*5T)QUVEdp
zRWa6BDxDi<tVWgo^OWm8i6&ydf3+_T{6`OeiTOUS8e-tdZ@2~uUxQ1*207aj!EIFG
zrXvLF>Gx#RtuC>1Tx_3ezZy|0N<N2TlFGHeypu#xvSriOs_J@aIV&o5IJst2klz3b
z;WdP=q)mxb-D?f^Wvzi7lFuwJMuL2>P5<@-BNL2sT}#p6`|ub@cl{DairSv}S&^)M
zcS+E)=Ch0TU!=yOAX&d63$xYKAO9#(>NF!fc|Jdpw3tZ@K_bzIE55h-fb_NmF4?!L
zmzju3O)+kB^NF98WHz{zmS07(sOh#t8~u%ZGX0HidbKMz^50AtF3CzdTgjn>IE|S$
zE(FPZzH<eBhEMa2AK;OAcw;tkW&#R~sV7vo2>JfP*ICitOD9#)z8nfOr`P<S`prts
zdhR#lp{pO)b4QcaKf#6jeeC%OQHAlx$!8{;9g*9r5q5DtPTfY*hD)It1PS1KIub6q
zILMq<bnGir;$%(F$_a!_{?Uh#WQHaZN7AYiQAA}Kk{2N)5-}6GJ3k5sG945ci;Oh}
z#;H_Uq*t9-Wa$&|Blv%JnoYg%U3XW5+af>|a5B7y4yOZ4KLqNCFdjav;bCR1OzNA6
zPH=ktvYQhpV21;E01SG_9%3DJ_MwmA`|06y>~>U<Df$rNUl$x$-AG2@yCBCaWiId-
zUof{#QgsW4KZ@h-_SXn~17>b-&}$O>!#iC21qZE${X&%5v^AB6{i0o~BOKG>^V6dY
zy$bEht)mqhqR2)^r*@sXIxzcG!FR|@yu$qHzlAL@88sIuWW|now$ZfjjU{HyAp>ju
zwTlp$Ch7hN!FqlV>ZED90Qa8pz5zGChzN{s-0|l0O1vs*dn}5TAHw^vXMadHt<R(n
zjU7faqEGFWO_t&P`gI_}dc*v1=#2$i=}WJUx!o?vb0=uis~4a9JzeZMCg}>3Y1Ai)
zsA?BRf(KrcAx}yz4hqOsqi94VSH7>fw@0wlh8Hl;(9Q{VJ&6CBm8OMV1XY5LP_=iG
zQiFd~;-%S&)lR&654c%bRmsYW^iL`Uop8+Z6;e)>73K=p)D~iiYe|i3(C$QLzk?^2
zT&J!pxc+<3m7Anfb>-#dg&iFokFSv$jAOwA1mr7{V+AomJpjSI(n|IQEXDgFpl@4P
zmv+;I#k_dPoQQ?*zWLpjr7KLd3H36i*~g_AMEza`e+?cY*F+uQ4qW|n5u*q!wqT*7
z`WHSIcB+f}W`{O2a(<;YQVT8~-<?v&UR&w4?S^w%qz!NJ%Dz;!{>JOb7V3F1r8CNI
zQI7ld-6sYim{nVTJ>50vV+`+dJ?kJn-WWW(<MhK<`}8yo9syCRqg?JnYdvP=7@K4o
zM<rQz;TG3$`|J!}(&m=5*ufMQg_@Esm8xhYiPTIN269?2-SpOL4IUhvA{Lw6jxZN?
zF+Vj}<Otk>qP+Lnf_80;OAqqSPW7=G#=M}a%lOT=gUt3WJ?CVUAdd}ivOf!3fIIS<
z+i|mb#Z-0cQPxdo3ohq)KFfF>LW~H=Dt&9C&xOpOUp%E^cJ!Azk?Z{*dSs@n-E?jY
zby%2-M>`ab)l~f<I$^z)r;AA8^#CY=7A}SQGAL!~D1T0O)crO!$rYB^d(KI)_lYn%
z!3AcWQ6%k=Tznrcj<_2fo@>|s$j)ew7*)9T-7!X9tHsIWNlD2qBkj)8olOqwx|>QP
zZF|+I987gqUb5?>Bvm$35h1<Cv8E354Er@MOO-SahjnE)E*67XxD%Or8!tw?eI9p4
zzgj{cIhKsC$`A<Ok-b-+%v;{9@MEP=wLnscQnm249vKB6LBFi`M7=~Tt6^nQ|H{CD
zEDx6{bhO(-ZGN!9YIn(|u`R{a=XJi%8Kp#2W&}yQlK3o%1gSz~MkMDMQ{-9n9s9$<
zquuV=+;C~7_li1ZUmenzlm-xgvt^8TGEEd_;;B%pL2o|5I(lEc@UngmthlY}cp}|U
z04eUyd&kF;m5<{H$&akP<`=7~rq<|+N34hi1au{~5;bd!5jS+0^oC_|o|?x~x#*Ux
zh-DcdU%_ucH2#Vsshda7suk!W^e_&?_<d1a7wC>tQ&Yzc4GlX#o6639@v@oiMSNsr
z-H4Td3H%b76|XK$a1@z?`)y4TtvrkZ*(C-1#*MYE86C#jBd(g$2PZ?6;rBRr8X{nk
zK33_ZeTJ6$;*3J222HU+f*gb)*SfXy-W39)-UPQ@k9mpg6eUl*AzG)<6K?~K4=x+K
zq!0Suy(qM>GkGVSN+;R=usc&eKoj4mH%X&dfj~u`t5U=)o!*Ssv2vbLqn*BI$hQ-_
zryWYS)YcO{Q*@K`Obyp=VpSZZEWi4+I#PhNvWM@1j5oS5#iP^TEg7VZy&P=r9CUzE
z+;iR+ZW=)tgiDqZ^ZqXGH`Q#>bL4(6cgW5^!~5ic-t^1f!maaq55^h^WCb@Frb72|
zZ^aq`E~eB^zhRea5IEY)lwp7gnEcym4AyXdeK5SQZouH8=0rNH7~*2oc2m(v_kFTj
z)!zQ(T%%AE#r!$jY*mM~6O{)S)*9XLVq+m18ACY%X()NzLouP7B-1`+q7Ix+rxRjK
ztnTI<;av^~#i(i4(yut&K2h(rvYB<?3Tn4tNz~HqT_z)ynVc|n(<kSuQ_T;k)6K3h
zYJ0dkk2JM)IJ7TNy42}bgNod)q&_Bh^3mQdGkS9Jy}L#w-MO%Hp?y6+owcr0I<!V3
zJn;7n##O3G7z7(08=fS1RuF8E*Zqz^*aI5(#;Z2M^uqGLqK(;4=lYEp!d9&FI@Jj^
zPE)4zE+whf=Cd%L(j__~4wO2oRlamH)Tel?`9ta6M|O(k_SeOK15#in(cq=$CnyHz
zXa%qN`C@Lnrb7E9Qtqx+Z{a0PH00%KeH{zcd(5HhHnc0~TogyS)-1Mss3ehgi}zR(
z>c$(pu#>#<&RkHA<Go$H$;h3dM0V?wJ;&OUkrc*_=7+p06Ip1=493MG9wZbLU+=Uh
zP01|m&m_m3xIR*!EK;jE_lP_Q@Px+oRC1hG?+1mLYk#z@!)jh^>nv7f4sjqO1mUYe
zt4f^JKYWn8dMMsbh2TK@qr044CD+;S5FIWSL7D^|U?~j_`WwJD=%HnF8~uQ)J!aK&
zH!nbdGxQ^HE{OWv%yl2pTsTLFrfj~8+KiEx+KkC4X7B-Hr;=H}M$a)v{WXHE_x9Q@
z9OZ5SD=LTD@3s0(drUZ?SD+ul2!s>m;d=D|fDgPmzH&j}sJ`t8^3=l{{N!rXH`4O1
zd5Kn9p7s>tFy@%xD;ehEZ1J90M!_SJGu|V>Hd4Nejh$^>vc|q)0n7ONzxTjjcoGUO
z9$KW1R#-3fDb~3gng%-O_L}P8h!5^;X+Lb%NghIfzWN~E>}YE;ns%?Xv#OJK2u<y?
z4+m=fB1Mc;^yBml>C^HI=|lm=3`ydn&B^FQwc;0?r2D%87i~7=;cdly>#>bJt#R>F
zj?$mA2qZq6Bx6c5+Rv@`)nu8^Jt$jvQ&d(}Z7jEX^cg$KeQiqA>>(eNU*+An=?a{z
zixa}@hTlT_w^Cn$e)(t5pLdFhh={!BzJ2|B1z_2|&s@Fi9ZXPQZgxI>FHt0!YcD<N
zYwhUkfncv~e)~pE0p0x331T<CMu8AhVlK77&Xf3Nt7!vhWHd>yg-wUl=MM2P)6y|s
zZ^8T9mn6V|l!}{TG+pKu!4@eF=6I4Y6#&d+hw+?hxf<AF4Cvk7e$(D3x>+s-#_sy2
zC|?Wirg(uNcia4|CSBY*${$UWG?rRpauhZ&;_;)hLTV`u7W;av+c(Z?y2i-j1$F6@
z#O|$l@n$3^ja_tOmK^IAKMJ58idu5Oc1)TutAMTWM~&0OVwGM*<cF4(V?0x?(Ju<7
z4ePk0Ig@GUsx2bc)XpPk&{1%1>J^WxQ>tdBK1mw#1#dF`@OIdq=%jq8C}uXlVuRZF
zVV!C2tq;Eqv9E73WQOM>$?h_5H<6)%0r&Zl1J_ZjI`lT^GE-02E*~6IfcyM7_g;&3
zt*6B1bUhyZXE~~+%JsSM*=~Y5g{siHA#Y#PZ)c9+&{3}HjccFT8C85^E<(*CtCWq~
z4R*iC`h4TPBokH&%W)a>?6K)gIj1K5N=iz02VBPS266<~Gul(^kKTlY|L(oi@6fkZ
z+oeNEWu5_^nE;;qT;w`f3gdR2B&L3m=igQxJlM<4sta(ZoA+xJGdsV%-|6TmbboHK
z5IxRToR~0Icy!W<b1;_h(us7rDKN9t-JsYtNWRJ)UE8Z`j@(lhfVdDO>eY_QiD5S2
z?RGa`9PcV8?AJJe(^K_gLRU#z8_qbV*tsxE^K3UhZOYg`?UW2Y-+3$Lp;mRug2$~?
zTCd!cX$LMt?Xq%TT`F7p$P-DTVXqaN_Hy6Ev50uJ6JgVOb*cEY4lk0*OVmZ~^Y`DU
zfxixNcXh|$>)+Kf5nn+bYR%18XC^z!ql`95hR;Gq?K^Osw4`>-L%E?2_R}{B@Q$aF
z#~G`=RH|KB8K)>!rqZT&o1vOvg?pcpxF6|BZxVy7-jTaA_GNT5RiQhtBHF>Xa~?6j
z+2Yn)Z3GH@!Moyz%93tW-2)&^eodUBIp6Ibd9orcTWn?ds(*Ln_<r`O)<({uCcr>$
znT}3wi^tlkwd6V9ALy*ccQE8Vw@m3iF6F)`-SJC+!hfx_jkMHZ?_<d9D0Ye~6=e))
zSZ*h!zoP2)mKu=#bVwj;+s^TAMLiG{5~^x#X=(9*p9BR|ZUQ#sILdQ3Pjl+3smVHL
zk-%XVy&uZ&v}Mc&UGfs0PaB`8g{?gfVl95*g74I<xtCft*%0`hEbeP%<hTtDQ!Rr^
zoobGO6{UJZ%*p^0n}7HlJZj#7WP_%TEp*iLt|ErVo3lRN(L#x;ZBs0J(%1nlY+~uW
zO1P&76^Ff<d`8N1cLF;Vv-|T5#JR?&8!yOKRlBIMd&fBnHr0#YxJf1Kr)-uS$6Y~&
zxvGO%n?mG*B{V{>848e=7e;4Wh^9?M2HgufxN?6C%+ao)9%lj+R*8${3b25^<LN^&
zBK4E5UWvdZd+`Lc_(HnjEPQhqb=!}-v7kmR{_2T?RYG_5lk@Yrf*-qQ_!7Vs8}*z=
zX7na?+rDKAa%o?m+Ty;QjdDxr%%@LB`!%#rKhHPM?qWrD`qSZxScS+vbocb!#6wAC
zn@0gugOPW0``rzZtrL|d;W;XVoSJuV4iiWMsD~Ls%80)K#I(ETx3PO#oIT%L_BssG
z{YfH{^Q90bxy<7CBO|%Y$cQ=-Qia)1rAD8E!lm==Yk9EN?L9@W0C1iHU^MhxQ3~MT
z;5{Xp=6>d5C7RbrZcZMV)8J010?==1_AIZ^Lu&Nhk<0;y%i??KjqR9R^?Mnz$7^>Q
zh8*jd{HWR)0D|it|G@2(ujgqo&Aen7tiC!0A&nN6j^#+qkdD$-%#fvSSITlqTFp(U
z38vM_D!7wS{GK~;&KMb`^e~@4IU!FLYsg|~Z2vVQ`Z(9#GTP}xt%sQG#yFYBNDoYc
zfHpUWN-_OiD0eU|KU0f1IiXE3yJFx*2e1z(j~wCRnhnJ>z1jq-p3`zAyiO5!s@rk2
zE^w_}Z@7)kkL`|3f;rEVGmTmBn3Z<xbw4W-)IK8)-;AzxvF2k;vhRjAzSmKDZ=3`m
zw&Z-tnOsOR?|?(n+f^*P8^4s#x7GZDY{-Lc!a?P9`26W&vHZG}8W=@g3^`Q!oF%gz
z@}<N}CTLMG;Z{7Uo36*eN<iMnckz?W(D!iCXJ<76y}`Q2i8XzpMwbi;SG}nkSKh>@
zI&WG6j(G-H%9(gGiYNLcc*aJjk*XEk-_zC3UiO$8F_oA5e1s{mV{>SVLw|q9>2Bc!
z|4TI+npC$}oMm1qYh{<9Y_mrB^+wP@L*(1H`c*|2fHeuL5T-d6X7aIehRdT@z1P-%
z1(4E%sVuOe5DP$q>fPD?j*VUp%ALtgJDEYVVHF`(3S)K;Q=((a^d-!=RQc>~vrS#5
z(FJI9LX9Ww3iOLb&h}ea8ua3)!zBPLcEl*jBx_u3;EWiFj+_L3wpMLb4JJ|XjpOjJ
zmWV$z%b%2KO{5%DvRGmFN?u_|M&~}=V;HSq)D9CZsNnu&QK2c`eJfCNb1CfR>XOn~
z(nA(%gmO{C+}^FJ22!S4w!t&S4|8*ODNZZj<PEF_lUP}<UstH2-PC?yZDKAKLN&I*
z?eJ>BKw$v$Dg60Kvv>Sx;gh<Q1}=k!I%V#uS9&_h!!hU6G*Bw7#ZV6engh~MYX@SI
zgNu}1i%ui7XTO9xUxL4xHf9%)QWEfa;#7&ILPO)wj3t|L<a;HpTL=B&c^=sU0^-5c
z+ZU;lM{d)Sfd!-JbF--OpEdKIO65me3<^*+jyG?Jh}iAvz2G3m99JHklmadikfD;Y
z8zV!qh^#cxLzo`boFP_cLjnmri(79pux@E{mu{eHoR=KE#3MkIuUgM_*@*5RB4t<j
zSXV8Br_50!A<=9)ZaygJR`T>gGrQ^a!I@I<O|L`CM0pgYSJz$h^h&7EGj*#<iqHh~
zeof^7J;e(5yqO{B)2B~pl~#;D>76^i0LmeMBdV?ucG@**G_Ov2XtEv4Cka@l2Zy2R
zoHDH44-D@g+466FJ{6v9TPR{J)<ZpB;TNuJ&VLr;r}yI<*d|!?!%g<EAbyKt_>_3n
zR3i46qkTsbS!|XS&`_{oE0N3_l`~$}BL<B))AO-*_0T@soD)uVT-h`M6~w|PU$9&8
z*b3d!bU3j|6PLoG4X3@W%s8XFRIAq9YH0}uM8Yg}3Ts2M%lz4`)1;G_<E=V%IErd<
z$Lc-~x~?vPZB0jluh|^uTnbTQol?-*Sh7$Q5k9PT5<@NX-Z`wah3&}3Gib@K+-dl1
zirK3tyR!7%3qy}up+0zUg}Wo_O&3rP{Ra&2Kf4E#j1Rn=DVpp7?z{)=5CaBHM1vM9
z=yQh2(}zhDRVr9$mR9T%MS__b2d%GY2^39PvzOYRBX#34e_h+Fk9@48Dui8oy+P+N
z*D{G`Z1af@ez8C}vLXoXF)hHyip2v^n#l`AQmvlYmEm<<CzmL_3ytfq<QTrpZSi_1
z5yM6CC&^(D0=rjqtdfo;f(mJ)cKvNDinEmt_w(OCvhOq~i7mN=zd;A(&{q(A+Q9)y
z^?aVcQp;`2)WBEH$XTbbU}~7KhCTe$D(myR>fPP^duT^BmX74d=ZP$VcDE3721rz9
zUUMb%K~ZD!DML?WomvO2aU4!(b4xj(TIKlAcMjjN+Hp@J$A>Z4w#&m<WG0&q`%Dw(
zM{((Dy?AcPVL0r*D2N}^>q+LYcAqLLYkX}Abt#G)+9%=kGcQ!)sxxL+Uh8{>_G%st
z(TzfXE8)1*d5m94XRzCaHmEy#zn9mBN|%&Z1u`hD#J10}$<{5MW5wVx@;IU*1?>xN
z@#Ob~$hl}WtHj<OKi*54$)WAZnyeIHkT#McOMdlx+fs1#g7t6TzJ0O0yu9mOZO3Yy
z5A43uo-qy?E*<9pRFeIc1>0hd=N1x0l%zwqdwd@NVwpv$#7+Q4Shg^GOezzYwy?3-
za%Wf3Ed8wo5O8-Ra%!9Y#zLNiH@4#4=-DV{a$vdc=;Pw7+^fSwP`1IB?GpjA(ic=O
z2hB(KVr_B1$MZU{O2luPNyM{0=Z-!!lSpC@o~W^tvDU4pVLLz8^1e8D^C^j)Ci~VF
zGgo~sd-Q-6hbL5R&nye;<U(NNyn%xF<I9K6W)oYOl=78<uuS9YS9O>}9S#bAH#X$x
zc`A`l%3QpIFLO(r@V|BqGxKj5py#q5`_KWmtJ;+Q6A!G8^$YBng^l5ERJBb_eWep~
z?kv!iNz!atbJFS@-#9>Z+nlVXy*^c`i#u#(?Qi7yI7n%Yakbp(1N`XWu~GNCz<M%?
znU_u>fjhaq-4lwe2F-N%)d-f>(~G-%A8gRQG($Bh=&)!F@0+^#a@sjBb=VGM=U};&
zH83wTOl=y+;meUmst9Tn=RVU=VYg5_B|b5YA3T&%YR(-H*7B(nhe{^*D0^y0P{mCg
z4GA{$QU&uZch2GS94MW6%QupjYmS#lk(z!OK^xf&_3f=;jmVjsHj|Y}g|b0W9DdPM
zkSN1Q`Z*m=;P?)L-MuG)+GvXVizzeJ;GDknQ3KK6B@FXBPM8{1)#J32f>>^j#r{W6
zjm&>=lTu3nPcHMh=3$15ulAH;bmr(cTea+W6F#56THCvJN_L)c_oV{*?auJHwm`P-
zhbJJsNmj<9O2#C)(Gq;gwl~I3lk41>#98<jXl%bQ>!ww?*$z=q!>V)JR3B}a$DAY@
z?0p)JP)h;;UXk>H@HAJIbz6ts<U$bMnkC-R&IgdXk<*HGxvb%$h5h2Wz<=XlNn^+z
z29PI<LwJE+<7X^))*!6*a4bF@p^c;?KXNjcb>d8`IhHNX_0N!$*0j=DvWzt?T__*x
zra!;r>#CiPBTXzX?ginI%aCOpZj{P7q4p{>sI;zZ%6$I>y7JR6TeW3-7Y+4RRgJZb
zri)ZR0BY64yy1fZKcx7P;()$83(s>cVDwd>oyqks?|A-tHLDhy7=V5g@J^=R!)VA>
zneKbFPS9!%M$=v^x0|M1mI-?$!ZgW6qm!|6L@YIZ_VZa3pvJoFQ_ppJe>v!v9{Y|U
zT-GJ!3Z4t{6!Rs(mt8|SxYL!sn;7u$-l`pamIuKBD4#m+$%XgoR_#5d)SdiMXmc5Y
z<EK7tA2fc|HQN6^ziRT1sA8{uBXML-XJmZqR^-T9Ib*#^Eu=AlSN+h_Y#J`exlk%p
z2@sBy8tL&f5e$HF$tD;Y4p}Rk=YrMQX4pmH0J|>S?F3Q^jyku@#WLoH<ePhJtCg?w
ztlXv2K1p;r?>tQ!9p5+iW|xK$B*PVx^<GMgx1Jii@@MNrRpQr+>r(6n>NGRlykhab
z;h!(@x--Qioxk<7EH{b?)-eXseYkM%D7`h&ddcn@alrXpzcD$0qr+wxx30<4xnic0
z3Z?owV7@BcB$TM8j^DVpZZjXh<e;%>Iyf^wMO&&bkJJ5b{aC8a1GAub<f*QUkG1WX
z@7?@|WCt~2c1Mi3b3SFRd6w~PJ+?)%Osp_VMw^?s_cpR+PV)_GG&8|vdA&B~&er=?
z=7IK&d`?;BAq(e}m-!8zSq=UsgM$)j1hRVMx&0h^gL#KXfW%JE&G4=&Q7o4fYG$v6
z412L^57I_!ZZw<ebvn>B7>qbph8Y@lVB1+Pn9%=%W%yua=U#$$h?QcgP{fG%+1oJ9
z-jU!$&0#oNuEoYyi|R!~<C9^E(Gt(V6&YvbX{vZZJ!ZTu>2H_C&y-V-lVF}i<C$-j
zJ(lgXUV|GZ2-Jfyo|#lq{dXzRm%g8dmoZlB7*D2E?}qZd>DW4?$L*ACs;_ksUQoPM
zp!X>I)*8WZ9$OJ-j?!e&5nw7*vyQ|I;wRp7ekz;Z6^gO0W7hmJIbJ0^$$Jc3V5p$g
zsrJHdP0y6cYI2#C;!p7POPIpP{Bj!RVQXu9oRgD-^acDDC=<4y0#(Q^;Ej(O>>b?M
zoGAO!UN$P<R`--4bqE}xn;&d{wJC|8mysmdV^;AHFHk&pjCS(XW?)}vWSLC`0pTjV
zlpJ$eRnx&=T<2misaemH!?Uew{n!kjL$~mvgKsUyr68qf=HUB2Pu+f~2-nQc1h+ak
zD94*awA6#khPH*<Ai@~bV01ql``x9BYGzPQ(;!Xa83IKU3XhSlk)+M^Ue@+94&B2+
zdzx;h1{JrTbO`TM9bSJw84{3-{bvfB#6Whw_ZsoFZ>#`N4|=;IJFd6~xaZks0TCRs
zqiI8<<$=pVF5XmW-IvR_E_1-tVaLNOUiN{B)FziItHqi1>BUG&Cn%SgITMu3V8ck(
z(V65;m$_Br2dc$y2hO^kl>@-ps?EULNWL6gAu-#%`{)kIhY!IdBCF-QmSGc->HHF{
zOR8hJXXChR#Vaf>ccXZ?Q>Z?2mzxF5nFto*U>*!--*mTPa10xNB~tiax4~AH*ZL&k
zNvyfmiFkbA2KrprjfqAVIjo!Jv{XUnnYpI83B_?gi2*ON{3O7L;!c(JcVO(gPY&-o
zpZdj|e7)xhT>kOdNDB1+V~ZkBK=qieC$!McC53ZSUTB}py)GGwHMPIzJ!xw5$`KX*
z`7hCnQ2)8FZmO=dbcooRisjdvLa;i@V^r6DY(1cwxAdg8=CY2Tr;+bp=%xqSM8NJb
zW;pX^M^f9R*DEg4n)Um**VUPzR?e=ihY9jH328K%pH%H1XucQgH}-xEt~Ah-UL^*4
zl!EF_z}9yFwtl;<VRNd1^18)+Y=)Id#cbK=I*+9oVvc0yV<l^iBUs|saXK_#yvGnY
zckA@TR006>CrLVRCN`&K?qfxLCt+Tv(KHX9Xk(1nfAUnwZmb!TB^V=eIALbOT?}=%
zS^(4_PRr=McLtVhlftGW9ODb|mK+-iJ;QdQ%rt3kQW+<u{6KACl^p)zmlcDtZ3>tC
zX$E-0xDV%bE{7)J0NcRy*lqyt@!V`--ddKPzsz50T&&ov<I^KLr3bijXdBLf(u@o!
z&Wuc+W*d@K6;lD0E~Kh)k^?RO6?dsq1r5Q#XLGjZ@2eXpT=pb@IJechodyIc74mZP
z7n>7xkAo@;B5h)2u~l4+Gjj!*oW9L7^{y?M7KW@y7AWwlgcc4@c=YkYL`T$0JpD)c
zE6?!|!cDZqp&!%i4fDRRD$I(Z>LefQ44AgNM3M=alnsnyWjiegj7@uBs&SxbdX=m!
z)O50(@aL5?@;)8HKKh*STkm5)cc&svf#7Xh1*V6@so)q>^Sbk+W>DhAoMw!Z83C0B
zgY7|yD<-%>J2w)chndTI&D~SiX?OC?p$bcEdL(2^?z}&K!M(osnP1_;d_!Tv+A=Gf
zAR}*Vv*8-9Hg)k6xaBsPMoD@3?M4g5fH#Rtp;2BI*xda`-7%Aymd%HiP}v;WB5nG2
z(LX%5wS=O|AC!rMToG+<9qnt4ZCcJn>tN2PTjEqbR~264t&qrqq4C40e)O>iwB3VC
zd4vk})S`uLi3gdWjx745(#zVBs)P=E|1cLfZ{=I4%V=fLNB;k$W+3{z3h-0g-Tzza
zhAL&y0S|=Bx)2Y2?|0g;IDkPMBO4{-(4wyEOO>I7;3O)uz;dEirMkYqIxdsxoPAqV
zi0kawCOfQcU$%nfbmWI^T%BrJ^}yVCpTfPhWR6m;p0nNboM%kYsGggd5EvU9No93I
zBAKX+L@r(v)Y@qqlR2r@OmB`P4=W+g<|d_doIR+p59Jh(R;>^DXnCx?u=8dK03w!K
zXdyO_q7#?tokG=6KI7~5rZk>C!qmheFduu{3QxIf!lTtSNC;@ui0HYRMEqjS@dm{j
zZ!Tn%{`0(+;?~3VuFWS|tp$U1`O20juUA}aA+41Zg>L%hfGf?=Kh|`XF<xU=JLe`(
zpYCgLD{1|aoMs*JyH>#$rUL8YxD66rKVnd~P2pR6o*2`ax5J}7l}Tl-6kHgT*hla#
zl*~yib?V&d)qH2W7fW{0@dtUgLRHlC%;WhbAcuTY#)mhgJr3{^x7}<S7n=bn4Wn=)
z7IW-u-5J~nqmpoP?d))kjRv`P?ub)}QLo;_f^iB*ofK==LsD2MVNt#0C$bDSkQ4*#
z(-ZHfKrN0W%7a*GW20|0-m<?(MGEj?iDBYJ7(==g4~*g8jDzu|@1~S2lhthA{SEE|
z_jTNxUVl2Oa0V)o6}>ZrZsXjPbZdvo)3H*GTfZVFUh1Ef<<>pYrzi)Whp&$(K7#mD
zEjD|EnkL`t5$-zV54E?QW9&q2I}!9=VW0f|r4yCG)W{(5WlW2Ui+A(WUY(Csl38D{
zX$waaKjdK)>am;|v_XugV_MF#s=ea*2wp@~pM^NY{LM`t9&mGiNY-LA0=dp;jsCM0
zH&nGeYIF}-fI`Cf(}f-mC8Zv}Z1=!a<+lHlLv5i@trgwOGN8O!4$_HsmG4eWGca+B
z=Ws<su6pZ-TEMjI;H>Z8jP3fVA+tDh$;w5qy>I0By_nOeQZOp)agQL-QvFxl2odvr
z&TxH6qf7}NgPt7BJ4ZG=P6ZGEb380jbvPTf$9~h5O2;94GGMWqA?Gx^)J$uz;y4P}
z-E1q~gc1d4cYN#0`HmBFjZ@K4T7BeUd_iT;O74fBe%U>f?WM#1aJn=;pWS}#!K;N=
z5?*N4FICW^U=kElXqB^hfn?FS^Qy-vw@raCeuXoa?oxarsl(w!phX!q|2>+AXI;6~
z%Cowp+Z*O4D2wZLALy(nxng>4@5-)Lq($S11Da@kEN_M#b6hnNwJ)1xZjd&OQZbVS
zw{-w+q*xo&++IGF&G*jEn=WMjweOc5CQmCz07dYP?Q&Sy{8_Jpn-iMlt~c?C@9moY
zT5&jLGfE^8&71!CbI&~{W3TrW<2w@?xK3+he2f{J)Cm@<RXGQm<!VQ&^m?9$?g1j^
z^_LuWt=CE)`OWr_9w+oA1;pIE>ninq=P%ARVtgh(?|+E9OsV!;6!C(X@`N6Sjv*?I
zyzB@c?*rkKbZ(lKeAQPrJqu+8)ZvLV^x91Cp0q82DwkwT$oDVG1TOPa2V@N{M`38L
zQnm7I9_M$$(!}*FUyB_{x`vA;NbL=y2Q)nWJt)u<V@wfCuhowKR5>R;cn;)}{7oq^
zyp^4O{8bmGLlau7dsJnkJ2GLsa$>b`9E;isbW)C__;Qfk@xGP{8e*U1a68R+bNdG|
zQMcWr9+0AWaXirXf9m+r6ePTO<n->P4<>W6@08bLUli;2<j}9~RODmQs`No?hG-5y
zTo3lL)!Uq|B|qAljz&D%>U{3uvanb|y(_kF2IIa4BtEt-?Kt|y)hyVU2UVdFL{Ika
zgE|Les*2mB+a>A5qMi7&h6=0J)=Wx*i)$)4Z&q=Z5A29;*`$0vC6#G0O~#+r!BeqI
zJ<;elmor6vjlbtoW8vQN3!l&5v<2&Zt=Q@gx<by9-L7wne6hUE;Nr6HksY~j-*oiB
zqV*-6vbn3g$-jV6Q2z1Wns~ErRDmM{UaimMqhot}z3%U9s}<2eJY?g}t%sz1+})s3
zy7oA8;l*)ett&?r-H7y*DDB7bV<3<^BT9c5OTVB{)!C|8m*Bh;l`D{h^b#n22e^i?
zoNAMGJ&uP^`VGEnlbgI~>?~55Dcxk6n~0(6w*W#`CB(0J^|`o-@kyQ=vc&U4Rf%Jj
zeCd8LdF^ux3vB!b^^EfJhiBD&^t#7CvajD}lv@(*1f@lZYSEMFl<kg>@pu1=wy(hv
z06w!<i~y|@sG5plBq59Yk(^}~;c_xuie%Bp;?6HM2x!>*+vrLra&Ch-gPV!9A+mQ1
zfD=#OrY5SS_JKzAZPjH}J35tSbIuGzbyE-}Q|^lZM)hEt^uc7)QhP;fcjg`8?m&O3
zNFz#a*0Ta==tAt)(fuA%7oxRc1Wcpxr6{K*rzMGOQ9XLs{IhFr>DD+x*j$j!i5jBk
z=Bv|-xhPmFGik*Ym&LT@$5Qv&@&q*={_4w-?t?0fsim-g13*7@L{KcSpF|=1)~%N|
zx+n(|N6bZf$5Mf86zlu&pYxdCxKFm2A2vEZ9s~PkhE#I|X_41>0I*Zqi5gdno+g$v
zZ>hwq1v*b@bvE{?S_iEgb!U;A=N^wIXn|0Z$#`BeqR-06&Tz@2{v|UW^FVa2_GYce
zf#tD#i+r<PM_@gjiq;WpNEz2RK+d4u%2aU}17IgKKh{`!K(PjKv1Oy|EMw?SLRHSd
z_!I`OqI1Fbu6K=GHeW+*c5yC#&^E7P_iL*GNkqo+m(jqrL(!)8dM%?5gd279;~R(_
zfPeEaH0%0xD<c21yMUh-;AO=EGGgny+v*@kEY;iy{gmnf-Tvvtz9UV3@SvR_USZ@T
z2WeyUK+u8B6GaKaXf#Euh4-8GPxnw_u{O;WXLs8F7V0_3Tvzp`lgpIY>QAU28qJ|E
zy%ed0Kkm(ur-PMD=k}d0ip)S+I!o4Le~?VMFRSJQtzBdWG(pju7lX*bN!Bz(>&2vm
znp5p!j_Buwu?49F50&iZJSBt4p_|s4kMmoOUbwib434f0phr^J-Eon*XnTLbarWGE
z_rrLN!>CtYo(>LE6q*|O{rsp9vtN-Pp*{@=VkNh4pTO$$fv%eT`SNo2q;o$~CV0T7
z`EXM()eJhgZ!51biccE{0b$blV8Li&HNKuNr>!g=x2X#gCz|pk2CM!MgTx#5l1Zgn
zr#he`zmKoWX+KbW=kDG0VW&^%yoH7`&061-d8P0~SG!H#WQS-avJL0Gq#Fu#P<sp=
zAuX+aY6;}UrH037&8=zHb;@jdHU0nOG>_C$75h!P?!Z`$qH`w!QC6xVjtOexsYtT3
z>~WUl!a3aOYF#XJ4?PFzXrad%&0i;mMHue?YUar5pmTJ8@4$sjJ(L3aPxz!eU?Edh
z`uD92gqurswG$`2Qjz%J-Fb_Pm#@w^y5Rn}ii_cDc;9Mx|7C!DAM)jA#VF@sKwvKM
z>oSQ_EozGPRNZGdP=JfhB<3<dRFuGu_S`XAQ{qr|srlmhIpge;%0){Ke5It(so<XI
zlR2Qc7Eb2%07@334?slMQO0p#<H>|OpX#d@wjQHVpRnw#AmW~<-kU6wTz5JL?M}P%
z(r)XSWWS2#um)q$YKp!^YmM?p`ql4n9Phk?pFAl|kUI;Le#4$iA^{b=1FG5-tID7K
zS6?AgNfe*+j4I{gA~ZcH^_P5lKD+y)Gl04A>C8I5fuZ37Of!%}&=B<^b3poDU0wyb
z%xlGGn55nY8^L!-Uw;!0CUNUvj%3ui(SCRLS)KemI!sVqCY3?(+wiKwY{!ZM>`pQC
z;gfskOQQ{70x5tAs;Rn}COWu27Mayz`WH1KyC5xkj9zq9<->%awIj1x1a?(@`~U=T
z;>kVf+PmALOMkJLdIL!(J3<tb^({77GOvMpYfU!YMM?*hE32K(^3ofrpvoL@F3e?H
zKADR;nFa&l6D9E7453p)@52Z@ccYCwa(!tnPEU`fjimS3&|0ZRa-J)j4RzA@P<%TK
zwP1Yj%$rUxX031r-BGS}eB7Zxk~_rh)tnOTm47{W=))&XnMvO~Yr)^8QvmxHZoN^;
zG3^Pb0Er?WT)0=CFU|q?>F#~fgh6wB*{OIA0Y-qj`#pAe861@KS}ZvhW=F**Wg3~j
zDYfg>)X>tOHlAxYB=yTV&~Po48h^k-gz?m)+%5$s#ZXiisgtfzny^*P_X?>~bJ<6E
z<AZ~R=2Mn1czH6}BDuI;&C`I!^!c5+!X~?pJqQHGU}H{#hpLE<R}6l{w{ODu4%Hr~
zSvin$m;lL4_Mlek-ZkjiN}O;CJB8Zg^T(8mS+D1F``KSE1NHXf5jwrdGCl$(m2Zpv
z2Ey}T6F{1Ed8gKDV^#QSg%9}iCGX=tJ5B4Z;AZ>NMHY$IRD~Jh$*PfLYRCt|_QxN6
z=0^-q7eKvSw7L36HC0jJ_=2I>3_grV{WpB};|D}KojkPs^iU!ddTRNA!yCpj?wRB~
zyQ%VZM$)79jNh%^dW}5r``L&R<1x=%BV{X_3ry8M=RKgUp<3tlk87O(n;|%E`bs_h
zSrC%rb=7hhg+8VkI~sy$Fgf`?PN&*eM9{lcH7$$x8b}^^>~T`4`cc+6`i7qq4kek~
zd>*2ub(!(ud(aLF6d5KqK8X`5Ic2S1AaIEx9&h+EnVnF!@-4n{VX6!F#mP2NJ=7IR
zl+6aKcuIY6Cb!|Mvk!w<!@1*&#eSf&j1jzNemdK)f%)sQ;H7+fkZgbR^L~5l=QMyD
zuLDQxc5M_#X<e5`s{zWO?6P`U>p2hAU@oW!u1ugRT(T~AK1rcZ98_V9KQs;3*s<{8
zk`)hGTBC;dNu&!ka*|3!X1+6i{6DtdJD%$P{~x~^5<&|ZQA)1N5VBLUH^&~yieqGN
zN+D!t&uospw@UUn;@Eq04vuxK<M4Z)uHN6<@9O$=yH)zD+c~f2>+!fh?&I;ay;AmD
zXG~BF8{G<?Q0)9<J3vUjDc77Y;PjB56A>(GCk^acC{oe(hv^_}K@oj*$An8~(hNOS
zyYWP2r>k|>ABz@tDBtk=5FFqV4qTsuNkU(eJz+M?>V9D}9H~ZHSFJ8@X20FsZ3;Q?
zuhShlwJX5Rwq@A^mo$Ix=1B=l#Ej8SZ0LEZ{~(KZcL0q5;+F@TDzMkG68?Ch+}qO`
zw!Nt@lEEwip1$xI0`bM+UQ1Bj3I7z{(d8n=;ri-AzOo#JmB8SnN^Xvg1+A)3vstLp
zlHR2P!3TlY>sR*q%;|dekB**x=-C)~#?v4ROHnQQNoW?}Ah7xMn5oQN)pp9x(uwW?
z6ZC{L)qS%kjJsU-Lhnk3%`B`xGvmHO#6T7XxDdzeT%kWycm2|)RyM+~Xg}9a7?CxP
zCeakI%h;c*C|;_iGUOuW!5pQU-!pv%+o>x>nrDurH-@%QZ&-qK{AaMeg1S%0wU?+q
z`!O9e@EQ`KuVqso=oC__?q0y5S#K%MXaAM)T=B;7-LE#UjJ5v#I-I-s7(YlY;a?GY
zJe*XT`50);+Ud7CSXYm#J}JqK!px8V=F~ZYb0_xd`c5VB);pk*ybjgO{^k)n|4?%R
zSl&7_ouhdJWM03J-nKTe$L5FXw#-Aj>L-#minw)<#Wv4rwH8dLL~DtoBSH8=`1-E)
z$R#ig3&}au(iGT)dIIz)gL8bK8DL*`2FyXG8x`;TaU_sjCZoojTW#8;w<gjnHm&V@
zFTo!>T}T>oo1*Xma*VcMepJ1Gwdo|V$ajG*CaUrj+nOo5Gj-fr)iYNl?kQl0EE3`2
z`1dREbe32-m#PLnU6icL-cU|sJ0eKk?t`BvPRiAf_@Z3~q4~F3<>_a)-!t_t>h3Lj
zW%cL|WS%+ds`Q<D%4d6NJ$LBA$CoOFLQ0$zl}MdUP?p-F*}=<>U8?c!`M1X_^fW@z
zUxf5=HpW@6qy0l~VVXYqR5#3h3%B4gi?FoDKCu_mAYWD^aGxpEy?;$Eo0@*|c<cL=
z%Hw=O<LsA;7`SeJ*<)Hdn~8f<fN02N<ZR4#^2lZlRu(u=5*uiq>mB4rCXbfOP?Fi?
z)Uww%k<11qYRowC_;(f!l|K%!;j#*CII(=;BHtVLC*V5F!nec!NfOPIj}!5+2uJ+K
zZy8xwSCV=$^(bfL*)B!Y{tZo$>-Ol)+tdcl;aQn-o(?~f*K16tpBw@2WUr}7x4yDy
zT;1>z^UJ+}J!g<Z1wl>yly=<<xluY;jm{zguci^0^q@G8NF5%Mb%24yaphC7GI|&T
z11>j0ypBB8G>sS1B6$;AJz((@%s#KrJ7E_A&{V+bImK5haQij9MD6t4Kf<z2a{~7_
z6xFavCr@z`_WhNiEk02xT-<on!CiUU{;_ytY5SRtp)LdJ&rtPO#dDDX&X$*^wKPc3
z1#N^We+%*9L6>yLDg2;F{|;1)I!$QnEktT*Fl<daFyMb)0AxpNc|@IEp%iB%RZzpW
zZO`wlbq>`=C|!h#cQH^lBFkTX@2P*mmdug3>{+ljWp^yt3HYX*)PSK-LVk`Y$PIU-
zGZbMSbd$}n`<gMPAL3N53#_J0nsXz!aYNRDrR1C4IFEzH)5iKxANt99W*;2=z2-A_
zS@qB(0Jb%%=RTsZH$f10{ZlN>99}q!4ubK!Gq;V&F+@GW1aDQE&+N3yQij`@XmL%|
zruJQD1m2?xCxgash;N^fZ)FQL$|QL}e50m)*w~a2<DrQ=9kL9;dNC4oSyHj-#s~kW
zDxBy)MJvk_V@_~u<zgVrcYVwKON#7b?gz47(R0!LDuK5fH5&DO9_pWUPV6EqQjz<d
znmRyLW-=YWV!yn_5kG~!J@v)E`0T}XCdKtVM;Du^H}{#;xzs%slIcc)EuDx=adD4s
zdee0wwb@h)_bj3Nmr3xX=+q+789j$Iq%twdn6GR3Jr*C`;9@yCSo+-LkIo(Ah21yC
zTRNBfsqnm0qySbfQpC^>&&rS8ouo1Uycd$#@OtTzYBTo9#E3n<^a3;Hl%15#F~rx`
z*E)4V|3VSX%yTfiY!=N~(^dm-mFoJLQf&yWqeKg%B6F>^Zd~BIy1p?ExJkDKW8~F8
zi{^FwluC3)?e0MM>pXcl*!FBB;=Af;?JbInradugb-iO8wB}(kwy|FM^<T@3&~ih=
zjb|b8gY|hyqU)YU3ctt0s&rt_!?Vvf&~Zc}=>4V7-eqSfW(Q5X#M?E;ahYJ#fre7!
zw@<{$>V{5QIV>IO8~w`^WgzYlRtR60NNMt70|5M}0P5AUG^XES`QSjm7<KER25-+B
z>%NOtalEq5Y^h(yh5%2$H(YCa>Uw1M^Jec=B`Pu<E=cu_d|l#7oo7t~h@cB#)ycn;
z$NzIozx(U_Dd6H$Qbm6L&lA5Xi&AgF<C?W(OWOSr;*Ni+)#l+L=;hpXfv159jg*@a
zj`67u?xGnV)csrpI}_RKKpKkE@kC@NQ{Ki9;_#5vT|S)Ac6fJH^$a+0KUwV04Dwed
zfZC*Yvd@&-v0Q7+UaU*8%uCDNZKr0Czemjk#?qMfEp<ZH>NSjBiQE9#2eBc?BdgKo
z_3rC_;M9}ZFACkJORjLr9KrHr=Y?3gP)!4`Ow<mS{$j`QP~Vl6Qc<LN!ODuqVjggb
z@iE`{({44ZfQN&5saq>mI<DrtqH`pr7wQQrrtgxqRp=f!FXYj-A8Y!EyKK!5(aKHr
zWiTE8xEueow#cTS&Q5-56%XQN0oR5a_-@Fep)q~C_G=aw{idfV_{cvyWkd{Kzn=I(
zPa|FYNEK)8xs`B(UZ>M1XY3bZgq&5i#oSw#+nisb+b){^`ubp5ea>v~+Z>)~n0gKk
zrJ|)}<q;hD$zB$OnzU$>?2P0Ivxf3?ZRfh=0(BZV<B2DhJzei)B-tIf3Eh7_tX$o5
zdnS$KQdLY8_7sbihj~46Dg;Hl-x$ix6H!N93IexvUpBpC%!XyI{JTT`b)M+O?|k(S
z*Vyh5ccP_-Wr|=?fG+UN*1u>Jo(`o*?eyP@sDnKcY*eK1%M;rZ8cZNNk>6?Do=GqJ
z2mT%Nb62L;$bwelV8$YqYH=G1u15~F-2W{;FYX;S^>5?JF5o}x&1w)kk|XLg)=RQj
zG2r--6cK36f+UNFDL(`)GtlN!IZ-Q~nlBMm^E}P|0cJ&$!6YvKy)4USR$jp?d?0=_
z8<v}DqFUdTpnv1lmz4MeI5n#fHut;{<nI$#AuysAr0g{hMs6ex7L|^N>az%tG?|UO
z`Sj=3gtu_KA$S*<K&W0<ZuMC<DA}g8y5Um;@Up~OaoBHvLm(jj7@F0(?837QQ01=8
znVp9Pd9;V0oMFT-)$e}N?_F&x0`U~NVXB=m-sJZ7pY`m@ORSLTfhg<W-nglUkRpYp
zNp+rGQFlvR7fj}C4X+t0DXhh<9QKixz1H$*5!j6_dVs#t-{jTz=570V8{jH_j$9ns
zSPpzPjAtapHmC3Bkb7foBa+-HEp0$ht$qUEOC~wnmqs$M61b*EgMhbS42--V0RqZ0
z^Y@Xd_pc{O<8H8l+1k!ChS5o|y?)OsG<GR#ereJ^F4Nr~+5c)O$EDnCGDW@AVCS>N
zeVLt`_!xunhD;<=EHEbHX_k9Rg`#SNhS}Xo3Mn4j6mDNSXhpp`3gTB@{O2TT0Ek)6
zA}I{EZ27)z|Fuf!%t<w;ImA-$S&@FzqH5&Rsr^jFixuOi9%Yk8?F_jK<A)kc_;I!O
z3k@cjg`j(=LNNNC=<?#VqrH0@6K7>j;f8{WhAW8CCdAro_EMSb%6`c?*5=rpfy{%|
zNceAz=Au|m)C){woxq>{VoKCJi=s$fQ+-G1iOMV1DPc-1h+(J@8S)gO2QJ{{K(lA;
z?I#Y~F4RMdKXs?*=_Qh#^Sy0usHzz{$L)AK<&XpwZbB${%sQ^muR3GU!8`EQ-GpZI
zA#}3u5>ytD?zaQ$+7s(S?CtBjVrE=m(9mE_luY8bb}dNU>ecSm>(#zdbg*#(=1kk>
zdv0<NJ7>Y;KMPR!9q?_*g@^T;Q(FHzeQ>MKeutm_I^u(i25~;XM4*Q`hm+N(!gFl8
z)D&({=a6j=LV*J1NarrVT@W`n6ErlFgW0iirf)O<Y+m8O0T*pO>&qtobgVhj^a6(8
zRx>hmxb;G4qzMYCDZpCQY=&>Nu}9Gn$O^tHSWH>hTUjuMSWitv5N<n$Do(u-_~Jwm
z@LmP`x=qq;xNTyOF>Etxi{9wCnZ(RGO8)LK{~MaZiLMo-xyF*FTBa;CYvM*Ja%&0%
zrIsew7#6EjLr*=Vi&&n@f%be~;q>zlQP(@jI||Vw7zwq2Di96t+j4#ueRd4ofhx7A
z8AW=#6OVz#hjb4@8S70RF7AiG>`zD3=sQ{S(Q=<{wo>qtDRW%hf8y_{cn>^&=^vP7
z81SjGcKVO3-2$cNQ6Lb}_vUiOj}Nly3YG`iHuIcKGXF>qq6zY<{NM6u=PGo7#+#{^
zLFNw)<$O7|s<?yWhd#y*uRfBH9kN-&pEC=*Rn~2~#x-`akP!wp5=F&NVWLzt6zw2H
zl}z`<5RJFFME`<lW!em@1u_~!CDGoK?yQg3{<f`J->h1isdajOX{aYd=ro*{`MyQa
zXB*x3LYJD3M+oA7W~Jnr6!7fA&Q}X2O9|L4de$5s&Xuaq@0?E6r(bUYxxt9ES&8iF
z7vQds0_X+v7}NZ)V{Y;IOK2JYf-ea&_R$Rx6kb&_CTFpT@1H)WcI)Dw=j!D%P{3|T
ze|Sx-g`arHMb3xe%UrF?GaSy*75A*J)e)JUI~{z0tIq`tde`DU#ED89Q0p?WYXX5<
zg8KX%K&?_O@g&7_8#a?5PE?ha>zUGJ{krlW-%J~S{8-s7{wwK0pB_}@t*T)se$z>t
zHWR->sYf&KKK(;myVQwyxJpOI!j9&Y@m!u=2`!`g&jOYdUd#Lrn4K8TS384?{>*bm
zlf==7lHpAMrIV-a4@2N2*QJv0@2&PQ*0*zLI(Ameo3fB-kozcHr&-mLI$Qc(97a||
zr(xy9G5=?HcB=|+tMESG19EUpe*-kbDN4aSlBu|@mUFbmTVCA*((cc}4g)g_rZDCp
z^_8J0_wIKXxyS7)ogF}?X6w!H4Y46o<NCcFr%Jf0*h!g4EBYj`c!|S6Q71GGz0E|+
z1Re72X@c4XYrc5SLT&On&fBvbuVF1eH<Q6LW}9iebN^UZ+6Zz1J8BTTU*|cH1?0IV
zE6R~#%<79BV0s6R+gdo}U5@N4Xr)A86h6w2_nOKCeYRNq_MR}*w+_D$%QhC?&o1e_
z?-He*nMy9vK`t}`^=I~Bsl-e4cH_|bOc|zq@q)=Z=NF1z)%uJvp6S@u@VaCvxou=f
zFNfcgKT+*@ojcO^(_NPO?P)Im(isUKUrpq{S%L>b({IjQyqNTKe|@Z5#C7W>c7gWy
z+7q@};mO)L1{d2%c)%({_>trk-;M@Wie&9URpEW$iLGy_aJkeIzW7kB8-%G=LwVm|
zxO8`}$VqzM0kWu$Ly-Z=fplFDytaYUmh_GNjE453lFH@1E+L;aOM3m3u3&4dXW3>F
zT0!e>{%fA1yz^OG^dZ7^iqY`=LDPzxY?^}^yQ0ElQxp8A^5r9Val#+!dKe?u@-pfi
zG<!eR%a9%MDHQGUuJcRdbxK<7Q#?AWy1=zDt{!zdxD?H8E5bcF<CEmnK+-f_2cMXG
zq8{~U(EDvO$?6d2MMA>Zu%MuUrtS6_yHUVBUn~Ge%6n(EH>p3bD&OWK3&<Yi>BAN?
zWS=EiAkC^p<;bPqyh)e(j>%tyq_LDCxn_>@!XyJ>RKcHPG&02a%N<k0RAin<3x6tx
zxaLGPjHM^GAH2-I$_`f5q+d7En22wwmzauq{ov>R)@YNeRklqD?ZgSG8tMSCR-Tdg
z`7_AQK}CZ`EGxqk7_L+v=U#U_0dZVY;7;fjxmlzoMYEr<80Dd!!v&i;(kV0;;Yzm|
z%J{P~ynRdbPK$aVM|Sm2h%_-q{^}+7k3-*uVB7CDyH8VrCIxq}Z9O@M?#jc>tSXK!
zFSJI&wlWCrfvkQc;2UZylp(ixf%pQtT+EfCWF?mZ7pb1u*dXXoQlf6#P%}qIhh!`7
zd)$|+j_}ysMcA6oSv2Q>-LZS;v2RnQfWoc{?RTjz04LX+U2VV1wtw>`=jVBmxvcr+
zb{^Rc&kRM@hss6%VA0L@(7Va>#w$VbWgtv7F+@*~&_7BBfk~OYFhbsunVK19)if*8
zoP(9M(3QTEztx1)5B4fzR@Bh5cw%S(zG^<UbKeH3oWEbMnsLG0yCyK!=#MhtDGjP5
zgZnKuuxi&+68xO%cC(0mvHKy;<ZPtP&q70Sr~x0AKb<P8<L7TJtOLtSR4bj(%zSo0
zeA5Q`mCa?q5V9y`DVo#Zw|H<jc;B6J;$7tu4{Fec@5MFI<!XK&wAzyI)XfiSkF{N{
zHTyLzy3H#aszD*>X<!LoTzNl}5EMp|KVw~Rvv8mvgx=m)b0qzp|8yufZ+;kX^B(t0
zYr%pcP?i;3WE!c^c!Sn0_6b{2Lm}(5b^Q!4TGQXei<f;f*da91NUBVspFc16A9nO|
z+|`fKcfDX{oV4JVq2C2R_Xn3Ri2SgeOCCCWdSH5{1emgw`0DwA+Sk-0WuN9uT85Kj
zKyXp?`oTYm&a<hvW>2Jr&t;*iqw&5vQ7)FN`IBY!pKS+z##@|j>;jLuDTt~k7$bhK
zff)dEf5T?^Tp2Wa|D5;p;iT})hpO(5@shQ~m2rS_k^$@4lnPMQYEmZ7#V+zocD+`z
zw~G?)%H^8$ZI=lr;+Q%kSyS6ds49ouux-5Ua0D=SlU5J>4lx)dMdQO0Yr0JJHnG1q
zp=eVN!%BhwA)=IB*29U^?1APH0_baE*WG^;ck!DBAu3|k$d%at<K^GWEAYw?=0_f3
z2hDvh-$`Qv?Sr9#mcPg##t1`9vzhrOt8SoVMWUr9-~vMC3-X6@#_&J$8T^J3(nLJ^
zDFO<lm<T;TS!#uC^orRq;<r6zjmt~m4OqpK5Xj8#;(|6uj}w&bN1dx1%pEiU7Oc5F
zRlKuff=Yn`a%h$X0ni65E_>nYE4+`|m|4Q%u(sUUN3_}CHv3nqJUOo@*)-L@AIonC
zB|dzV_+CR{^&udj2I(+auS}PwB1o)KhEnD}*+{Fu7|eG~`16DQb-wL9`Kv6?v-~ka
zu$Rs1g~SP1fD72Nh0w1Hcy41kS@{Jt-&LVFuNlUV6ylu@cf;v)C=oIy8LYQd*Hhmf
z)&|=T@+P$FJNjN<V`1RQUPIxd5WW54FrRQB_1q#|8cgM+pBDEXE#;WzZPUE67}$Gj
zi3LG3uB5{bqL6L%bPy}vYD1E#7F@;t7qSx%%trPvrye6jC$*1!Lre5Iu$`G6N+Y~2
zUAUW^;BV5}&(lvfGvo4?1~!IW*3Jc(KC`?>Lm2^$<viMp9CEpga3P|%p*q`zeexNp
z)Rhn^U6bvt59-n#{Fg2SWTGt>F#4b1i138h8_BQ6(QG=MD@g=$vJ<7_t=IdkSo)7|
zCEHW47**ckD4GE*s|i^2lu!T+EU}Ti39Z;8U<@~T88TTHmic=j%3zp6a(Iqn*Pvz5
z<f57THG1L(2bBMn{3DZdCX;*9EYj-Cq4`l#Aqls0!cu<6&mIrmONFH)>hwju=WpuD
zy?=78Aa>EqwZ`0GIRcE6GH>26GMm{I1q)KBF<D^%Y!%6Rz2pzRBlD!h`{yt^e05Qt
zXT3DN6y{!cn-{2$UJbHNLs0C4C5`IP!FH<HzW*9d1>U(BkP#D;l1|YZ{=fZEZmGJD
zNp>EdgZa5=EO=R#v$Zh;g4p5AxCV%NpI&Prw)Qz7nK?n2d1&7v-+Wu=&Es@OqM2iK
ziI9fp15nW|ZBAMlv$BR(;mQ6g1b+D?$}kF$_k0Y3Uu)Z5SqmUywgkB51^v)!GBi&Y
zB)7GR-^kAdUhMTeL$KdzF0Bf$4s;`k<|u3M+;Ka)!R&B1mv06Lnlg!8!KQ>fw?EnQ
z{BK(!&u93LIJdKgS}n*^7+IlL!(XaXZ=L{`v>N(D^>DE_XEhy{5)hU5hp)#`{UHjT
z3nN;k_-7|t+&6mNE5BGFQAY5VhY4RckF`4rJL_og{bZY=CKCt{o3yfWiJF?V?vu5^
z#9l;*MZUBW#fvpC1Bix$kYgcrBpSmq;Mgw@(o`&ZYVSYiPD>yi`9vW<5g!c5>aIFS
z0(hKT8+48Ko`}vmMRhocZ{U+aq)zVtcstB*ncv*PuTp9_WhW)+rKgvoILE(smWSUG
z@^h=oliq7Syl1uGto$m!3QUD$sWsiyDt=`B4u07nfIR$HG!6fH4Q$b1qsX^Me0K>w
z!4`~@8jq-DO1;Kw1Ic`s&N3v4{}EPd9Ed*Qm)8Aa{!SRcK6{22l?Og2)#oIFZnC=b
z{usTku3Y*f<*!$blx#m=uB&{8`Cht6LB#0m`}SqMWTr%a+N%R%-yi<lK?H0sgJmR?
z1qUQd4$j0-`YGY-eMw{BX0T*j56a0<lw81}3oHL@lxN1__nQZ<m$`@$=7}bv{Wvt6
zJhVeZdyBMQ<BqDbl9TIU0nNz1P`n_Y&$6T2_80GLefqucR09o=iIOD*ksi5zl(~ie
zJ@ud3BfxTJs9w=HT|DihmQB{k^~%S2N=W}$!F#5nIf6JY`4j-!941sv2Na^hzrGbB
znG=xgXcUp;_5O$^Xi8s$7Lu==^E0db8VaOXBdCox!{I9pCV9b@?28H4)R;uD(sBm^
z+FFIydAuCusL#Q+;XJ{ck69@(BfWe-*i!aV)~l4TxhSO{Au~OR4^5el6W7`Xq%Z|}
zA@Ygn0ZBgFLw&vi{f8(`93`JsPM7nOFZH<skb@8PhzBi}<wndL6t^aY%HKb>9FX9{
zHAxN^9`eo;W#+G6)8`0p@b!6dg8ddKI(X?%dQOo@oxR}mz`SS1@@1p|FLec=v5KzW
z$z91`E19NFCZL1BbmEj5%-ZeQb6p|R@q&u`qx@=Qg2(Ct`MTs50~PTW(Im@FjBwKC
z){;pt+ta^8XcEzXA51Y*aQ}R%sWj9PC{h&d;67zMUTpGr$>D;F$xa%q#Tnw&?=!Lo
zzs`N7d<ybB8HwwFmZ87hwqDB^=PLI+WKSO-Zd8AY$;UpB)cf(Xu)QE@ymou0TK_(;
zA)Fs<^Ba$bG~V2DO{Kr)5?nZ8SGfw=n=MZ<c<S!{b!i7@-p0vs#cRQ%UCH#Z$J>)r
zYx@!Ec>h*JU!7%uKF6N}%hgAeTY>5~O^$Y+sDv>yQJ#Z+6M`)nuhAa3tAT4WJXauI
z(_;I_gC=gh2mYkDop<5~9Qtqgn<DDQ9S&x!YFjW$1Tg)*T48V0kw4}N79OUOJekQS
zOXp=B`_T#ZwI_{=N;;!Wfi0{>f!_*?;;h9MTwruwOP`RET^Z!GYv|6x?POVsXBpMk
zC5q*nyG(cASi-aiXwgf3n+ofk%Sp06YI2=<X&Tj2pL)htl&GAK7RNN+TB~#Zp)RnM
z8#S-+$^(ZMJIid)7VOz~e@>Ayfi+lr^U9?qimVz}38}Kishtv^y1*XW4z=x>hw2=t
zpIf$F?hVqN8Pb_Bpbn5Ok11@a`qH%G>vaZ&<(3rqOz>HZIlF8h(RmOs+ZgA(`tv(;
zB)E)RlT{%*cTrX0YpD{^J$6x;T5*tMOin^Q`@)dOkUX#Rio>pHto}hsmOd<cHR;%8
zp@kE=F`Vn{n{@h=p@qZsVeY}4oRXBD^29;7;(*5qOe3cyW=GEFP_}Kxr0JzgYvYak
zLOF@-*7>rC^zNR(drOULZNW1b+J?o@QhSpT5-XGxE~x2rmfdaHpz+Q+yHW#_htw#J
z%jWipVQY6jWoM0n`D@+`=h?>fJ}C(ibAtx@#v{GK{&aojXX9(trn{041&ZGrMYE*k
z8!XQLcd6TrcW<w(#c#0Q&25jm9{l!Kbt&Bo@o0SbRGG7B_joXtp*yNNk+7<WhEJPJ
z;K4`<j|Sfn3FM~Dl)8EHMimjdgC6^WEd2#?6&5yqkD-xUVE##{7+O|EJkg}t+Ypm@
zrFnhyV@tn&PQ`?HVPzgKe+H`S%ME;)G3q)#$UjzpYk?`#k(=sm+<PFfk^f3Y4r;R2
z<VvA2*Ja0_VnmYw*+_L>+8D^n+0HI4-B+_N$mi1vadc`t(O7eCwJzn03dat)?-h%&
zhmJa<rX~W%@7ssrPN${0uU^%z_YCwrnx^GOFUk0OZDJ90V?4O8HJ_lU7%`LK9}zq4
z*_dmn85mA{;Izaz6JxXS1ARa!)V$YTyx43j&ofD<;rCh2;OGGdE?f1_^U-kssWM!y
zyi*JKL=?}p`?^OMIN!;onsO{USz>luwOk<hs@!{9NkodH9`$pTrfjXqD3*?icDN`=
zOZNa4#8mv%Q&Xi<E0>ZqgoBd+V{i+?#u84wsY>$I^6rysBX!Z;CKBSI7K1A@-14D9
zEB1r-8YHVp16s{}^j-|%#(fP4OSSB!A&x-HYVBkKc;6IfOmL7?{4h2Aysg5myXY9P
zqNxu$pL=_ox$=`;D`{n(ehVn-jEa`azoOf8`em)`Y_6Rt%_SmHui4#i(r4{wwpSgx
zmqFfzmbdW3pGEYttNz=xlo9jMh;41sX>E3N$DP;r!PT(HR+qZ$DHpXzp06A$t;!Rx
zc<wjrq$Zy2;9_^A(GC?tw{6&;CnFxOCcP&`)$H|c4?H~3+i+I@)=N9w*(f4NGd&1h
zzhW`-r98`hN^8`L-oIfdWPR`Ky?~;jAe6$7Nl7-~hGI2>w{mR;7B+b=wGro=g+<6j
z#O^WNQ%K+|cQh0x_Sr+;bi+i5XG365qmR9<#v8IM#<YlgZV5dVb=w{L2w88k&7R~l
z3O)3+D198Ieqf{1tl@9)x?yg`S_vL4vX2v*UCD;3ELp|d!z-`VODMw?%3m1%0edIe
z9_3gS^l;D`BU<lPR#r0E&|mPvubi2`3*Hs_e7hotK{Rx6TU12;`GYSudIw)03(R)h
zSeRs8Kn^{0MLuqj*2F#!i!m@Zc)YP7q}JX!5yimGwvv^W^7DMMV4h;KR_sSPzrjt>
zZqI|3Bp-vjD^tRcIrP<qd=+a~1XaF>PqF%ZZLyb#-E|6vMq>{#=8H>qE*8;{oTR+L
zzUywdN{rG&TF+!p{xkgzn_M-Fdffik3`NRRcxf|V1(r%dynL%9fA^o7-0PwknvHvn
z)~>A3k<@p}U)!iwDhQuCD{ZzAApS$%yc0ff{(-oNshX}XDun#Gs(bo^kn67wvM}Lc
zsU&&M*M0k?#sb6tya0*3$x$eEls>TyS&DGBN3YwmQM`z0g~qlxr{~>X-AZfa;dP#k
zC{C8h@}g#4Scvjnr|ZMeFw#)hFdcY}F;ZCRV%Vo=0v=Uj(#J7<RVL<|M&)jhjhmCW
zI6CjHF%{mtVY4Bc!e~|iIUdq+i>u$^RT*h>f|)E*Fb5OUCFon;RxOlj!ya!&5y!iZ
zMEm#d(d~W6Eium5Gh>FZ1SB$esO<}EDUU?tKCL1)qt$OQAF3&uas1CVrD~$92#1MJ
z{S08v34UwLN&O~0MMSO1ZpsoI6kgT(Rfj@i7l5IHm>UzfUOY~IG&mdE6q&5M{_05)
z|EX81=hmw{BkIaGju@PpN$LgiL2i5K-EuXQqf*9X-4gj&nZ^6u+nUwZ6D%P#yb+Yv
z&QT8(TtY2YnO2-0>EBU1S3JF77?O88BXQ_qh#Go>X~+<#SszkvTcfL3S0Y{Cpz=-Z
zZ1n!U;wPP36&?LJb3LJ_ioMN0W1Ni(B{0~vcYdn77*9D!9y+fmc7|oF-9_#YepBu?
zxti<CgZ*sCqQSVXVskcl;>;u`I+fD&n@;`~QJUn5Y=jIMF35Zf)0_W)M<9L{_?mJ4
z^z9;r8J307%el~BK^dMAWxG&Pa0y)yQ(X#EU%G}r42d1p%ps93*&+PaUy(#Ar`A55
zQLRVWshte&^K14;v&=13XM1nEm$2~Zuu=XgQZAInqtop27uG?y)&%EHU`wg|ZpP)%
z3@6!pE`tGg+s=bh@VgA|<m&1eVBdd^zG-3m7mP*~)vZaIm{bzgZ$Rx+t=^<zCTKPU
zpLo!4FNB7li=7sFt}-xBP^9l~;oRd4UkfwTROx;z`$6#$#@uZyDC$c6!myfv)3bsD
zc)@<prED8FvsJ=>*FJCy(PgBYts?cc^Or~Ofs#)z)o!VBj+6q?-`Ka|RZQoTvc2hD
z=hUidqdmk(ZGN5OA)A}3VA7j9BOXEVMr8u2)Cf*yOpV5V*OU7(Mf^1`gU6L+Hm%$$
z%(VG`=}kCeJE!Bvs;a)QBx2X^aYN^&Tioj@_amN%_C|$3HM_@`nmg(!<lcL!-EFSy
zj_pF4R$fDEzH7skg?@e?Dhdf*w{KdXEQ`+KiesP6?@}*8Ew@jXs22}?w=`NW%#rLM
z<VhK5!uX1|ERB{ES9D9U9NHW$><oXnmW|2v6R4zxE+i);#5m=DT_#2Oa$eFpa=VU9
zq=83Fxl0Eu)OSx4I`$i6?tq=|ZIv*Ijgq|6obMSzywJPsI+f8H_0>`O&QmR=f2H-!
zSC4<u*ic7hL)XX+yAKolxHlrzggO$O|JzJzw9?FDI1i6%#NPy^r3A0Q+4ak58p?I1
z*%vYyHm`v$np%5kyx>l)@+N8RlHbpZ^Xg&z1=<h84r=er*M$fU6}~oc`=QpulHwt6
zCW?DIHyeOkk&=NTv`AjJMzT<QDWn)_b0(9S-TW?Za!19y@H>(ybnXE}WCkAApwJZ0
zFNZo(_DkS{UYooH5!qEVsH3IK7uENMH!t-^_3-moz|I8EeJ5jfj0Ld%h8{r1mGdhM
zrhAq8R|4FY`U4}KlI-Foo_Ot~u9&pN30+BSGH-ikF<j=vSSOYEr#a+)$Wx1E&paJv
z3&ae*Z%mJpvRRn<g)+VcJLNAUYmF!$W7D2-xFkp9X&<9sy*Kx?5O`~vcdqxlWp#gj
zpYYd`3gT0}0iRDs@nfdO5?Sa%J>+8T`>?)foY1m#J&rsg+1X}#cK_*}JGy*Du_4XH
zF(Ihoj<!nH_c@u?%4hoJ+_!vnkGhr08kVso3THDi#yJ)NF^nPNerE^{Z_Cp5JUR9%
z49#wOMfuey>h-n>?5hIT5G2%InA1dVr`*#(r&5k9d0&^`$ELoldtH2-<9u7>hhI*j
zwH}BBiuG$YC7#(J!gNqm2Ql~RQaGg+lL19H#dGT=<V<abA2m`CKBIZc0%)cPLs;`5
zF#-LXA4K17kS{d8=1;%FeCsuDU6QA_6Z2DZ7gb2CV53U%e8-;~iT*kt0nqri<LKR&
zysqYz4Y$2A$!tM!4u*R+?P~@0c(|ooEX7W8X@su3zq6=)Lx5NgQkwRbc@NJXTA8Dw
z2#cgFbP3i%+YZfijiTylt6+*d<11gES03}ZjJwA3T6{Jd6G}Of9$&8w)9|w4R2QA6
zJep6|{O+vg6;mXtru2h%Ju<@=6(KpoWv>4~<T%cj#$x#F=~qsN6t~eGLFnn*fTfQd
zD~S->i4o>$H^1RJi2HuT0CZiBtGHWdttR^FoWg<unN1vWNzU9s2D6@TK59K8RV*K)
z$<WwdqU)8KaL*%~VYRAg5Hy>AS6w}(my`%_y(tARFK?TWf32m2BmP>J5-EVlL_$Lm
z>Cc#0^wK@@qV^Dh?l!d#$m?_{eNIA+T>P6(ljrh`e~(RU%XjnMs6-_v1`=OAt#!*-
zV~n$TPaSZZbp?kqkb75MsCyc2*kt&(82gEq_sJ`REuX*|1kneo>HRe4LCfuD?5}R=
zD-tFn$VzS;Q^j2>QBQ?DDV_OzKuXmV(w6O91Q8ld&>s_+S`6}Fj-!wp9U1z*-Vn>&
zi3%f^(PBXI7btqr9j^OSx}1h&FOaYpM6YZ%k8KTaY92&2be~cvVC))K0G{b=FBUT%
zmHgSUa=(0V+irs}`rg0IR^Y0_^P&+!zHEkXjKXhVBHbR`02TuLJbOdznfojsXh8e$
zj!0Xojtu@(s7|SlW)1Wh*c~i*2FoIdb3--}4;4|GoUvZyYxk4u4byKI(Yh^lUQ=3B
z+en^*h9@Bw1Phz;w)5?1^+7&|ZJx+SC!{8~9~88hEHO`tDk-W1T6U>qXQOBCP(-nE
zz`B9?F%oyyf%wwM7Q%lpkhoaEY8!^C3qkfDq&6`tk+UI?y2YhDnO*^XI;I<Jd#O48
z-5!Ifj|x?B4K+544tQ2Tq5A>!T|)6n_YJR*W|umByF^DY;Ea%#&euwrE)TY$<_juo
zZN^vsxM<`p33x4P->9x$KYzLA88{ED+2X}FkS6(a1np}Slo6s?w%rCB<ii{cXy52!
zg$T_ya&D`(^~KTzThVCd<GqhAn#M(j1fGhGnVO>ntkoBHk>oqz9Lt=(wHr6~sXnoD
zgZgfhr6>|&)K_95?2&ejef>1;N7d7u?a5PWFsrF&&BezG{)Wy4!(OdX)6rA=o&79{
z9nUPO`ppAo=(Z8kIap`h^Fh<I7L3pIi@|2;dei(?waQ06t1|9zmB-fY_F=@gV#8`6
zlucOo_m+r*$yOYFq%+pMb>QN`pUWR+y?&3YDi?T(ky%+;OAt*>O;?5w*Cbhg6$Z}=
z+`K57_!a>sNHSW^ObSQJ&<M*ED=XCH>#m`NM}px;-d6SQ6AFJh1h4V>Uq_3*=J)t~
zwtnVx4IL=`M5c`2Qn$Wp>FCCkOLh6;V&CAs;=#B=1<QAm+PMXp(x7v6Z*4lqkJ+Pc
z*uEHhb~u4Z=+CS#=JUFQoK7BlC`G*Kt*28lJv7|(fn^t49Z1ZP-xRMOknEYgM`uDG
z!vkYZD29=~lro%vOmM`gJ;_*0jXLl=knxjmK*_c|*buxnuxXbUo<fYG)>lk@U~zCO
zkNi&xmigi(XItCRE6hj%E|LmhIkdSc?W(MV`>V;v%&CahZgP}4*^g%k{XDdjqOxQ$
zTaNj4u0wue-)5xEt%Gw?Nq02KyxaqlA!4b!k@7xiVYGE4Y2+G8FA>yojK1!cF9mb5
z^6bqirkI!wJsX#f`UMn}A^FZ9)rDR>=&sx-ZeGmLWbUL@x}xA3YwLMvDgzhg-oz=-
z&7e<(G-EugmCt)On=tq!(0R2nacz90J5UV1sB}U9WSj12ZkOD2EG#(l1**#4q?#nF
zg(xQw9=j|-_4-eb%DY9%JGrrwyDRC_y1&NF^e+PxR79CMIfUPx0@6}2GVX4Z`a9bw
z`0J_Bl}_z4&9=KbrVA>ogWB+Jl8hiySkpmet4^wIV&l^cnr&HS+o1s(YNw%x*QSzm
z!Y+@TOI}g3=Fr;d;;#}@r5!1ZF%Q7RFf_;I=vNbyQi>cF02)aHOF!ic=FN0rF;!cI
zjr~B1GOXDCNa6UteXy9f6u!*XPlx8D+*_IQG}N1{5V{uNE11KR|JM9x$Ig=qdfn(^
z5k9TuM&Ztq$iX6(7!Csd_Tj58e^#?uyL07mY9gfD$+?%VU}=>K{yJ$9I$I@g>FIb&
zZ{!#zqs!EXYBS+jVumDZ8Fh4RXz0}-;M-&~+CuKd{5<NPVILeXEGc$mKWToMqVp6&
zl4sLUuzBlv@Vl&A*^`+!f8&ch|0l&B*P(;JV{fiHJmv@3e`3Hfj(ydi1v+NNzsA3G
zVa#G>IR@$S+{o$GtgaF%6-Ar;k4n_^nyT=c2%yQZtSr}BG8O*pPFJNf{}OMMKzig`
z|0=>FeyuO9ZrsT=CpTS!n(lNR)>pDXeO7-CKzoA0BZB-tfQA_nd89!0WJjWf+j6j}
z9xnK`BGG^d8DGe{@4d)7b}$>7O~3v3qm}po)BVpsvmxYwS{SgbZj|rVW>4cII8j=A
zbM&%JZM-@k?Z>*Ae5cpNBEu_8`y0NG5t1l{lR`D)DWWA(t*DV{o0EQBFsGKm;@an-
ztM*Ngr}OgGo<x0ltLUl~qKjj*o5(6R^D(KWx0gBNvOx*r{^N>=00Lf*_cp%q>fN8!
zB4!*&|MKw)wJ6N2Bv@W=Dq4*1gyWw3az{)r?Y(`6ndgEs=<VV<d%80(hlGa5&S^$7
zMYp*iWWvb%E!@A^nawV);SEH}26<#tR(bW=&*vkHon1_jqP1#bNi&8Sh0yi!=VYR7
z<KJRnu|r)QtV(=z!Ga2k$t<zC-Rq0>fLs5p#?kam^&nepR8urnq-8I~x1J%YMM+?y
zA}Nt1S9v3VQ-s~H>A>H3N*&~(-lv3nkqIM8`7Hi3=U!A*`pOXA@ZHMLkET}qjqJd{
zz)n64_I|16>Sg9%g;f}NS;BSmi3(?%)N`-CVo~%+;|`%P-IdY%JH!`HxED`au8dqB
zTK-@e^(+GlXb_Xt7gBL#F9_26YHZTxmJ_n0Ytr71<vERrJ=WhBJ9*NQ!h#h8XtmJC
zDyGoKhA3vL_Jy3qwB<+~-|dq{F?i=EE|)kK?J29litp|O1>RzS!ZF)2X#@jD`MCP%
z4ntquonkS&Vheps{oT+!1{px_Ds&5~%!V-g>ktd%i{{Ph#HLi^$!Eo`UQtw?_y0H|
z1X4np*TtqTn{0!L<p<QU_9&TzUwhz>I~3m`C~|eQ9%eltY%+1~iU=CX7K+i7nHqb!
z2v!L4oon~rq9(vd_T&rj-g(8&w8$qzh&8SfO4d~xRFlAi3oNHdn=-70Xez5D6H)SM
zrdh$@mYc_S#gl<#o66NRN(KaGWjTcVuz{?ux-Bb7=SlaI^;;_S`kOA&%qrr~QaRc4
ztEvF6y(x-%+TwwY_4?cTV4Oeyc%w#!J*jCMjg?^E@~iLL<o&y5H+vJKi>c<`Ki<r4
zC4Vx1v3B#Dxk5LS+_}p4YreI`AIJ5d7>KK3*qlW-7zH*{{fcx8<YL^?Q~w-P?xb<{
z^K@6^%I$pJk8M6%{r%6AU%vd|#;io+XiD&l=yKC8kr7MH^kMf+?^6fOBwtKVIBE$+
zZZ5g*fg`=&LV+Mf=6MX&soI=|3~(LjSqwOuTl19rT86A0L^pS#Vct=A#sc}d8AmRr
zk-%qb;S9?vww*3fq+fMwFsdolH>*nWcJdjkZRG$nva0s(1Sj~rZ1$|tW~sOg8JjAP
z)<iu6x|G=)(!Q?B-(rh`ZLR&PM5ka;utlvFG24;N7hyH~#sb}1c^D1GYVvSPjw`hw
z=Y04*;!vK)WKOH;^C|<i*j62!LwdMQKoZw6S-hA--O@4wPT}-w;tO-iU95|+A`P!i
zx;|itp{BU}8Q&SPE+sG9{H=6BClx>DLPXZo>p0(CgOvNlB4$ktyM=(6M`C*vG__?S
zJb7$mjAD^RyY+eKJ>ZY|H=eJ)3`nJJ`F?UhDrNj7F2dgzB$Imn-COy3blau+hP>N{
z91JHWk3K9dqkbmBqnTzn9<8VMZARz&`k|XB_5R`#G<p<4X;~`GPLqd<MK^zza!*~*
zash<aqoek30o|g6bSxg`W`tBv*QX1W*_(1-#J!14r0kydIGx!^9l+8$4kF1i1%vtW
zBSe+@A{)8M?5v8Ct#X}zuIy~IJ1n*nFa!$PamJuZqTrvOhi)(u(z``gj5(xg<<I{g
zcME<6p5!zGNlD3~8FVUb(RoEg`HzH1zd}4Cq$Hz$$(jXlCZ9V>zGC5G4{c+}28%Lv
zjSSaxMy2XKg-N^*2L^YJ7oJ~9tnqr~w|*LiyB9MCsIl6W)ZyVrh~t54`ka?91#y#!
z=s1sAa{_Sykw!}Kb)v?m(n_aV2l3!7t&br)?Qp@BQzh%=T1$(N0=*sME$CcP6bD-*
zsE-EDV@~imYM;}6l6Vof3c!s{#i0fYXXDB_qUgOOS1DrwsApQR6`ZJOxFIewL`3nI
z$LY7G^z$DCV|Ai9Xte1E^Mb*Ur)8!@Z9SeOIJKT_$0^l#F3;kE3gwn9Ru1qFX}Ogh
zuQ@R|r^AQlaybpVCaLeqqIyvyU2RTKlR-2LoS&s{Pp8Kz<V;)PVqL|MVEK{OA)4(7
zeK-{ccN31<72xdLBkb|v6U3b8P@{q~u-zxsT?)3;!V7X^wWh3t_WvQ`?obdce-4QT
zjKGzEB1J^~{<j@`{eHnh!)12ys<b|sf@A~_EU#@uZ@YABqMNDdzHymTNN;BG%LWhE
z*Lnk?y?i9KW~3z5vR$dN-asd#Eqg(w-pCs>ow$Puu<fi)twEu;4UEmDv1G~bjYk1j
z`Kb>1`D1Wc_Jt7Tw1vEtj%h+Vnk#~NXO+`lw(ZVxCnz}k&Q49eFWl&FwmizHKHP}1
z0J4H4<F=~aChTZgEuME<9Pl921%~Xac)nr3C=z)JrjSP)i8_0NSpRy3ZnrOQj|DO*
zBcc^H_){;r#6U>==bUvR!M-ey^3Q+5(tFmd$9jE!Y*Q9Rb?erDZw9^}|5hn8{Z4l*
z6_QW9@qXgdSf>$7kW6rkC5_#zkpR%j>c*F0UQurx^@i*vMeLKB&3S)0KRA8;k#I{<
zlD1<`lKnQRzbQc247+a-?%~Aic~BHX;!;>hbK<3sj<gWtC)>Wzijg<!jZkl0#vwy5
zMhJ;$BzmPe$BR43Ce^VUle+V+!n!(D!;a!BjrUk-wrTPe!6V#+N#AF#oxY0gY@8~A
zV*QTu+Q2SlVHt)b@2{1TljaeX-XcKuk``^$Vvgs^$um8FuvZmdXy={=;NV4JZ9D21
ziUMxB>~g$Bdo59!mJ~Qhv<J#IHn)+I>X;%eRi=#Mu~YcF5+^!GM5jGwi+dY4BR;88
z*{bkop?8HmWzNlM6SXoU6&EQMz4!^-gw9BO6|pe)M(^cRP!@Ok%69ffip#EY=g|s9
z_7aj(*{F0Tzd4Gca;AG{oWW#c$Iy4~OJy^ztmXU{9bqd_<;U{0Nh})F)GQ`5^D15s
zX(YhTDo?S^y*gE7r<&0ap|_`{c8ykA<<EwTweO-d*hn@FYSQ-uJ_d)@#_q=q7Oyow
zKl3>`dG3Y`JuhcWtoa0R`vMzXuR5QcXlZ^(M%YAjwKiKPR!lc0YkcK4`|vlbGQ49M
zgJMga-A9923R!##9nE*wxFla_V%~a2pvI4DdQekt0TUVbo6_d*dbr5<M;!+Lyu<f<
zo+N_2x{0_Efv4vty4!{yxp;v4>G9`=Gl4666wKJIX1RRd3p@+uVb?3Q=S^Ki@*%^4
z&n!ijr|EB#Ua+%VXaJ%@ONo#Yd5vDv*}4~V9i0>H26y>VnGi#S$RPO@-2h!*H;j<W
z#0M9XFdSJwO?3jizV9(7$^@7=gs)z{s5Gp`oE@)kX<e`wZQB#%hdTx{Ry4Vpu&I0T
zP!zEO>nF8yP6>;0#)KUs41J%}NQU!8fLyw!`lh@xkuAmmh<L)<o`D5=szbAinBCom
zL^MfJb^SK?cxL4UMYYVdX`_DQ@`j|^@gJ8qhU=WK6K|a3fjVo&%CR#^2LE+Fk7YI}
zZF(!7mr;~DDL*PaXu;cA%U1hv6XpM0lSBxY8qW;S_JSp&Q-0F;wjC~E>D!sj75^yD
zZ#S0IrSnd+6mcF#i`Jyh>XS}rN0oC8jXXjf23T`~3$`%tych<mo+u0~*0y+gHp3xn
zca7?Xr~Ou**2%UIw_E>EHoJkku&kg<^JoPf!M|JQKTq)b-#~ABcQ_3l|MiH_fn1D2
zcqKAB_q)^}!)gYk6Hsk2fFDP&pwzzr$|JICB>NWMNb~x0i~5nKYu&3w9#425WH=Yy
z+W8Dd)$x5s774!NaSo&R;x?=3WVjb&{d!%Fv!>+D#kiEy6-V%dyiqZ|JEMhE9r)ti
z&h4oYViA8#Fw9}dLat}iR0`AK<VZPb%8eR}TvDAe<P`JFS!-zX5EjgVj{gi$0IeY1
ztsLK3I{|(FEzPn|jAV&{@V1Q<#;(wKXu$HwY<l8buMVW&{;VY|Y60H(oi``9&1p;h
z&jrU%Kj9zNwQ@@q;xf-RsfqScDjWP_l*tF6%Ubh9GUBvLGi4<WjZa-RYtLwCP99A{
zM~2ybI*Nyam%>P!yE&{6!mBp+qq2K--L2zNZiS8a?#&sPckCpfN!v4?#Wc3{U8e2Z
zyLj>aze&hn52Aq){4cCLYe9Bezozma@d2U{A_-!o6YDywJ_+Xh>mu6E?{BLbeZq3^
zk<XX{<eVO{M7A$qoCk($W+hB>Psx@5KUvHT&YB0WsGEHfg*{u=vY`y_B0)ZR;+!0#
zO{FXOS-jlHe6{(R*EiWE$~Dx-ISunE)oH3U9`6WruA9(X2B&xXSL4AQeeK%%rNpq|
z=C#v!F>M2eL@wvIo;%Cs+4_pZm9Hu(PfmlgcJ?z>vW-7!zWYt@DmCQi{1?SY+nq~>
zSKhoP-gwhT`IhMUhreEcl6YPU3(@$*bcgV=mQ%B7`G>5v({D~J#kJYl9PWny{Z7eT
zgp>0V3!in<!M)<nCPJ6T>{5TiJ^Yz{_k;NGLZoKg2JAA1@!q|ABj^LEV^4Q!Yinzn
zS03^x8_JSj@<*?6j%^^AovG@*S{=f=dU6;Z7px+UqLyZj?Tu}FmT3@|-nVcH4L=gj
zI&aJ`dQxW84fVze+!~%JbIa%zsc5eI9;v$a=Gno^Rzshy7-0uA=|2DU=OI`+*El*L
zy?s;rkx*?3&C8}@Kr-)|EAls(cff*8a*`8u^GAxlvwEMZ(BnqLNK<dXn2~EXt99ns
z_lv{K(~%XDAsaNI3z%GVIk}m9xRSY|%bv%^z`KP^wQ0A|u-Hp~xWhqdb-L*I-W^$<
zk)kcy&dr*~bKw(M%QeY1J4)Ut9qiA@{UI$Sks;!6zi>HG$@KVwmkFk9yo6L=EMIRQ
z9d~0!QCaBxwTo3YV{GJ<DX;6)R!hw(ItaY{7ygf{%<qI>JddHeK!jxamRVa9tnuDO
zqr2X9)sg)jnA%^pYo?xs)o-(D*JiR<nFi2Bv1(59%Xo5l`7BnpU+M-FHTAQ}lTq6A
z^WX*#c(7P3k;|Ur<EJ-!*YkL9RZ?bfI6WH^8f=qN3t>NaY(4yJoWZ>!(`A(ndwW1;
zzgtu_q0;KM)08EfWKL1?=z@+w&WMQ0h;4gF^J#_FgGrUEc8xuj%IGnpM$@_a;zZ|-
z<ACh$(*Oo@9S&;Z+*E-X&pqd@o8nF^X2L}=Kmk_N?~A<wZ`K*Fn&6~YP-gzO8i{ux
zmH;aTJvkoyT-63YDM@`NKz#q9dqeWEmh$=}?p1BBtNM4W6rrl-Q*$Z($~(G%zX4XK
z1BBlzFundkroVl99q1@>>7BB^MLPJGI`!g3N3L_=i(3~zJDT8JEKQ_MdE&H3ePln~
zdmyv$Kxk}Gna=X)bnsvi2+aPx3_#7XpL!2ZsA}@Y*t_Aj@9J!&*}eRq7vS5&!LE7r
zG}nRpvI$Y@Z{s<W4ux#_Nw;%ioC`DvQujqNqfQpmT1?SBWCI*La(G9gXVc?}al^_A
z$+j6+v(b{pO{`fLdsy>ByxpU)@inU|bBh9vV<lVhk#_*gXC*+iSZdbq1r(c{D3~Y4
zh^2SCMT)&lvZys-OXDknFh4)tTWVLWoXv>yEyFWmKn9O`FPO@>u@ASmu6aVe5-9p7
zC-?Sk>DRBD8<igs50rk(-4!-zc<z1n+&L*<cQc{u4(Ta`DfempOMu1dsPI_m$YEE~
zqG3?641O-4OR-mHRG9~I=cWGa0i?Net2>mpaeLk1{bDBlNPJAj|7@iH|7Y=Zp1_ll
zf<U{ep-zPB_+)%#7lJ?_)|ExKGtvlcuK!aX6pXVt@$#tojNZxC7~xLWz4yM%s64d+
zp!%znIn<4fJWaC114a88K)+n_1a!1KnOLjv&8#|Y-~>2AzOP>Fb}@`~VvTLhTN#Fl
z9ImP@A+b4D&D$B854vGt;~w89@r+b~as5T5yU_9#vCLk24>{P>gv_pnb)n0XAfYB)
z<})(|x`!-3DC-b4oVCWEDqc70-sEjtiNP~(F4>}NyE7~^-gVPiE}^l&xnjxxWV?@b
zvHl315W@gO4;^$<HF!9jvj?_OOG7g<8}sn2W0;bJ-=Vsz2Cn)v%Qq2A*Dt<r<@08%
z8Ce0gh*rrupFjTZt!y$ZX|xDd8VCn<zzR6R18&(9;FU;z%j)Znx96x)5uX1^`mP?|
zxv&IydsS+m`*g{xN+{C$#&m>aNaE<sIm`d!UgWDKhIj5fcQ?bzA~oqA?mbh6pFZ5u
z3Heo4JbHt-|8W7dn-nkoPp*1$#feextc;vJwWG??qO#!dsD1WQBCF4l%rH<#h6}-O
zHrUi%y3q=4lCjzstQKVsJb19Ia^|zEKQ0XhvP;ixdYvuq0dow|skTmrWSQyigRuth
z<N`xnVu9`v?4)T&6ObX1C139IJ{mepY-?r#gpdpN32x#vQpQD}X~t*YyGPA+9MWp9
zK0-fWR_yabFZMQt6`)qCtj7Nq+L-Jq4S)F<M|Saine!3nx1&<~@tH8go~WE-L$mS4
z_z_aTCa#(4^q~mwMK^J{IJ)_C^7<9_d^Fd8?W9k?vV#gS$$euFF!z%HkHJIApQVDT
z|K+^pH)y{AVdDPW@9vPj^!J}72Y>!&uY(DdBmOelvE?5WC(U(T2mtM|y!Zwu+;20=
z+F$=u!UpXF+0uLs6Gw`pF;I;T{~=bBmO98zyZ;Q+LW!|S@u%lBjvH_>zD_(orb0Ii
z%tGUn?H2*YqHj=2{>=RDt-24dVT>o%FL$sTk@elrEfyz-@c*ROcR5$Udp%~?q_djj
z3az@sz(JnCKuxD&pW`QuUzq2QKiEiqyNL_2Ky8f=Sots|tp+<!jL(Y6{2jiPW}u^^
zW8mcEM8J$>nj3r`du=QCt1a(L=V<!i6{#HOqIJXlcbUAF>na5P|5dp!yD9?_kC2(S
z^m!Q`5r~#%WH?n5SJ=*Y+uXU@Q8?3V<L8wHHDaiCPV{aSpKbk#Gd<N^^5=!AZC+}u
zQKsSDpLTB>pYbM2IUI4#Nd|B4S-6HrMN~zFq+NJ+mli>j`<X(#;GQH%TKsZ!W`7Ig
zAH@FOk0|NwP}=ml+YWfF>DGq{j{)BWfHE}?a`~<8UjBcKeR({T?H4!GC|enO$iA;5
zls%KR?E6m0o=REDh>(5Bmh3xOE7?M}P_pkuMXPNrk%kc7>&Emvzo)$K+dt;>JTv!w
z-RC;zd(QcubFTXeDo|^ja#N;X3Pu?E3a3hr1G#;Kl2Za-M6CW>y@S-CMs1&HR@$;u
zl1lH~gSj%-nMY3U23L36h;EEn%|>2=U*GXMpXGek?#^~$d|S9OeMq&rD4~>1MDuIG
zRJ#_xE~pd|j6I6J<}dakKT7bjE)mR-0HRElFT7JSM3$VYuiB?yow4_z5cxkFiEuRr
zqHg^t2HvP~o*QHovkxMVvHPLw3AzK&Aal!Kt1-%3*3|eiPL287&&zb69+r8B&v2}Z
zqUMw0-L-3-)C?{Gt+QH`d`jU(WfP#oFK+DuA+u{I;3mx_-EG1hh1JE0Z{f&j3;w;f
zc<;Y3JkXNQl{Tl9KM{H65^2c2ThVMk5mu>OU_cZEZ3PDi6%%bKm2Z%vShitMmsLdW
zx0t>O-|Vv&p^(p9`ZJx=MDgWc(()vKI(|G@HOToZ65=}*+jdPbhMW+{A$poEty$}W
zuYl-F(il)L_56AL!{Asshy+mzEUd6p@zVGsXeOlD>8oNca^K2tpI<B4<jji}3;&B(
zv(f+}K7~{TV#PpFMuVh78<+}Xg`huKOOEH8#%&w#S*Xtjbgn`a{we1y=#`BB1q{D;
z@Y2Ir{hdpfE-|j37{6@zLq%&ZKX0(q<{JUYU;f)jGlp;BoCf9A*z3zz-*G*guTGp#
z_RXpy=85u)f?Yl1b)jvDNuj3_G^tW)`Cb6cKh_t!^#5XlEC?h%|JjCpLt8$=@mTns
z5&7dYI!Fk%xbRghg<pYx*v&1s)}u0lf`X6h>gsGTIjFFowq>cVGbG?^WgrwXd1g~o
z+4U+8wDcW@|EYc`mh(7|?>erfrA4MlPXj2cbk~r&%ey@;3awem|LpKN9eiqfMjwsQ
z3k{x#+Pt0P;F$JkDg&F*Kc$8G4#~PvGPfP9x73gQtn8g22iC&r`M{w+ryBQU9TNw`
zryCS~XGC#j9*))h9#&@8zw7Vb4W8#)zK0siK`jPuue+7d=$}4rWM6QCm)>NtXjudt
zu6)kYU$Rd|prRo1^oiz2-UVF?l>8im4|TpDyO;4tT!jbY<<bTTU5k%0<6TzwoQ&h8
zL;1&<Tb1zqsSvIj+FF7F57s&6i@bS-$w<lnBD7sQX32mZCh=Q%_cW7%5PrS&R9_f+
zIqlxp$L}CDUjIsK1NS5Wnr(J(19&7$Sl_WZL6Q$X<7FF=mQ63xwwI6$1D4|t$svva
zpL7kCuL?%o?g3_AON0B)e=C8%frOAB*vb+1W3XL0wP$zfCw~P#G42*@7UYi{OSo#S
z%KP%3D<W2J+XoA;#Z&287yfZuKh$>;Matooy#bB5+dm?qn+Vf_IPZ!g(I2?ApO^b{
z&@S!k=9S}Vg^6F|Q=1oei{+Wk*#nhoP-BR@%?u5UPz)hVg0VnP-<-YYEp+|Q+XMXL
z5-#)hUSPVho5pVvo|U~ULJa!TjzF!CRupuwKV`LNqJB?+m@;B+67KQ=%+Y7`9Ilh7
zTTt=vE@aBp0Q4U9C(Sg7e}5=%4d^1Be<tm1A<D%WUVt7h!Ve;@F!QEsHMNOev}5^m
zStvdtV<OBxhX9mv7T>xQfQRqYiyaZ;oA)a#E9)^B%txV@P@uQ`?Wo`%uz}m+ycATQ
zz=gKH)+o*4l1)0#;^dQ-ZhIpzJ>IA=eF#RKMSE|OpS$9Jp@gPmc=HO>DH3qVsbO>C
z`b&>PUqMR@_geFe?4NRoX@L#Eu~dlzrWV9*BJMzQvVcg)H2D<%9Jbis%280_XVOxX
z0hk7AB+)9FMWVG}w>Z~+4S2(KbyyHfUC0kUyn?3R_%6R&NDPNZ(*0=(OOdFN7FAxs
z-ZxBIGm5evcjQ>Sg2oy@;%k`$QTinkP%sUbJbXAlaYS=?<$LbS(f_(XoEo(e{#4v`
z@Uf=TiVbbu5wxaNyJQ53a^!PMsF@0_%kbxO@9%#jcvv5PRW<<d#5H{TaRG<8N5P%g
zzZn1CCXWh&c=3u3hJ<KWYof(qSn&AE>Id#0hbkcMbG|x(X~ekm;G;+njn97%Isa{!
zWkQG;R@n+4&e$to6=mo0-Y-Ah8*7nZ*S>(djFx=n;<7kje$x6{a9=V0e@{LAN6`K?
zgx$Oh{IO>;kG-+lL%i*^MD^Emp9ih^pu1DBd_1S!H^Uk7_U+pRA5bBM%ixRs_o!-|
z3f{D6w*Az{BE#wA<m5FnGD2<1ZF6g2PXNpok88TR0Cv>NJc~SNL9S0xd0836UkwBc
zUHZKO(2E&bXb`z9L(Q_xM$QL{!!-iz(9`a~=?&{$uQJ_yT)P|G%Z4itb|AFy5U>$o
z>!??wZVpZ?nVfq5dOtPgNu7dNbQULo3!FXr13Cv@!4duozk|PfRY`_Ani89%Gs(oc
zS559D90tJFE(-dV{xo^$zs2wXoePUAumG-w3*U$U`#^~JUZ~b#Zc3JeHyuc)PayM?
z!zYlG$lkuPn7%v6bcIRUM=i2>_=P_zkCs(GT3(ux`@xOMzeOs$>ip5pf6#Ybw<@dr
zIl@D8SWi6p;h5Lh-6yO-yFt-bExp!0@9&HDL#Yxc32u-Nt97)u+&{Yu!&lX~i`w*n
zsAW#2lR^!fe<4&6=lJ~3ooxW}(KStD_7w0CPZBk&P~#(&g8Y*r0!S+NKuLXc!mm)0
z%gxFNB`tdRP;glcaVNrbH!NB&lzthNK@Dac$BOK%4p_3^9D&VI%=#WL>AHoQWdYJh
z-0>?3K-)#YY60=d(0a3m`hbOl@N;3m4KstB=&USLk|V($o>gCXKMj>1C(_N7|D2rP
zTTG3G6_`M<epcw6!vxf`Pk0Lq(m*8sSoS%_1J(<sHmSf9bviAp7wq0?jg;!p6DY#G
zd{6&BHI07)NGlyMCVLeO$)4OWEu;%T(>&(|oceiayp6XYO7lRFawLM=p6heD5T-L0
z85wEb^g&KsJ{hVH1+QJg-jUCs+GAwm9ZC6|{mYg7--~5-r|X?Iv`>s8GK)sj1%R5`
zUw0e`?k3^!pk>v~NwtxH&ci*$L#ravQ@^&R>E}F`XT&|Av`LegLkyg(w$A75?EGE+
z&@n+_DI%2Ffr4S8={H_tFP#3g9=+1Pw+n7-MQ1u7GaU;Y0H*Xe-9t5BgtYT?Ht7*C
z#KZ>B3@Ixsi+HpK$9@(m&_B>j5MX$=y#k@K=Rv=WdK7EQgZ(});pF}oN40zBrO<tE
zbyYZ}TT@b3tlZok_@&*5r<(c4S91DbNX_S2J{D(XV8(#OmSvxrItUBFB};H)D3Rb*
zy0Rt3_sD)b6A#Fl$ozjf6QDen<vqu@4n)s>(1uu`q4b<*-&g0Tz5i_OUlal3{2c#0
z)-qUIf%#0y%T^faflFo40WzlG)OrSx%>{gnyZmwnN+kZ#(0_gMtA=1~cKTylVL{LN
z=jgrI^3YurWKS|=2mA^T7Dkb@r+QBPxAt6>(B8bQ-JQp@QGAKrcO~ELm^&;8>0Hc%
zl%e?;{tzwIyd%f%elunE8R_Wh#T%_%0-;pP{i<B%;R8VjR12jde3Z{NxMp4~xX+S%
zYLA*^@_~CAD-22KJl-ZDBm7(DzxTveA;67UK5c0ntSTcrD=QsS1PCM<){k{r6O}Lg
zYw=^vwPr*J571XU|3Hwjs_`^UGd$f5QC9p&`{Vq2!_aW7K&<ZZgDeiE*DO%zu!_|d
z-6wO!QuCn~4#c=a8*-&{M)<%Dxx3#_;9z*465Ad46V0)w^|2~hkD$=^$bm~$BuXM!
z?rvnlh=cCP9+m|VPD9QhE$#v1LBGLQ0jew6nI0FlJ7bv{S4b+A)4V5__r*a8kdLT7
z!HrqQ`{QIH$a6x9)TYoZ5DGnNCMbwr^em(V^c*y&s!?(Fzw?a$V8T2l;j5U8u||q?
z;-R6TL!Ua=Z#9Oeq|guPt-HU?id=)geR@?+96|W?HM7Y+yQQ=x$6Ywx;LzMbPCo3W
zn_B(P)&BDdLws+wE3ud$;i$Ge!>ifD*tV7pMp0<39Jwg@J7_kMyy|MtkVFogpJ(Ut
z`S1G@e+PH2v3@~8X8>`S@{|KZ4ASKk0W$Xlz?gBxvi{wq0M>&BB;<MSyI0u#{Jwx4
zIKf?T_TU`=kSW=c))@u#Y;x>|FUW?|#0!)T`o4b?1U4Mdt<m7pKhk*qMkwrBIL`Xx
z9yRSMU0hu&l(;zK8IbVjS4zmu8}AW3X?aZC>3<`fc7fD2)6Ya3Iq2{Pu$y^~w(G0-
z>wD4rA_J=|lyS^M<$-ASs)M9KSfh?>_0)ViI5czzKWQVeZ@Q$gsL1+>_K}kWUEs-|
zpV#^r1IlTL%m;rI_V)I|pNjVQXoHWj|ECJ)fZLkd$QR6rU_jBoBQU2O<~O5yrLods
zKJ{m#DNZ^3je($F<g_<P0kOH?<GGj!tmzp?5%5H~ul(<)=3?-L5*xte`Rh}MdTS&1
z%c#T?N1@=SC_+mcX~3-X^7k1X{}I3_Kj>{N*rJo*#*AIc<f{UqT0*jCy95!Y_9D{^
z!2tje6Atqvz}`zwI90D-Y2OVX<2~#xl2=h$Jpet>dF_waYTytWB>*#|SqDM1{w4?+
zuI&iO-oJMJ>SxFw_nx6P<0?!HujoxWK(_cdpbo)ynSK(`A=rd^e?bU@J=-CEOb5ZL
zB@`FEF!6Yk1akgMXX@Xl?gu0W7fYcg))(v6A!cI5<{-?tnH&JZSsYsYM@A9?h*`JU
zJ^w5-%8)LSePwe$JiK`v;z<c5*Z@*z(0e$lXsf^-|Hf+Vsz<XME;L+hW+QksyaoJO
zNfLSz@N^St+|F$tchJRTv7SOe4uy0%2ZkrlU{b6y=_0JGAei(qJWdCAX3(U>``1_o
zy6;jN(2+ikBK{*_f;(a8bB8AR@XFUp5%AuRzy5}W7w)cbGxl^c1L)%pxkfRp7VcCK
z4}#!)(9zEGspdeZ-rNAR_Pw;YC5G;-EVAdpvvyD<8edWP_V@eX;Uh_)T(Y9;rPy(s
zJ?GcA3IXV3fgjioKgg9>@OB=!F&xn-RPc}P)r1*O3EhvX;}^`#7}oj*JvPrv-T+*m
z+nZZAYsLRx`J><XS=lEwcc-r~LWv{cJP%?mkwip9<ocHH&U@D-Tw;oCI_+_drCIU7
z#jqf%{9X_M5Y|yE_z?w&8T1At<F8>Qe~RUfkJ`Dho}5CUCYQ|Uc4C1c>cFeqGy{SA
z)9J(k@c584=dpQzY0%mO8+>udAX)`(Hj;sORqn+jI+nAtZ`27G4<hGuH9=t4sUdkt
z8PESX+?;MWwOT@0oF$H-wdUZ2h9!t#FK0EFp#XE<kEkXw#KuQ4cDA-(H@3EGvD7|i
zE+}beXfV|gD(*(W2ggV}_|NX3cW2a?=h(thwBG`Lvj&7N5&XzMrrCZQBL|q@vSb_9
zd#n+FE)t&(qdy4(hmN8DypZ7^)7_sc9mC6U)UUPm>>k^mKJ@G<S;VPZhCRmsBu6_C
z<HPF(fb;LF=sJ-8p#oWAVh-iEr7|LnBxy!V5Ae_+^G|pb<fCgGo_%0`G%5&+uS0?=
z+I@?3IcES+#RDA{?>X%ApFOF=SJuY$0{mlyWlCW81taxsKtoZqN*k}gQ^X;+Lu|F#
zt85FZ>e_upblC?*Nl;_Sp__G+_sjvO_r5})NK|3BvxEb&#(ZUVc9xS(MB;tc(B1?C
zKj!wsZJXTG=3Q3)x2XS(dw!b)r9z8zKKCv5jA|UIO4bUD>PxLFpq(=G2BGtUaJc@z
zpR_Nq3{G8B%I)N=xY)b;8IHZ_2?0QCO_Moztaj4>^<ewD46vx#354^S5e4?%z<S&!
zC{H>A#zVW{&(Rv3z&H`Sh2qe*qQE^Yp?f4A-s44(zj$w{AH?c|=d}l>1;W-kM##jp
z^NulsJ4>nF`_Crf?gK}Jh5+chS*Q!zPHq}BCval`f?~eBtT_nS9H4mVOB&`Oh#Iim
z&9%#v!*0DPP@(Ye5AR783RGrq)`Z|S5UbxCI0Ze#R33eI1~j9}+D9G~0=Q~6bL;BF
zOm<Za+Mv<suu#%tH?;El${!AEnOp_NwK7#-cTFFPsXR6f_L{~{&p7X)Z8Sp&3Y4c0
zSX7)ou)H|U6XO6b0wJP@P&(=$UrX4g*1U)-@}81Lzr55ES7QmXUm36SuRXxu)7I{P
zoSL@pfBwvZZ-vN3S9X{W6khL`PpVKb0XgxjLoLDxRF*$5Cnn+p92XwG7SPa@VfQ+;
zFqlh{mCeAM_`jDc;*^{3;_|M(V;6zr6^q7W1bJ@+>=7v@L6G?CgLT_SFP><h`1`&=
zvWwk2D(H!PY(~bTPO@V+$ha{kvFZ=YffiqCq?9xTT0C5E`G7UEco3ArvHLyi^j1M6
z7^RAg#JY;+SD<>C3`IsmE(1mf{1^}44JhuE@;w}I<bX3*v<`)_{oOEu;#Wfzp!9ph
z<S1|?Bk&q$o|FDDNx6TsbCUi%0cbipx?10++gBv`HHXz|xzBkw!?Be`Jjd@q#jh(V
zTAn}G_gfK97oM7b)OND6=iz~OFA}txyeGqNC#i$1%H&s$27ElIq<DGK9mJ%AKodyu
z+G|P?`v5eNwN|Y%5hnlrJyELz1k!v8X4rDWuybQC`hP^nv+E39fMar?<QVxk!2J(q
z)hOVi|AYzP<fsKZgMsLm1tfqx`-tq`9%LfW92n;<hkOB$^NNE=3yQ`c_X>PDsK}uU
znN+yuEf_c=1c^VxJIVoCE7RPABmZYL|17gcv03$Juc`^u6Jk{dJZm*cn`|MiQ><we
zNDoijY(jQj9ep3{=K)46g9J~oqBoy{PciJ)qkprGU5}PPwT=7E+_Cb?3yu1KHy2*q
zFY0P}3;%JZ^KW#RhL;9@E4{~AIQ{;_AegI`0ENCjldGoCQ9r-G^zToh6#z3O4(V-q
z0v(kh)r5i-`Q+1o{Vufn7EssvV6b~8-LJzV(0CbWl9m1?i4!fVB>;UU^Ks6D2V6)B
z0d5R9+UO!w)cKp}P`3!s_Ui(-SJPu-W9Qa3r-SEyr|7r5H5NLcUTHc9!SxMwZRGk?
zB-23--IAPVArWan!pA^bupdu+>U#*s4o3}?&U(E_xCWs527y3RsZ|2oAloUBi^cu5
zt73P1`pHA>b)vH}?h4ppfE|8Gc7<lsDLbKQ<-b1&0z_;~eNY-^1u?q2Jp%`Px5gZN
zp^N*K!Z=_&vpd|EwL#W%^ZLv|UJwk}2B4WTt%5<+`wuD{u$dGPF&w*5Gy=v)hEu*N
zM%~9%p8md_(5j(zT4^*_oo(2UVqOO;IQ?5e{;jh9`nZ$$a!#HDRsX<e$o&=qwiKz_
z_ICgV>Sdd*&{@=EP3*OPsNWIt+^fsh<j+C<R)1d!O9ufowc-f}hZ%|eT-l?GK78yG
zm7Y8Y29gioUL-qbX<Z$Cs)pmX^2`}10RTY?H|(J6;m({qe-Z(_+DVP`A39u9%|MQ{
z;be#R_f7Oe=|bK!+$jenQ6JfW*o9I^>jDngM;1gXu$$3eHSbaYm|U{FcNdD<-LJeq
zkU~K`gMM(d2p=5Drv|?3+#{a*ztpXapmtT6?xkCDg8lpj2FVe`3m<s7izjS7BNrPB
z>A!I8$?h`<ijlp|`b~ErCrii;#W30G3<o?E9_$kYk>-#Az6PI6R&vDtSBw1!{9T{d
zZm`xC2QeqVHl7@?N}p}EFlQO!ph_BBzWHz>HHd_W-1(&LE+3M07rN^aZr~9li_+^Z
z114g;*5NOmxw~>h<q*shVv0_JGWw7e&pv-iA*=irqKM`};_pzv<?+8+788Xq$ygA{
zD0$&Z|DhN~g)l05F%lB`AUF{r*T5l^5r9zt4P3v+QFRbYvAp-$PyYlcl<#1cy@9j3
z6<{GE1HmyN1jm1sO<s~8`bHtdT@|(xeA7S6{~Tn6ysM~_=5SqT)+$_Gto7ghmHxvt
zg$#8h>5fKnjw-K&sxrxyA0w^YQ&>Ksd{jUU84q?1YJL|t<+}puW<d)PpxO@|m2Y$F
zeAx*~wntOG8IBs&(X3v}G24X_jw)2m(c+e)JU=8;MxF=1``GEs_^I>R<cf4|pRx^I
z!PqpPvIoQESNdybPCJC#1=sv1GGiHm1CD0c;gAx0SL=1qZE&7+4qbGbHWI2dtFSDd
zZ8#^5h<FtPD!if3=tJHyig_8&I$7XX&;}?(5E*Ksw3*9S@TK|yGXZJG1kGqkM>*r+
z8i)A&=Q|g+)~}ka9h3e!X|_hR?Unb~+bnWGi9gHt!o!^wjXXTFtq&%1O%fDny0)SZ
z1|GNH+<hIq35u&#@Xy#d3yv4YmW@N{bmNIX+~@Qr(>Ca?=4;&pc_bQkQefs<4wF{A
z0WdJb7d*^Sz$zSvYe3xmUdW2-UIf+Mp`%T;oZVBal3BMu5p<tn9stvAGb@B=j6Qb!
z$BTgyYXG`K%<KoK^Sy_rI|M~P#rhUDMNa>^dc>%J1~q(S(nF4vyKOo)B+Po(2jxYg
zcw%ncT+zC7@fgAPc|7gs4cd;67k4hJj1R}A!Ln)n(l*ko7QYo6`f73dC8qV!Hb8uC
z4!N*5zV=hibXo!rOga1eq*5f~>)SdKDe<Cd#}?+gomB6$7AD3GWfY#J99Fw`%ehKT
z8(B4#I>3gw>c7E|H*yvEW8G!4;MCE&a))?3UCE|TnIZg}YYq*cy4t-Xhoey<#DSZ)
znrA+2eA?d1DBWSwdbys{x4SVN>{;G%*K5d^Z(RD`Eh@W<8KId#Y(K|zMM0)x1U$-(
z>DRHboNP%Bc2<UR-H*(kjzu{AQh48~u1y0V@?1SifVuJQz^0?+_p03S4Ud^ci4)P)
z2I-#yeo3l4B|ClN)^yWO$)mK~A<lx{ovRMF<^3Cw;vUr%rkLwbz>uoLQG%ZsUoEMs
zfuM+j@JD+VTV}y>A1~L$abULTw!%dsY?tcIOcgd&uq8Sz*UZF#FUZmP!c+qOmWJyv
z;a?>TBpDw0@U`?xax{iuZq>P5pKtojqMuHa`b@;9mKzh=wp5@)29DWbkREJGooix$
zE=?{Y2Sfnc?~)pftXC%##J;hYDbue@MohYW*M4Z9nHX|ue7>Q>b#2L80kc(a1^V-1
zAWl}-;Sn7#IPr}8<J+u>J#xotGgwu_ldxu)(?R+r^<$~hUl+$Pa^WNL<~*ZT`8ys8
z2+SF2Ey41^=$rQq@FQ=UXHzEI6iL5r92@Wy*qWMX0;(7j84^DhlIB@W*K_2-lghr3
zhfikmCmJx{>ia)f`&^M2JB=srU9D4Bo+D{@QEAKN%XAcfL5Sv_xMIG;JhUbjOh>sV
zWnBmW^zm~i&_VG9(KTFr;kO0D-tqK!^R<OwUcXG!Lh6%LfX8Yd?I}=x$?(&hiEhr!
z;LHae)Zdt(rJ+@KhcW)#H~%tU`jRti?QB__n)wN9ydJGVaN>)}l8eCH^r^MKUFZ>F
z7_#~~PQ0m_cdFNaeCKNGcAfY!fXFc?gFPDb(<vd^c@gwAOm9nGS0Youm)(wC44-wi
z;Laf;eJK^X;ooXJw}ROkfdAr~Td2V(Gx*ZfUNfAbg!K{!26%gB)T=o5jY&otUHUaa
zk2JyHoEkTLIN*LA;o^%23%kzyF<XjhY4D>Qr#s&DoL1Q6j{QEwIsTu!6`K^9tFyxN
z((q0ig6l)K20tc;O-&L?ZzwPl<`tvHtt`fyEw&PK=bDZ)F2bedJ&QJ|PF{f{3i-9y
z2)102pWgV4Wzb^00EyAu1&j5~B&99;VnK(LBZ};~43n~!A7{xn361&S@&5GY4}dlp
z62%phN>F{-9l34aAdF@LwgizT1rLU{pD*Q9Y&{spjq9wE{Al=e&nn+Dy?Sl(L2u4>
zdx8w?*F=hpo03*K^|`hO^u2e5z0AJ6=>D26jlK(NWqg$dG)pFVPlYtK`|9X!D1!JN
ziI!d*1}>stT9FrC3Pg8<?;NdEhvtaR+T^q=_r0a7k|l$?Vd*?A%U6dcUTE88VN%)B
zEuO2!dL*zS9xECq4vD02ErN^6&2+%fdL@R4!Ky(xk@0kDsCodb`d{q?$4Ex0-;xku
z3Fjqc{B^`=XAiNC7t1p{zS;U{+CcT#6P0OkTKOq`=L;Uhr7VHhl!+rm>C1K3rJhC`
zjSQ#C!(ufXAEjNmRESvo5nLy}6z4=Nt4MmviD38}A7+h8b8IM69-OsccrOx+^jOXZ
z4}%xHgP07Ce-S^IazP>}uwa)A$!he|c@?eq3!;8l(oo|c#1#s--|Wc7FEREVgAwMX
z_?b9rd(FU)^k_2tdX|x<|3#x?IryDG*!qpWT!87aZfCW?FTFtU!r5moIXGl|2#uj#
zy6O89f%X#rL2+fA_Lx^~f2>NN@?LCtjvgQ8eO}Z+=Z5~)qN!=x)%ETEke3!Do2P$j
zi@*L5B6x#KQ+y_P!oBtKK;S50!3(GBMKt&!plb+jAl@w>P)vIEYzTMclnk<&*Bh;;
zuC3J<rn?Ga!;1D;Q#KO7sutZ|p@|X*q4Lk8D5I#3n3l5D%^on>4bR%)^;C-_Tj^aS
z-|otu5&L@Ge}lEY4qwLioBR4#lbH<7EFN7CnIkk=yx}MqL3d-Hhy0!5P5b9(eM8Dm
zcVss?EF7x~IbB2AL$_jc*)oThQA7K2<ptkSnX?0h-w%It-Y=L9fK!g4r+o^jBe%J3
zj;sp<X^OgQoDL(C0D5eQF-04zF6HXOB&{A74&R;P{NSj?p#9~eQVU$hZ?DbkBk0{>
z&2-Jq*uc-ILoSpw=}|I1cq&@fK8s%TQh&I;SnS(JO53f+eqAKZXwAE1doBLTaiNqC
zQRAX+KcL!4sAV&cnDJA?bUf?v@AjGa$FULvP1A~xXD6gRTBklKXEC7OB;S78_mFu0
zT8rh^>7=X-@22*KQPBXjWtzc_sIaVzXH$130%?Vb-)?P<b*V_df7SI!=KQYDv!o4G
zsX<N@pp`HcXzTRE%!chYwA-z~rJU3^#;rp3tck7-pB>b{)`q*c+*_OvZaG;usW91+
zu6afO$j-|OK6S6hByAfNVqbGtL`MYW6cpqh4@-Z2-WNclFD2Czxc0dVWcH1X!=RgH
z0;)gtcvXq<l_0s^$oaQnwy@F#mdzON){f_LtAlFo_DBS#Uczy;JnP*B`idVcd$s!J
za+H8|o%Th|?eUWBE2LXniJ^FzjQ+CJ!|bIF>pE3i!RMX{1zqCNMIm2*ux3A6yiL@f
zy_6pR;ZpS-Jys~@^1BGL&*YSNP0m37z4L>Q=C5NVz8AtgpT{*@{tN-gbo4`;el}<~
z)JhEx(18eeio5UV31{lxU)Xuje|P1R<zVFM8tsF_{;semE~D05o#^6Z#03yqKP6N|
zXTKA?J^lTAg;@sGw>9?n-yVKawh*t+YB^VQliso>lL4z0B|_{~!}<QxRzV8+Yuf&!
zET%;Cm|&9e?`wQo1fYp0iQtIA1$Qk*eY=qlf*w1I9&>96rV4tr3>RK*SDHvNB8O8F
zyuqP5tCBf~0fi1C64|w$uv}T)of)XJ(^>SrTCbs&h|bX<8dH6VK7Qn7Cw#8Lm@ze9
zDgU!0K<|lCP$|Gk<xvQulbdJU$#V57bw7RU?C<K+3|b?c=Nx%zaHkg&{G+9zve78s
zuH1TnG<ns%_A<9@@wd%>r~aa)t(5m#WDR@u=H@g0b@+`ZcYKjk;R+E!#W`a;?=I^<
z)ono^%?B()&i4Cv&NHevGI&6LS4p4x{+S<U7uxowIKFxrg~uxK3w!8fDc%yP+1`qM
z?J#Vr36f98)GueE2@A1J{Umx<d`vpohp#aQoA?Q)=;hCOzMZLX8h^f=ob>2k)ph$z
z^@h$v7G$SOhFltI7PsIWJ!0!punF>yZbaX@?dnx1W&@>tFN`F<Au^qLZO<8FZCKD7
zre*JepckY3Jf}%A6G-H7C(xc%-uIZ4?6b<maM-V2`gf&fMB_oG<=4qT{x}u#43L89
zFrB8dEa|i#KYpMB%N3k2T~cEkp>zt-9{1#R5XxdGC!y_+n1aXkcX$cBPU?#?QkxOr
znH8d%6@ot@{5nz)c8#?x*Ke$gID60@DL7Mquld8pUKyW@J6pbG**AmN^sOHE?Py}7
z?Df+g(MFlNtjz@eXz{6`>G}5P6Wa7fD-qVim#OV^=KH;ZwO=B9{_QQcAI~Ber#(H6
zn%Mr5i@FKNDz+`HxFBWwB46z1$kjps&+ANz$H~!NuDv@g+w~u5o4A|j-5gI}ipZ2b
z<$VF=z$lrQ^(@2ONW<ZBvWML_IfqE$Mp!oGn#2>1#@-I_diGqdE@0i6o`G}=#3$d9
zTj`4`h7Nf5IaRWYkdRTFh%~&vse9h9s&dBuo$8FxaKMPSjmf)+EBd>cd7BZ7_1T8~
z?(6V#t?Y%gAHv>vb(mnY9QD2K+@3e_(cyWW`0)$>^3JF<rLbVZTJ(pZ2O5O={T$gj
zgYuM0{5e4a@y@oX8?ij)SC`))se<IO{Awf@iN^F)O%78N*{O+A<s-`V?YG}}d3mLJ
zWll@>tv2?p)MrlDnK#XC^hmDN_D!GBD{#yFR{P|b|81h5>3F*AvmE}NF9Y)sC1Q;5
zQlS$ZD;=uv#8DL$#8bV^(_^#E%}Mxdh^C+ON>y>`Z)~b-ioQn}BRwJ%9mq^QzhQz<
z$nDX<6yBEl;TNf~EO_{YC_I=oH{$lqmf7u7I%NgMGlpZi;e!|W+ciY`&bbF^Bj=kE
zw%4Pz`{b&Pw<GZ6Wew#;V9TOpSkXW)X-ZYkYKztRxAV7BR+xRK!S(oIQj&JkHuCAe
zPOjUO6fUh~k!w%WV-zZBD_{MbCfn588MUE<SyC>^cRNnS8_BC6rxp^72T3jrURE$_
zE0`-VNI>=SyPqEv0`>CwG4`uQ?ggdq7QS#jb+~%EG8hle5(5}8tZu8McJ55*k2nj=
z6Yl{!f6J_vyP}9t^8WKDYR}qn&wjN}EBsN#Z#a}Jo=E7<Gr%<uuP-)?d-c4hk>E+j
z4u7X=o{{G_=NlhPJrC;#tjawA!w8Tl5U)63R}{U&!sJV2b<%ikWCPL3KQ^fYYuScL
z<f(EN4#_t-w!i6}rznY2G)S9Pv~5qWNjFe5BSf)a;O^I-dw#DCCCqKK`IX6Ru=>j(
z?Gd?%_0<%iYH;?*?&bYZ&V+>x&Ts0;^@T>7FN23uf4gu;TZ<hb=8p%LSHjD-ENq#a
zFky_fEKh{FBVoK`<L2%+<P-#;#rZ*x?*t!y;gBHUxtUX%Dobi3yJMI>az^ahH;i&t
zh(ur0aUkB^FX91>DQqZ-=kpGuUL97el1g0ZGGyq@lc+gArK5w4Hw+hdQ!1Q3s#1EK
z5>?oaadn%BRYe<umu!-~VJ%m)3p`E&E`-`3njs`gNc3zATsD~vC8i{*>EX?fu>btw
z>8aG!s9Ng+Q*E15jN>Z=$SKtq6iA*29}XL=3^E^*fffsd8W@HeBhK*Tzw(k_9P_p9
zy{q+OXN_-|FXi#4MY@#OhAXC2Nr3d;!VwrH=|<_EUQArYz#N|8*5^B}wUT#eDD{H&
zE(E{lHx$tqwwWBjdx){;*pm!)mj~&ktJ%w~A_cUsN1M>hq5|^!LdsHUKA#|qEmMrL
zFE|(Vz~_Z?Ce<37-U~D#_Qz}1hxM#}+I<XnjZbp<X(L6ALWU}iAsso1@!mdNpQAX&
zCXIY$b7)tHIf^JF_d@CE^3QGMx7X;y-Wgm`^;zB-VSMZ&U9}b(@$(ne(lw&DPJx5(
zkR#rDxkp&$iLW>0(HLHPoJd+mu)F8UDlB6kI>M;p(iu1x%^Kd8aNVVQjG4nuB={7s
zZ=9yUL?s*9j*@3vPHiPWk7UL3PbXp-?uwzd_V8+VjL+8d*J5?o^XF2KZx&<SSJHE=
z_)h~G0*Vg3=!lpI#nzLQOPQxiiM;8zvN1?r7k&AT{<wdJpt%@iF3?Y!9bleokHV&J
z8anuPV$~_<?7{>vA2>=XhRRMk_FLTxeV>%Kqw(U!3yi}p#Ap~Wa+<&oU_;h5B#c+L
z8sUshR{27@KtO3}cL*X*OyG3!W4n^0$KuBjSL9GHUrUYD)<hy77v`*&fr-k3+MNEj
z@-S1C<|Nj-ZSvK0)VsNzp{uQ{=t-Bh+f0wSDp|7|a`C&|RZGieUt7Lc5%>J{=>FH2
z;v3K&o}<OL$WDvB-N9t3>XM3CVUb%cSza&nDcN|xXhA;Dc|n5^JBTg71BDrwHMEJ{
z14BQ?F)f<qQ9+CTB#CLK?w~~X3=R)IDBrx8Kds13K9|pFyF=={Y8CjlskrYEN=sHK
z`xu*^&uEfmT;wsFff^Hzjy)?Dt-a#aT2(S<H%K56h_*Ia9$2Q#wsg0l8&2-Fw5Ti7
z&(ky6iM%d{DZjKm8jz)Vq=}SpW@^$MkBjoXYeG1#7vRe^njD3@r}qtV2BDppmGFu?
z`DgpAKm*MJq&ghG7N#c~#Rl}S->9%2Thi?Xie5b>$cU}m$sD~aQs^)nIlFf=HyT}}
zE2=FwlbpSmRn@M%#HzN<1o`Ro93sz6bsVIHNmG_lG$u?^KR)j8Vn7vPR4q3xwqqFR
zX-4;!=k#*;faf=AUhjFmE<L3}2Ou}O5TF^_8F=yyp^u3R*kmg(oY!pi2O@rcwnp!-
zm5{v5ZC3KkWe=2*uw52AF-i50?_%q-ufh;F%HOsY+h$ym4+!CuuSd8t?gNuQe7MoT
z=L<LTb&Ai9o7+XA*0CWH0p^phsGcy%%RbCkU?&5by@KF+5J!P3q+JO5^g?OoCuaq@
zg49c^Yh(OxF9naC-80~ML}&!Iz7cOB!7*B`-!o_Ssu^-M-%eJ4U52y!((q?wtmSy;
zhqg8^!j)f9`DSw+iawnW5|-XL$vxd3{BQ;z*%oP^i{^B%l&knITTh{e{-l<3)IdpW
zB=gb7ZnCEN@X^RyAM0DAXW*^&xB4(x1<K{I7bOd0<O@%<#=S1H43oV4Mr;Yx{1Y2c
zF7xH`$%h*%3CSOO3*#~p&P8|iR=1!o;`9rq*!;+<ugF-7k*VjypeRa|ESlfY+gMuk
zlQofarQ>hpj{%DHv{+W_P5q~wjU-Mw(fk$)`39DUU%QtGvMue~B-1Mt3llXrwxbrN
z*u+|7c0LE#?K_QoL_ZDev`$vI&DIKB6gpdQW2AO0k!wXps8>%;4<_5@H5+x+SXrE6
zopa+VCzZNO33!PEgY?C~pv$9M`nQ8?w$j4yEZkjiKk;-WR}P+Yt~uK0Qd;~->!&c8
zv4U+ic53+saH2}u8?j3|RBy!G0`TA>@jy8U+kG&TwZRF>aTeVJV}7n<k}V{`Mf-as
z66jag{Off&8?U)(-YM{AAz2UCuUL|N(_<m^v13ezH=O59JkAds!+E1D$};xRXN!-P
zb!#d|e_S``;-DtS5cn9V^vMOH2O^yM?E|%uj^x+AMQffB8sm}4o6V4=*^f{VF}JT9
zSx%RwEnd@C7x`4r7J8{qqnlznUOlY<&ae74z?CyJw!J`~!j+F+JwN(N`{%Xyf@e1r
zKxYX27sbNt;+^m9)W7r?CF0p96$>5OMjgLJYa`>0j=P1R#E5?K4PP29vo45>iwo-L
zHz?H3UU^$}>1aZqs9}O<aSZ4AO=VH4iQY~ZO8;|xX0MI!?87Qc8hqXy?aBArtSmoq
z>3hcZM!zhfX!)z4Q$DXN194w6hr6#m;;YR+Q;|5Io1w2*Nc*UYk>DyhMu|22c?Jk5
ztlHb}-Y7Qh7>=#=tRr+_tacqZzArTVRCv9wyY8e{X)kA_xnik<=uM#%X|VMSLPGs|
z2`j@YSQ*;0+5CGjj|aOCucuj<ozMwKTlNsw4@{g)rePhRk(cE!|8)X}2$jMQM3YCB
zjZ9g0*Qb}X8&?^H7OGM_4{EvhTh1+ZN+Q1BDIQ|bnyU(!p!#()w7hA&-vEia;dpJO
zy;5i<5b7>OAP8V}HVxa;Q>re^QWxt#Rd!{NDf?^PI!1`iq3Ut%20AVAnaUnXjtP=P
z-(G(tR3VUb((-jF&u~VhR7j`OJJj{JwwKnjS#gU;xT-djJ<k}c4UEkU#il81H)ZxR
zp5(t21}V5<<xff67cD8ZC55q$b+ptu9?Zh8Y$14gY$UwOg6FT!8gn|^^4hg0;4Kh_
ziTCYQS?bdY#t5d%LrkSoBZ0>}4@c9nxa_Mi3qk@`mza?bT&U)i!^bjIj!9~eN`-}~
z2wgmk+X2G7Ak^Z=<ojeAg|dTYUE!qfOd75y-w*Q=<B^w*M%gH*0(W^?U(EenGmnw|
z>bDsll5$2Hbu;m2(~N0ud4l?+r2C%5@7#QO1ubpH7vY7)ApIuwM)<-soBQ{hiYB)a
zGwXD$E;z}43`E`fhVdjTskquRENT>p)->eufp#d=IDGOw%K5jhy08qpDZH+EJZ6s{
zn<3Zzgc#?;#0Y1?v1u|k!rnt49Z}z}(Rh~3>L~TKTUpWL%(0oWAGkTd1O!y-U^5CE
zb%8-Fb%|d;#x@*be(sPsYarU6B&=FNiuTs$m&1}O&+qxatp;c6;!y7w$H#oDzTS*;
zJ-nUKB<pmdqVJ@lYf`Z)S&5HcfwMIBH3|^v<E^F(W}+UM;o5z@e0hT4|A5GpWjJEg
zS@_vUbHg4v?056l2r1%-c(obw)(&@^F1`|mNzrCd$K_1EdCT@?Rru9wD^?d5k5L0p
z)p{Z37M3p}xSM3`K8xB;kXi(3``yoTnS@xG3(m(YS^Tu4IX{a7t@ou|ooGyV=9E$V
zDtr5PnD~(q|M7P-&lOZMn&|DbS6i1lgf<nKDG1QO0aId`nQ_-g5K7^T8?2VDYU=pF
zhXfuFVgnjKFO$I#gnZ!IXQLtyHO=b}DdbJ#f%n<$$7rtzVRf__3ys$`4P?Je=9+0l
zcU_`e=6}V33*ySoq71sbdNQ4I+j})~le85Jr*BI353qtO9(z8Zv8Pu4j*bPUL}d1d
z*=!~7ae736?=j%@Wp$(6+FSNbY3U*#=P|?!QIa=!m+8w|pOwx^+=io1XAc33-;Kfj
zQRJ@QmjV-vk+;qL#hha)3%xdGN#$kbW^EV%D2RX?l7~MUfQ-b*FqM$+o}#0UFg0$U
zPmuSM&y6*xPkvo$k6mG%v#k=3(ozZgF(1yZ5DdG4dm4zNs=ecoV%RGxdZM-%oqv7E
z?}15fT8m46mOsY(;@Gx{iAh9wc({~wm0Hsbg=2r#S*P+bYFb+Eo+dGu(K5aB=R4$I
z8@s;=d?Tt*IDJZIXa3sDPt0ZlIID>9g;`G4y057_zfATddfUbG?o*n?mHKeY%(PDZ
z{uL3HQQ)?ArO)yZKEG)9a8)<CQtOnBHMT5x+?O+dQ1Y?U>zqm<s>4w5^AVCWQ;T{5
zZ-vI$&%w>kD}LvY={mM9xaST~L`|2w4Sc_JxmMkv;d_mH@$3_os;X{BoH?U{Df-8n
zxD+UBZFAJJGju$}0?~9E5&0)C2cqd?96yHxTrm2n``YBGDUrhoUJ{~1IMn$_iT^Ay
zExy*;;|A}9O4!Tq?KD5d;%8DuCe7~qNwLdk(T;mO@5U(-;z>o~Dd}%rc|OG!&n<4H
zk1d2~M}N=;Ig>M#GjZfd-Oe^)Y>PP&`O3zMes9b132(}0xthd&o$3>-vU}S3E<mus
z7R25eKWPcr4L9{psjL?2go?TC=V&Mv19(~}mMu{o1R%weFC}h68KLe{d~>8hembA+
z`C8&gXZ4EPdMb}hi!);^-de7l&E7}z%1V7<kF2ki9n~%>hj7AULAb*_g(Mhyl(<#$
zHp-t?J*9WA<uWyJP}u70NjM{EkO$I&)x&wG{;@Dt{Pe9-sWm8#arKP#CLFSQAhi)_
zYq7AZP)OTyfx{dmGwKj*G{a85Bjvb%|5S-#3&9Mb0Q|we#rgS7!`<7^o~t?xPv0a0
z$Aw;WIDQnvLA+3Ql>N{xTrz|Q^F<^Hdxb@2){owwJ;zB&ju{J=jT#4!Je|Qx6AA7?
zvWh9IE2Uhsl>H%Qb52DO{rnjB#P8kn^XnyW5Ur3h2&4QaE9rVNw{GU^L?=uKCYung
zkd>&T7Sw^uLzl$yOSS|Iu#wD>6>kgJhrhfb1r&ojATZd<VD6S0r!HN5N}&#qIZ5za
za#<Rv>70sm+JU0tHCl9q3de9C`K%LChaPuLJpb!(_tmnnbT$J6vTUX%a$YC_deJQE
z0uq(yM{59e0I>RbI$;KG#h{NCP(|Wq$#2N#bPW2!bG%+$&Yk%BxwLi=hQo+%7OTpF
zDyc(}!wN}<v#a@QB2u4>_u2U(MGg5XKrGXu2&jpq9J`Mt1Dw+}rsJ3wc>tG{{a|`H
z9m0cclfPn=(IZZzp*v<Z{pnC3n$7UMk#vj7PR`x^Ko%Gt;tI^rd$hbFLS1CQjtwa-
z?QIR3G78FXm6w;(P*YQT99t`H_6F75;gvglr@sIdCvX!qw&*|qBjtp-@WoK0-x{vf
z9A(wVch%P5Ve95tU{}saLz!&_D=qD#<6j^rvx}D7l%q8bjfJuc!3$VV`fiuU)I}k}
zM-lI6-fvxQ_moRqm)$WUWQX$EG8Me+3|UM6OAqxl`btfSjv)O>c#q*kh|q?OJyc0W
zKX~?7_lWe$(;uWW0$Vq__)K2SrEj<Uk>iZ@)d`qq8#?wpSDpG-*5R(S4`h$+f#2hg
z8$GWK1i17%h%q)}(L`s-K1>DoASh6wlY4>IA5mkXl-|^{d*1scx7QHA0e4M5R1Fcw
zyP*_W0rF(9_!fIJ*R`jN@3zdR)>;}cOH2U6*#MZaMi-|#qs;@3c`bp7oqg!PM}bpy
zS9R&Ihw2ZD9XJQ|T)mvm<aeb!$K4@R!XLzaaj9~8-uGi`eoTqPhG=Vf*Y+=61)6G1
z><iMhX>N<n>IL%v<vfQ>r0nk>9>y;qGc;YODlAUC<&z{kVObrLZf-AT$X@Y}TJ#ce
z5U+~V7H3?3KK|j=<u$kD4{l6OXh@%zLI6nKu6fX1Jm4RpJ9E1>M|-O8c;+zeu?xC1
zY=?gXx;(s;FK))NrBg9G<L<fW5DvB<T`l^!RS=TmmmJ}yV2o-j$rPgWBE-Tf+!ekM
z$~W`}^4+@&m^;nUQlY!t=+1Rz*U!P+IJxIV`^96>_6e%=Ro$__o)FBYYU8PZ*EPNv
zpUN=CI~1yh`yxC8^*Q79yno=iuWv|mtSeQXrc?}g6@O}+0QRvIL39L=Hj(TxG})Gh
zhOcwLzh1*ihlfIfRnGApa^h01VLP1m)x7-;iWwcZlBIE~iQrLdE@lh664y0*DaZP5
zXm3P@xh;uX;PQSxjpxro*-t&E^m#C~OOXjAijTiqZ3Js0ldgq@8&w!4+>H~cJvnx(
zTWC@_N8Y(PpwR76w)vC`yOn<{BMwalRhz?%k$OnfyQ5TWvd`Ka$$rOgY2=|FmOF=4
zI1w00Gw~mgr~!<$U1}~BC{4Mqb4oTYkoB0<qu<`w6J}{JlG@<p`gS^w?}&SRkW$xU
zjRdvGL%**ma@TL<j3NHnLhb-<<QLyl=32+IGcE-l7Y1w_@_3iRn)#hyX2PF?Xn)H!
z!q4JKh6(4Fb7^KbshGw?INX`(u&)HWar_dV|0{}{4_u#_-}<x=yb^)=;sk*6SzhvS
z;J-zI%AYy3I#fT99l^&f?O}UmFk8}ZS(I%Q2lOnIpt;}R>ht@SAW&*}#r{Ra?5jdI
z92c<=lH)Tz$&fO08D`^Ftl;~?M~!V^cou`ctBuS9k?%XeFfsFtyB(e6B}3%PS>D8X
z8I&+lTS?3e+QIN4$B${6I)wV$*+rdN_9=W9^?w!)huM2o7cT0NW0v`$UjiEsMEBO@
zHzbac=#GC^*498SkhlDnU5>g{%U4d2dD*n`(x@}?NM0`2Ah!+I)>>22vP_TuewEbi
zch(7}w3*VieS^=$hH4{ux&$x&LKP;S%+~BuOee<5?>KmwIwmsfufZxY9kt4+fMAB-
z^1koK_=E{~qt?P_DoS~@qZbHH{Jv@%y!*S0&cp3Upadi|Z-7cy0<%vv{MCCzT<WII
z?SaQ8I!VLA=jEsCXNi8VyOZM871_6laLw;sn(`v0KRn`toPF!1m|ydJ5yvUIHM^C8
z0dhX9ZJ%O@9jdW)a6k$O3NGkZ-^Q7Y=E5Md4hm?*tIsCU$v6B^_HPdv?8+R#?N=xh
zzI5%$Vhm?=(;`zsz1a`n+ezfOdpu{c^1L7rPFBbYsGGW0h*7=;E*6BeKN-DXj=rlE
ztB@3PnBghzVuWD8nWWdy06WjYeuxOw$~$h$N~Y@AoJWMaB4R1jPt%~`<Vv^<y&%^9
z7X_v$nTnQn6P=NPHjSJi0NZJ)-4;v^{?3{)*weOlp_R4;eXiNq;*76F1tl(LU(HOi
z$*!Q8`@q{MvOZO><p)mUP@84<Vg&2(W@4sJf=g?|t)lm0Sx&Ec<_P;Cw~Ue?{E2>&
zW!)N_uBxg^b6+WYb7QG=BtQRCa}pZqfs}2R{es}ap0rWK-7rU^852;^yM1Z(!SmRl
zyi@^wJa=Oy8Y;?vZt28gg`H7Mq6`DtU6v(s>l}z#8}V*soROE!#AisFR%e%ePe+aO
z9^GtK3GAYEqYMR9Qxzlyv9D|Sm3w=5EjDm&5f5J$X213#=Vw;n0MjsLKr0Z9o(nSs
zi+3l2n-+?Dd(5`z#%d0{_~v477^2frdL8j4^3*?`&5sVhf}~gSaz(yXCrZ!H@cb*`
zp*QDgUQV}ntytd`Ys8@s!Vq%540RKNR|DO97??VXz&_iJ$}fn?H)w|-mwMTU)qFw`
zv-ug4?6^27CK>L|H<xTMLb5Uz6dyV_uUE@36)50?^BR7BJ}`U9)BO0UTLXjd4rN=S
z*F~j3d}0ZFp8Rp?^wz{Sr)vnxf9ILmEI^YVTVYTgPVtu|5(&)E9U!idKWQYb`F89v
zhLC<CQUjEN_6ZV5IFww-*yVUy?#m~pXywQ;voA6YC&+;}WmFSL;0CaWEotdZKD_>P
zxIKg%r<<WP`20^Yk^!mhK&zQkxPsy$yC}t~#{B!jR+)Zdt&2J(zaqhzof}z4AeB{O
z>6dRXDN$1+(({6O#swCOhshiMHV?#7v|nm5T8A1l%R%ym-1;%*!`rss=`*5?&8jl-
z=!%_qAlgu8dfE~Sg-_|sb5bNV)VW(1ohz`_@4l|2^NcjE*uYwR=cUq&zEbctoFGF*
z{1ck0$*~v8ul5`X67jHo^y!DF-G`D2ttzyPSm&kC+TZHpuTLrLJu2&*kV%foM|+%P
z0`jKbmG}7y+T2dff}Pr0k*S=QtZAG=xV^{ocSV#H7dv=5bz$=RT@j;WYU#GtHe?hb
z){!D-aNh!#vG*PW)+#Tl5s1zvH7iV4E~LdQ5lnamqKl4_lab}$7tj4?ArkoPPcjmK
zjLD9Ug*6&1MyZ@!=-(wL_OQ)Wt689lrb?DiUUqGcfCOl?Jw#f!)Wq|UTKBvDuCbk?
zgaFXh*IvGohZs9A!<U`4ym2e<_Nk;3I3!^Op>^WXxso41ugYfFrq}@MP9SDAbha?D
z&d=DwS7?rCnyx6~C_8R-F?8_AHnic5yq1mrnDj!^lU7%qL%1&E^uy0j9De;s-@TUn
z4(5i)K_=?#4DmSvez3$T@+E`9H(6)z=&)VCXi|#HxXnIgRMk}u7i(~BzsKNl^(q#9
zr#;huijw@K!*896(S_NEAonb>Id5B01Jnu1olFSrU3BSDhyMW(I`w8pB*SY+neCQr
zG>3-4MiSi$CXLWWdicNYY2b}6G>)sQ5vIUBeX^3(z6Va{x{O8H8Z*-mMwO1-p_te3
zI6sJEhe5)s);#~n_}*$EA-3K^9O76-LQtk4hq$f>=sb$$WM4vT=a(mL>$UqjVwkl4
zn-N6#`KiHzy23fcZ$HSU&P^1^<r=O4m`R}cy?+wC#}&62#F8Ism;!*Gj4@SG8K|~G
zRkG^y;ID?&vDA06eOWeP3jOZ?JT479ZnCFz)mf(wo=zG)_R9+qe}VTOp`C%*-*{0s
z85Q%$t958PErgwYx4r=wu1KhGVEd_>eb{>uD>+!dQ6AHS_=hb)Nf0GjxC$jnX#)~8
zG1YiD5bbH1X(Sw36X-_p@n2~%H-3ep&2i*2fOpeUq!n)nu`MSWK5pLC?%O$TV)098
z4nO+XIy0p*F5G6RAS(S;6;-(NIdep|NLkR0<F?CL+wA1YiP2Q$)AQ;e|J?nme+&XB
zu2fNh&f8oNA$QocX#k(;3Z+{@98<?;xkwY}FBlA!=z00PvJZWR3n4&b9=tmct&B8P
zK40jNz@tH9V3|cWZQ6bbCzKqeX7xhamx@5k9Ut}ycOTBSJf1^k$xMz}XO<qXH+xUn
zEAlIvXgBBDTgg6*+OW8P*Sa`GtwfPBW=S1u$)V7jhl3bG;3uBh=!`*29m{F60815y
zmdZtizB>b!s(tlsC<d+Uv{!N6UF^pczcB&_I~fyf4G$64eXZzPzw&3LH~!5a(8Yyv
zewSg&NMAk$Yc&o~h-UPas^I{IVNO(CDwOf;z)qHE>w3BiDQq&6{T49j-iK9)AKg9@
zJdz=}PpC=Ww67IsiU~EZNk-tV&dOoMHJsX5wv^e<EiX}()MX!SZ#+pBwLyP|!jw<^
zsAI}0zS7bqz`*&TOdtrX=6Yb6!GFn+$`rlze$Fk=3u||O!4bDquxh|oRvb#Qm6ttr
z-6m&1T1)rp`5T$^Ymr@-i=xSYyFEO7c9^|<Lz#uALSeg<)+rF56meykeCaW@Fm)EY
zSfTdJdb2{>LZ@nT2H`+A8%_n)P<JI369%VXepcCemd<kZsH6nDKilSD%n_P39PWgY
z`V++`zWUh_5#Kw#=>0h6VN^&k`cd@@Poj#daaz#1@P@b-;PE8Hpo|rlN!`*N(i!;?
zOg0PRRNE&nz^?7xsYR=~v^YbDpjMq4Lq|m0+C-d$j}se$kQu+?2zLwDDnQ#tEUjlW
zs2|7%JYQwb&S+YGQrrzTr1<_%2C`Bd^5glJ!)aV*sY~Q-JdF6l<z*{WIA1+keg^v;
zqz0UAp3g{nTk)`iiIG%=5p8Yek|=z4p^*@KLOJXTkiRyxVrN^@UxVlA`LZV}$ww4U
zi-I;F+@d~lTY2K;+9%G-8|eq4r8bA#2;FK0f6b-S;e0)U5eQw@<MrsNt2Z6F02qp8
zhezguY=yEDr3uf@xNUv1j>krf%|Pu}1i^irME(58{cjq`rmMb!^qx3xAT5gNaMbYE
zWK!In5rjne72Y-&nI~RN?=F&Q1!5}%tJRVUp~bh<sYB``4CQUMY6{b^A8wEPvQsQy
zeGdO^uZkglW1Q2rNOR@0g$`@i8P2!Ol-{t%i3TM`n6zNsy2{s(Dc?X$&F60U%yS{f
zQ8C@*CWMVf98cmS{gSo&N{i2pD!!f#ZS1xCrr3nQ<g=?r{*wbJ>KimXy-y9*uO3K%
zI@`Hi8p+tR_U>|#K2Ems-HJGK3nlcQf89|sRrE|}O}$Qj>gp`r+vK-2?>M`MJud*C
zZNV)8itrpzSY*rJs#;he)cam`mKZP_5&VldB?W@U#Z>9qBL1dSfoSt95xpCBTsVlb
z<pL_d^HLVaEZOqogX-$GuW=83b_O-8?_x0E9K?TgZ<<cN>s%Ew^?i7|*9zy)fzC9?
z_!Sq6+jUsdaSn@mvxtu6W(oWgd*S*2AQ$d@^5R=)(7I*P<tG;M$6eNx3KJWZx?ce=
zgaoLx9xx)8YZ#`0RmD_t*|Z4YdI88sw%qW#aLWXm^8ikqNPYgwc6Z+8PGYS7^^@I5
z?ibu)+R+z?ai#*%Wk9h`RX~xA=vh8Y5SM=JHL7e&TY0)yVSpp!7O<zyjE_1uLUH@E
zpTX~+WvN+BKE9mi5YqUdCs$vmPvG0jLlku-!JfZw&vFXheci?R+XFtutD~04Q@uB3
z!olOT{H)$&8(xt>Cjloor~x>uI(93!Tm0D*9yg*Af89-WG0p$C+f9Un1A0%3CH@?=
zdq9l6xiHi<5-j+886PwJObi_DWYrVL5t!NwWXAg}TV?eg+gA6`%k^9$)dEgM{7}M#
zg}2_4F9fGZ<uaZAmLe{5{MfS}HlPY!Zm(o5$#2ID_mCtV|7}2ca2Xb>)vRZA%m%qm
z7j%GuEbkgf-mZbvC@z7G-Q^!_yMRWm?c*wID)n!L!t-|oBRhh;06203vvu^n7DP?w
z(#L)qwlBcu?I>$c&(#+2zDUiE9Mf$1px#Sw9$8Rrhl(CT0_a$Y<Erk2JVswi8UOH?
z6RnvNR=6ArpAfvv`@XlH=RV2gseZWn;lQ{spX`y-wi1W;F$#4U^lxBJIS*JuR-r1t
zi>Z|mrt-WY)C5TP66jTsi-$20cEZOyZn&uajBQ2BZ$M>12!iNMIP9@SF>QRyhO@3s
zTj^?H*k$NhF8%9Sj_y7SJMhe<v--WTY)d{Thl&3~)>nr`xpnah;sHcKL8Ln+MM@e8
zk<Ot*q#H?TQ9`=AL~0nirA4}>yQGIwVCcAe&N04w?_WOlGxP4f*00ujH8BtuLA7b<
zfuHZ}DBdak?IGSk+s5sBkeNQ@Z`vNxm_G**_xSTzK?D!bF$S_1I}2wSDWKZ<hA-VD
z0jazSO`=;My@S*`9*OUGQ&zMRn&@4wR{iiT-bMe4`lqVBeQSmKJn(Uk{RJHo@O@rf
zbFY2*hc;nk$Gx2r$LlG(cztUaDyg9DHA>cyt;WC3-S41_z|Kr(vzTrN-@6g(2@<-A
zIPC~NZ8I_VdG$XR;AMg+u<80>A_V`bDLbFCg#PEhA4L1aG|D-h;?he(rFRgZqr%M}
zNXA(L^#p||UeE$`T|ba@Y`oXrD+84(e=O0`_UCR<17=`IsF+PP8glXE<-5NZ>H&C0
zmn8I{l4+LvI^br4OyYe$vwLti3Y7v~ovHiF5v*=|m=FKw>~8-tA}VEvv6@}}0oxBb
zGs!(hZlGZh{Jj&nsLx&79HvcWfv*1qIJ!y>z!K>xUE%`l3B0WXfr^Vk##<O2h=Fio
z9R45UI^gY;$-3M__|IuG%W;aVH{y$f(JD#T(|P+H0#+td!6mD$V+%3`Yv(;TUDtca
z>mdiG_X>X<d{j|N88M;Pr{C$t8gH<4Mb;v#G745wJ%HypiBs+C##yh4rq2mq-v0$C
zQ81oTeWV%frO==Z(_VSXL6In|%03v!&3rrNtaYC2vEkSIf6h-S9ylP~1Zen(6G;$z
ze3FFxswEd>pq)AE_io{7{QinQ2%=Q<4*VpD0sWnhq<R8N2&*F7aH=~>;*Qs6PE%A7
z+f#YArZo$<og(hlHQf?h*4n+JCXONUU(PKh*en4aiKL&qvgS~J8pv9V?hj-SnHDnb
zs8Pv2B<`QuUic2gxECOjljJAsQS9R9E*O~~78mg6seW-qRW~E(@nyt7gtBpVI)*g8
zH@)%agWN!w0sjM?RDjapk;<o=;CWD?Hr+<Zv2o9;>FNsh1ZPdjW~zH-oj5XQk|(5R
z5jwA-{l4;vn3`m5#>q+%w67919M?|g1@=%t%*=fxh+@jp$HJvcc=+%mzr%~=#vHc_
zca{8C{(bM;L#Y;NP#{w(Uii!>?tZJwcOKtG(&M`Ahp-S^HfA_cP_Y^<vl{f9+s<@V
z$k+Rj2045r0;EoJTSj(^H5YOcG3fgLb<b1pmGajs@rXNn8{R<4Oph6R_P5<d>Z3j%
z%UC|nHnk&8)kh}_w>{iQ+7*8dzNr5k)rWjf1Is><fKM)n+BEL~{%xBt{h%kiAH{Qz
z|2e{M3D83Q+<U6sjj0OJIZUQD^c|Mpr4SU{CdsEPJu0ABnG`;rCbt}%=BBMtl(m@1
z^&Oon$Q>=JDA+F3aqiwn7|d7CW#2>-d$w3+rv9RZbFX*!xH<fECd&#h>CM)o4~F9R
zg;MX(pcM4lX4Y*0l{{4T@u)D(D3<dH>W(1r7`_@g@zGA1-%>@FQAn}&A#Gv$pK8GR
z?=#kf4zNe}rF$4w$D@V)K7wyWUjmE-Y4sGjld1nQo-iET4=UQlE$3%`?(@YM*v1X3
zLEL}N!y+$A#d|{MDJ%IS^JcvlL0+H^wiF6=)sXnapN5EpK;Em4&4urIesJPH6WzJu
zdf;|2SLLu4sg<5-Ub?qCd^rhTMy!A?JEMM1!?hpE67Wf8v14Yv`+Y={j;*`dkqw<u
zvC#dWR%$l~8-TVCFV}&x)lrSgZfW<lwE#x`YE+fe%+LMG9`H`p1oM+W58wtG0KxEW
zF*hREK?tp87@Jr@uHYhhsPl(EYP={VvTS!x+yb}K>wj*g98}6V`ra+vKew`r5Y-f&
z18L;0UD`&VVO6vlen%U*#JwCZn#mSxXUSKua-jX3kk;m!;uF<t`$k>g!%DvPG@nVW
zG2H55x7#^K{&l~*&edM{sdr*LF*oeR;BNnl-#}oOvJ_7&tX^&3gfvu)vJXvx{P1H#
zjf39sBf*0od=4juh*m)%&p=U1Oieu(tT9O-SNZB1DZAXx)Q`<dnXjc9JqT<rR*Gmy
z5xMx2g}H$?3z)9W)eTeR<9++Mx28<Qxx!)NcP#G)+65h`ZG52cUlj`=Sw#!dPo4bv
zXgzJnK%2D7k0tW+?>{WsaqpMK8JA+070(wfQca0V>g4af=^k_i-tye#Mnj%3IBp%?
zzhD;zNvwC!G~+8>N{?Ek$Y@*aO^-|<@Q!v;5US&eI`2#;PChNZK4Pfd?l3cH@7&5!
z?}wEIzn#z>mbBc_Mr80=UoB~9o&MzAKiYq#3_)X7?~9)HP)i^lgSAapTE(#P*$_WX
zKKQV;-jlkn!4J2pQOzI^?2z#2thA%$w%UGGBd|ktPZ~)E4cA>GqX@`Jy4^S*VPwaA
z@#~f~zF27wU<*ys@D@o?@R9EXV%g+3i7^gPD#NVL*8P<RGYCP!2xM0OdlQ&|DZ7<;
zyD#*25(P<0-@YBvhX`3V(a&78V4}C?WQPoSmD;+|WLwSK%TnBfYf*m!<nEp!lHxwp
z1z;YF@DKKi{E7Ai@DlY?zNQ<$`kRFo&S`=$+rdLGZQJdT1A#98b<$PX*ZB;(9Ea=o
zcJ0;^3wiOR-#)s?-S}+PKYF`Z-K$zzp)1X@>v;H@lqTVhf+DIbVx7cfY0KcnT%(mt
zg*aR$ro&loqGh_oJF;nA@Bto#-W!IShP#t*H*+M?dnuSSgf(ExuAw*EepIJAb_nQ<
znh%4s9n*V<8JYU=0knYpqlBN7iv%c~nffE|;a#$~PQCAj>0ubNfD3aSF#p0SM_cVz
zYLLKWTg>pgG!mT{o*+q-h_SQ3I~-LWS#?ss{BPBvK~)FpwGdaV-wk{ODJvGg|JyHh
zcGX>+>|V5cMY;@j4fX2PwsZ8Z^Rqe%J5z#a^UKu#6)n*S&@lH?@4+{V$h3iKI0R%1
zcH`HQP=fNeOMjwQM2f#6AaXra+nB9w(pq{>vKqHG<Wp*7Qg-p~Oc-{i-dt(%IFrr^
z#wRTrtIum<bDcfbu5FSbbR9Y+%YJps;JO<z_?dV1s9aF3bkrUjXEu&mkDtedONuuw
zR#XL(4zJm0hBtb|y(YFN&c#$;MTKdv;W$q4YCBl--9W)!-TgUQk&-l4N5#e;nRl9Z
ziFCXgtPA~Ktlu?fXbiWbhq&gi<XBigqkMo=l`K)M67-n0@=uq!A2itGqUzs!j0g8G
z#^u6CG-%>ZmOcYt=Uq*%s{eGm>9C>!_U{3~@dgDpF@GC2c&#VZ%fqLpo`<FDfOO=T
zl5xmbVhUdOFxCF(;gd!es$aP8(Eo$2To&KKFhA_}f12=H3bfE8nCpUvzqO6vXatp4
zjkKiD!hKF~!oksP(zogv7|_?$)}DTZ*_;v_5+bGbI_*DAHTRM<Q$JTS_4E%zbwOUa
zU(j_jkMO|fKlYNjq&o~4rR-H(8YFHlwsc_E4AQQExxCw0R8kP^RVS>~NUd}7{c*lV
zO?)xm#Lj5*3cZ4lc!&J$=PgMI<U$6?<hmLj`WfCo)0{-Sf?YhCiD69d)^W@kF3+o@
zz2MvMiN}-vqKg=SB!D;nI^-)i$}C=VP+|Bb0S;fHGmo(e=#=0Hz>evfl5MIW^@VAC
zBhMGy^p6`Dh+p^b>goT0=o=`V0Nyw|^@WGudx7rTZdVl5kQtGP-DtK$JY2o%?Vyy^
zxbh8Lu$1_7>AA-4pd1?gP$C=DnVRcIam*T_N3=;~Xv07;E+MWH)IpW<bGOh}0}HA!
z{F1=|j`YXL)bBzK_#;+`ib{NXL#WDA3ho7nYSQJ&yYx)7hSdzlpi_8FILx|5+>I9}
z-m{rvD{Fz49jy3D_1U<BMyfov&2*xxwU>Q0vH}n-J;ETUQCQ!hwy3t&##+tdTajMg
zCHiW0TuO<ce*L+|mUk?6BfUk9#$5>Y&ts8l5AC~;hkS*Fu;Z})sj8uSTFH$<EKVs3
z_9MCPHF$Cao!w~1()a`p-+c3HE?m9KIk7F2I`hKYrkS$fK(alWo&pVIkd349h+nMH
zNr)$#_a^dE{s$p_`{*LO!7r~)J#nq#aAd~_B}Ehp3LoE`$(<%|$~Y9LjRV!JdUUEj
zcXY0v9&&803UH1@)6|ccS>e}F4M*d|Qx})&%E$kV&iT6(k-S9*sR>`v2YUga+ZkrQ
z#07aGu;dxkR?94`Z?t#CAHMh_Tv29W5R!FpStik(8evhly!!3V=z=?w67P02;32B+
zSvl`uO$YVZNcWE4!6iv`l=f|?TXF0a{#A8m%hOW+iOFq#RHLYyI4V(NgdJ^xH(+lx
zuf7nW?!(M*lwn#Bu@Hu1RURQIaxPe8s3yl=x0av%PJpgs>sxVVrA=)!mt9@NcB7g6
zl+m%J0OaVbB3sS->eTggzhKW~wR54%gL$G?M>gbD&^`7hwL%8=i?)A2(*}gN@u<&#
z*#25*_Rcc{4AGBwKmndlbdy~AzG@^XbpIJhQjbOAf+Tegn&Wfqci<qG0{#8}X<5eQ
zN)aB0`I?5;;bqH%xFD?lpm(p3^-3SxN?UPYV~+aBoF+rqS9Q?qMSRnBl{b1!yv)nd
zy3?j%fGszte^O56VMC4uQ9`1+a5XDjDh~Tn^x4*nyc5#h7p-R^2&Itz>f1nYPvQbC
zoF1mI9BZmDr5z-jvhF{oO+>~jBGtk384g~!T6``hqz_)57)}0%UVz};FAd0{RZ<$7
zM{njb#Y!e76el6@;{(UEVY>GOQCVB~?nK!H?IZo;ER%!uxs;ylnKSKb$C1T>bAnzy
z)p%O&*Zp*QCS4y%dDm<ZX+KY%)Y|sRR-c{Boj_@bduU?NHL*E;ewsPV|LE+gQpF58
zjlhXIJ0C5tj4FI)k>7<*QIcS0aHVM~f=J(jdE8XAVx2G;*Mse<bVrFey%YTgHhCfR
znYt2k(Z?{)xPIWEW>Vjy%hZqW`gA2<&M$Bm^sm(D-dW(#JmEd^9-IwRVz*qst6x5T
zA0Fj+5dF0J4-S3niYzFH*tz1eKGri-Gu4-m>c?l>GyNA`QxGc-waZQIWpLG?4EG27
zKP}9p5tec9MS^a;fs$8Sj~+EQxedckn;cY*bCF2dZE}#ehY*|dcU_`Q6H5pWGm1kX
zVXqzNz|4bqndvoFcdUTK`x#&tER6=V|F7ZzUUFMugDEyqV${4Od4xyjcJ3XyQ|*)T
z6OT=&eXr@|>3qSw=t%U{S(u4VPy@2AsO#i~^+k^;ia1q6jUlim*M(~t1%cxwwYz3~
z3IK!keB_))mc*oLFAuk})Yn&cT6q*jJgY1D`-G(ED7_BSs`f{*D3GvC`^;`sl+G`2
z_I2{Mjva3&&jyC?N3bgZ-tYD@a&^y1X+6LInKGuy)URo{KMd4mr8xyS0l*FZ%(w#}
zsx3iylP^fCKt@VBe%5>?b*Q-o;^F7sLB+lNa4#;rGXS*~<Mr)76df%T2rJ^slQ&CF
zKPu&h%dw?mh)Fiqviq=Pbqs`HP7-g|*Qbh<rrmj3RHX*fq<YERUMs+?;YA~aF2~Dt
zuUcw_+}k=|E^)u^;;Pey?{Kj$IwOIm)0zJy<1{~YY~8>%+I8VvM)=D3J@^CI`C3`Q
z+f6!Enz{uV1=N>{Hgw!Djk3FAw&~+XH!#VtEh`fa`IeM6UA9{U_t*cS^`b$VM|t#T
zOW&iS9_mRkCCbrrp?!+A3K2JWZ1k=#gQH3Jj?)1UI>ahGUq3)rLZIbRd_1$j%}md5
zD!cnf^1q;ooO5?lSw3C9ovj=)3K2N43wU(`m8gpj44FR>`{%C5=y<FMYlDhQ{wuv4
z>;CEf;o7~+>FR|v*Pzj<f?O95fxD35?Z~4VBA3&`A58fwIiJX4il?XVbJP)ceXPKJ
z?!pfj<=dTnpW_(VQLa-2({8QY<;k4gf(DUYLPJkl;}?u~Whs!%joB)=X=0zVEGl^`
zG{}rvvs^wL`x)3XIEpN~!<!p7-wWTpcrOaYka*+<oDwxz_CHS+6Oa<WTL47r{tur~
zd4~LwJhYzTd-Idgbcr?M2wy#Szsth@(#l<ljL2~=sF)(ubFbBTA09nuvL|Hq;=R{#
z<jr!cUT=hJLNLB(<FqE58i78aRJxBzoFRsY+m=jzU5GRJ&?fO?cpjQ_gh+*$_oY`o
zX<}nurI4pczE<mJm?0<>lhRlw!-u@RNrf6U<097!yd>V`PLQA{dGb%iv4!pJdaHx(
z@?8HYsNB`QdezKIgH6orGsvG+lsaE8DQDk>?kh(JKKA*-$%qmDqf>B8_rPbKc$f!D
zcRj+F4!C*;7$;{XQ`_6y{y8~0Y0)nm^FM!n6N|sARWK$(a*B&dnBR+)iyfQFGxg<n
zI1qGN3`&%WV0UF)I1$;mY+2~BQj1FMQgwn&5eDQ`5ptvQY&-ad^Nw~{%Jw>wArv>T
zyD6Lxy{pgTT!3Q>Hh4Y9VQ|H}Wuw8<<DwqaioHWDbC8QLzCNp$Ez&EH=}(4a18SyH
z_TAvRvZzjWb@Do6b<v=%pFcdVNT(+0tiiJdW6CiCo<1N;q<X#Y<pH*ZrYjT5O3YmD
zQh2})H>TTcd8frGOOc3T`8gt(ZtB0!PDqE-dO|EDCbTzxEB+7^sF7s}=5tUWr_6Fk
zS~s?-=QYwWp+2n#f6xR923myZsK0yg=b^?@r#;o8ec_5d3Q8%8sc%D{l=08_k7{dl
z!*i8gY?2&0H$TC0a=xFtEDsLv`qkeJ<S*<N-oMyBP+96xFuFKA*mLN19^8%<QNiKE
z7Fst~4(;(yX<A@X6s}R3*Qp)gG=PLF^v~Ibv*dLOeWLm2uy5!|z`lYV4ZK;Lcm|K?
zY!%0aDpCeXVz$_~=m8K-kVybD!%+<Pa&&bBHS-_9@0x#K$yHoQz{dj#VllZdEeOfW
zDeBP?vz(dX4N-c%gbe3P@Scd@-J#1E)ikITxV54BIf@hd2%vnlGsR7v2Ba-Kd#?Yc
z(1f6#S`?9Ne%(~Wvcqo?Dexxu0UN8#G$L8Ht|#(Ye#s<M|EOVcpTT7QlcMXsY~w4f
z=6PEcvq54Ny9{lWeY5;R-OHlP>!36dT|pI>B8v-tD4t7@my@6S@%*@k0(^dSM=AGG
z<n%(I_Byl1c;JB9ChTBi#dr7$!zwDDHH#}iG#Lr61JXTWbzdu+sqT4S+SFehogXFI
z*Er+^s}xz9<g4Y2C7ZUD-op^~Wu(IaBW^y8lKtX<SBIF}ue}E#MKcldNSP*g>rQLN
z)T_Tffdo=;J``dc`qk=}-l=>oYgaU2^w`mAJ^DoAf>3>O3|?-3F65*rvgvf_TfH;A
z^W%q+?CP0L7{v{;H`-Q2Uh+8grg~Kd=VLey1MLw#$)EQ0rha~^aDS{KL~5~4;=W4i
zHjpirNv2~v4GWYX<U7FFbaGmb)v$0{=Jldl40s!-wD;*CJia%OzM$p1soI$s2oxai
zXhe1t2qPA!L?LI#_FVz@G&$PCsR$H(fQhCK<u!s><lWXPe;oX6uI~|)ouM`f&+D&n
zFp8Oong1&$LE}qlF4kBPAWU7-Sk>`RXKvy#BuAKTeB{!WbV_ev$0ys?amU4=aE?Xq
zY#umMjkft~vBip*CgTrm;um}tsN}Jybu7nUuvL<XxRpW0WwT0FGZ7}cp^lzu0pUt}
z|EU1G8wC`vmnn*_)NP5HMdv+Ti;mv(uHz&KS<NWEv%(P-$H@KwKC5Do_ukcB_wdR`
z{_@u&?E5A^k3U@rURd|jYh6w~KXvMN4Zwdg=6@d&RACy%<nY7SqZjBAapNKks(!JC
zlrN>fY=6=zH%`iF;o<>d$>%3T{2-Bb=dS1N+%T{)q3d}q4>z@#(Di4dzXs!Hx=2hM
zBjVNRVJ+po`u^q@GrO)@0VRqN;;Xwu3_8`a)}<B)>J}qj?@Tf}uM!w`w0TRd*K=Dw
z@is+1YbEFAdOd?+kZxARcIEUdY9eVm*K$wglUHkR`REDgO4;a~hX$*wjI1PE?XHE}
zT6;w0bis4@MH8GY&MT#&hZWi0Q{7~kksYntfXh#m5Q*;o`Hvn6hpM)kVW7o}E|pVv
z_h)DgH2+GFyu9Rb>@&9uR$`A`YO?@^x*e3M9Kz2jOMUvMa_hdKDSSf&e_)0&DsXzU
z?&c8zkoXNIvVQnk3Q1^o=&HBlmU7v>VQs@4jr|7LtO6U(G<DM6`T{den$Cod69r{(
zt8U@AKJC-#E~SP$ZVT`2x{F+!EuZQkdRC#m5fxvT;OpuoElkU`3>0iN3--|kueV*i
z;r=+Q(;V!?!Qo)mQ;n!XKzN5L61=h!j0JkRpJi6$Kn}bbc|wFR{J>C7qfe^|ZQWUY
zJ96MXOAKE(6_Ex|#KL{E?Hj-f!)hG5g~$D>Wz;u%B*1d@YN+#XPR?F|Qf8gt`Ga?i
zqZPOwv;*zCT3hE7#R<Y@LN99>;bTQ~cEUbUYS6_6NT<!rRqT5QTkgZvTCH_tLt*hk
zz@A;@MD#XDX1JRS;0e@?-SwF|3j~BPwc+zi=i%2gfDn>S!F(%i%SyOro2y6X8<V%I
zb-44@%D#&1_?&yyGg7U0X4ppJQxqkbP!z^|%~>tjdzv&ERdkeaZYwgTeCq|n@(?=e
zCP;aj9d9=3bbVHC99|+e5~C~!&npgx6(8iMXOa2okI%j|`!oi>_~mOFBcfQ@xs;Sk
zJP$)<x?u4b2rhL2`QL^4ttc@@BHi6M0Yh)Te3k)9OoaEfQ6TB?6_3>YX0yIW!{;Za
z0bY=t3cPd03X^lVY+aWRpweTVu%8q&E-wm-m+#)vMXD4W6KUqasTXNJ_T^oz_6Ws-
zF~3g5?LIw440uq+Z*ZOu$CWBUzSgDjOgWsvlG9<ZPITYLcqgxW?S@xJyPFylQ$qVC
z*U4BDaO7P)@lB&ouu(kFJ{?jE;Ul)&)5muK>USKxht&;!Jg_rK#6$q9yS4B|^LHj3
zfCYQWFB|(6PxwvKc~QwVu-(5+ZP;q^@A=DnLzS*PCQyr}$i|H+++5abr{C2fp>b=m
zw@mL*L+3xeRAI;CmRs3F;|4X-71&01>%FgzAS$&Ea9An#)>HMZ`%(;!&eQ8bn`Yhj
z1NZEv^)R;wnm$wJLv2^SDMU-uhg1H}0)R`XHkT7kLr{Hj=ponebp14Lf34n%Z0dun
z3SZTuWd%Gs;+6NHxPrS0`jQ$DiR>Qa4Wv>=@&Gc~lO+XBs9x>*0*DP*Qag$zde2D@
zXl0K`XmS3pP+fNTFBdd|&>{2=#W08SH_4R|_NJKKJjK^sYeZh!J(DCJ#7B(>SM3fH
zv2@ke27HATx^D)QtIssKy|3aR4oh*4cT$AxHHw`!CntO4D7|glM|ZIb73jGy>dYh{
zC7kC26yFqzn+sF$4vNo1(<)!kD|oaH55Rd#A}t}zI@MVyXoVO^xKZxis;J_e&{H;H
zZMCC41`8P<9Fwi1C_Yo?v7Kik;kjY~#f(q=@^EI)JKlNWVg-r|8~;Kd29g7GFRmHy
zi9(O<9^^v_{PEMK`6x$aS@-0K?}BK}5KnJSl+{6m!2jfGbWoEhM>%_Ha#W$b-_L3}
z3}bim{M=cSHzDcu9`G%PPOlP+q^rFuA>%LkMOD-M7HXdhH60GjXm{ML&!6xA$mUE-
zbe>)BqpHvi86w5e&o#>0o_?}eIqtC5U6lPij48m>^+JnUoXtaQjs1$nB1J)pf)U1)
zL6FhIEO^|*lyte`prv})XcWU6y3MwZ5E*{$!!Je&MMA*M6Gv8P;SVfSeUJp!NT&r=
zN}a$3WCkB26Q-Y#5q9NL`@g{8LY}f7VXpd68GLrG%z4?8;&>{W45v!$F|6QTeIhok
zf3CIRlUC6WO3sJkcbM7WEaHO*(35|Bohr}%AuBHOm3C5vy$nX671o6FP>;TTZ2vi}
zF;~YZkzFtA@_I@Kq8uwc4ruL@!HpVW3p%w5NaH1^mN&ir#Zf_XiirTRD5b*7k+q`z
z)9L<wlOfa7i{AeIspP$tRz)7#BJ;fH4S3ZI_71i^niA&q0P)!eC{xt4Gq%e?4u|f=
z(c3-mxqLi*-%^7>+*UlVSlya`-YaH&gyu*-(rFoM<1`ZlJg1%iLqf)73hUpeDC&ek
zm}VeBH<3$i5^pCvk%ilMK#nYa6Z$WJS})1=w7yZ@LxB3rvs#<gHRgzk@~!MBcNS%#
z<izcR<!1`a6nh>&SZfix8b!&cM+fhTcj%DT%Q8P1OCWc?St75z(_;6!dQT2-A2(t%
zt`oht1>IePsH{?LW$w5+`d!{a0z>-cm-`X3F*pf2#r)4;YSMb`5>nyAn5aBR9H>pM
zZLl`*v@<|58K}SNpv3OTb(tGjRk~7qYukvA$9MBat`(t71iy2OlTErY+1;(UP|MKM
zjav`>36<D~g1<sg4qBi7QK$fJ+)CzD7Vah#n#VMs&t#*}oN#+TXY-kJkFx<{^B=y<
zg=_1SMo)7FW&D(eGjGrnxcJ~M;*LV~ym8Vs2!!R+g&kezUabAD)Z^&zEd8~MH{GjE
z=RdmF7rI}YC2AO6G#7*?R1BD>j3yvkl+BE57aN30dbMYsVc#h_rKjvMCYKiu(@jEL
zO@Jczd#o~oVSV`$A6=R~?6#PJEn-<P0~_S&d5dMrfPZ89$;xXZ?g#-}c#dAs{=`Cc
zuT+L7v(H7DcrtI%di{sV<;_>vr~ycI-*BdcdekU04=Mes;SHw#1W74^ElPdwBfIMV
zsn0`Hk(gpa_^eqyG@G9ZawU}R&LoP+XuJ560ax73b6QIq>$lKrSyg4E(fd-2hI^3B
zYmc+A?`Y-l+3?Yi%k5<6v8w`Sk)EHIxjn#@)SY@XqXik_FWeLcP_ItK<JeNgY64bv
z4TBUtu+lCQ^PjA(iP4=XWJu`c&NARaXbAoDW-ez=FimxK5=X~7WR-+L!Kamnx2lP}
zAunSTYOBlfjUbQXHcAHa7>q8r_pS%7`wC|~HhhShj{D&i!i;84?CLYB#VB=WEQOem
zVgf-2xZwA2K|b3o_T$5-e?5expUGvgN>zUrEkd!}rdn&Kdw3L&$<UuP$v<rUdoWPZ
z`q%^+6zuvMK7AYHF&7hplx<ekj6Yp-zciU!xj@<JK{T%poIY`0o2lM$XC2i!yP9x-
z*WwVJak$OqJBTIFS6|9$1Rey4&>s-^#5<j2g_|=*$dNDBTP@gJNlv>o_d2E4+iub@
zWW1Rau~GZ(0<6LPW-YtD*dO%=YxBd$*=yP*A#gdlnq>XPx~l9S?S}mH7BRkPv4oJI
zgY=Z+GRC3Q6h%yPjRyC8mrwea#g&`FR6Eo{2-H&d{F;`g`ramlOAHVX$C`A@jHH5Q
z-<<b8IS5)6$V}nbl;g|wqOnO8xMGPanXrq>@V?SRf{ShKtYE0I`Hm1w7PwQDJD)9}
z-&;KVY%!i!u!}V|f1fTjY*nD6qcb%dj$fZ1(xvL_2?Z!_JkS8#wNuIQ1{vvpAsqco
zb4%Ipk!_@3nTXux09KHADir-|i4c&L`3_x{I&DLo9T>V%&xWF-R4O{ya?7fjQ4bFc
zJ8KdMdWqHtA0Hkc)RYVh<G40_iE;AWnxo$2HC|H?Q6|Nn1V{}j-<Rmfx-1kW7oMAj
zu;~1(LY8BvlV8^BR!)e(l*;=0#MOi@bb_(In#B{2y5Ol4FgA5Qn;;x41V8H!$f)du
z177#U=7LBiU&V3&UPYEgg+_{0!XL$``=}SBj<YvkQ2E5ib~tFl;KpNmOfg0d_i*Zr
z{{hN)dUz@8hgeW(IG9dA$i&Yd+_O@No|X3OlLy1EDe<U1KmW~ksVD$D!=Ou5Pg#-9
z0NcVRrQl7$1T96n)0zTL^;MmGvNPdU*KSv4nsH(vJ|b68mobZh<?$h%HDt_vU5Jud
zSUbZ>82F<-{Tk$ZAL`Q-;Boz}0t*(?CewbL*)tsImm_Pt`6az%$Bpro5x9imO=oBp
zqxOmdw%emeovh96ewlcAjhQCPhx4q)vrv6)8_vw30MPd*=5=Y1$}lTV5j$)0Sg)T&
zLdr6-<<0DBnV;a6sxtm2IYqk}KDE+y>HJy+MH*qTCq?4*0Z3D7xIwEik+B5M=F^C=
zy;AQs%lyRVNMN-NR>kew!Iw*4b8%QN$m0rDo9t>U69|Xui>mt^JMB4@9&QXv98FgQ
z;PJY`RaI1oNe;&7GCJFm3`(wlW`q#KpZI+r-d?O5&VEgXOsKd(o=t=sQbwW`9u*mS
zXucA~GjjmxU|-#IUssSwSQLMtjSfQZB>Y*ugZaX(KV0KAk&6fZ3v>@rN6zuPAqQ@*
z+O>|UTFS`<e?hF#val5ID4K9iAJMWxlxFfW3hy~iq+CpoI0n&>-cbR?W1Mn-6FPIX
zSDbWT3o6X+@=<yWW>XWTT|nmo*S8hik5Z3U;5z!H+ecbU5RR6Jg^XAG_AM$&kDY^D
z{Dqp^L%j!y&hLG|`*t6m&l%Y;q9d#BUpPq<gCyNddB>Xs3*GQRp5$O32zl}Is-Ch=
z#(EcP7J^hqwI1?p+8(4HU7<??h*Z`%)FgZ*f_3sys1|j|8E-e7bae7N0rEpKfNRK?
z86P10obDx_9vF2A`0|+c_ooMu%$Ko`S!U?0L3UMsY6G8cjAtxicGL<6hd{DQc9zb&
zg+2C|1{=$JoG#<>>+{jlWJdW;+3FWHYo9j1k13Wuhg!PBVEX01X|o59I@6u1T~l3)
ztz72tn2K|<rFs&X7bBlW$a0ry=^qm`{`9{?d<4xgAV{6lKT8{yA>M5cG|?FoZIsiz
z0DW%wtD^DuL*7cfK-gYy#A+uRuP5~u?*%Ooz8CV}&HT$b^z94P3Ks{JyX>0n2;vS^
zf(jyprzmwm#X6eKs^hX&(n;o>lM1)+`KVT+&S$cu9CS`%>!G#AdN!KCksv6*rJkg)
z?y;5=vGUz2ecS>L4Z}?%8#T96zAivZ3Z$qm83ow}8lC59$UK8)(2vc)!Fct+9i8HW
zGp}_1Wt&>F>8)z-1pJ9w&cQe#mx48OZ1{dl%%B(Lb;@aVwWuncN(spj<m~WiU(LdO
zDX=@P2icy!#*2cr4&^zmqSWt{B1}MzfjKih>YLnY%y#*E1Mh&b;``01)l({uO<ro|
z`7iwKJj37_<}{rhjIet$COqv5OLJVi{{+6jEH{xA<7fo7Hl_HSK(t-S>^8PGtlYc4
zjA)}@xyt5=EtP8oc<jGOy&w9EKJ`S8Z4F|_U2-2_#dmNKLj%c5h%P_%8C)biw~AWD
zdft`pp*CSDOM`(H_ZCdH$S!!e*fF_A@cEO*Bq){P^8?D`f}Iq#3xeDm@jTcdql1lk
z;wZ3VXBLV(`tN|yPe#rRvtO}3{#1<>5Q+j@Z2nGT+-FewT=>G_AgujW&<M^9h%aVh
zMJw(7GSCQ%AJe7sgL^GIq&KBOH%R(~Sv{$%o7eM2XX4ADkU_Zr(57n|m7c(r4x&GV
z<FYc-j|H8aqr)B}O+%ycha#lJ%0gLr4>(rcfQm=l<pp!R%IHOP8FkxJWpgNHYp|ZN
z3_7}Uuk#EJ!})3(pJ3eE#b-|MGw*d-lE%@hcYlBEB{Qj>DHgx2((V?=nROce*}J&U
z&D*4w!h<y#a`WPB#>MS0{}_2hU2*@RKai)(N7BM@A-E{;`GeV)kfy4sNUd%$$w4&x
zQQQ!SRdu0bvB${7F`0*Drld|hIdY|G-*2;(&oLT6L82d*I(|zldd4PO3Eu8OtC|=+
zgg_A+Ez)q7TITbrI!-%oite>dv^wj_PZF2Mfzf0YKh<}b=CWVmP5TO*?|Fl=tWWxH
zChYlz02@^}8zLt`j%aY>@t?guomiN9`FoBX1w#V7&UKzul?k46W++z$D;x+7RZn*h
zl@mHLJz6z5`@9plb}?C?m~k0C%e9LyuCd^GMLktBZ|i$ar&^nLbNXuHo1$~^_0Q;D
zue88jnuHP|$IPRf3Htoju>wp29cjGB@v0s?#}4KoNwSJYFQUeH-Dveprt|Uy(d*Ug
zlj}qc-b^2e)^{XtI9EbEcWb0}9V9lF#Qx1Kg11^A#Bh6Vang5P5OtIsLoJk)*=bvl
z^HBep_)p%wO>2uAKzaTxGfB^0*7<v>>-^2~?^%d|u3X^TRa%r~0Kn7oU<+s-2?gw%
zxK^qSyHcBgku5)dimyvc@?$7(K7sjq?=-s!N;`9o#TLf|Vd6jH9wVFZs*nulxA?SF
zS~74NK{{!#xG-1l&H>^3(bu}(wG=oOar(qzv||5c7RU2g*6i(kCvObE+GMcLipf{G
zv%)`1YTVT5O5y{@V+N#n;{mMC;xrlIS+Kp(nohHK9p<qtbk%RtWP*(1B_`d?*3U~r
ztn6z>TWyWoR9~w5@W3?uT?<yzy5un3&<&3KK-ezL#%J5<X@wngEus#us-DpY$pu|J
z3IbqH;4vfUTHFVR>%jTo4n0^E7YKq)5?;stIF7BL7%etU=u=J$VF|ia!*9G+(4@q{
zXF(XxBTdpuOZ_I9j|?3#gvtponr1FS3PJ4c)bjG-spVXR${_-0f5vVmTf2%03Czpa
z$+=&Z2o)rR9?uu#ZbRzbx35*-83-SPK3i-kOR1tfc$xg=oiK=AofVo}fFzWVN5ghC
zk3c8of@uQ%&L7qJ?KRq`H$SI7BoFCxkDc<YE_`EN?{X|Dba((fDAo1U?xqZ%Q*mqo
z(^Q-Ke{)CV<mfQRD<qQ;>=pXul~|uAg=52Cx6w<@**<m3%fg8b+}&I+u(BGrjUVvZ
zt07%()R20yGpA}$rjYiG!eieno-Ykh=@0HFQ8=i!m{4>b$ow+kMcp7?9a41O6W7?i
z>~q%cx<7VCVY2Mn+yy{SlwI?=7E&OE&Yqv$711hotZ|eb2Ln$OYGp+9sHW#*#*me-
zj{#UP{D3PAnC3IO#U&BFUd5<j_j!M^%yd_lvfr`d8)z!eC|!TZ{kxR!ftV1hoOW2#
z^-m3pvGxn!kQ#$J&7=H^0?qb~kIy~2FBk&aI>-bpqU2JK0^)JHiC1x~87B0m`(IzT
zg`0XaM~z(TIlp6A&hF`|-}9l%OFJ)Jvd?&^zbV^ZRen<&!lbriz|=(mv61nzjULf2
zZ#)#~?2-gw!(b|0SYqSihX2zIfT4Q%5*`q~@-}}3Y>QFf;;$cJIeOkBj{Eyx6?LbS
zzfXhMO$6qV<@jWK9BWm8<AK>{+`!GJAHC7j6TRbEeIR392U3D-pX+qBn$;j*0Szr!
zE;h4?)zk^9j`0eDsE+BP>pqrMGZEl504h!Sk9@33IYz@pkj0G2+4CtH)6a{-trF%v
z(9LgFWKCNOpMlaM0O*dMVPFfUi`?<H6w-HoO`x10_i6tTy<*=bCB2MV+}*jWsgJ$w
zGEjnQG$7&6b&U*QflATv@k5v~l48`eg|a0<l>x%*CExnfGrwShb4mScpq=4_^mx-V
zxa>oJ&xXxJ2JNK+pP)s>@1EErQ40X<A+aLGu8J8fFa10+lY%DDCE5@(qH@aRsXv+8
z{3ppgfjd&9PZ*_&YqJYV$ZZy`^0;iljU_Z_a0CYZ%GEQ;Uf*agLauFP<+$s<CZtd?
zWB0T>$nDIS(U(j>W+kL5WwrNv|NdzLyji~dtRk!&UJ<tXff2<K1Qog?D;$g`k_Q~#
ztC}^j*39yW*jH*ayZNgXX!F^7FWo~^k^HjFrZ~B8{n`{vcT~9(g7ndU%VvULU{3}a
zJA*$Oh3)U;+YR4X<TSLlhn%1LG?aw|cH0a#oHUf@hYqZlh|AB5o5!ocDKQ$kA;;{V
zuM<b-v+640{PSZ4j{0;?YW|tn^l_K5Z`MAv9@nfq$o@yM`A4R%!yKoy9t=d{poH1q
zISal%r0`ivy?&9^TyUPh5-9U7drK9yB<Na5vd+mmm5qQj%nBj1Ted^e-5N9CWGpbh
zIgdUcJF|OQplwjD7(~KOG&}T@5boat=&&i{w8JY+x>ck}OFC^#!7wL@=>eb@Q%2F$
z3bYxX?WfR>T}<M!q<OjfC@Ru@u0p%2=U)LcILwymK5W7>Q1{3$Q&aKsG}YDJ%EEcF
z$zeg<E-84f@r-i}yD;WVF$PD&C#ACYmRU_iQ2XdsZ^HazjWxmM;qc9D8)qMa(O&3H
z_SBsaQPrTubng?={5n~>iGxR0$T<x+ixx{F*F9=QPYr5kb|wdR4&^Frz>Mp&j4R#k
zUv9)HcBkwWUv|bJ={&FQxIBmlPGBA10n1CzvE*9=?h3@k#u8rB{enjZ?|pxM19~Gt
zU8H}9A)p-rJ#x^5aI9}v0}uZmU&;$0HVC`^I}~%(b1AXJ34<oUy<iZ39_DdVoZ`Dp
z+n7=~OGR4lVR<+N^V}vuvu@X~TJszBPhRq!d}%DoCH|06Q1LxWA=!K*1UEgu!ns)_
zEs!k)1kOh9I#-`<mCQ(#OTaV=zFQz9xIb!-Fr=S~^m1CniL2Do3S%VRKe>z`+g}r~
zTFvp#0M**l`kc)dHkt2m?&4>|RP*Ev2pxdU`Q8ri(NQFFO?EPOeOqk+LtUR(u#OTs
zhO+fzPjwzFxY|gWW1!$*Y6?9RHcg*5+9R)9nFiHx0)qqaFXP9)Uv47`aU+u+a#r{m
zrrr{3l+(fQ`|V$W01l(BI&JMKXVqMY80rwC1(Y(|A;W$Ao0J?gPuhIFe<;Zq^Fxv}
zxX1E!NevYKLgOl<_M;*lNd_`c(dXf0gNwKH`BDuc0-0l<%Jbt!x<YyEL3Id?yk;$2
zHOxTry1a~Vq6r(d92N*i@3!m%c>WuMH>;!*%lsVn+9eM`K()B6O>$DWz|uA7m3E$^
zDylAB8~53NQhT44zbvG7+t2&jQSDULOxTqDP71=uWxms42y$dVu{+h6pE_OPPwH$e
zZqTM_;-xNaPv?+!+O|I#fmAH(C_ExbN1Bx7vIz-LwTtlB3u57Il2ytjOeBy(0c1RO
z1i!QUmlih4Z8513L`U&iK!cc~G$Fs}?!qOD*MR%C8U*y8435n{P0xl4komCM9#ov`
zY2`PfSU9~0!Uq=o@?I8MYlkZp6m1g8KM5Qlug(Ti+`BRLlFh>`@D-~fl+H1mn+h1b
z*(&l#G-x2p0grrtCFkfCxRL;6S)aHaQmqGe!-m`Oo0-KiaGtxq(KIkuptA!5E!q8=
ze~YdV7#V(&6=^0*``|(5Jctu^cfTihA{BHNK5F^e+XGQJw>@AVu0Gcb-v$DiYQ#s@
zM;A}>*~362=4&3C{fZese~~UC?2z>Foc)HH(2?8n&cRBH)B^V*%@j>qfye^m#MN<o
z+Q4GmV@MEI#qmrlUaHJuwuB!$o2HG0-B(txl~_ES3TvtNeKe(r37hh4`>=-6msNrX
z-=&ICE`S@@t7TR>R(&uzi|=p>U9X%uA5yxUnlIP#>Yn$(nQFe;Ex&H6mj!w3R}nsN
zPdj$@@XIC!Vc}^YX08NTytfWQsNoOU@mw87h^nk69>etXEXVEDbCf!d+jwL&Q&+ck
zJnG4ZgfYNsASaw<nq9D=vb6jM3W&xqqZriOevWB5^u}$H357^~QQ?wD_H&0#?mlYe
zh}$=?#PZkb0Ao~?f?!$`A_*$+GV#p$4*BM;`<+P)NAwW_&Ush|zf$>@pIIQdw(>*A
zxxMOSXAk9w80SE;eNv(E!!d}K=(74aPkPAhNe4h0zMj=B`>^>8KYrw*e@(L*5H+Xw
zdSvI~G$AE>E5Cz7&E=txmoGBO-?tJ@Mpp(8Sg<~kcEtd}XZMReZU&iinsU7vnFL=B
z*G8sek;C}S9~lkKIm|<eV|&h@UQS@6$F&I^N26*Tb`ZGKP1y<@&StFyhw#t9&FHCR
zO<y!7do1xuNw;1$O54vks7rVQt5vN~v*kyeQbAA1=E;EqL^-wjq#|1tB3M%Q*>QW&
z51Hoqu7ugQEI@{f$4DO8n0N8B6I&Uz=zEDz?3^Ve&SvI1774+%>(#ldvZ7neQbk%z
zdkm%qn56livP)qLWfyyDUAz~({2usZ`KL*IreisIf2envOGT>Cwz49#N=K)_dLPXS
z$g6a~MCwjQGqx7M)hT?Z=YM-mDi6N*u0X1pGD4{0t9{qMimZ+<T!~s8(bXCpF`iO)
zbtrBo3&!>8lF|(q4mz%x4$MM%t3cQ?ATZdysql+E377!bJ-IdIQ6NYpCgeI);Z8XM
z{rbLMdGl{Fj&jlO?byvE<*o`k2acJoJckT^i?bAm^EQ74u_MI{KAbzzJgmnv-2o)^
z1`Qqv5uB!4hwVMVL0_qG4lYgya|R-p`61gtX)T|)dL-^>{W<?DGq-mI?L5mJ@SL$v
z1zK+u@GY)SPmz;Pz!i=G12OW`>m5;ZCQ8jTx)2r5%N_RBqo5F$L4#%N29Wc$=*>7G
zr%bsCy*T_jA(DU~@s5U%F?Ev1RH&j1tl9h86pydl;1MJVKY+KL{x*De-G5dodQjgR
zC0CM&{jdv=L&Pe5GH(F`Vr#s7#4E2FzBG$>CNG~Xs~T|(lkyA`u6np|ZJyR}b)*dr
z&U!$E6%)q~>A7SXkQ`mzpdkUQtSl?|T9w_KanGw%VcZ+>-t%<qxQq{)Y5+X^Y_}M%
zmB<d~Y{<b`e1yIG_j}J#kyLmfQwwawlp{;hVbhnUNMepl`G>2Y7=Lo{JFwbspr9kg
zu($EyY#;~xEREwYMd3tEq95ht*9{tx1shSAHnXBdOc$hNpYI+9PyhD$9&R*yJbKpe
z14dN(Bgo8{zMXteE9h+`wlsSQabt8XeVx;BOA#A$wPg>pR`ohxSS>zqP6KV{2g}S+
z*4RP^uCJ-VRNL(Ln(-I&-=t>m6pU#EJsYI3&rY4WexZTNQ`F*Vn@z*$aXFP)HbIc)
zt!@Opoe1Jk6V<2rob&NI9VypDMl;Z7fn{0lJw%N<mMK=_uM2ZTS!}Jh!-Q5th0m(1
z(tdo-)S@;zi3~5ZEXuza-Aex{^VXyj5{Kyu1o5T8<sArJXN6;gJ;K$uyb7eSi5#Zz
z0=~9LT&syW%tF(LGv`oN?H@f`0dg@tj^L|{x-yviQ=rzSfAmHU$br;^RSAUQJ<RWC
z>v1ZPY&id^+L$(^QQa$et{n`j_$;Ug<TmLyONQ_A4LLEc_L2<h8J{FuOH1eLIw4eZ
z<J^{SLfiIly2dG*^CekL34@VtYB5P6c6HUYDUjQ7<4_z61xQ{NYzS!fEJJ#B|55)2
z+Lv*zSVqpcoWjrU!_&OyTlWV%42bGr0|M83*aw^ky=+f;Tsj_Jk7`XSGsrt#V0@a<
z)%k8b>b9CBZb~%3ZtYgXH&_yTe`1h3b#U5bA~*XiX6${K47U6GoZnf1?JRQI5FZ9a
zhUqC~n)^NHA6k44p6lY32gV}Sr}ApMikRY%LT8aYK3ctVL#uzaC7*vfoBuiAWxFQv
z2_PMQW4Dm|iHU{H>)Hw#TT+g=&hI=uDyT^zjnvZ#@3x>tr`TASG-1eTGgNFWRVl7V
z&>1+8Fd5|SyVMzU>UTt(!KWWv<~b0_yzqG0+L%4n;`|D=vNd^^rd}<7W&BX&(!gy;
zmCvVbS$gMDR66b!-8nMlsr!1wQQNJo?MenMzb#Us83K|i78MIoo$KBoZNwnUYCgxc
z-Og4x&s)C$;92AN0(cX)LEpiKl*m_rNvYhK?Y37Z$XMo4bw-))Cf)d-O@cvhhn7O7
zOY&Q{%MFGn-ie|0SvDsJd%<M3$!6B|M{WysW4rnQmk2~|=f_&kX4v&u&PS&p0mw|N
z2f$z}>24>ad>5r>;WPr<@I2$)UJ8btJ;`?A&hXl8a~DlMqv0e6x%202Qs1_&=6jqS
z_CKP_ldz|IWE$_)INmCcD`<*()_VJn!PzKDqbGCRhDr(U5i+R^<^d+YlIMB9nH!W*
z%8eG?Yw`>lh3?83XQ9bSk^9tG2&z?;dvv6}va&?F7(0ro=F9`HGb17X+%!_;X@W}i
z?w`goeZCXQtYozbKfCUW4%>VYyYMfN_Bm~Cb$$w~dh8`uyi#fLab&hU3?`G$CZOpB
z64si#4Yf7f+KvPxxh8l!CL}(e-(=_9Wi}XHFo5rIunW`at4n{U^EaU9P5>cx(*=<)
zM_9c&-e-E+PIXzS{eTrptBti8XB59Y-}aX3lGh+j80`SMMFb|jvxo>AhRv+LL2>#?
zt%+pgV?B4~M{xsg@)Zv^9l~nFp=Jq$JIFEO!kf(?ZE_H@ZQw%Cm-zC`)!;vmtzoS1
zGAGda_pPCf)CSM@YBde!1>>JkUq3ZnKo4PyS$51z4&LnvE|2}8h<PK;8`;jQm3l2h
zN$fHhaD<aD?pKsU77DFzEv>^*o*?dNm9xjWNgx(8-A}%42*5uC?02o8lu;D)s4*eg
zU&WEP7dC<Ob6ph`Y9^7weKF+J(M(N3*AbbI%+Q2LR}W-ix@Vm>yxD%hCtJBy$XQRu
zqh3T-X0>LaS~?6~|Awti7YuYb?RxkO^T8IbL5D{;T`{f0271?lWIt*P!~}<)p#Ig*
z1L(3~gisJcHI%5GF%jlA^q63&{fo>G&#SjC)45K5MbjNu-lg^+N|G%b*a2eWUV;YV
zCSGp)wQ&xccRSvFH-;E3yfG`fQ@>`>**wDhP@mYtdSu{u%Wh_M;9Q@x2;0)c4vcRQ
z37;ft(#bS|WxNm+ZDkB2d~>iI+{qyT3|1KJaLR(^B#FS)yH|)^5Hk4lQTBIKCk*?g
zUqgYQV!aSRp=t4xNnc${DER;sPOmN}ww{Ql+`v16Mb5P?NNu?9jb{=KL%7Yt3-WX)
z)55)L(?**!hmAHJ#VGwwZbz<x+<u)L`D4rgFI6qKfg!Xz{Fk3sc`c?+6~tq@{a)*b
zXPEcGUS(R=uS{irLUS@{b_T=?#1iAc)NA=py8END7H^M#2QSlvtYqf+>pR-sDhpNC
z;Kp8>DS6bMMYC1&bqFLjwh`Jny6-)FVynikW2D(ALBczp*O}>nXxGagn6JO0!Bu&;
zxW{uz4ZrnGD#P92yh<w0-g~vIrQEJ@j>W6-ES|s^*jy|#?IfvFHIP2Vk{IYTwmBD<
zEDvl!9C4o5qxch&n&!za62GTaev)!mL|P%jgJn+1r+U`?Qd;{}Rmu1zBbmGWlgiV9
zUIN*<M%OCR5ok+m1Z&)uotIX9v-Y`A9aj(}2vVk=z-GCpFMssmQbUgkQb@UQeNs>f
ze=`4V?XZ5}Ls06DCq?m<`e>c(5na90M>f&aCmYX@gZ#aQq5AK<oV(Yr2j?~3N;wPd
z;IQQ=1wUYv#A*-Lv3IR`Nu~K?QTrqK>D)W|dl<XKv+wYkhgE3JU*|o&zV<k0ZSFUX
zn5}o^O<CzXI9xjSPU7TT7n(Tgb2EcNYABc9SozPV$;8;rTtlcPxDzuyk02;TOb+%p
zNL25{N!KOak09cCpgefzLEK#}v<J$H4<F^^5uiwpj)Wu%dx0MU6Nc3>7||Y}-bV3%
zD<(>HFAC#LwT|3mtC_U%bVX!K)P(XNM50j$o!!a}C*HU_HuarGbsm*b&B(i)qi);y
zvT@p=rdA&1Z_}zz9#4+d+Lx9*I>p=G!H01rNnf1Qx=LD(RF*D1rFr%8`hJSVBO1lG
zv+c1_&bqh{>sYAGdT<}$N<4e^qAO@?WfN~P`m=;WDSM;6j1g1no<J0A=Hl|Kh+$>U
z<dGhxS9Uv^nCUKQetu&Oy2%Nf^^!8pGaT~9xQm3D!Ayba{62p26~-SdnDegEu8Fq}
za&zX%dS4JU#X6T7kL?|fbg`Dh0}w~WTiI4cp6<o>aETCT6~?BjOv<!b=~UR8)QZz~
zC-k{t%$+{BV<xmgz?t}@z-a5^ap*r7t9m}#R-X?F5G&tqIgL-AJ>bk$jo1uq!J}=R
zxfaYxsmU1b7f{4Jo=Ec;WK{1<j6LZ+(QaaOD`#gsow&XXB-QO%36gWG;dgp1;oI^=
zqOt1@kKBF4r!VdKxgYN-VMq|7DqNzzq=N>TZy>fN0)LpY%7{KEe06_OGnS>&&o6lC
znPak)s9#SGctps*@cW6~LQ#h(dKMte4W7v0A|w_HiWwP)a?c%>)-E&U^|o>K2U45G
zxl;yto$f1=E2qHKdUL`gokgs}D^~JsS25#8Lc6vlsSdYR()uzxU7fNXbmJ0nkYyT)
z>cVEbB8Xa7jc#&@1u#p*Iz7K5_<gZry(Hgj58|{j$*s3^_69%mnGn+kF`iZ&g|N5e
zY43qF<<?DY<@D4J0mV<a?6{1A&rD?0ps{g^86TK<QdwzV_0ZF?8+bsS7aQz1(;uU~
zEPIts&>ljPQV;@5A4?t>8zXxVl;AtpYc*J^$tHIXli||mHbn3;k&w#J+uMgiOEq_+
zJ5qFqPwe=)(`zo-H**kHVdBv~c_Z(9s>Wg{aTt7v<&-%);|H;Em|n#x4qVtgZS?T<
zjZYbNd_4*0I)>YwigVd#MFbo>pHe<9_ywqVv&sm5n(sMfg(}ULqH`V6e%QH>h$JnV
zom8Cg3ei+%eRG@~UbjZCqAt;RO_j|L4=R3$aP`1o{FGkDmg@hOQL#gyfrJRh>nWC0
zyq@c1uRS6)PUPRa^!q_Y;iD*{Je3^f^f8f=!&q~{-M7?y`}_`6Ov<}iA#8d-NZql6
zmogGxqMgLV^tGj{q71BpGg_#g`W;C1>OF$UfNYWR+N)}Xm?POlK3F=LjB<?&UB8RP
z#Qb2hO_t=d&ZH)oMxO$wcR}7InH%m3Dz$d$dhn(3gNP88chn`^$%8er^sn&4M+A8F
zpGjhAX_Uz8c&+Hy-PO+X$e10+9(eV!D{V)(De7bmojfOV*@ch@X4N-I#SK+160Ga!
zdcZzz^sP@U=85-&oq%GOV_93B6)Z|D@MwF0R5bzy>0-kpFPd>zpxI1%q&NK_PdKeN
z(PFrm)d??+|M*6crcXyC`^sQ_C97?Zz_`*41wyB#l6t3$LwTlDb~2Ox)UTUlG?HW=
zIoi8En-zR;SNQxiP8xiJl+%<SKe$J|%$MvI>wWM*nH`rU_qOui^Y6<Tal`81mT}dm
zTO(6+aFkf(nW;F@IWn%HV^~FyJ0&!>Sm-^L>JiH$3#&*R)x7XyF@CqBcw6Wo#fG>6
zZD>K5fa^~RSjR@&`(_W1hqU+H4tvgVDDox3Hv3>U<8cg#>2Px|{U<GjXc60_HQNm@
z$j%DC{~xlxG9b#XYgZ7Ik`77fE<w6s=p4F1T3R|qdg$&Ry1P>pq`OnPQ$qUO_&o3T
zo%fvI^9N?n+Uu&dcFBqUvM-ydRPU7g_3_I$Z=1EQU%6dhQ6XMStKxwjC@#4|_D;<V
zzNc;OTB;3N`J(r}7t){^<#iAAz=a1lzTeH!p=og$Kx#S6;Una|3;|uu`<g3>WxZJL
zms8bz$XBFlEXCBxfk`96#{jj*`7A+snQ%iB8HQGwg4LYftkVuUJ(g(;66Ww0G4|ga
zKkTU1_w$==S)lX!pdH&<Aia*#&n+NMl#0k;EyJBIeayXhU{M)XKgl5`0gpB?oJ^Wm
z_UGQJ?%fSv8N#jLUFzJ)I&ajz&d$|CwXqq%yrg%#%s$uet$1CG{0&4k4ITxtCeV?=
zh8EETNYT9o31eKMz|!y>4y2%;A3PY~z4nAU*v1OebNts&o*9_qo+A9<PVOD^AhfQi
zimnevHQ!nKy81IJtBAC4k;sV}^~p|1aViD1CGo)#{EaoGp0BAIIbS{wB#}`1{oqie
zOzD+2Z)?hTL0y|{i|OJaU8wUK*!(Al#gsad=EqKCLa*8K#4zfJqVmAAKYN|=jfZgN
z!9VJwL_Kp~spWzUeuRy2U+});^G1_TbB*S)zJ#@-kESWtug$t#?R=6-L5^CF?DTSa
z(GzLY_G^EauGM0@`+Jaa9aoiqW3WUh8^LS=F<y~M=X8)U>|U)>hm-cV_LB)OaXJ-B
zsEGj;<z_wRk0`OTjh>vQ&&K!-QNGaiT5dk2@_U~jbt__=_TDL%atRsH5HD93JC^c;
zwB4n^GNIl66=Z04zlYbteyZNo<ig`o>!`(Tn}5K1Y$TF{A6c3=afbwP#CNvGqeV1q
zc6F4!^ul_BMMuPBov<MI4{T6tRJ9X+pcMFx${sXY19}uc)M;J4U3#VDF$>ybc<$rA
z5DDOn$UWdG@54|9b>xcja^R9>#Q(Q!fJPsOacbR|Y#+U%&KSb3&IZP~3}->%9salx
zPqUjDNN>nE@3_n6?y6P5012vhm~0~B5Q@T}iVMfwZ#a64Te#Hj4<Ak;Y;Ae*KD*6w
zLu>!$w?`{Q%r9~1tXg+i59ar)t)t!G6BT<g(d+tV*+}$~;e8CslHiMC(C2mT#Psov
z$?2BI!Kle!<BQ3*e2vT!-mUQ*MC!j4eCvcZOYHgS>!C)aWsnwA@@5w=Pgj{_(#SGt
zc{EUl`m$f1QDXUu=T$dxx99HGU}KAsFIy(KoiDK<eO`Hw<RBcz^k5{90a#H`Am;OK
z7o|}yjV(a_;2UY?vF^OSY)2{?sZDOpX+8s5^7gNTuAq^NOAW+)UBA59re!1M5z>0U
zr4kJi0e5j6f}xL?FPN^rPIeb(#vk3qx9FatPH3X65l6Vzmz_d98dfnn<(K8zafxC%
z<QqWkYa9Ocu7ThbzG^Zx#*FFRQ~M1U!E$Ne-~P1RUso?buwh*G8Hy@8Pq81>gsKMt
zhp5feC`$uecq4&27?<f`l{FRrS8EsTVuTY_2w)2{)b^4~8BfyxvO3S0gkc*jIpx?h
z&x+ju@|Ddz)L9LgD3mWLhMYUiy<BQ9c$rYn7bZ>VPrOSdn;KoE_!AR*3-nn*a&x-#
zgx2%yy&RUuH4bq|PgylB>L34~c5;qZc=dT+LSL*B`6@C5#keNUoeNWVXKsWxrv|1s
zc3{(SK1dPfzEYoV5s;8rb8h|a74BqJ+bcp%<6xnJcgtD-e!Yynbp~fBP5ofuk35pM
z-g^=L3MC(0E_0(3?QgZE&T*ou)pF_s$;Vbnx(eJ;6W*sDM=7C8N?U9~N_0_jn_yP&
zWZP`$0m$x~m?VuWfGtF?6%&b8$K@%8Ls#%>K095~<=_lP;K%vMk>qJa8)Q(qI~3$x
z!j#K#b(;NtH|0IXt8!$EYK`@=*{BUGG96*MZzWh!_bJ`@ha*ArKqV#LzTXQJ@+twk
zqnHr;4r=N3KLzWrjM+c}yW$PCA4_sG_mV={ik@*81%Q!(Hp6)!Zcf<3z-delqe~C^
zubT}c^hFp1Z*w+16jw?yw8;GUyv^$}5lJnRkuu*<g-g5!q`N~plpUH@1}A3+wQ<vd
zBMinz3O7OJT1Y7RNa(}sasF!ZIMCnQ87shKXV+RFnn>8CisofY0)?u8Q^Fq>ArXH`
zyRDYV+V>I1wtBafIR}lIQi?Dkx%0+^O6Mj9Qc)ptp``g=)4MTMmBgvkC>(zEsaejv
zSF+qKtG-0k`)MLxS9~l5Z|t;Hve7P_d~vW`{Nbj7i45o5An8OU>f1qcFuq%mrRtDE
zhai;`R$-psWR7ahw$t&$h9#6_g^NcfJRuW?o;mltMvI)>M(5tiN+TBdZzThN7jCEr
z=yo3QU%soB{`08&p}tyzZIYpgJ8YnqUqpN!$5ChZ%eHS_a_f-W$HRtA?r|bFkbu(<
zaaY;C;YGz_w}?;VjvpUwiNg-H&xyEYtQ+c?WT$F%)|BxunN?jd!7LKtZ#_lD#3E`3
z^bmX<ws?W#7<JZ^<O06woh2L%kigD*Y1!Xv?jO36jn0Y;gAXNfu-BNV{mpLHvrf;4
zAwNaP=EBFM=@HJOcX)85KY8|h)s*+0udeIbgaC8E_dAV(TD-@#*fw0btEtct9`Xh~
z7i$|WUS6K{@6*P<Oi1n3o7caN1Nie9@(#M{>ud*k3K$keGe|Femv&@CKF^X6%Sf~^
zd*Fsg>vWUwC=&JSePg!kTw2HbL4h89F%Yv_sgm0O;}N_!@$<4{r8A)dHQmD@0^t>$
zz-Dc!MxSkye^!O`JS~NWUw4K9b?8q@Xh;jLKL3=)!Nm@}$hUEV;cfQioVQe=2MX!!
zif@%Gtc&Zs?9RIJX}`41X$LE8+K~u;$^Yng9Vw?Fy*cRhx(wsiA66=cSO+(Ef2$VU
z9q{?nwQpy=f(yT2bI<9!I}fYRDP+4|d>>~7Q+cny;%|4)b+vB@mA{8Ekc6A1nIX*k
zk^-CC^klE@ul56Hm1NOx0kQR=->GLRO(?!c#HpamND|``a=1k(^aWa@KX1$(^`4<M
zT1N42S4oC^UBtYEW%;u35K<H#;uuj^094Tga|V*`|8^Z;3kXm+=499aaBjvE|Fs|Y
z!SIJ!1z2@1mA%7v>@2|wt=PI<OH^Qx`Y>g=z1yZG3_?ZY;xsERg-*pq2w?n;;K$0+
zEY(xnyo<r)qdNPP2a`%BjEVV=quNf%67mfOTEpu)cHMh7VO$GAY$fykNU^+O<{xrS
z*r}6<F+W|n{g9J0nm^p$1k<)3-;m;iYj3=YDG|c&<SnEU2Kro~m@#f>wuexLadW<J
zR$L@1dkgGMza^A6cN;2n*Awb}uTO+9D?Uk<2tf}Fchq~x%PgU-&QmWt%3Pa{*41be
z0=MNY^X{~_-xRHSU=RyAt1M7wQgP6m7N!g3LGz?By%Z)Mowo$b=P!b%9{k|$c%Rg<
z#deO@GX0bK7n4@=ac3`jd&=qLfze3E8rn>5@3{SXZAuOW#q~JXQ$wXFq|FVrfYY{b
z>@bc~6p48TyTd;L?59iCuVk`#XvKIc!nc6WDm@2Q1da-z`cBRgot?EUtmhRt6vohZ
zHI}`?T^LZ3UdaZrcNO~17+|!srWWpi()UV_|3m-(lF?UiCa3}o&`fuRkP+(}2D!~+
zgB&q%Wn}($?(#81PQ8}%E_+f^A)>{F2z47?G2pr~UZ_5t#CHLK+F(=aOSkjrC=R%d
z1Vwv~=kD<8kE;Fl785#c0N5Z+tKJ(6APTG&T<y)_#p|h@QF^@|)d;`vC8eF29y&Yb
zhfTIz!HoCEzs9Ym>Rz@o;Cp!h`+}Xb+348>IpG_|<+ov~%am|FW9a98)`=R7rQ<2r
zno~|C3S`#zmpBV+(LVJ55g;0TtnZ~Z)qJWf<wJJXA6{QIC;Yk#s#eG*Pab?Q8TY{W
zUD!~_#z{}51&@T!*`J{9Dk;_I_p6&?pIHZ&N}2=IU`AsJWaEW}yX#B7{fEI<3$(O}
zbR2Y%m@UfW8rC4&Wp^2lc3iJxNU-_I>7!b=lV@H1qU#(rlD22)L51rM+vZ{cx613l
zcq@0NXe9g_#oAT)%p+W^3iy~m;mA<A=CIJIrR|f7{h~sMhi&a9Jvz{2x<GJFB=zGv
zDQeP*kRbieFWv~FspR_)XvF&YlvXHek^<hdh5s$r7;?e_YqV!pKaIx}neS0saIWkc
zYE-PqeWWS5i6aEx#A@X&jUw~QeD6I%-*2!~h6G&{V=Gj;7YZ-Db4cLPXStE9Bl|QH
zRAJw(pxh;KcO~F6Utf_lwQB(>8VU{&QdDO-O)P0Mw=(XzWuNlwd$ori6P+nkjj=Y5
zkzf+K3PT{|$l@r|s^OX~habC_S+inU>cMclP0N$$P77mF6wTDpV%$LdRl$Fk+v4id
z_3XrR>R{)Mavo!%9lBpj9h1V1gZIHH|K(fZO9TDDEi0N=`_$2UXLgDjpx%b?`Kl(4
zt&x}ouxr-OEFK?8irg0M8D>lAn=ezH$833`xuYtEFO)eTMQm)2asnmlBFr&;6#Mno
zx{#o&a**S_n8jvZ23y7V-LJqqB)Tt4RxM+u9yDt#B#y5GY^EKSVdhbk09p?d%dE$H
zsP)v{t%`9;)V=^+aihrN1B!;k!BT?lnd+m+^g08rh}*?7@4pln4RTW1DbE{kyq3F>
zHUS=uk?|P~kjoU7bDCJ$)zzB^*vK(R$hV-6R^vrqT_8`RohK^!5{}6(WAbR8_Y&|X
zqR~iNP%a<C52&?2&-%D&`+`K{D`Igok|tKZYa}0Ahj=J~cJgzK8{?kNu?m8<SnagW
zX_I~Vt#)!wIvxg`?l3;06Suv3YzaId2!KX137`ZwS`X9hb9rI8dYukj!Lg;2oEdyc
zm+h@LM-F@<5%ehjIc&Z>L6Tzeh^<&GnoO>o=KNyrca~sF#iFYzwsY8G!%}dKch&@Y
zgT$+L?>pZbY+@UeBF<N%Xl)EbyReXvc#`Qt865R^9y>p)I+G)OInN9oyxb1G0fJ1v
zSLE9>%l=L{SDQT8&aHZEwR8!zv0&FBZnUqCFf)GUJta3Q@syvvTCeuqSn}e`?GU|*
zSJq`ebs!qRU99-yspg;Q?wJR&LvBKg@V?w=-c<tN>{M9;2~a16u|=MF%{Nhcs6M48
zB@Xo8gP0S4K^$Pl!YemxCzLZ_`elq$D3@SQn5kW~ivUKwrb_Zod8*x8TAqU*5XgwL
zGsLI(oDK;%K8;r5B6P>KkGPgG;AJ>y-!gAdN3$lXeYEPEJE}7*en9>FHh5UWx_bbj
zB*&D2GpO`dLrN$VkE<b^r*W!y#fc=Y>es-2<_q&50fy%~Lo_nO8FSI}iHRpa%---K
zs0a$Ek6r30=FJ+NLl#sEwV6M8yJ}z)7YYAtvQzddwjzM$1q&E^U>?m)v8aAxV=vH7
za2%TgJ3IGSD{5`NHtt0&7KyOj8b3l_zBIcF9!vhz>Ks*1@?t&OZV^+J(uX(run!eC
zrk#BKh9(rzQEcK}2sL~D&tI2y9+O(^@<=vsj@5h@I2nLP4b-?j&V$rrQfd4>)Je0p
zZ*9I)m)PT$3d-{`3xsfzfYDxmRibNC2n(F(D98Ey_C+wF$;`ND_m0|C&=H`9LN(tH
z&o3lU?SHe&+Z(wE3E~_i@z<dpSuvuAX6DH9YooD=wy?8NYx@&L&{P6!#=r~wIIMMt
zKa!Pl{vQqOue?mUPfF^zi@$1PU{Wb*Zh!;9#KQ9~<Yo6E=cSEmTbyri|9&JJ(n1-6
z+p9c>?A{U%?_*k0cz4CH(3w-)NA&xoO*eWa9lA!E&WcVc9Zhu<B4Ct`CHlu&xvp})
z$c2R9PWDYJ*;}UgSFi}7s0A`NRQXbg5aj|X&}xB<C~d>r@99jc=As3Q4JbK>n-wI_
z)X#ab_XYM~`9iXR9EQi;j4SlwQ=H7*3$j{31q#@KSF~qQAW{#LaW}b@>+7yvJDc>`
zTY1>g#+VZyRFuMU5Y~kI45sw>jn>p5fmR|xdemHcxbVbPzmNL^1&v2xlUI+f4Ys(A
z`&p!X1~7z$x$g(D!9DeFKNRvjB<oe6zo<m0Z8R4ln5nE;tBR48pph3?wu%8BO5Wis
zza!0zlJ1nr7Qj2TgR-oDsy?-U5o2c+9d(?n)sw1q4c+CfNA1$D{_1)nsI+kv=8W~0
zh3HTg5$ngefY8i~6g%~BHR5M5Fj_aLfUgY5#!Eyng^$=^;!7d}@08F?To$0fHryDH
z|CRp@NnkZ91oCPw&4415M-iW4M>99=Z%0=G3EUm5mHFF1L&G5jDy2qKqZ$ULHRhuR
z&q#MjL4m@jL}r+9vypG!ru8Y$rCptkWoW?EC!1%3P)e_fT&D-gG&{dGO?uuzgK9>i
zZ9c=#Wc%qajKfbeK_qOePdf9ZcU^v6Nny=7D04DmS|oe04knM`w(ZqV<>j)z<hLTa
zDmyBoLb`xpz&<My^`T*G4(k{4C9{?oDmWpl?r{YD$(Z{PUW);0Os&Yr=IeC7naqiV
zwmS1t`;L<$^Wce1a9=a=JMmGsT<Pt>S(1tE47|NR#f0&+A9XFYg@kZe(nXpCm3GI>
zact(@V@mk|gixZ@-<C&>Ih%qU5gR(nNJWVpd{u6}+e{&2?RUy)XG9VDDcUf|m2oA?
zWAolu@It)1shf-+N)lTgg4KgZLW<6dLCctRaG8R_XR{(g0H{%pKdO(GX-&^Tku7<i
zClgAilLU3_Gu$0N|C0qU#Dsq>hrOTzYX8fHF1>Y;z-mY__%MHa5;?mBz01aPPx^dl
zA{mM*%x5Je#wnOz_<u_W!eqjfe+w<e!PCN%2rL3xEgm7?&*1S1w&ctj4Xl!G_ab=T
zlUcXdk?wb`H_;A9x_YFudh`1WCGW}^1VI8>Ge_9WH#9DJIZ~+!1%8M;*J-~omGh{{
zkm~|wc>w&{ZX;v*VjVna1(PA@n#LmsB3dHy^5wnKD<5RKvg3N)kdLz3AJ^-|W6y31
zJ_AegDFzXtO=q@~kEa!4Y;~-m@vJBO*0u;dar0Y3gk*HeCom|7-1DDbkmGaOz8*&A
zr&mV)eNBK<JNeaii_RvMvZc3K)P3hscB*C3p`aA(Ujm-=xY5NJFn_1U-$SXi_*M9=
zPHj;>UB-nWA-oF%*ZIbX4vX~3W!iOoXE$h@ueYVCQNPQe%}eZ!_8t3Z<=nzuv5vc?
z(r?K%?|{$4eDD*G(WOJtoUa>lo`oc&6!EEA*ADoT?HXru>7CU>j7w0nwEAi7&*?(?
zt(v`5LPf3ZG@K2FziR2S{O<#6v9j(Xwg53eF`X~XJ$xpq$B|#j_nU8!=%ATtEYV2+
zuq9f!WP~AIw`2*$d7R^uxuLo0He&Czg$K^!DF=EuBJr>FXHedKjzTE>5S3FO^(U9g
zW8oAw%E|V+BB@ugeSxyqf>Gj7^!_qSoEr9{c0hb^h<`hH&zI@zb9lrguC%*k$6;ea
zIsK~(0TIlFu0ZrcbTcad=i=qp5j}toIYX~c3us2?6|u~YHf|5+bFiJQt_@t}a?jT`
zXb6Oi(d4syq!!@l;U0ObEc1>tA^p%?Lc5hT*hM8~V#0^lk-ES-)tC97>u9fFZmQ)!
z7k}PGT>=!CisKdM3_0mXX8jw!H}}rSlSx{6!m2n+)z>WLcQh?o_&DfS9?pzMb+9NM
zFK5PXZFW-Dh?nNXT_=7oMEq23xeB-EM6=$NTlKz6Ku0P=n+c)Wxw?MT4OF&J5*3MO
z1aS<N&c2h?h{S3B(F(t85TTC0O_ifmX54FcNvox7tmbu1+##x;krY~hetc51EbR^w
zh*41;!BK<3ofoE&C@U17v3>M!`J5>v(Kmc$1StXP3*?z<7M7EyRQPP`s9w72z=cO2
zhlhjvBo&{PhnoXt-WTQ19qr|o$p2DVKzcy%y||GKRh7cH3_zEryCA9YRWB3zKArKe
z_E!-({?Pi5FN>LF6B_DFygZZ8A$^uVm0p&RSScj$9b|11sk5^hdcG}_yI8@8^2=HD
zB5f?1KB2nPYy{G7y`17ib%8PIv~1aIF#%c7ou!5DPMgxjof_G(URj(mj!mu&<NtOI
z90pj8!*#ANi#;d(ZWlE@Z)p1U&4ag7GdfOL!29;*hmAs8<%gL=zqA>c0jC$BosLHe
zPOCtcOC*QBqqflGREh~g|8l_=G`|l$u^2B)qBX>3U2-q4^dvk{^=CLZY8|TWLnC8$
zJ6?&>D;{EWyWOhXqmKPa@l2uLr3{UPdcfyRC3Xxpnec|mM>~PtxQDqy@|PjJM1H#w
z>}(zZKowmtq2rX-U0Ss=L#@*BT*D;bE!>{yJqmiK6K>1`0WVsG_)@NQH=gX70v%xh
zt%l2heBN><IcUFVXFII8Tjj6aIgSFOnj%_H=jj*WV%@|5MgVcZX$DW0qq$Ubnv=`B
zV)BLcmSmfNIqnPzTVivJcfL!qG$%#p67MLr4^BKV_#D@%7Py&(5;GK>*5Ih2B$V?~
zFJSkW2|<iw)JF8~gG)Fv%(=YKCwtM#_a=(rD+HxqN?32-esHXH`F{0#D*uUKk7xVt
zOedA6fP`%Pt(G__Xll|K4>T$hagTywd{IVGs#0=E@T>99`kp+kLb@g@Q$&&HyApBR
zH|L-Gf{zTNyI?gk@&LQ;%_7rLjP!wSNn<LsZ|&E(PQT9$V!=Ygmf*9_OwRuCal}J$
z@JkxB6(3WA@tdd7_*8F;5hh*bf?J};V_Rmi3t%aP5^`~t&pWA?o)46M82%l&U$^n4
zSVU?aABU35F_B=9pTFAS3o?EDHJ{k_nL-7yxmmCC5_Ii0e*z;3@sL8{JisnG_yxHH
z`Q5h_S@=4A+@|I=B)t<LgFQSfg$;NDtvcfJvEi_Z3+kP@icdMh4-C9UXMIL`5;?>I
z&&RY=*-F08^UnDHsf}EOJ|d3>>>(7du@SJV?R!D;4wV?kL4%K&u8-V?J`Uz+W?+#+
zv>BV`Xobx7d}K)-*q|bvWDHSaqtdzP<>JwX(yR)^CSSIh$ONuwSuY1C-Zo7`!sVX@
zPwI-h)EC>v`%|tQA6M1Yt40Ovl*I;;4s_<pVw4pxbU`bmXGv#ohUMc+Yu@$<)cS~-
z>j{FE*~a0^+FikVoH>j-bCZj;q;C#1v@G6<YuvqAq@`qDWDnReIf=|K{`7E<Eu?Rq
z!(z^04FCY%3#}ffm;7JD7{r6BJ`qRFn}|BS6US4|++|uhX#9h)w%mg?56x-dGzqaL
z*6~ovVm75ymvfitiqKIk0uApiHJ#}SbhM|bl39k4;eye88jq2@p4O1*H9yx>434H2
z1^;2-x?)dHW9PeO_Tbc^PNh=>pW_n(Ytu1`4@)9FhHLPdC<tSAX;=Tkv#(KK?{(&1
z#O_Vx-~D*Yd2xGrAm&E>@?YHzrV(rjA>S7--cpSN|5GtYFhpg&Fjk&MCX>wCzJ=6;
zpD+w<+BEGrfrhj($guyx{OoImz%)Zm>naaPh&OUAUf@_|oGaTf+P;>rS1R6GizF<m
z$~S{%-k6M{pNnvkn6LWZY9R53*B^bWpoM4-?OT|o;Pr*->m?#e7vg$Tx+V@;6MVu7
zm(|FcDp9I|(>oqJQjqXK(N`?x%18jeDa<t|9+yYcy96n>S}G7+%gTom&T?y=>|aW%
z7L3RhFgOu{W<|^qczuVj(8gO1@e>~P2Enbk;?7iG-8w*dP1DG2#lo3S&hM~#ynkP*
z(CZtgimeuuMULa<*3<7~7w7JWijH4i8}wQi1`ZxphP7Ma2OA3W>vtU2#b@#JOez7O
zp~!5D=ozOe8*9p-k#yw2;>-uTUsR;@jK&o{DW{PPBL3jWeKt2fO4{Xu1So|*KZGG%
zmCc>$vVbX#n}nVmPk3;O*#ddCzV68v#Y<A>MthL}GR{XbRRC6Z<Uuq3L26LqD$8yH
zz&PExM;38<vEcA)@g{~A?O&w*A7XTcj=+N<lPMS$_)D{6@!2qP1(d-3)>gk=zUf2Q
zlrH05Ke3C&sbyUYfX#{ed#e4~Pru#ZoVKkNCNRRL`;p^mg@Hjrjan&wS@J}C^fEL!
zD$tTRUUQ<=5&zsqMK?*_6^y%u&GL?rR1Gp!b_2PRW=MMQ-43@S-8u$b#t*jJB%;S^
zQQ?HjaRf61<>pd(6I(GNN~i25VU;;IU&j`*hslfE|H$oi7A#xw%G`;(@38LawZ9xa
z+!L3^B&2v7>JW|l6OabHN~P=#hIQ>B(WPwUq+FN#nxRVRocc=;io^0%mrSqMRmeZr
zQzFe<8N}nCY9rx$l#@m*3&JRS(~v99N+KS1yYoGl=yF-jBSLwKKrtvw<So}EZP`HB
z2x~;(6t1}muzZ}p4>5RI+Z-mX;1oMgcDEe;9=X&a)HzsHp1pz7f@T(3l(r*kK|I`q
zK3Hrh7LG2@2-qFK07!zf9JzpeKhPkQ!CbVjveCRrXpVvbAh2RdH3Ve<c5b43KB)p^
zV?<eu%Ox*gRkwc+staUzHn95XSEWv}Pa`!m-CG=3Fg(z;YCB`is8LM0=7F@ErMY6T
z6uU&_pmCwClK;%((AuuxmQXw5EPqSjgQf?+22BmtWv|?#{N&EN8a~N(wf*plgiG#d
z9Y6-zOdSAJO!!J`yP(!lPNg;M1eB=EHZoJZ*()R=mdVy6X&zT4x=lCQoLqca(6k%V
z2Ip&o>`3qV1k{*Ax5sr3AR1*dn~=Hh_Ga;qY~`C47kNUp;Ak&)a%gDQa|}xDcY8!~
zBFRgmXoD_{)YeVN#ak<J{#9y}Qt|#c{O?^0xJGXUGxfY39ycXaJ4|r*xW#eUo&+Zh
z=5R3P!jt<V5jX9le+<V1q;8^N*=N5a@r*>oRlq45x&17;V%ZJEV$e>-|Fg#QM&gdk
z@z$iEgJx8}NsJ?P*j(Xh2RNsQ5zrdoa~Ej^Qw`l|;T`~5kusq6HtkD5Jh;QF+&}5a
zpbJHkL03*66N<$t?xrn;zTAWGdGfrYUy5KlsRCvKqaScMyEr%xy@X`T`1n{4(Q53k
z{+8B(u+s}|z&-7JHQ@hW(i{o!vH3e<_TRRrr;iVk^tz^@RYkwHyEv7bLja#s1`o5g
zQHYv6vF;yLH1A-0E#{SBAJBZ5H!#YE4x~)tiHw|NNWYoTvc~eY>Gq#|`nGu*eC6(c
zMbsvjaa%bv%~1?6VeQX-$-FJH_!*51KSNlVf!vz*`^vzaXp_)XZIpxI6hAqY!7%yt
z%d)tn-h16;rKLm>THMfDIQ|<OduK~suFATsFrU?2eIdKC>G|;mZik@d#f0pj`g03+
zIE^scLf$G*_i}_n22=3Z=i8e`8)aq@+wYD;j68o;tWHo8Pjpxqwv^;26`Gs2AkxfN
zC#{qZ6*?d4-}~MdO4?WY|6wMFI@=k<R{5BMQeBbq9>_ET#k?02vjKpLe94H^=6YO@
z^%Bv05@c?vIl>4M(9coX6ArQ1h@H7Gc5w{lrv*m*H*vs<qtiuBI6%Ol2=bTKl+-Nr
zE+7-)Q2K3uaQ%&6eEP48K-xlzUbPPQhQPElmxQ$mdt!CU|Jyjc1_$tv`PT%-x(A7u
z((ooYI%oKJpcIEz7py7958|t3W>hHuB6yiVn>|~KeCwK6Iq|Nr;9Cc-4ZEk`;ru1?
zTN2wxTG2t5rPdA?73@DLgvH?yV`gmz32-D!cxuMPF}>#g5C{AS2)D?%H2XlMni?T@
zetO0*VDN>UYqOuiQYFK(IhnWdm_I)>V%t8hdY6~$^Nb*qNO!DCtEch=C1Sv7LK%-9
z1#dsn%qzztP2SSO=ur=H%(sxQvgTZqHnV*%RKLQ373>nU|A<a}ZZ>ed6*P9Bs1S-0
zp%HWXVA2nG0Y&1uuoHrAIJ~9|L!o{C2?HinorrJpjbp)}n%HaWXaORMjX^Lm`b&ln
z$-^IBDld+x5$B;v+}~V#TTXloIsJ;>x4d|L-y<KNn|s}HnNh#_PBt9<vaA&@lwN%C
zR@}k$s_E5Ivc0`;LO?l?0~#jKSsgc}5T8Ou&H><<m$W$FO9?4giCz$&;dB7UUmk<!
z2PrB%xBdTk%nLMM>DuebAM<wsq*_RfnpZplFPC&hE;1}#L=Z7zUfS=`eZ9Vzw<<Z@
zlXN{6#3J{$_T-XD;RnBM$11c_ZdVUWHolA<wO#TMrLt7u8qTy=W>){@tji{9=|p)y
zYqv6qM$-9Nz7P9Dz4<R7sx)<w1Ze-5KsXW*JDYMiDC%-kDBKnZ!_tkmyK|Gx4%7$5
z<0PhLj&Q9ss=rz9rE3D#C&CT4mR7?y_9bCQNO5E`Mk|G%QM`DG+*&HF)&~lv%yHEu
zLPNntp#Y_TnIclxpx?;y2itM`yXI3-M@<=bEZ_s%!IbUWn?$N;Z9O`|hO9#7hk3n6
z#%k4nEISiR3!4^}o$V-j=PRGy@aVYrs;a+mc)JkXr6~e7e=0uZx0%vN$!poYmL~z9
z{E4*1=5t}BH}Gj-eKKd^M!&RAEKm`00UHc*Z@waH(|sCxe_5m*c&N+`i8l%HsA9MD
z{KIZceur`CI^)io!fp#l+u+iId-`XA!0*{PP1FU+zo3E+<LLybx{^B&|0mqpO8Tqw
zp8E~SaD^YGr|^p2cJ8_o0tDTV3K<6+)faB33Z|i4ybReLNV?QvfdWL2jU|nWJxw`;
z96!ZJ(z#<G;vSSlTyLWuz+2BO(H~2Vz26e&2dN5_-<sbbR65jrUthKA^GE9XU0e1Y
zngJ#PXjk-lx4-k^V!rd@<ly?^@!@qS4nslWkbDv+o#~w*ylH>DpX`^a9x-Zn$orAD
zhQvxf`JxXD41+r!;F$Y8lj&vAC`n$CuToW&>9gI;DNm=>N|uw3JV|-MwtBk@9M(&y
zsd#SjJm&KOS^Db{Js-boH-7?*&5Hy$d~{=z*kB1Po=C=f|6mr5?v<gyVCNK(ZVb2r
zlvS!}juCAw(8!EzpG5bo6HHI?MMS$VzrX~%wyzx(B2`OF@iTZ&NF&lZg7>gY#G`SS
zMNA<3O)Oq*q4wW<97%{Z@R<3i({Xv_U){Kmuj9}lOtZTi71#7njLS}0o#@4X*;jw8
z<q;S~rhGjfaxw@yYkL11lG-%-1?`ymhg4qG?@#hAcg!3iNGzI%eUvL8fD~Q-BOid(
zD_Q>LsV9K7h5$Jt1IW=Yj7!a6qOzD!xz~c^mVx3tK>9PRC;|KhxIUkQ3xG3mI2xW_
z7uRrf1Eeai>s_L!7HeNn-TFPHDd!ZSLy!=5srHvWlY;!RW*`?TfJ&Y!KZUhb_hJ_U
zqt;iN<F!_oOJj$Q#CaZ2zZ0>e4@h-Q#z4=`TK^G3RX<aUOI!esbA9i8=Rzl&q0^dJ
zM2^#xH&OB6qf%-s;!+Q2m>?!Xyx6+Urc{(OF|@r&PV!A;nxUoAqj!TySJ68P)|6^(
z;2AT}e!Kf!W(}@s99hIRig0KzyN3D&;G>8m`fD`MIPE#A+E;F5nuk!_?b9;P{oc6@
zYCRyM#=bwCs!6`$myhM{$eqNA#EWSk`jUn4k{s$USqYv*U9WJV)ej{vFzf%<^*2k2
ztMM1j(Mhc5xVRVD5psHAf@p20H>1vrXJaL4C)tuwx*&`9X$#|W=C&~p|2Ntvc<u-U
z-m~)&a&7B%LgU+A52xFx#i^zTqk__Z^hq>|oB(`^Mfe3aTDGG_0O);B3TFFjZyP^f
ziNkmXbvH(Y9r(f7%S4zsnE-V7wT)n?vU_;rvl|xhnN}sp1YE+FkS2EGvbJW=kJ?|m
z`Ky;Xo>LyPF-!+O51lpi1%sAFYcoC$-+oAk*6<&R@)f7di-Jb>lG43)W+iY`5+$Zu
z760W=o04TC-#Q)Z?b*s3iicz-j-JSecg^w=f?oG|G0P+ZU-B^mmAntPGYSLtp<IB8
zwZpPvVOF9sAR>RT#U6-`kR=qgA`_<ZIPwM~1e4N4)Q&%~q{Y$-C2jJ-d(fe^c0MwO
zOp80NDA;WTz+Hj@8ZyK9{pkSgtCwkr5g*Grn(gNcv|3#$rJb%wjq7-%&k5>}Z1+Lz
zH!9k06wI!d@Dh2=M){ygzIY?2xW%VqpqXWBBdd%&3Ja8L5o{QTx$|(%Tmn_x1E({W
zUgwP_U!7WuCbbHxjW5wbqvpGhXb07%DQG=Wc#C6O{iU#F0!j2{h4`J*aW&rOQ97!H
zJ)#A+fTZ8oDE~Q!Y1}!GL#VSeYAlij9v_y)tGkg80tpX$`*}BCqmT`o!EDY(dI{N$
zOYV%1lmlnT-BXG0hzq_Kw_>O=-b%Kt))^XH^cuQ+g>%%sw8(aX9yVGoBB(riJe@Y8
z41}9f+=%`<iqedMc(QaQn3t@pX7g?W*hdslhzF{k!ni{f#zaQU`*BQNj!(9d0zfQ#
z2NvVV91zrbxEUD!1^vI?!nOxnXX6}<2E11WRHO3mko>U}W0EP0z^`}YKq~oawpY$5
zu*frm%5M_bCQJSD*L<LPvVh1VjfH}vD05$?ss(Ervz~ib8#f?^KTFEedTr!73_Fe4
zqVAn>HCvG~zNLy2Uop~-g3QbcNmH7Eo7fLYJ$*=x!}y)|;7HhWHX<eZ1MwyUu7Zu+
z-XsAFGwsn(e`w{Sx8!_S)JKnEjR(n8?zY4VTKmd1!Ep)?L_0vnc4#HD)m%z-*pVuy
zfop96eoy)990i|`?yD;ss~{wUFV2@dfnBy=6KD`9wIqSpft3kjW>NwK#@R{4eP3Uf
zWP1|Oer>($TGe*t-84tqzKLA;z`z)I0O-y|p){R_?;ThHmuyBYUgHB|3Q>JuP-f?b
zRxq?+zGF7)CmxP~1W}b9siL+3nfDCNmf;cp(ZA`lttiQ<m;HD*+~$Dt7$-S3gYZD=
zX3qTMSakm%NOx$@{S4Se0sVYURu=zx#(|vY8RBmv)5>5j1ljeHS#Sk|0-ueJruVRM
zJBNjcwO4^5=*yU?GsBX5litL~qy>bE;(=zcus|r2oCw=I<Zw+DOB76v4a@v&9?!9+
zokf)FoO*U$P^$uO;>gZqYz1hvV;V=daW}LR{$6&wJ7zyG;+-BNQX)nPp>~>=y*Kr`
zz1YM0SbM)$I~yRcN7;Y8>?7iOEHKMExrbrh)nKC^Aj?~0j;5EWRDRfZGW%&SRYjTd
zgTUuAr^`0GXl!AJuQVgn``@mg#f7M>Sy?K#=%Oy7PN`iIO5*Vro>FbH<Kj0k<-3bh
zPdG3LJR~xt62CW>9#?cR7w@j$(K&v+>TS2ENbIEGtPHv-mT9(*r!sbkkPFLu5gVB1
z&oZ_|C7FNR11KNL<pVrTwns5uA~J_F6_jLWYel5<uQ1sVtk|H92O0J9<Bm$4h~Ybm
z!`r``gL#28O-#4s)O&)njrqvqOtTQ$5G0`{X6lHt4cyK~;a#HlWSW+@`2uX8q!%@-
zjVH!e?RLL#&8=XqbPQUIejAiql?o+UUPy3XJs{g>N?k~{r=8F?=~T0qf(@oJf%|cT
z$x+CzHq3323rHhtY9N6W29-Tc3c~;=bCYmU0T1$Q-hCdQWpQFt96)Ke%I5zU_PluK
z3yh1fVZZ&PA*~uM@v-!|8&-uHEQ}1NHsSus*2~Lk*)2=fS}g{iuo<_`WMwf5_+zXE
z_q0W0ak^XnaVGC$`O<~T^|^MAVB#E~r%^fId11*^o5+_n?I-6OK{adc;8L0a9E1!|
zRW}g(P7Xu`A7T)1=~CIPe#vGOmqcFP+k^D61s$8&crq3O=}UwM5tzI0$b}yYx^Ku?
z=&*jf-JCOhWJ#r}38ZJ*c;>2^z$z-{WRI=$YM2S}m!awz1H&+GGySs0Un(lN;975?
z?Y9xK+LKZ9#5C~wt!K%J<vh}l3<?$pMyl!3b67E~5$mW9tWw=G<cq^+Bg^Ii2fj4R
zi%FDii^K5>59*UZ5<QvvS+xP;*OS;uw6sLrwZ${Px*!4RO%=;Yj4+7h3iY}Ya#+VV
z9_6^>{k4rw5kFJl(&HeXR01Y5F-X9Qo7g}0fmAfea;Vckk85j|_hY7f+fEe4D))>C
zaQ8h>&EvC%_`ReKx-*8aBvD-{z1w*bcr~6`FmDvk(RON?{g+(Zt2{SPdAwI2`&B{p
zZ&t&_#%xhSkmLb##NHT=(^^OoNUVO8fXG*#Q*{&)Bsf@XhQOU|G@V(iT9}cNF7UhN
zwM5~c(GeoS)ILkdoY2TGxlkRe8EB_8Q%dF=Q9A!u3Bq*)DlV|DA5M56Zzg~Y0LSif
z^Y^N1{?D3B=d|&edwr?Nu0ZBhte?2nj7-e7=8xs1ceDCU6io%`{x>0r?vZCXcJIq&
zKxVKcE+G$tl+5mD1h<d)0kR6zYpdJtR(nx`i_?0hc5|z=F+LZLZyphvtY5?6MW)}5
z$mb*v;sC2YZb(1Ogzf$*PIFOv<-($9lZy8~a0tG8>onw>vWYT9LgITEu$t!XAutS*
zwkc{lSpk?@5(?{Y=KK!nDPR~B#<Ia5(Gun&by(-)gp)aCj|JRB0_o2f(7)sHmZF{*
z^i1tTCALowZ>(;sFWe}^oFl3wVR0U)aRgXQLp(GM%4*9Yu}@=mO6*(_@3d8Bb!BJL
zNBFJ%_VY={`N%y&JR17hCB67O%l3<QtOF9fN&hV@KA^Ck9V=HL--p57Qn+@n^*@>;
z`ZR`A)~X;F?YCfUVUkK~70B6o`hMvh3<XGFP(S!?G?F+6h@=0Tn-VQjAF^6QZ{8cL
zwj$tx)w`gNACUPpd9Ps~MPoj)I*wFT4Hpm!82pf-u~@avNkxiA`%7k$eZu0=%DRL4
zSZU_ZY6^n{hAG|?xr;3mTy!1S6<Jcx35tH#E&|&^P94G;O2wEH$ZabG1(BTmSW7GA
zom+fR<RV!qiG-pHF<9y9HQMKjRefK63g3{H>wf|^BbGnvmf!b<*`anweSskpU1uMp
za|k9a@IAz3v9+Ew5`!7(PAFtOB2x_YDkfROkxjVa#Mv^y7}S1BjAxl`^ytRi->zg0
zNtvxje*}-Yew#4HuDgmKBawwMx%ei)z<Uv48?|YB_MZWiwLh$lz=R_{0n4KA3@sS#
zrV%N>24Quv?3ajY@1=E`;*J2;MQoZ@$0g>aS1&xcU`hN<xqs>F>MKRlZQv*>^qIDv
zonL#d=h9hL|3Ocpzc|ZKD0V1Off;1mmQDWw$Tmp|7rcz%fGd(xt_|;InEFo^KpB-@
zpVGfL4FycJEyW?Azr`tM(mzz73YF@W;waq3WGtcS4aH&aM)Q#5Y}4xRChgo-t?PP1
zkY^M~kw<au0mZX^*dSC(oZC?7#{rwa1NUtBcG~9jF!;F6nif_FQq=|cI!}L~)CTAg
zZq5&WovsT?=P!>oX;mu83~{NXj=pYG`{z6VcIbxLP#Z&yN+IZN`NNx-r9*j#8(kLq
zSRuV8p|d_FB?&w>bRI1~=J@8pDn!Y*VG1#hNkpINucSt7v=-D4;`^~wHAl598I#~3
zn{T8<KPgjjQkkW|9ap|ARj$2VALzY;T_*<l3w}|_?k=(EhBq-+^-JgW%wiQe;mRQ+
zL!_W6LJg(Q;g2bQOTRG+Ww$(MPOaaqvPVE@B|Vm?gYa7M5f})q)=U+vxjC@KuEEnQ
ztgpOm%8Il*Q9#5BU?HNAoifG%xl8L0;lGZVxoMnOhH?J!!|)ur>M8wwu15=?54V#8
zF-yslv0X0Ut;~_W<lW%1)c4QvmmP!G02c8IPk8<BIDkD749S{^Vhr0)3NS7<Y=xw7
zosdOHUrTN#`F?--WMJp9azd|T;J~hRt(CK6(oRH7<%q5uHdn`><ZGqFx%bVyY@P{#
zucFa#EN6C2hCdO0N<LOh>ga+Hi0HUu-<FFnDz-KVj&s=?nEooo?qTn|XKT+(X5Z2R
zL7~$T1_L{7?bvEE_LT0cf!wrmQ!@Cp_22kcfsjD6adGyOB4Ts#k>_LHXZ9;h-g$fs
z>?FAHJ_{O7oh))?-A};ig;0A)g_6j*5yrephrBy+E|V`@rFG?&>QF482Pl0DJ%lDV
zj8bkwfU&>A48mwFS!~yGUFy=AJr45*Rn%YslS7vXRtv@OvXq{Y)L|c`Ymoz#pR;-T
zf&>CZsrGF^+W6eHRoyvT@AV_@w$Ty15&)&o30I@nd*2vb;EHR4!>2Ph`%{8uUe{rc
zNauCYAC3^<rHljv{*=Y#^y*)>{DKe=+Z{Jb{d=fK;|pnlj|f@PfsTVNkMZjPeMiLa
zoi+#kb~X?diIe<x(alUYj=dtqV}mjO?a-DY29uHti9l_r;LlMZ@J-ydjbErVp2LyH
zVDbRf7!?tR4{Bd6IHsnY1f9eK&#|=1PY-S#D5_+U5vGvkvM;X<cMNS7Wab4fsabZ+
z6p2h1eQNj0dc$SCaiUHs09O#K$>q5A5q3PC=8nsW+R4@_VX5OT-10-(C{;aK4!13&
z1695vG;3>Tef7FXzf`=1Z%}J-Fo6k2{-GX~U5mp=l=X(mEN(aY!;L@_9~P3+;LEch
zs@mR}=a9*G5gp&BEAqP@r=cz_&W{MO$_+-wdAfePQw3TdgOSa$F&%)3&?dpN@UGf!
z2n3>772;36>?NbT&ayFCPZ(8aKGqS<pgfBk%R;Kq6TL*N^H)V1sbuTlZ!j}?OneX6
zUsDJcY!lxW#VC}0VpuD}c{N_oP8HWS?wwen?VgmP?OE=Mb+hjcr(FZF`Iiy920gaj
z$g2rBghOU7zK)cvlrLfsOhQnKKOaUDOckkT+EW7@G*`gReI3ZEzq%|k)CiQU`gyVv
z0aN~lYF1>XcMW&@_tdx0RbRA)PX(qBq_HqLV&JXXzf%Q=<Br3?`I6MdGyi{1^U&RB
zxWJp8QIPw;)6c#|;V({DAAjlN6xr00gWYc}ZO4gV01L?ZeeS=xsu%(uu&<nC7?*(0
zS&*gsuz752**C(4Mob_fOe=c&V}<=H8+gED7Xt?`=%`O_%Jb~}%@!k1WXO1a^rOqQ
zeMWOORJ340KQ9v2qo&#~8Qw3kjI@yx2#ZhLz-YBrjA3mV`|Rn(Xmc?Qy@AiTIVJjq
zgDhSRB3ZNf*GpN8M|+2y-+P4jEVKixr+|0bLHYoYBM|n_RJc+$UYc{{t_QNeZD1il
z=w1Ox`S=g5yMP`|tp;GQQQn!jt80PzL=Va6*9_^q@%Zlh3=P_PfU-@M>q+MbjNcKz
zC*!OIffTa1ieqN_hck6MuA%{Xpx~g9<oO4%H<M9#p%x3L?WM(DIQA%;!XKlRW9hQs
zWT5cvj(NU5#0yJmuG;u4WWT{5jE)&dMx22B`fh<K=#$MX0vOGl_*6HMF4dCnb$aW3
zHI&#pg*J0}Z>{ac$eSuq&>xm=d5p_LioYY#M3=6>wMbe*LF$g({}$yRhnnXfhgv;J
z7jzV`%hZ;nYTpHF{Z$n3@uI_Q>JSGOBcuy3l$2Zb8HkZ!gs2bHhsCGqO$y23C-xq}
zd(n;@b53v|wUWdUTZ!F$DCIhauy{8x8#BL5`N8oe>$>^=5(3at1v_Zjsy#*AgTMLw
z`Gd2?V)sNiXVj*+!#0&K|2fz@#q?MIqz|3qJff~+(Fi;gK{gSHc09l@rwxV9LNb8S
zH(6i=ZYl_BWV$VSzh5BX^_JLw&R(F3bNXmr4;Xg3LDl^36k?YkbajO~DL!W5<<T3`
zN0I4#ZmHKpo|f&YMv?V`PBh_Y$qVpQYz3*B#7S)bwJW+|YZyygmzM8Fi|x1QhRz&m
zIP*gDpALR-x{rKGWwpvGmi?O7T?ck9He9Jwb`d9jVH43M80)nw0lA|B#?+L*WIF*i
zH;~m23{phFtuW|^;(s~|FPLa3$BDpw8RLw*xg#!)9B|UxN5JACVMs*J>8JatVd$U4
ze>591=6=`lRz1k2Ecv$KQPfV5nsV6nxHAvO{#Spz!GY2dJ<m}|01+CM7hwur5xGvg
zkxPZNYA}<559f08U3hUYYA82?5yP_!E1EEeqjrYM-PWJvUYQPL{w&>@st&jeJ+SDv
zV)|K$%1-3oF4ecZG#!Z^s<L@@BQnVM@jK$7N5cPP>6yC5z1D#idXEe+@${(b|A_`i
zQgl)5_mc#<@FSUukfFavfS{i1KtpXo0z-?tH>}Z)+OB9&z_iN8y_`DEjQ=1FOQ%<x
z0J=MF=S!g8L~>&T=7^5QgcKuf4`f#J{!AR6>KQ54p%G$Gf>OL6fJ}T5Fx63D93Zfu
zzMSY7W}wYC&3o{sg!ypD!Xq1%fJjg<c~Jx%UJ~5*ftp5$l-?uQ+X)+f69-s_4J36&
zRgxYL?@I-^Ej5Ovzu&6QAb>rdhbfB;^QD)TrvF(SaB`8%k%!z((&F!16`WqAFY-P}
zM1X-?j+#l^nV#L;uk9z0$;uP+1bE}xVmwt_Tg>I#@jGuvLqZkaNiDBXhmNwgm=CPf
z4C`&M4zY|WoGr}LW$|JI3?ug8DGfTBPPnt(Eba7|nQQzpd#Oc9MZrt^-E<N4R?7&$
zoX@ryK*!-Qd1Qx3g8-q^+X?XM_BqvL04+-xTS9pC6LE&{TdI9{-9M_-W+)5glX~$g
zz$o3dw(@=S_g9iNl~~HhM4yNrF*fyURmN%Ih$Me~|EZ`t>q+);V-V$A?{feH5hrzM
z_ti%Uxq%=-IFL}AxKRA1yUIc&=~q|XtMx-vehl!Z(U=?1l;(^X)CL;vJhou2i*7nV
z%)|HgoZd54eCm4(v#v|{7yR1gV@&_}Dl0Y5<+RI@4t=-jkzYCmlksv~BIcw1rmG&m
zAA!E}!2tx4Uh`T7uifv_`W)L83m?}7RRjVf17PBlz#+P?e_o4Pfcl<C?Vc5FELr_G
z$YX#uPEO7k><#rJi!I<+j;P1-qKJH_9#*x&MlQ4YyN~^8$x}hRqzWKU8OK%qtHn>2
zWS`}!ZKGo(-jo%D@xN|mBi_|~JR_1%<3;q2Bgde+CC^Muegp&{??0y@E1gfmb}L^<
z(Nii`J;*aQgIFXXRwG^&U-KY-a2{~Mngg?0JxAEHjj~$%OD)MHwaP70yA6BQ=`Kf4
zIx_EMc6s*zAYEgkprFM@Jv%e4UR6M&usTJrvO&;$JAlMHnDd=T7RW4PQeT*Ot#$m=
z#_6Mw)LwS|gXHe4Kfv*@v-SmuCmq`Jy1o9TXQ=pvP&b2Mc6hL!5O7-~Qfm%=JHEKa
zX?a23e*iIz7z>=r%$A|0Zy>+{iZM(VP_bNnZZAS^YN9YGdNaqy#%d(a)Hk3cu>Y8a
z0-ICeCeJ_@Kb~k33=;Txs*?L3w}J!V6qW7n!_&!&M$ABARdHdb=v6kmMO4SchseIr
zu%cX&ZPS3FU?aQd{Rl3_wY?~m)J{zyv^@5k^c)9o`<$Z4T}!t;9lsy&+B20eIz*QG
zLtQq4rm}(7C(%-CxQia3Ai^k8Eswol+Xw6z*?at+<n;6nx#jwlO5n?RD2dQb(wONP
zFfgXA@PasGB2Pv%Wx}T%htSlS*60SH2=9EWy0Fakb*|raj8!=5OL82{Rk`vxT0cms
zN$G-kzujx<AwL0k!7D&WXPC)atH40g>kT869ptNqyGFebu2}0|Pw1TNGHBfW5^w0^
zy~M&%1Q`DLKSL29taT%uI9tO#%y|q)a>^J-GFK!mWjOa!o&^>Djj!MAZoe(Yfq&)x
zW%T$+Jft(8VLJph{CQ;wdlD6Y$jS+@xCLwF$Sg$e+Zr$o6=1bM3^$A9u*4{`f)b!Y
z(oZzI49r|%@($`FdQ+z3p0DhFNGV)`v#rpfK4IWxu^>hnNjub?e`LP`DF449`<)R*
z)uPFXVMPQ?NfDq0`vbft_uoYT+lrB8Q3R!b!{Fpd0CEna-K4s&!3@A@^N8LrqJhZ{
zq31>qMw612-Fuz_I{qPn&1K7o)lRE|<l7h31<b`4HC)50pwj4!b)nU8$|G9|m5^KE
z)b@4O^qP(3JzySFKOIjK)5g8bco#07FEK9$Y<h64dnx54q6hpfm31i~f#~iuR}_?M
z>;71}X7!H@KziHOwV6rBG4%G$js`KZ{(L6u>&)y@4Bi6)uZxNDZ3H$9ULuiyVl5hc
z0)k$I_sIerTm31iHRd-#4kiylZ02%iGO%y@i&KYpusS|j!&}vTmL_cdfm*0|`$Y6r
zc(O%Dsp-!c4>CL|d1;&5O;Ev*Kv(%@@IXpt{clTYd;#L_F^&ciLLcfgGhh(}^^GKG
zO6Q;Pr1vO?jVFSWEo=kSUu|YlmUnm?=@m!<!>O{Oty7v$t90~`ivI94C7r!A-d%_|
zUN%EQu--jpOmnNditvVsuF~?OugC9@wKQwWvRNZms*cH8bA`bSr;DS3a(_0$MBSS6
zoY_Ov1x@)g{R;ffT2x~RQmSrrJXmkZYNw!n6=(3`?kmzkBLc+5U&7`Mp5^np5@SQc
zV%IlU(Wh;X*Q<^3%r?rCNTcCt)ky^-0_e;Q0#<y=y-o&%z8ZwaJWbS&FHmmhk{C~>
zm#c&{Q1AbQ40xDERZB)G0=A@o;spFZ6;2B{hR<^Pz4P-JwQ<>By|ovk>jVj#v)fKC
z;T?OVvFh7?zP>{(8kUHgW)Icmf>eJ|QX*I~8qxkq^J;81U<U|%oO)5%EVy@OAP5*$
zb&Z^I=*`H6KiB*}Y`s-n70&ksN-78<9nxI_(%p@8H_{;8B_-V{N=r9LcSv_0Kw7#>
zy6@ok_rIU}a36e8AI_OGv-g^{)?QPtyxx!$bG#!#8&7Gp*YE6BjVwV_=#BJ-PVZep
zT}-iLtO3mGYZJfcxFnq?z8oF5j7eaVWHVYk!>OS4nz5^a$*A*y=l3afF-2O+IwAep
z;&T_4<*kj`3acb_>18rdys}#m_gOiKHO#XEp^<dsgFA(U)w@#x@hlFs)!`lOWPX&~
z!BTNwj1+jpO0@(L?W+b5g)&*G!f6@4;$vtF)w7)f8ImLcsp#+H_svy4;+I*FDEver
z`T7=XV-(l}rvG;F#SIJ@-~Z2ttE)3>%a<=8%3*P3l~*N*>IfGd9`uvb^_EDF@Dlj#
z@~|2?>BuG%k%_QZXy}IMX=ruez|WjOYOvcQs(MiK-xO$rezp9>H(IH%w3RhHxt+U6
z!?EN)Hz9U~%u|<A7@}oPjWlUdF!Ow9E(=(CPBV3VkIDgWr$^6jd?!UAUEcn8q7v@=
zF|0)mTLfspbQS~H1#U;RBJdVCsKi15N-er$X(5U@>zwW%T5>v)V<jx&!W4uAmtz7S
z;A9^C6{6rzfzQ`cT-W%V^43v--@vd-8U0HR<@^y|T&j%FgZx$JkBP8Bey|9E%b#VJ
zUX&XHDArMP*;NYsb@|qQT<9Th=|hAr^3NJlKS<WQd|e*?dRAByNk&4pR}^0A*#8VU
z(uOJae!-$kxBATeh~rQcuK=kJ&$rZPW7nccqb8CXJ`&KjF0?_aOVoQZ^kMIZIw!bE
zqWGdC694KH%dgWUQe5iRY<-IHW6Qcf+rAE&O!vPmQlZReTcMbFluKu+RKYNnvPQwJ
zOpeZIIpuyw;YSrySf=NM#z0d08lPAe-z}}+`gBckk*m^Tvjy8$T6H=Ci>}dj(ZQcl
zze$p0fOJB$y!%SE+7adUwj|JV#Cs;BY<1nuqOJ_S{XvA|j%7gG?O3p&^XnYpMbg+s
zHrdToPy~`19;Yp@!qgTqCKWF;u9_u%(tr#*{U$0Eo;b;$_d-ME!rGkfZfFbPt)A;V
zH+>AnQ(u;Ll`Hru^5f2L=%is<MFsack1?h0M9YKmSALk;z7>QtV&wxEAtN`>G&6lO
zdLjl7YZ~hL{`Bv(&R+u`5nfDEQLa9I{{Xb8{SEdOc`5gbxz9=VG~G#D3)P~udo$&^
zOLMD@tiVel5@ZnZN6UY3(Trk^Pz#Ad1LqyT1PQ4&+}A)H*}8T~n*9$<N^e$N1&jks
zMkEWt))A+<pXhD;PXK+P$UgD@d64Lg2>~!H0=@qk=!l^=N_o%qrFYZ~(NJI73$@v|
zIX5-W@5Bg2*;B)bz1%26Dr!_&q{9yK!$aT_fh^qAP%U}mIKZ(=ZZoMU4)ou=$AkGw
z>FR9QwQMUEu`>+auNJkxWf~mT+UDUe>7oGZl^FClFULe0<I_!;`>D}H^A+)KPXy;V
zNbz}c$@MxB88Y?Lt&sFPWb&p{Zqt3*cc$hg+IIN4(&+R=Id7717F%m5rZ9FVp?ZMw
zm@WsY9(DkFskR$G9v*vPibQVYyGbyRq7CA|rn)X{biR1H(LK!Bc5yr!I_B4*RL(~8
zyJY}%S!mt-$c1YLu(^uB7RL5hck*fIl;p{NquWKhX)8BL6bJF>FujCFarpba@f6K(
z3(Fm-)SZvPE#i0B;<WMveQ$Yp4xNJu;KSbx#+%wO3vr6>r-dA@m^O!%E#1F#!LNx%
zWd3Wnvq3{nmYwmx5x+=U%k=+^_cq*?AJz50dU3N1gGDW!-rT_+l^wS`MV05|`<T=G
z{hg!1K8&_fOG9O?J@CnkyR_VRXx(#&*Gne#qhE~IVy(QLxypH`^rk&-_GL|eAmYX-
z;%VqzhdszEo1Bg37jFeIbE=pLea8KXsRpvZBT3Ly3i<eaO@H;At7Y2~S{j7|et7E%
z#W{kP%@d|VcDxQ&W7lZTW^PR3guMC2&_xsS;!uLN2JIe_*U)Es)4ZM+$2x~kK4+J(
z-nZw{cX9r{2JJoy75eSdj}O<o39i2dT?UzUo9)_JAf~^H62jqn9k@Jhnx+E7pIBZp
zU&tfsF-)-MyRSyd^A}cdGUgFu`^U2vIC18U9Ute!<$M+6gCWZ%;^5ns`-FMw-{b)Y
zBw=6wVJa|1W%`kmFIjG7ZhcSXtK(d5LCC0>uU@0Vw{n3W^s_|hP3}vrPhWsIE3c5*
zeq+5I0p~!Y-|{>0y|8Y0hzMrbnx(Wjs%o1|nqRyR2!Oi2{KYv{y={lEA@UnWdiPRr
zY-cY@(v{O+$=^XLQ^wFJspyC*g@UJ`_KHr1+eZp3>)3<BHABF+%*DYmkb)|uVxoQB
zYyV>3ZGJvHURuIitkz+h@lg~A&yxsVBzp31^`D1EjpS9tYBpWg1C;JB`J5h6^jaT7
zfn_#V&0a2nj+yf57U{4}_VfZLkwu)$w$^~$>LV%5>$k0HbpFQOAqTkpUy(gHCgN<s
zy|v2raL;h$r87eTA(XwUJuVEPzRjDDf!3##sz06U{@g5<8vnpVgJ@M^4yPt<MOytx
z?T8GZCt1(9J7ZjGc?c8|7)>MP_D*N>SI^_IE4QOn8)TAujekY7{LSr{f7WN+JZWd|
zwc~wPUquay=RlkMjygQ_iBG3)+dWGL?O05q?Ehg*(Q{ozFg5!B%hDk$G4)q)V=;Uz
zU9UYe|4-9ZdJVnte&3S`S#+{Gog77ZrZ6#}`0JQNGKJ5I7<**)!sq3-2O+KSXft_y
zZk4B9xorG*pV}ti0C-(zs);*>3^-j*1!~>pmOuOZpH=Q$=k%h$atp4N9`p$(`=fYr
zw$Cl6@*wLlRig{$S^CwD_YeB+c74r0@A9WgWpt%~7>y$D0iS26aB>dGtx%%+MpsV*
z?E)1pGzwVf;=>=-vf6BGafXrJ=CZ{a4r6+y!#h0v5iklHuJuQ8tb2`KdbqlX>bO#l
zifINv7en20l(R#`bur76|2c$w7Qf~aGb*h{h09m+t<0Z;Wk}=8?<x|r!_wrRoS3=)
zGXOzryrkfdy)dDW9H?C6hBbG#VpH)+Z%Hs?dbOI;UEAPl_+c=GJ1DvQ_1xsyD947M
z)X-^J%8Cemnl;)+^3Zguw1x;V^9W1<xlR*HL_eqizzKycsr`*|?@)6>ewN37ak*&a
z(^lH(zic};pI~Ml^b3>zU~quWk5t#6gTV^FJo|P}WvW6wbLv9S_w6L;s2Z68i5*-H
z(?t!p|A}}M$!PN)TieT>a#Z{e2rIC9*)zdFp@08ddpj1FIdr+jLqQ?4bp~aO-~BrN
zNU(GD9OwMc#D~MD_N$XT+Ei7NLT8KJteN=^mUXDrs@ejJ+k4Uc(jXgz!sy8&HRfV9
z?t+C_vz696p@Pw+e90IC%fRuqpvirKmQPTh>Iul^H5)zesm0&fCw#>~yHM=0@D`%y
z44d)p0{7B8D_|rL^Fq<1I%XVdz+nm{yqj^EY~pFo`4fxUR&kPku`@t_OJ>LiBMl4p
zLgyD1brV(Ae2TbY5b{QZ?wrcRn`9Nm;N!L#1b<TMa;s?x2IWd;?6Ozc=s|w0XhLfd
z0&ypIXz<j3HNJNv!yikukT0nzPP;)i^t9ud$Z{om4%j?Q2uVj-D*Zv^y-jiCCoV!h
z1Mik_i?8-02W{ai*;4YvJg!kjy`jSVyoppXBIUdt54R&;43#4v%HP7s8llEtkl$)J
zEVw0gV<v`U0Wgkp=?K1=i`R~L$Ftgm+v-63PzYX02F3`&XIBI+{l9S$0XWx#hlu&h
zSV8;OlSC+QbD%VXkK^~8@@vT9VQ454N+4WjrN48;aoj17LZDE%0S`KHMAURAzMp}{
zEdpu2VtfrRShkvhkq|1ApOXP(nedeQ{aR&QXq2N6>r0o22f^e~0cy94kJ!3mwn%zm
zi)yigsr==_xK+>@52AfGF^WbR4Eml@tUGL)g_9<X8|4&xq^QmrqAz*k2$p81?Ppoy
zY|QVy|A}cI6L!w8;eLJE`hB~)tg~OAPQ3zc)sPQzW2w_tj1V2ws<G#&7r*Rq`c_eU
z)ae`~I+@NeliIOA)O(6(jz7H7gYx)dn-TSCn|X5>!*MHW{slsqlKJI1%ekI^%R?xl
zbtj9I#s@Qw+J#MwZI~H5e#!R8xzQfj_75CPHuO;;JA8O)$=-tDuWGcZLf!r}i8Iu8
z#-P#mi+ZYN(os0PjY7LxuHqiqU4Te;CVyWi&PJ2+=-e-+7KIqMzErp4`N038V*B<*
zB_^B-|Ch6#n;``g6vV%0+1Y@WX2z4?q{Fz#9dLCWPav}Tagx8<qYV2WPlEJpXu$bW
zHv5%qZ&y6BEJiN!v5V6(9b-80=<tJb%;o*%q1kN&W1SH0&DpV+_Ep#jFB3XK(h&(p
z$Sj$2pkg*LwDbGljZg6s^9!|4b`LM$$WPWC6vtkv77f2aoAKD5JDX**zDLone3Wgl
zFr{NjMhVBqHHk|?eaqoFuA~s=MFKE6sqjxPIms)88Mz$b6pp}_RR9!3NZ8xd;mc*4
zos=|MHzm$=Qo(onoj)zD6y8p6-bryYu?Ad%k<Y@mJ`8I=Ny_;7^yaahrMs9!x}I;g
zal`CR{9ns<{Gs9%ZU!qk5B_IpYvjk3R9OB|IpvD+=n_QY&)#BSTC7gmT(q*~S<*!&
z9MBa`8DR#}Bhs55HeYEb3yjk`4xjUPx<%p%*MW$Zl<Bd85|KPSpRcdCpFInPr@$n%
zt#79^(ew|XDxvYRQ!|58LfzZ1^cnyaOks9?Af&&?_XmNy%BOFqkdu3$Z=0r_l%^f$
z4v6a5OjKWogZIaW>z>JWu5(msvd=6N6j08#DLlteguE`IyqZx2_$ZJ0vKcffBp;XA
zE{NRg7_Eus!aPTB<*^8r>r0parzdSO<ijQj0Ba9Cz1$YM`}45Oh1JiozvqN?w4JZ!
z2V@C*wp=TAPa4M;SB<ltji<?3j8np=bs+5X+utiNOFgK1qEXI>AaCsJz>ti+L@|*S
zPW@lL1i8W~Y;5PtsCf}!>SV0tc$WtfZ&gya#1FQV#l#s5o&M9(h|G|?-o>(i{y#53
z@8J)CpAR3G9I-bCYeOJEZ2YGBJ%i4?C6V&ol|eiNV&(IENDa+SgW?-_jNps11N(B+
zE7>WO4}sg{Z|pAf*ox4R)~nPyxa&YbB|@Kv7j`b)rq8qY`%^me>ATzYHNj2?MNQyX
z8T^_kh(RU_agz9@s`O2L$ix8!&EzMH$E3`NL9X**ud#^0F<@zHo@^I=iuLyUi(KtA
z;s-IutY^FbUKA9K#YT~xH9bYIpgvB$WyOEfANv-Kw?Fan6>t52i34lBJ!D49Kq}=c
zu8OG7#y<qqt@9Wj?6}e!@q$BXkTq3Q0^U4uWtKX2E0N{o>H77ZPcg;XyszTmH%JgI
z)wsm<l10%={%<rGE!s~-fiCaEw<8SWV9^SLc7Z^4Fa5vFooG%<P#U9wu?BVu&C9AF
zKPlp5cJnSUrX9GtcJOJkqDJB$_IZ`kxQ^y-#Ef2{<-;gXmHB2&q=P>o(0K=Va-9!v
zjhmb|1?oj)1L5y|qcw)Dgs)G_ySZR?QmVbjGHY!2xka)&Wq(oigU63jFGaOD>gw$T
z6&WgSoI%{&R%{~%zH-l`mZ_YCC;!A&37JphEP6z!=kJd|@on~kKQx=5$tG$Jhy_7U
zZk5^gV%5?9&bNy;7v=WVcN#%Cs2@rFWP%!pBF$lHY#QGH7rtpOG^+IISNyT@&L8XY
zMQx#Qy6Ze1Mb@Xs=@O%@o)2g)yY0crxMIk!KW*e!0UK5z;+j}WEi<{IbGE_H16&3@
z?@45*z=!C+l<ft|M9Bz<&8jOD(UXCaDnb#t64@+DO#kzJO;@vp7Z$u^$V2+yT)f5q
z&BY&;!j6x|=>F4Uvsqw3R%ZSO4I(bKo@Y7jqIf$SWPgOJ>KPKGIAan<w#Kr!n6a3H
zDf`CR&h<Y`C{PQpnubf$Rz$)H{^+xY|F$?Ma#t7QZ@fTv2^aabl%vz!G<`uwrvA|Y
zmjZ2B-<R+h-qss?py_)-;oa9onbe%1{Ves(G`0pw_%ht;(!^5kbT4^3$d6Bu8hrsR
zr~DoN^EsP8+9DxbNd$|(gC_G4SM&xGq0&s?%$;suFIdzUHuO`D9?IgYX-Kh&a@Gb7
zMKg~WGF0ZjTSZ$riC%oWk@Ffpd7wn)$F4bFNz#}E&1fb`o9^;6o?$baw?+G$`J@lT
zGCF-OSghf*^ck)9`6j3RoOUZyVN;xz;KHLlD;Za+|Ho#BPxm>dt#}4G&ZEn>UZX|Z
zMD{$g<GS{d;n4L4V{Yq#uyJKD!zb`d7t5F0!-c%kqod7=mHYZSz!;!5%>i1t+d1U3
zZa=m`vu~eMNf=z1iWws|Vd?B5;_KZZvCosAI>Is7KqzjxrMi;N1Zz&Foz@<t7<bC`
zSr@?<Cb{gHO2ri@A#_~h@wkQieW3#;Z&O&RqKRm_cB)?01;pYIptilY&*u{#Ft?61
z^erc#`A}DllW?F3O73Pjox!sun*O=BX=Imm;Y1Xgdk9xIC2896?b_p*s$CH@$u8?9
zal7~wo&?AU%KS$7BZ)#+1{A^&`3Hvx6r$#>#DJ`FO;~40SBx9w7K!~`rukdTH-sdL
zO5^W-z1aj4zG!rd4I+xti1;7)e&ug&b<{QawltIZm|vr!wX3d;;}zo9(`qwKLn}p=
zk?jG4z%?DIC&*qlH1=IP@Lq)ohw?X6Ry%(Y>dl;do_r<{h3XOac{f6}cc?p^(Khyd
zk(nCp{oAGT(`Au~Nb+`r0@1$m-DsGwUSgJlAJ$s!6R(mtK+0I&SOu^~j}zlOPM|RM
zh7$%<naK!q1=7gS?Pgd|5I|WcmENY;9o{DBNOik@lIqVXvC8_qTlg`fCb+mtLZm^p
zUo022F(z0LX;2@e|KFLJ!2dQG^Y$OjG7EBcf$+)y=KA%Q(Ko-zqj5N0yWz50=-?J{
z{q^pis0d`)K8!oG$yIlr8(%`ql2u)GL!o%=RIPYoc9q2B9SjQxvCyLaqk%u-M1b)1
z*qCo#&NLD4A~u0u?^JW(A(nKb-M2e@48dVqQ7oEgP1Y-U;Y=?{*uuZ~s*ZL0Sn=8X
z%N&NF&S}e^HNT9MK2tWN)bSi$B=;E8Fyrz^Tf2TUEr;rL)JwzeoR6MS$bnB)X7YxG
zemW7OA1seo>0OOG<^oy)108Nse$4jDNcCwo)$$_0+SQo1s+K-a<|dEyfXorK-!q5m
zn4;Z}eA2K6H2YgO2KuiH!knY=wfU@LE`Zw)9)F0WB)RiDH&(FjTf>{S`%N|9zvD^C
z|LbXG_s#5lKeQU+N|$G#<W^i|z~`a*q69@Sp!a;I$7D5=y**haZ}oce7if7Z#JU!r
zec#%{O+Z@Ceb$KHQq?M<2q9<Ddjw`gdKd4#QzSjq9T26MYlYi1oOIqV^(?==(wmQG
zc0hsac@T{bne->uNFEbRQdv&6b6&QZFim!L6J72GBk)=J=`tTy!?$H}>K+%Vkl%GK
zwRLIqmz*g#XF;_;^YHiX2M;8R#H~Kds8k`Z$(Bxh)M$Wm-!wC-jpwDr(9D<v-cAUq
zR?0DQO?lMe@2Ug1UcedL%C6)80Y`tDf48z-kt%&f0Cn0wUhjy{qMIZ6Kn@L7lH)1m
zcC0nxIt2tQzzAewbY`3MbH2ARbw#s_8eQv;q<1NC*kcyOZr>-jNji$(S-h|iy;INL
zdTQTnY~|QD`(6JVgi*$+x(Qjo&+1eHwzCoE<D|OheB+zJpb7+I#5(2V(hbh6ES+8=
z!T@x%3%`NsbwO#3$M9%eK<ok}+jyk#;E;L`BeK(-taE<QV--%!gd&Hlo-l|xRN5Cg
zsRmA5S|n7wKlg`dW6!x(U;!A69Uf5PTH%YAEZn%h9-!~!T+E?<nrwe+%h9YfQtX6t
zpan4vG?wv!=hGU9HMvIN8)3+lTj_!ro4>H3?*)8McHjYLp;{soAwr9KFCY_7A&@6?
znm3~b{qhZgv^DJpIVTKPO)nj;nVnf&(O|(|Ax>k(NI!b(Kj0V#{|2j|@%hL#y3bc{
z4L`Rk0Y^^&^+m|34<TP9C$gg=bV|t3;+$yJAo3DI5dV%)FK1$lW3@zpVM;G=qo8nw
zjib|>grBp5TG#*F<f7G*)9`H0U_*nrH1T|pL0BX-<?ojx)|#$hHM!&#myrAJEPX!5
zn|5!xEf+=XsUW-)5fHqHoUR|Q{zA&dl|(txxe*A1Ua6-HQ_c<Wy{UZScU+U#Zd{FI
z?P53aT5u#Dy!4i_v7T#U)pHzuWkQx_IB+9MH<-arxp}>ce|t;sG}@NMs|hT22S&`m
zzpdIVBQMI8%#3iiRsHa0s&JsWGx?4_*C3aX>PvOJasFY})VA$H(5s$N9X2{Cl9pWA
z@)(){dMc%Pa*P`VyG~sqBBdC$4W0Z{oFURtN7wVq@zDs<iaZUI(XpLc%izZWJ$~es
zob0tX66;-Vh5BR^pZ(#~OKv_>-5|@cj`MYCu>k9vNWFSaJiSiwnBG#XFKqHC8XCK~
z@L7PqD93nGI?n6HU)4Sq?J`ZgnAt<7wfRoT6nZnwIO4`g8p%dD-+*`6B7MaauJLvp
zKZaH*+dWahMmd>OB)$16Q)SMxx^rY;VvF#3D#TM9f}tNT^Blqxx`rTz@c$+S4^{o#
zq$JL{_WYcYu>+sSvv*nnH<KS<U7q>BJiT<&<+4K~*mYpQ+Y#zlE7Q9=W#yDRjNGzK
z;G8U9TH5*}e|mB>4W^aIotft_Kl@*X=&}=V#1jFKSMM1^6v8bN{k82#$s0v&C%frS
zh|TJt%J(Vf%X}HtzS)DD0c*C5g_TrvZ!~~?m3qN}q0QD$Z2Ueg_RP49A%rf+c+I+m
zb1cPAn`Qx3R!k4h-s12iO!4pjAuYQ!ptz2$KKK_VW@G+Oho>EfnB<}%G@KO44&&A=
zEJNYNUlbl@O}1s1WDi5{{4@AMd99E)8JY7RgG8hW;Gmk^w0ct;4aFrDlue9?-ks&{
zU}Q-MH$H%*Hs$qZ4)q0#(l&fhg$PL#ncw~Xmpn<!od&{uQ9sd}ezoc1c$MOWFT2n*
z0Zphv^)KyKOoy45GvF;X+XtJh=}fmtfjQ?_3Hi%|be!5dZSV*N>d_it{R39Zni(f!
zD|vUG#}$s!^t@$u1UilfOU!>a5VA3y(9CZBZPb6xDKymF$*<wQ&UVFyKZ~)bq@;V`
z@t_{j?>6w|XU3sV6y7&W-%9OH<WVz{NoQdO4*&$VN#Sb5;4L*Zggj`ZUCTnCJU&@u
z4nlsa+b-3mF++7%Gf%lrwT3X&;x^aPO9QUOF_`|_0-whv_Mw1#^0+iQoEqkV6)?_Y
z4c-y<*#D_HDULF)?WC{E&0S6(inU8!8}@j-U*OQjbY47K@z_2QLQiSFMySTmEEC2=
zW-?$IdNMwy`;jA)bcwU5=jx!R8_?X$3m5W7-rm630X*3Q-su4k^_UtWqAo5`J{fck
z*^*4nZjBD7Cl@<$9jD8=xJ=J_9}hcUXqet+;4^#@i?F9Q(kVHje|;MLLujKvR_06I
zapD$F7%kpi2@3xhMu?htkTPB^GG`RG$0cLo`!?=M@k-UbFS093ktB{_&53vCGWf23
zthtoxa$xLSeTg5FKtq8nMK5ndD4EX9snSnNQ&u479!Byz)ic%G#hKt?2dj<l-@m>d
zqHr&{<gal-pqa%IeL_&V$He&0`uk5wL?k>+YBr_@j&#BaKf>WcO9+Mf!5g%0D#^u>
ziT~tgpxPTSroMvi5#whLhDbDtc-wizJ{1TH5~R$yNXFe4#Zj4lrjWsf^)>K~L`!ge
zf(U4h{%Ci@z_kA+coJ8=j^JCcF&;$!Mksw;-ImYJK@C+hl)GD!r$d{guPaWMqozVN
zJ-8V1J>hZ$DckNFaUcf|*Zj{zbhA4+3jw&@=}NeHb|#CjN1mp8!_0a@igXXnNfHZY
zy(MA=5MN49NNtY&*If~{@J#!U%r9UZ#o+$N4<JlhfzL~|hqrEN#`OG^6yU&=<(J5y
z-dA#e&2us=su?8w(0<~;$U^S$ZW3u#j$WgnFp_G&X>_vjDkC4W5`?RXJ*tVUGzjFY
z=12ROopCW?o}FQeo7w74?BoqA+er4oB(<HICGTm1Kd`3(;q$}g62$xAA!O{w*#^jm
zUSBV$xc+;=OD2I3V!yg{vOC53`TiO{Q^2>UU~C1Iv?aA@U37BAIF&I-#mx@e1p#i^
zGZ-krDg8e^kv97m*clLF<{M=R8MPwApjwT2&$Y;%%Z-Obc9x_y$rrv0b60AB?Eq$p
z$h^$@4o=KsOfmG_v2XtFQ(hq*F9|;pC?$SYs7)-~wrCg+deUYX#^4b#$~sO=z1OCF
z92IuUv|aJ`$FI*=n~y4J@Oq*IIpW5*R|h>_pCA8@nD8ifpM<`p{8EG~E;jWP6YB1e
zO=g>B5sFVuXVe$|rJ*v1RDGCO+^*<sqWatM>Bg>3LfOlhl|PRG8b8WEw7>EuveTb`
zRix5=ccpONI+WA8q5ua|yDy%3_|TLHgZ8ao&<oM<eq&BY(T%bx#=QGDAQ6cy%f&QK
z((mN2f!mB&3sTF&A|(B|Lij7?X~G_MS)kwL`5dpxc5TPP*vZa-PInmLG$c8_-~Ie)
zeabGHqSEC!SSqmFhrC{Uv$=t9QqXGWwE1Q}ike2pt$e<yYe$8%#;|oLIs8<UPGyUz
zyL;}X56jd%x!{Y7cg97Q;_EoqS_NF;*#l1nY~HeRstn(>Mo_2hPJL|!dN!?MPOdJf
zudjFej!&G6Fi3nBcYdtBJ5Ns#s6NN^q<U;Sg<I_k5uHJ9_ZR5O4HOvLxyiedh}?;b
zec#q3pW%6HJ38|aQX$ts`M|(ICSHY1Pqmk9z`#u;)=xvVr{z^&?frkdu^Q(u%h{l}
z5VO)4pC6OHw@}ZEGfWH*&pM`Pb)uVLb;R5I8~m1Fo9^4PZ<vSZw`d||$>~}|g5F5Q
z(d+fMHFK~OIekQb*GXhHh(qX|zf)0?hS;7PQ;A{AmLIbO%IcIBsJgtpooH~gxyxMq
zL2R!&J-EOL)i<`NbZ9gIQ^ag)^13?)U+zM)2s)8O#S71$u3Iq=*Bq<ZTCL?)*3dnX
zEq8-70!%`PK+e)9P<vkGqw&aw8~`1gO>Y<hXY_R(S)gQMAsU)Mnp(T<TUa3|SqKKk
z#S8glxUuZPf#F~+Ro6hYk8rv~{>Li;aXe!*q$siwryGV&sP{1P{(h_&7Z&~ys;lpX
z+n5XU`N9^YOO|-XQ$O5Ux=1?2`1hHD;*rJ|m6#5A#^;z^G8Db27`2t?s%E9L@4j5P
zeX$xA!dc(>!i%^$kS!MG7mC)oes^{3e)q_l(e54tj$Sh3gpxq)qY_$ApTcqqW6D&J
z(}zIDi2BR^ko=29H1gY+4+rlC(YTyWiCVm4b}ttnc$^ZNjx6ThGfn1(S1i)fTPbe`
z1EDcFGz)QET+-(%5!=PDvZ`5~RRL$!>#0%8A~QTg`BVkln1n<Uk}y<@NB(NvJu9j3
zYX7RP&mjKeYzbX2qEryIOj|+ITcsGs3HDSb7#ECqbn=z1A8D{lR^)5*)!WB%nO0%w
z-S5%peXz(xWmB9`t?G2Iue@-VJO<;q5gko#2=S&UMS`6X(fB=0bQ|tF0+BX%r9;Jj
zXB+Bwhg%AM$p}9jEk=oidgxUYu}|C!l_uoJkuS2dz;q>UiH6Rpqiu^RHh+J<cqeQ1
zaWA+qL@JJI^FYOo<90cjA)!tDW#Q1up4_$^{NEyfkqUWA%tLx^->;A9;iw+3mkNu8
z+LafSJ$I)lI?_&EVrPWff1QU8soA_&8oM6^>bZzS(a2`wp1^to9<)TqDeBL?5zC{(
z0&me!6g|GaJ94>8n2*`|n$}?*Q!QtX1jr4vNPG{W$^3P76svxbpYMnw+qyVG*0@s@
zhxF>7-e{>fwrDavNo5{ao_<wC$%@>Pj7p4URV_NxzYY1}X%ji(ENEhD$~|fB6LNv2
z?hZMmuBC;HqSzA{238B&ReNkNBIoJSs7_3LSDTb-bHTuHYN*$zz{{R50~fKZR_N?q
z6fOhb2BWr0swBO>E#r;Sf%2*ZOV^p>YHR*_Wpzzi1d{W~>!JCpMD{haJ-GfZB755M
z6gFXB!+m@5q42N}&up$V5))3-RQi)aj_2u0V_TeiAlnJLrjp&>@N%KPA3ZA1l+U&%
zF0_USiCgkOC(0Vw_<89al`Jr$5l|QxKKeRTd)&wR_39?tQf}J4nnS*IR@QLe3(X%c
zlpLW`!aSeKVD;W*RQ`D%8u+6C%*!B`;IK_iQ7pg+eVaSxcMn3+Ao`Br>0>pfDO)!M
z`<p;3-Y=By^57s_lHjsRINf~*z2tass`KpChtK4l`Z!}`RFi(KflV)?Q!ANdN3(=7
z62G&S*A-cX(+_-ZU;i<5NQTGy6CH$sZOX*r@~CrXFAukW@@H(UrV;fE^MsBUY6Kg&
zLvd1#Nuw@Tb9+j64tUTQ=vauIkArrX6RJ07388#km8ju@6@CYKe27V|mAF!Dp`Mwx
zq}osRxO!wYW8evgABtW$a8oNAoExvOqSb3pZTGlC1oor8a@bice{qv49<}Nmv%FHN
zwD<Yh%+ZDCn?s!MPI&!k&rq~oymyaOcgc1YDSSVvGGz8Zu;P2TQ9--DaWToxa5}Jy
z{sL-E4f#i@?2MJ^Kdrxn5DyKxAEl`FiVQ?Q1;`!#+zgcBp>M%YEt4{_eTN1G3@<r_
zQ8y=TSh8P9(xpLL47kV2f6GEq48>lf!7X~*D^Dxx$HR7s+ag@}`Lh?!9Pur5!I$@+
z{@SKBI~&a9K21%g(zx#Yma93{&f9x=H`mH9o2zV&$(>C^ogG=bCll)F=_yOdsxe~s
ziA<F@;=NJ~5FM(%aif41aE<5xNe=r}yZMpM?6N9bU=aB5xX_XkV6g=&G73+0Lz7Zd
z7upu;?Y=p0u$!-3hzlX@$`9dsx0+c+e*s&lv|U*iq7r=y?gMh>H!@6TwDtF*PRA*(
z!FwBdGrnHCDkup&%Swi|URMUIqX8NXPREo02^f;?XwAcqX*(!=3p91&D>YrFJeZs(
z=EVwRXfFSFI1|xS;WBpFGcib>StL}<V7eYXv;%P~vzH+=C_S<LA{E!UFv-w7%I^JW
zhyhZt!Pt^W9*gXK)jnBJif(ivr;mjiq%3KP=IHJ6n-5{0m_JsgID%uQ(gHvYME9Tb
zwKWcj978XdwQaN@OAe(_xfT1pb!<6+$(IBcz**uWBys8cx^c#5Svc1w!3~<fyJs2)
z^S*>bGZ{({SrHrEcIxP$TXx4EW_i|9U-`=CSHsVX5XP)CbFQuC^{a24WBPJj89`6S
z^Uakj?9Z>J@xDLD!ml&vEDe%Gb-!z5G@LkIi4U5}wr^AIzm_zzWy?Y;?qvms_soi{
zF8~3rhwvh@zF>|V{6^z1p{e$E19FbkHw1n(<SdWHuCzYA3M!HPruXuew9->pW%?U!
z5s%?SF{{a;25l-LE5S}*j<paJ$oVzCbufFKMMrw$RPf%VHG+_p&Zs*y5O;!{+%fm$
zchj960k^{k$N3x`UUKO*%^CYBn|TEP*9%NJPJ#}dHdlLf;Zn7#c&__L^!X1mV<pPF
zzpGF|mM$V^y1;9L_M6b6Gn8Kov!5vo4IY|2b2+mm97gxJG@d*n|6a5NeW-L+Cszp~
z0gcIitCJsMQP-b8syMx21V8oLS>8&%_Zcv2Ir&!FJcAKp7#q)WbZncshFr=j^T#Eh
z(bM5bDDn=39lz&N^+GAS@a0EMMa6MN!XM@s@?ZS0C)1Od<za9_ve3Z-6~V~{H2v$;
zZ>ZKZa$G;CrMH&oq3mJ4w{&N(?b4NbTDStQ&Uu=V{YeLqx=-Qd>fl8mN(6KQ>xE7e
zP;LT73ubLvL%-sW?wfu8yXVdOjm>2o0Y|<-EG5v*nYK=GbGIrTMz1Wkgy3|@t`0o0
z!vs)CtPYd;yL)T7jpRb=CGcQrm}kv4qn>vtC?z4gZ8E<2%k_LIGegEBWdn7fF$qz^
z80Q}`yX3G^Pt%+Ig2b__YO@9NJgDb4M^TS1{{rtVoPY;N)S@S@4v{S}zBJ7!3BS4`
z7>FgaBv9jcr<Bj=2&|#Mz~x3b{SxT?)_+NDimy3t9__-8XNW$S^ukKQt~US09Mx@M
zy2Gce5u|)oApVPwHjdNW(PZQtpG-3RqmA2K8#JoDS+@S48z^_L={Et6DFpC;xJM1p
zy}60tUc16xvJdX`!n3^o88Zj}ya9=!OEL<>89DmZh_<3V{X#=IJ^O4z1+H~(Q1kD8
zyw<F~`Lcl_lKayoBE_owT&Z#5vu?|+BCtC`yLf7FK4hx$YA5cIWX&*i7O0}#GRMDa
ztfCSw#ykxaJOCdw<gHv!Zd@szZ<krS`X{}Q-T|aPJ+`r&cd8|Jrn905<8*sS;Rwmx
znYgJ+Z;frU%`VQq^RVtEo8+3@e7E$X<aL;-h{G^`slo9(d}lkF;M8hrGO0f)jHzxv
zq@jj#FIwiMH5kC}Sue0V-5J22B3%X0*x_u{X{qFpE-WI<v%afigL|3SvI<8LCUezU
zY(;NiqKMTi#}j&+f)d}B^f&4AA`tou8`l}0hO3wz%Ht^!g4cDXz2!Oj5#GZ@`Z1t2
zoGcUW!?t~wt4cl-m0lZ3dK?<L;K!v*HF3nK&%EvdTX+L?v;8Ujzik`YVns$G4p}(l
z#cTddEe#`jLxF7x8ZULb=+6C5btK|x9F$Flkm(=M?-Tb;Zu~n!VhrqWjwSOWGW+CT
zYUSw!@K<gb`P^nJ1W_z_mL)NKDK?$?+!$|s5KC8#v|60LKVF%7VsTtx@%w(@LNQ;{
zQ@km<ob7Wb<FF1LBdW}e<6vqvE-W;6w*CrZ-gujburb#@l>Uw|BnL)3rj1KqUVGq5
zRilkuV&Pkp_K1Nef%A6Y-z5sM@E>EL2&-M(9Vq5Fc+j)LemwOfy6)Z99iyr>K}Tzf
z^=BFt0&6!zSMH~Ku%Rd7Vr8J%kjj@9hT?}jii>`6pO76v)+B(1rlWH__9Z+BIBM+x
znmQV@2$xti0W{*e<VzP3*+h~T=+xl@nZnVPxlLXJpen|)C@EZ(^}cupcfX$dtuRhx
znCBo)Fde1ZCpH6DzdXcDX(yDAf71f0&UD}GM1U4Cw8tGT;vYWsPNqZoE9%~NF`Jg6
zSCPsrkZPRcxaOMcY=85Fd)(ixkvSdxKCe(?_X{PvHOSnHzorV0O}KSq!=<UZ;^o=a
z9^kSTRvif@6{Q^RMQuAAO?sm$g&SoJidri7N>2Yxw$VnEW(ZE&8=_PNIpTha18dqm
zuVXa)p!i=hG*|pY!c!P*-E?}J^HG1h(Y=fCq9G^aOJAHCU<o>)(Z26_BY;43&V7&V
z8Sy{USq-ofJ&dd{pz@s%!-zq0&KS9?B>NGl_8cVneSCZ*{_DTJOYHwRf?2%*cngiA
zLdl@Ez1qv#Pt>jBC9>(Tbrlu?7D{^6e=yL>VC3PGi!Z0N)-O5=W_%G2xB{xv-z=cX
zz`g8(T*OPIx4(+x@~oi`CiJQ>1dX?tNBI7el~Dbp#crI!m9BO_-_VR^L}mH^c>&Ux
zH!l+ruy&SKS6&RncdKU~jq@mr=hFNPz~|+0X`7h9R){W~tu!S)Kk6PKODw0yc>cbV
zt(~6WD}A(v%M~Cl@;VDNREr{n^6L9{dXNDr!ukdk<KlY45TFkYNq|0HEI|WY(e^%y
z_j*7*?!H>VLvi-qhmD=E1&eXlu+p#u-exkxd)z9|=<FYLIr{nMVT#2TJeZ^3-ZoZK
zi$iGUF^ZETqA}1vyzu%eULtWr055T=;@2_HfHPt9$CFe*1LbNaEwP|jtb`-IZ`^?w
z6D_e&W~y7T<kv`XW@LFaR^4Iar{ZP0cqcC}68VF|a@f#xGBA^g-gItYUNF})#bex~
zNVOLh=9!98CR}H+)vSbP>r^HBuVdYR=@@*yy~^&Kqoq6eZP>37WADA$zklIkdnSN9
zZora=+L<?VRRlQ~m`3V-S(iSH{)eXEz~$BW{&I<0)usi?zuwoGT8RvF3gNF+$#INo
zpL3<Kafk)InX%wb^vRCkE=ra%4kL0R%+V;PxPwLk2&>%ha47|kfZfHs;1QRV3DD$j
zr!7>$2Pz%v0ISX^qKN4B)rHuBrjqOupL}NfOX4StA0jE*fjsqcZ}ne4qZQMJjqika
z<-e-?WO`h~7Mni_t5C?}s+{6VsvbOSn_XA%(l@t7<EFCSzLSLHtW~auu;2;r&POov
zVIa=w{#rh=iOAAf$QDd4aC&pQZIQj<Rb1i*=5*Liees|*B>Y|(DI3Z<b!JADx3F@#
zN_2SqL(jFUA`sE{!5$`#_<MZK{^ZXaLE?`fT&avCs-CsIlhcCR-KC9Ta$%lE(NXHY
z%>P;!tdLaCr->E4Q&h8+IDCs4%f<~eOHF#5UgT1f)8N~aQzRDbxAa!#0Le&w<MA)e
z$f0?s?E@p)d!)F*sHdDaRO}h1$|=axkx#rIO>-V>%<?<tYeLhttZ%zVRr?xaWOnPx
z^{&o20N+XDAJ$uWl7zO^g9sgT@J7uWYg-+%GpCEo!#nJ2&6!SSSQAUx7mmEh!PP`6
zos3ACA|k+~z0cq4GFK{ru5Ia|Dj1g5DTdAJ@^Tki@WCx^d53VY$xpk>?E?tvmyKP*
z=rbcx>b^^o|FsUv9DqbfFy@SlqETiM!=rLb@d}3ry(@=@RV;k8&!{M<W1UHQ-0oIz
zQR&#%50uA>^i3Yue<Y#<p&0k%I0CpD-`)v77l?yGNt28^{Su*EJWAz8YbxYDr56Cg
znyP#IuQ(v%x2uHFDt-fxX*L5`h!B2GsI&{f5`lm##?Wkf&sMqy0W;hbk9gR<LDr}g
zTGxl()0-jAsg<yno1-Q3@r-1isuyHp2-~rz_#?XjNs-&MO^DQQMOH+;%42yvzXC2F
z<AGuo%?)2=rOF|LC<Q__*lBu`39RsvCq9WA&*ft4e1!xkD;##oVca;iC?|@|3{eiF
z>E1JuZ?FoNlGpa?t(WUk_H#=bpkGD5lm7XuKIokH?2NPJ?qp7^JPK8S0B|pVAej-$
zAHns8cW>??%k1Sd3yTA{bq)87=FM!Fl@xG7=Mccs{l)`$G1&l$YH!ZN`0x31U2=mf
z^)%`e<G0x#664=&m<W<?;-W>Dd39v4qF*2IVFIUk{LIc^H7s%$o%T&<aEcg5Za1CU
zj_j@1yF|Kp{Yzz9O1Bhe-C%NRTSWdcO&)RplKpFVFqpuMx*V|zhT4#69jWJh{_XWH
zece&pr_N)~KPqrbUSZ~dAc~w2{+xsy7PlMI-G47>)UYi@)l4+s^8Scs1;okNpL1jd
zw8!hzNIc&{oR8{J7Oox{tc)%j=6~>y7~#PPJW?}XR~)Jv=s000u7zhb^1pog^>shS
z;p*|jRJCM0l_{Ue(0%i6a*mse&lM)N%VP~P<@e=2Roa`KFr9|0KUhbLzKEX3lRZx>
z=sjYc*AsHbGmexjBm;I^OiiJmHS4yJZ=-)36-5;HswlVkdC8!1GkI;T$OZCg$e^+U
z{>9ZWJEOQS^y_nT{)hlxC57gek;n9`b&&orWln?ymi#?TbhuwN#sx%u4P29Rj!f=u
z%KnwU2eOUi!?5Sl7bAdiF>K6ddL%5H<}EY<bH1<r$|e;%am1$dw>?U0u&uI*)8Ej&
zLM_^oXI>_|+}y`%^y9?e1oxAj+ALk_HN7TP#8^6U-CU`+IAQ2wA*reO+2T=w*MOSw
zX|c#JH$7bwGj>eRQa=(j&vh(5Pph+1A1~cLUiR$;)%v|_ufTL*>(=YPMjx*xLuo`I
zs<2ui0#>RRnyu6`;*IFOg~`2H&UvAp=Lh-%zlXBc99+#eLp?DCu>=|81=*Kd9!v%e
zmK$y`pB6s{x7+dc0v7kF^}`9hIw{5OZ)S4x-Q!B7b_w~A1}4LSlzo7-oMQAUiZ$1Q
z^?K=UGWNoI77wd%|DA2k$c~f|oN@2*7uBwNsw2^eT@4GRJtWotiUR$=ZYhXyanQuz
z;Rwpad#LBX=^(q`>$F`NFe}k?l5EY4Q=X)?nBPe$RM!_|nFZX+wMzcp_A>h+@*HhH
z!D_kcg!6a479K#Zlig6D{;JsbyD*fJSDg+<|0y;+u2XoXM(1{x9wvhF^WtE+p#BNc
z6D4I)V_wKph)!xW<ad<!e50V;lh0Z|^g<@5Iz;cy#!-r)y1qT+y}fVLs@gLNC6}OD
z<QeQ#NOeO<jAWq}%SQP<uXUvuireft<uR<|i0ogVNnfQ9$R$IeQSp5`Kahp&2=GD*
zXdY}7sJDFkQ;nGMlwoBwb$~iF8YrF$cdn%|9l6Gz%#V&aw?rqC1Y2;!xN!`F;f!k%
z_aDIx*sMuLGlS=SNC49~B#W&dy9+7)S!99uf142OG#`C#@c;JY>oQmVv0lHKL^9?w
z{#DDJEBbHBuIL*3h2Sw@Iro~Ygq|Qw2zF~~{W)&10_rxb2vwe~73))RUjIu;<>v@k
zY(`!~sc*Y2`Vh2<Sd?lNL)2KFW(#$}H;)G@`snOr$R14GFUa<a(FScMoBdwB*&|Kd
z-sN_O<tW<Wm}|9RcDB+{joE!|DH>k*4czg7XKW(Yr@2_=nYA%uZL#xUEBhv?kPuu2
z)TN_B^TO8yqt6!(GdL82*KF*F@2P9)%)4?|r`Z;^{Qn^gT<Cwr4IW5QhHh1Fo{O8p
zGoz<)4(|laf2JwUlDkw(Bzq0ppQhufhUa67mO}#RrQl%pY*880d=}E02(gO!^c-gi
zmLby2VQU(q25q0azE!NC0X1>SEx%B^XwqA>oBPSdTy81j)t0p7-Xk`9_@hrS+~Z>x
zjtL!~dT7@d+6%=d3(qBoDhP?o-|WGJE~*1n+3`CU`Qlry26HV6g|~gmZpecA<t)ch
z7!RnVpShOSt}pp<#q_h~jg>{h${2>U$;~4Bn??>l1p^)<U`{}CBsiVco6P#`3UF><
zfa-gEJ!$$G<Slh(@Fimm%n>aFMJ?|MXg0E^0--(h@<nfPw6!e7EsNO4c{~6D1PMT%
z7+IQQbdA~%%=28&`vnawl1)LCAXuay*#7y1-;#O$q+qn6=3&dtm;(TkL)szMnyC>O
z<|(m|)q5rRkNR3C|3#z?3l>=mb~@9BMd{d_ukg2C(CzXMGGoW=ya22@CN1Nn13X3-
z<R<4I->kn0a1%$JUx7mJAF4Wo&a?7Jt|!tRjt0-1Eh#em^uXkDih%_|I?c1>cSAnb
z`K4W6sfSOhShI(`TCLV<ge^;`B^pN!F0jm-E7O?J6!F_t+(ZChWMp$E(hv&xAi+x7
zsi`JTlo%sA+p1}XX@x-I_E^4Co3oVaj}McsJ4zJnW(WZZ_~-T?K@Oyta0}ZL8^h`w
zYyLPQG5)`E$Cmgy&9<kX7nR105ayl<7hDAY4;sYT$~iY?{B$bs__q|w%}MO$ut2T(
z2nQhtqR82I&`lro*U-Sq{n16(>HEkM5V2zv@PAw(rf}mqozfRwjprNfW#aIVhEMU&
z5@@aQc#6bKyut}<v{Bs#bM>!`mPh(aC~;&#8b*86>EqMJG>FNYpbLKGr)`=4Hg0mP
z@iG^k&1<GcbYAsT?aq1x`VDcr@Z&3$AeDD%%ES<~aVf{ARV!!W7HDW{JDmgU!@P?&
z22>__wg?-%IVhK@oN`aHM$g>(qd-NO_;U@RnVje-r`}nI{vjwa=hmkatfOV$pyoIF
ztD~*o+)u9AJ428{G?{Zq^wSEZPz8C=yYEh8ng%IlP(?N3?N`ur0t2P;;0!6fz7>~5
zr?lXM*88<^Xs`6l5~Uc*>h+xW36Hj`URK2J|4_NKPDD~>2k<I5C&xrO+o)jj&tpxE
zWU#(90)(-eX?6zdk!y4qnmq&DM*p!hUNB?NPjI@ynie9|q?{80YJk>JFgNx)nJFwy
zBlO){Q)i3}(WgB!j0@zS7ARX^{!w5kFOZr`$YHWSC)31WdhSi|z7s5%Va60qG0I*Z
z8zQ5-zTRJx`yXRblMm>pU!~$(NG(Ae4Q_J0xy+i&GAbQU_%+A<ui5r%7SS;TqbMEa
zPrq2oiL*Na(T6!ZA_9eDzlGoqWd(KgkhKh222;Ky)y}$K06}Wge}A?=tTKSU#PI;o
zX**}zX3wnj1lJ%dL;5s=*xTt3Z9WeuPZ#FQj(=?j)(Rm?(SM6a>6FLb8(7yL7OO-j
zABFMx++r(aw8Af$gH|53>kB$E?FX^?;QD8+TumuN4xBaPt4g_l`!A7PsE1@Hqkuar
z`we6@_5VXyZ)qqXDjj>hm*E)Ls2hu$&Gddg!xBP)eygFL%2*<Lrg(s)Yo82>UJ8`-
z63ild2<5FQ$|+B85TPO$(Q95V8KHu^Z%wt>>vdi`AcR;p{<h}(Px^=75RM!F5uQ93
zu7LW<*;)&sm6x`y+9Zxi=A}X7B?2%ZV-({@6-qvOI%`r%6RExR3MGn5P8zn@f#{VK
zHm9qqjh@|klDU@Be9B6gE*SMZY*9DtYqN@P0<W)2q~<5_`&Nza6x!b%A<2SvNza)!
zYf@V)>ErxK57)t1a+*!xoTTVgCWJk{ACfdUq(LnEl@_d+RZ)>|p9+h|RD6=uU!8YL
z*72m;7!_IgF12%IHcE<}(i9N*I@h6|XN1If7QLeZEw|USZ#1M8^&aS%gZTqWP+mAm
zt}~R|4bNnRIO-3iC}&z33ak(2-3Jb}dT_5%62Os9Lj%?Bp}rW1N4<o$OW*%9D%fIL
zF9G&TX{@QF3%q1|lDB3jUOXB*;?SQiJy?e8GVlpvTa=bk#H*3jQe<*LAZ~b=6tTl$
z-)iO-7N}Y!#e>NZ`}W2$Pw^`!o`50F76TTJEMI;7%e9?YjBx^jHxPJeU}yq``~B~d
z(mjtt`=d#<f@h!`-m6Ti-L6H{pRBfGBtb0T^DCcFL#PM!>2bSq;F~J#8mZ0Tb$`k!
z=Sa7qxG<iObaANt(x*9Qpb5ab@7^<hzP-ode__*+pRR$7W!<Jb0A8z75xvPRV9iDv
zOC(Nb+ZOs>B-Uf()?OSgY2Y(E#a^<YJgF3u*5K{wH|@QO2-XT*&M^I%4BANb9+^d0
zpUH&UpAC7Bf8GD9kz~&`G8ELvLwTcTrQM$6xdnG%ffIk9eg5dgdOw?vMYktxoACDw
zI)*9(!yi>ta2t;R)KRu)&%6GwYC1UiK$wwHr?|hfAQlTF5GKO)?=5n2Qz{9H0r7Tk
z>=74kGulKlqr@O8DVj4?eqdy~xBn5V7TG0~dFiFiGk^?iBU)L855wrKk$DHsl*A#n
zz>ne|%2jg}Uuh@IO&Xte*v*Co-${F_?g^qz!_+y~H6)m?qxf*)+D{$Scv7H*1xh%<
zE&G(SvGsVWp1LfpVPm(ti7IgPQA6eAAE@)SzwPnS`;etq{iTys_)0%p-^zegE5^u1
zk+p!fNN!^gybX$E1D-A*WM@|&RL4jl-w#L8zi-9Q@$(w=6~#ciC1{*%-7&()xFGP5
ztUd;X5A{pOf?H;6-j2JpP~DB!o;G{N#J^MAk%W+FU5W3%4TC8+KzQ-~9Sw~|A=yvO
z0ZP<yk)|n@UdoOQl9A_i-?v2uyEWwa{3e9iHrV-eZ_t$0@OCE!Q2Ab-$q4?*uVTLH
zY0?FYh%2RuLtG)`oho4>mP(~UZ^8WPWMT+}E<z)%^9S=nVAsu;c6E~|_Hv;f%T7L_
z|M7EXA+*G-_QM<p+}AN=_&WF6q!GW)K83?Dhbj-GR0lS`AEUZq{^^Am;J(*ltGKE-
zMnvy$KTIeJpA0Ty*P|AJCelXugG~JC;L>!j_1ZUjzj6IHHfXz=82*c<R7gl&Cs&eI
zWMj{AK?nBp0plB#g;e((t#dyg(tZSsv0R!^8CCz^BO^8|hYR&1b;oOfm0t`|`5!4#
z4U@g6_$p~|Rh15Ml|(U1F#j73yol)#+OYGHQ*t0Iz742-M8GUFMG&K#Hwv)y$Jye#
zltUe35=t*^=L?(^j#o-de>QB!K?eR{53GJD9(fge!Ov8iqH?Q^#M;bF9XI_S?ib1N
zrx<aTQ<LhUsHAC2_VFp2M>jKTHAEpFmEkskw}H9E4xXVrxtGuL?~CbF#>q03G;Yby
z=4@h>Uo6?~a9QjR&Rf4%{t4}I%|#x+OI=<kYxH`;YI>vnWCv3TE+{lw*OC{T<CzRO
z1qZ)?wkE=RMl~%yx8|$KjxEVKD%SdwzF&-Aa%K8J5G-FHiDLb<(ZCRwv!TnDK`rk4
ze@Br39eUzQ)9Du%(2b2w`giiLk<)nhxHqutN~Zop|Ljs3i{=07*a@rB@yj+aW*Xo%
zHxU1Eiu-0SXatpCn<4^7{OAfx?!DPZO19FkLu-wZMt-keVwZ8K)ESfBeAuo7`r`}(
z6UA52AD!L>ZZ52^WQ!jVP+lbZPN~=|XT6Zp`+PLjNWoJ>W8%QKU#Bak0iRB4X<(x_
zGx#Zpivgwk7qV1YnMNP_&SLRTIdaFIZTa$(QU+{4o|60z2_0gKR7U#68>fmbAK!{a
z;x1!PE!ad4IHA1a-Jal&G7xt*-|O^p8yA#BKPudoD(Pus0XGD6xD;32;0H<Akz(bL
zk95}~o&UJj9I`*dV*J_|6L`hC!k(1tZ7*|75@c;Br@IJTjDM~?V~@WGG;O&*;Ax8@
zUJDBgn{AtA69Mt!a(=dAC5~goBI~Uh%0hEpYZ_kHy<T*zL^NaSy1$5rishxp|Hve+
z<<Ky$y()>2WiqYzc|<Zeaus+O{QnPAXBib``-OWcB}D1Yp+UM!YUu7xL8OtCG-#w#
zy1OK$L8QA8B%}m}Ql#_TgYW;G^NF=)@#&do?tAZR|E_C4ymkG{RLX12IRBpcSR_Hq
zQIDGe^mj-w7dc~UPx3~#-SDyFhlPVBql^{J6*1jrTZ4S(G!Bww!R;>Jyxw3gF$#vv
z(*6R2kl$;YkGdmq@zwL7)4Wjt>fwBSiU96SIo%e^h-I{oMf{(O<ITW0RD5Z1Sl%xB
znrK?nFi5p|T%3H3&BU*m(Xn}Yyigv&av+9?KfuInY4+o9V=n~{`@W>0@s{}cRK%JW
z^6zK2#?XYTo#)Y{FOwMSZ4;gbv<)hxW_R!G@>9ZlA@fPcl~o`)dNqxYPU=U(fdEA#
z_seRfX^fU`a8+J9j3IC#vIRWAJ`8)uPM1WW_nMePPz}759At12*hwB-361+UZ$j?x
zQ%(`Az3dsd7CMv7OZ+l-`Kd3%eRnpwHQb{6MY?%<&BEMPCw#OHpi}IEme3CLR~mDT
zgfMn6ZFEhhd>6OWgHa?M*IjK%E#S<91y*e1!X}!0&l;qak$X^APguwrEoW>V6V=b`
zwL-5208&?eD5KWsv5v-f&}cu1x!81uGl!Iwx(Hcxu~N2x{Y>ICmfRRgBtu*8<M=Sd
zcS7wt@w&@*6RWD`)jVSs+btL!V_^PZORP_%QN%g-;hz=drx&iqWzHWK)gYSr-M%?d
zXF^dXeY1BVT)yOu4T77I@WtA^wwi3R)aS<RQ~YHcB08w0nxXxPtYMB>*ajbR$iDZK
zJykNMyaRw6NYv{6*5`yp<4rqtm9mTrbsS3MWeUev$b*psn-CtZn5n^aBB|8*udxio
zgZX%B>sea*@)tnR;2&nX8-=&aTM49T0T#$cc<<FUJzhjbUz{1FSnP1VbrB_$4*vEM
zREO5$VAk+UR~=1VRk>Q(`a2655;#g42@rMYU<FDkE>6zQ&S<Xi;Zy4b(Cm+;-mC+e
zx0Khgcw?Sc?4ie;%!BH_bupV!gmHwt1SR-Y$6__Jm$^vh_TA^gjD6&^g420Em3v#*
z;^R2ajd&3y>=dVz!?55FqBMEi(WXMwYpF7b_EDqxVw>L?%H@+4L<Re>z)~X|^<iS^
zQVX1(meC?X7b`{Elkp$NMnU^NcS-3XrGpv}*)S2GKNQA~%D;Vmmv&cbsL+Th%8vb@
zE!e&r6gCNsdHNyEU22|0tdK;!+2*1S|Es75QC%iT<87t@0a!F@k)(I}YoB3i6PF2x
zoGzCCykQ(ih|y-w`urs>v2NtJiE1V-A5c|8hu(^5D9O}y&yC+2|5xia@A3aY3i@lq
z5Lx!MOWoZVl+~BFFa`1~a`%-Q_!*M@7=c+_JP0+v8x;gWOMf!}pLL>MN5IvWpVSc5
zn;O$#6ciN9P0?;}ZfC=lz{fNP&G)CU9l(U&jT-$;%wnG3sd$~SNHpJ(>-h2Y>L8NV
zA3Z7?r^OiNVqMV?M^~5)P_i6nzZ(>5$=p862rqEoZu!L^iH}d^u+oZEsyqClKNhmR
zQ#vpUE)OD6t7!ceI6hXX?ckoJ$sXV~+kQ4KruTX8T=jxL@M`G?fz)pA1$f^q1+|MV
zTJTan9{oxXn^Od>>Z9l<{xv~LyZ2gLDU7#5aUsXkZJ~7yu{UX%8tc8nMec=vpG1#J
zBKnfm_I$fQ97ANDTWF6rTsx_$x90RZkdP)<w2>S<Io;JjuP?KdYIHuDE1(eZjzq*S
z(%&oVe{d1cD`k<dfc;dT`zDwyCH|ApR*>$`w3&3S81uGJ@8}NNua&ymkMUAkSr*k*
zDQITlq)wltSx>O*Hm_U!_zuRE4@5_4j<!&ER-xt#c;aSH;8M^Eq0;pF7Q9a~B1gE0
zXTZRkl$<v7rhCJ<$kF}5hTOmS|5XmSMP){T^jw&+woon)XRGO|*eUZ<iN5D{mIHit
z1bhKLQI@GMU|eS8vrcWaWKPiB<u&{wj#tBcIb;7_&98d9C*Qju*J53PR#ys?Oteog
zJN%x?03ZX~`KUqv%Q0kCkEE8jcgXnR@jslix_4S^Ud!cAGng7>Z&c!xEKNRQ7+b!P
zPt2@+$C|<VT+jRDlE%Q}@6#8iVWz*_xC&GQS>`A)LhSZ)=@aqj$<S2mA?`U;brmjj
zx1C^wNB#2g4s9!0*u@$GyFTeJ3pJQ^zn(%2e|ZXqjb_i?z~YT;mGX?3^h}ggLfB8)
zd>zHjiFp1(ML`IsH@=<X`rHc<+><VyPS?c4BJVxQhOgk}?3ER-Yz&ySK<<rB7qx_+
z9J1mEU%a-@hj@0PCY757@n{vwUUR9B<s{8;jN0Baq@Ui+W2eF8l3ht~9R2(^k!L_V
z{<Ci!BVewyL=e!g3rybe&jMdDT?7g}myW`b(4M2EL%?Cme^sCI((f$m2ay{AJ%i_A
zM;BtpI#rxej&Df}m;QH1v<u(#bw)Fx2BDOz<i&f3+f3Wxd8M=E0NrgdTb9tpVrfPO
z?qVNe%@9dz!iD6WqiF14IM@yn=E_+fPixmy7F9jdcba||{<}ECaIVRCfM^rBHu)3P
zWBtRfhC?ymuig4rt8E*fUVi$)$MVk6A)MM^IyBXRk+|{23h(2r&$tQeGY|lm@Zw)5
z>~=DJd1&yfu$2N^UlCtEgtlYA({hF@q&C7lmjaCblG=t}!(|O;*rM^Y?J7;61+18)
z*n}EGs^`8Yk-Be}hUoi(vegw_-#lsJ{B8qpn!N6tCRVDm)eRW5O?pVxtp7WCSK?Hy
zT_S5kh;?*gc7SE9B~XO<uIeZE20=n+Y|$Rxz#pxW$mSqod0*YkuFeUdw48M}Q*lL!
z2F`w*RaTkS@-=SRgfwFsInB~)Yc7FGX3_1*#aVL&!TNcMu;dEs&UPU|%)blc{Bv4d
zbCHLK)HLH!SOKWc(<&C_5@}oB5UB4ibX-Pfwe!XbRXqZmJ!n)%E&JnX1T~66L*P@C
zMFDkeAVW>v1#88V-?z8CS#DEt#dpUSEVqomlgInO4#Zr**N4BLk}kk!DH%NaB$0@(
zo!SZjxo|nR-U%0k&EJ0v3wbplOmV$mTjft;sEsA%PwwHYj(SP*wotXI6<=t>+FD@M
za$;?w*~Tj&pc*3-Ip(!*#c|{mfbKqX{hFM0<LwQ;_No1J&2t$~g{{x8tJm*O<3JfI
z&~?ImF3}}&AmqkBirHm48SlXZGbiNqjN-F@^vVy05I{(8V6AneNx|8JEiYp0|FHl=
zL^3Pxa1OzJE;G!VW0Q_ij5=<X;)1i-ns|v&GF(I^eqVozP05^_PQ~~990lA~kMM73
zbGW}7&By1O)RQsBiPhua#tEf)nn-|I%P>q$wzY)DPERCc`61(+j3B?Oh0PNUCsV0C
zU>zs9k#(Hux&B}M(GXz^wrD)<S9i%B7adeo_g8RISC}>IXhd;&shoRutn=jzWF)Cc
z^jyK^J-^E{oCMv=(;Hz6ndk^Ot}D^HU3jNV#12HhXh(=&PgH>^0}-U00Epj2W%q6p
z)f^0hgsT0?w;8ly20PGA#h0Bx6~T8e*R<t^9)f7)?P;fqlJSCqBcC-rJ>CuwKp4hq
zqfjbPLO@wbu~WggQr^>08wq{{xF_ijs;(jt?^tN4Wc}H-dB_P=(0(*_C33EVVWdpw
zYb-23r-Bx<_gh{5MXq(i7B1-zClr{7!9=oeerpdt?=6N?5iri$OVp;PMV`8|X3)6<
z^Xu{5-1ZryI_D1nTl;3WEt8Yz{xOY4cQ&73)S_h-ALgUzQA_eY@39E!*4*BBLenF7
zsSJK0^{()aZ|ie!CtaC}qDO41aL$wMn3Y*c`Q<oX>mhokTM{i|Ok3u|Jx1y8w7D}d
zlo5OKQsrOqG_VqCVFTD_BS`!n;J6=@er9BV<Yp2~8ulAGt@uf1Bz>e?A<Vgft<>{?
z(bYr9+MxVDNkOx?p{EMJ#`0QubhzVf@;8kkpU6;Ks9roy>2pC3Iecka(Ha`r>vn?i
zN3k^0YGh{s3ySGj)0#)GvHbkzE6=Crg@w6KK5CgaTOj?hM%kBNNJ|P|{Q9c?D23C(
zO<A$%HSAArK=QaTt#M>=QGGz6rW4+iC~)nO5g=SDIZkOBy^`MXlnz08M-EpMiloIc
z)yG_0#U?G;e}d!*_{lxAnm%(cCak_Q4{?VR$ITKpvoU4h{YUr2e&sV)CzT3x5K~kb
z^IYqYORe-ys}P>-)+`KT+3)6*+`*@^3vhqm&J}~sqa5}>eBy){;thQ&@!_zWP|u0{
zk0c=&$vHDCIruF@1M|D0u!qIegC4`!{CCRo13L79Ef2Sh>EPZCbaIUl?m=RGsSTmL
zaR90;)R#t<gyz$rwvpOA9o(Fgu2##LHz8O28C^FiO_FH#`XsB5(R9qPxRmcL16Q5@
z4?0|VRaN-#3+<8UCg8DNEzeVKu`SJwJc~Wi8MXVcA-|655bh#oke)r3H}uHIh8Pz2
z0xHPcOo|V`r<EMDD9lbfu0$Qy)ri8;EpJHz8BIdalQOKpvrCr9cFDVFd~=e+vFtEp
zNHawlX8iE!lLWX{2!~ZuOce#$0!cqtsmpwc8?D6^z`5_m{oz>g!$oWtN|j2L>G<83
z4ZbQb;G)Yr7Sh=QKndiT71+eJQ-Hx!LwH($C-@|$Xwg!CQf@>+pVJHKE@j?eKSF#=
zUM;;H4@x41&}h(;<tB#1(%nLplGmUbWiEV5Bqk$jyLrjnF36mv(~9^$4s`q86}_`d
zx<OOCPBDbvyp!YIK?dv8LR%ag5zk?(QJfOt8lH9v$$SSSjy8&J*E&@4SMECP3hsU?
zKD48<0Pjb<&lk)?Lkh4IPu*VifS6he2&6pNE0DmaMg0e08=_kEsxN@52Hqvrr|!Gr
z0l6YJBMUVR+FoFpt})-Z&LwohyQ1y1Q!dl26;Oz5U8xZt$fnJfhBmz!Ck-!(yL8@L
z<Wh)nf`j>|J?gkq`9p`Kz$UEw9>a|ViN7qfMu0Sx>?t@0h)@uCmk>S`hnR!rTq?50
zV=v?e`=11M8(Z_}RI>BX_V=<G)iCSXQhR*$kgji()NRzV`R!3nn+4}W>-#OFpZj#C
z6)P((oal|hGNbaWz0>m1La;Ds6{5L@|IN)9E}5_&X`c!TwMF;uZ0`Oz=}D&NTbK@4
zofG0bJB>;L#{VIM^&B_(?8#Fc7Ev0di~+Hs-hfP!j{Zt2Zj4buJqyV&_@-OW=k!oi
zHjJb;@EZu-2@tzV(6!U4ZeMH(e60RQdvKpbAHYfOS?NNDG4_te@?>c@2JbUD6`@h6
zO>g}k<ttFk#W8XGlN%fQgI{srzs-llc%yDoh%T^8|J>j5{&Y3)F!lt*FEQy~CxoWC
zsfyDAB~)hBx5WAKS`^NVPUkftHh9d6KD6fD9)yb0t(@5`r4(<Kw`8qbAgANL2hNNa
zrDxRU&b|yif^!*ppkMqqLbVcPL&iNZTHX0c1V~4j&QjfejY6d+egT2+_9^Ar*;KWz
z>}@zAc#cGCr9%*f$H9*7yP!^aQ52w7RbS=&aog*s-M-(O%>^2;kxx}X`9E;wCI3q{
zkq;e(`-$W$b(zA!XBAXs)`}=iqhVk$B?HWu4m&u-U3RO?Q*>MTFo%RtZEuM)P?=;`
zx@bt!DnK-a@_7E6#suqpG4P0>G?;mrGen1*NM9ih6-D2eEkOuBla>1kHh%Xu5?@)q
z&Q^TbCX}ePs2=rr88rEDmA+0nzdl~L8Db=0WlDP6bb(i3H-^2SHT619E_~YS7{Dnx
zqL5bdh41RGd37K=6DU3<mL@kOf}Ef$&>xn<EB;aFqwi(GV=j-c@>UT~vw9z^2OHP(
zCSg%Ao-4DYh9}nRBnA}m5Y>4mNzJGjxjCdqiuU)`gQPu5iXk`<67ax1XJ5<(gKxhT
z@lwPQv}`c#u~h+t&qe>b#Sd^`1S`UvB4yEXz#Vcl&F;*p9-Wn)^-u8)Hup4!7l>{3
z*0~fM7L#mmuEGb9qmt6HA+}md%Cz2i($CahlN9@;$n9xcYQW@UVFo!oTjp&He?qJV
zk?LR#l3b>iB~NF!53EX5y(2qZZmOB0AvpCHPgwmi$=XJsF7a{m>$Yj(Y$WQ%wTgg&
zr^|Ax_s12ClZ#72-Be>rW{z9`(L(iF^5mir%SYjFw84wd$CXn3tx%bYC70^Y=Y=|V
zlF^z+#MN(l<R6pNF7Z5*o)lozg&>6!Dg=lK#?yeBy72CHZRXL&GePxj9Q?nFzYa@6
zhS<!%C5CwuCY=SZAikY<f0>*43|bC{XdX(6%f~uirst0-68w`l(e961V43~D-~&kf
zXn9@r`XxZ(AE}_JvtNpFj~9IV&e0o2wITMU`cj_UKG3KpN8Z>oV%z%JWD4rq-(<>c
z+jW#K{Li$nA#=n`$N35^f%7h9{n&+j68&WVhBFKmXq1pu`0Oj__ReegSk{8t5|~R-
zu$gugo72;mm&pID%fxHC_IG^QhD8Ge3Hwq7x%}f*8y;@=Gn7@(nJ{YUEA`U8Wux4q
z6OP1FKU`d<2*njHEsw+s`fuD^b1Bt2m|GC85=H%xuu32Z0M`Qxre7L?0HVxXCX1dh
zp1;u{M}Mo2SzpFTlkBi^#lLWDL?`St$0x1cDT;EmsY0#q;Pvfv2<R=f_PknIHkji0
zd5MFU4$0#McS8CnyclU;MP<44MPs|Z9DY5&Jb@GO+4I4#sEj#sKGH*&{*Akfu3ASe
z)tJqRj>whsahn+fXe`#2%AO;R0hc89T$CSNI=X-U@ISyRa|+JzbN%uEJP8KA>zt$S
z!AIp^q7;DF#`i+MQg|_p+&Gp|;_)u)htZ@0R2v@c%=CfJ-V)p&UnX8HLxUcbJjjWK
zh8w-Y>*{}}ZdeVpI_b~qU(;18!n4hm6W-k2ayRTXhfCiwSfPn{V+Z32+Pt7^@^uAZ
zD93u0fzC;III)hcfNG66WK->|+_8*VmSBws-z);aacjL{g+8OsVBq+BuX>XCTu<Rm
z;gfg}nU8(x@lw^ZNBY@YUdfA^(VJQ3@U){mt>RLborq8ZR-99c%f91bjpu4AY!tsG
z0i3rv9r(6rv<&kHYocE-FBpdYEXi1kq4IZ=*POojJCy7c$0_eevGYeiU$#8TLP|Av
z`9N3u&-Tz^6pj!uSaK-#M8U;rbmVPU9;vS#w6a=NpHMVqXVuITwLu{a#%f}Sqbk$5
zOK5+|4x@SIF;EG`ql$~_#tUzBKZ*V`i<2S-)4;;wq87SbTh&3w6RpetHDB;i7xLUc
zo@V;b2b~>G^6o7bF>QhHCl}F9yB|=oyGrWpeNHsD4sroqzuFDd38Yxny8fz+<b3<O
z)Bb1)(V<r<y|yoa7#gAC*E4_sdmxwbz=3X>m&PXyt+WuVPGV9ZQ27Y#xYs^<cQ-y9
z#Vq-Em#*PB=;Q`?l3Am4+9SJigMPC1Uu$@+X^oc~TI*?d)W_azhAQyr{v5ciA>S=_
zVTuhO84$|)mED#D{SfdTrcAiDSa1~wJ3c5Hd%z0uB&O-6cX#{?Q!|>t1IMTIe$tv}
z8c8neJhhL)=2>^OT7eNg*!51j)*E%{)OBj`6SsWHS6zKF2Iq8_@Qd2ah1!7h2ewS<
z@!B@7$BDgVMa#hT5c?&)F^>Y9Fs!p50)JiMdQbqU)&B20EDIGs=<?eBv=QeR9FZjq
zBX-eG?_pxi7iN8tj=~-#B?|Ov&FGX#Y^7-=E<$pKXRlu3Atq0r(8j6tN}7L652jGH
z?n5vXX5l54Q5`7IFx;w4Kz?!Tp1F?u4m8p%Xei&Vl0sz3&o~50VEtcQZUogv<rUcW
z4#NOz!I(M}o3L>-(t}a>b&VLaf?^W5Thw}_jIxu+UahoJU3xGTd{5rA8?)Gp@Von^
zHI#UcG}SjfUGa(99lzipcQ%0l^RGpyt{{l`#Cpqk`}`X}#!`#hI0k2K8N=5M0iQ?i
z25dTb9_vC)%62cK>}8E#H)AeP&4;Num=BMusB=y=XSUb$@JmHgYnaFSmSD5Iv~0cm
zjdB&Ws3x*(?TwVcnBN*&geng4$3d7*=TKLjXt>e^;5Sg=mZHqz-X&*bWbeBs;@jO(
zbmM}tBQWY$AlUW@6qpA_Iyl?`ZYlpZP;(fJA;`k7K67}_{M7>m_auQ;4&c3TcT~<%
zV~)03mBACM-o0XcXJ*tBQNJLH2y!FhX>HXCqo=;%T<*UZDP}A6C60GXMVNBGe?|#6
z^e~pVT3$nc4<&-E#Z&|e?WoI&8ivy)(mCuD(YcNolf8?LFTnl$k78IGBuyH=+x-rR
zFAnor{{bXwI`WfzA(T?D%BSQlXwj$sanu^g+=pTAyxLfPz`x;=lwOP~S*p?E>EI*X
z?BYL`q4o%L!7=r8x+prc?_^MTEeB(mz|zt8**@$++%JI?)9=_Fy2|Tqbz|-lxRHyZ
z@nT?DX4bR&y*03J#|x$ZA!EpL=4KWFuu(C~H-Anz>;k{oz#pn;_<g_l&M%Uyc|b=3
zj5LpYf_S?i|J30Zhxh8Rx7{Hqv8>sbX*_r2CEwZU45Gu3HyI_#lLu$gSavE4EDJ!!
zK#o?$e4J|pg&ta@?IstVy$nc{z#f)-mw(-T4RjQGZC`nN01^es>?fhS13{9=_C12;
z`*H7zo{N`;9=KW>(=lu&eNl9(pYS$QTmLcNtOmhOVAABW#x{H=>=>I7^A#@3fG~of
zK#mHO<hEooB>XM>f%<pPOXWW%U@+@>r9zGt{sZCUBQizbhUCszqwl-l(<1}7WEk9!
z{8SPvgmu+d4M~))THV*L8BLF(vp6m;PlQ42c7R|z!gtr-lmmK6qqc#N3r)jThWtz<
z2&IwGxFYWBDT_X`@pr&o0YzDSYb;}ZaJ%=;Fv%ExGohdw{S)frr$y?jY2aFPCrR1;
zLpFPS|1y$;H!|X8zyy5u?gl<{y7|m63}l`J)_$;|p7YhPM-CTux<{pb0H9}?xL+|;
z0I)g7-0*eRD==DB0|dGDvU(pM<c?g-RrZen4bDmNAenX=ZkY8tFw}hGGk#ElV;#%P
zO<RV*{LnBcm!KvF{co_^QqvuYfhS=Vx1deg;D>~C>JJ$S`GEXx(Wy`DQ7J{#`uos}
zv)ofqpEO_m@R7G_Xdv?g&AzV!)p^V{XJzorm(Q;*(0X2YRyo?iu$UNJ^l++UN;3!L
z&m>4czx<0L$ZLG3vLszHR7~P^xktV@q>@BtA>X6M+pd?(qU*k(GnPxo`Q4|S4%)`0
z+??kY?|Aw|;_tQ6^weYWr*KjCe|LK40T@Lb_Td0k;J|Q#n*3jXMg7&h$Xx;)pv|4H
zW<!B0P^_q4AoDlqfgN3<o>Nagt7+>cRwCV6&ICr$v}{T44o{1mB{5_~ZUBhVIUb4j
zaKS~q<j`Bem16eyM;E7=v}ec_83}5heO9idG?iT<a=@(=cp<{-rEAw+`9&FKtJ5)p
zlcUcYM8tZs4>an2z|rLs7oZ5~mWzd$eB%qMmWCGPr3te@u2NhBPfbAc_1S-yu4fe|
zSQ;Z=0zh*TO@hq<SnQ5GQ|%GtETv&?bjd%S*VmE)(W~-}iUywX>0?fpzm{)jf<#I<
zSn-P!ak{<(cdX^maSP;a!BOrAZ0l>5-ylm3ZZ6brI{zEj0GZ9J0ArDl*>-8UaT%VB
zk92D1Ax2HNdfP+cUdl0Ip6J@`shSkr?oWZ$nz>GD$=HUnx}Sz>uth!+4|~qz9;!*$
zxA7V)ulfq4@fX9~|K7sv!^&fF9a`P!W0`42?ENjOv0?bZ@J<1F!@uRmc68z(H0&VI
zE9y)(b?+CcGvlfx711{lt|m-vH2$)C-SQ^~e1K%8SpUP5%i|)r!i@OnYr4Q1l9Apr
zY5XrT)b*x3x{+t^O$Mmnz87KQ%r>;K9J62i)&M+{6Zz|0)OZF$NR{`jd9bp9+^X?^
z%~c%Tgx?+r2N*B^>yl$^c&W5RTJkZE)#{$EeqPI3R5xVw9vnLPN+<>{dmE#baztXm
zGrkFm7Qss8b!!LbDOa2h%ha{E8X8_{9YyxfT<{|YI#Yv2fNejeoYfWGg^+VNQhi~-
z<!ZTOqHv2j$5bmUg_%ncjc#@F7kLE}Uqt|;zCgLB*NpbFl6VUD>G-%dyM+Ors~OlA
zll9Q;43KG&BdEycp!R4!<EnDi`zJ2kCsZMFMC~5_#=FUpQ8@P_J9U`E#A-wydDk_%
z5a0*vG3SY8I<2tufEfPkoJ@NBCYjL`f&|kYL%1aGzlR0D9PbGNFSKn!xSNo1dc;=P
z0T9T-<;Jh+?@I$CY{RIFN1!yQT~={+&t0w%W>*7=9n_r(rgl@)*V3fB72zI3F472L
zZZ004Pod#>CR4QkBx_vDK4w-820J-fl#(d#r1T2|Sndj#lJZ-8%qOm=6&39I1t3?(
z1pdT=N>ayGqo|MQi<!$-Es)p1D%Ypw^jDAzk=?F$<ugE;o<(U09$v)n<|uLQs5t$i
z2i(Qw<CVvNPqx}(2nw!u$x&Xv4nGnG`S@SI@Aw3Hl!P+i|J9-a(E5ijRiCtq8scYX
z-cscGiaN;?PRAWT_rDAWO8?G+a*UiK)I$^UZ?bjNBQcO<bnCLT_NzaUw<OMn&Rs@%
zK2q>Gf+RUJy5gZJffZ`8U@g_h+^w~LGzOrH1qT(Xv%uY!je=0(^G2bA#Wp3m@83bp
ziabs5$r(t!;Q7stnzw-e`niWn6jl7?n3+7fxw0Soq()j8W%9LqF2z0;fcmNd1u;Le
zCY(ndj;};hX8knCPu4M8pMjIuAyikt&OSl4+HQKLL@zR>E5JFHT0ueGWl+xL{G64?
zmf?U;ZmE%R)du5_u@^evg>hB1+bthXeQh_f=!RCBRNvTc?am#_rKx5{@Y;$?znLC&
zz~>JYhZ-{`hECd+xfqL$9quw2?r4ryG0~Lf1wWow#>2&~v914&K<-}>izmNsc6a%t
zW=4?zu=T?ezgbk}=zG{ufkAR3E@*mb|K&J_h|jHxGOFa#6IS#b&!V_ed2h?#f2O|l
z8#;#PQ!;qCN#`QZ(xkx<5g(K&l4>o*m-?0_p8R0TQM_{(!_0%#w&;A30#2YC=F(Yh
z<KvX#Gwz5=Ejh_#%x)QQaEo&y=6-mpwVyWN9d2$ZeSeTEp%ynC{~Ct2;W25b&5TlW
z{jdQ3aV*`l4eKq_e{oCZ{l<jv3v@q(bg}NW$R{YGM<&p;*E%#!Dsa%gosOfjCk~)<
z6@7a_(WDT>N=9N)E8AjGcmR!pK+hp-$#SrN@T=XEko(F@kkjHTAGi!nS})&qX%{N~
z;%1_(2J`z+%R*?o??tfAl8)Kkgm_Lf_MEJ|y^GMl3jcYWZG&&JdS4CtNx^`bf(yL{
zRFqj<QQCNB@LsU@`2L~4V<G28R+?c3Z)oqkH6~C@Aa3$Q1+Gn=X{qi7h60!i3Z7Ne
zTRTldiOPcdk#Z@}QM`dhYEVylKj~3C0WPhL3J&QBzen{=lc0)GFFdseTpvX*rI6}z
z+J27WvO=ZZ>M4F^|0lLs3ne3C^^ugIX-|Hz{Nhq&?HoR#c!m0@MHjkqqdQ`kTP*is
zV$J+5c<P<?pprHDexf`1|G?XN&!@6u;N%{Sc&F_oTl9Y+iU+|Mgq0f=hF`i`&F;<e
zAco>@pB~1iUAOR9I;2j~npYBpdtshEqZEDp!;Oj!r#=i0&sBqg<IHP~8h9v6X`jAp
z=O1oQIO(;Dci}?UH+Z33vD4MjP#~8BB`|2qCe9S``3Pnj-#j;MM!IuAAc@)JZ(F%E
z=62;Zu0h&F(ve5nf{Ag=fC=n1t#@<3nAwfitGf=&4J1=mqAidj6^2L#I@kF3W6vaL
zT^hBn;XRDxRK==KFj7pEgoo|$J#j@ZBjAx<P9LD_b6oPn2kJ5b?08=TeLsybp|09{
z3I*;KEOs@2s|t=h1Ur7yPGG-jq&U~o6mc8p2MawCm{V1z-jEyv{KArz&&1{Wzk%?G
z>vvl!_lNmU-v0_LEe0kd#Y7ejLbeE{d|mtvb^rlh5n}-qx+d+-Gf*_kQB#$cD`)8n
z>()ttTrci-|CQ3{SZZk^Zd#KevNHVhuFmu@rtci(I1dr-&uk*^Iccyp4Og4<Nn}DE
zc+=|_fE0fPTsxD$@ChgEj2QT3PODy(ZGL`}$tS8SPoQ;tKbI(&;d$yl#iC}Dn(Ld7
zJ@y{hptRH+4i`>SovSU!e|9u{>$}U(N8=586PiEg{?l8{QHIt4V?rONsG=8SVLs(e
zWs6Hc=nQ@$m-4Jq*$5)3_f=sEo^-J43u>k0ks&$k6A;uI*)THgev>dH8X{m=CvH1T
zq^a2O^X2x>gl?rvt;gKjE}OgKR-Hn?;5zUu%GQ<7rAaXry1#@>NBeTHf?kzG_Rlih
zsuQ|R&tU6^S?vj6ooR>tE2rmK&iP}Hn6t}uDV)Fzxsa?*uHn31GAW{CAgKTC9VKFJ
zD`_lCN#1tV+Tjur%A#iQ#bFe5T9NDz0^74ghlY1oRqsov_l^oJ;6)<%F^n(eny-7V
zo&1O{bR1|CPJ-6Zrmt`>2vx(qB%qU7Fe3Gw{>%R!2+kL6JtXkM{X1&EiP~pc5unS-
z0*hkFH>Ie4>;)2IyhAXgfy6}$vDew%0Ypw=E|Z2wyt@LkofS`+DzRM6qVv_GHyREH
zOcaLnPZej=tr}|njaY$famU_0P3?I0byu$$R%bagI&LQ!f;r7v`DV4QS!+kLW?5#E
znAeIQm>}hvp|s2j3p><WVH4Ip@=B)8dg>OUmMTCZQ_KBa1HG-Dk|>@tSS*M>eY{W3
zM11dU;?C-3GU31kIj_ZNQT2vwxj#BjQhO(}!L4dO+^G<^Uh<9BI08$)FG;L<PCiTv
z=VK-@L^znh+ghz>1AljnDLity3iM!-z$K3jKuYNsPz41cDQ~`k==VznbBhoLHxi?Y
z0ZF)L;MG15u1lMrDD3mgw(RBMtz9|{>*iz1KUT?f9k;g_nRsIE-y`KF5fkRE;Q3^?
z2@w%&^5{wLK`4AuRuG;q^|h4=n^ys<i@L<jiKBiu_D6VR;b5^D6^!V<47S?Chf{#s
zkP>$|8dsO6^u|7c_H3m)m^zCL{u`gjG|86l*uyJGZx$E~Dk`ZH-f7j17_HIrUZ`YW
z$l18q<&7r82;uXRD+2db4G4p$hjiCDXF~CU5(7M32{HauZ>vvr1>)4iRP%?wWcLX~
z34b7&1e%XM02GaJyKQ<kLVl*5@m#D+x1>?UejzJ06rri2T0f?o)+d{u_H=OVh%1aO
z`7LH?#6w<Jh2~mt<*RXOyXNw39RW*tLJ7BSoQT|CEigHPYi52~;_`$NHDaKUt+Odj
zH)GV!PexbIT%J*$h{K8fY=Pu74?-KVwFh6yfP+;DMM+w6jB}H-pud_&XT&%prU_ju
zf!-^QVl$FZsj`aM6kmv-3&aSrhZ1p$KmwE5$4gN+)D)0p9C-21@dtY8SYt*19@>sH
zm$x$>(j!8l(|1hhe*a?ue6H!xJlaavumk`EL!A_s@&ETzKTVORe=U&hM|~oz`nss|
zGs^q=-F+GK0A8@Sh#w$TDD5f#HvH*y`r!+UNnoMW^vjGSCVJ?E`R|R@na5L*8xU{U
zulAdLz_@W9Opyn*wDJyQr%~h;fWMwfU-|gDJ+qV<$>3@tr`rHO8l}r-S_iGJ+f+S~
zuZAfR)%*M$vzXa*e2K8p?<}Z95x>oi`!<#c<e5P)dC(-}$DLQpFPR%?-O?foR9;!Z
zLea`LkhQz>U?oZY89waQ=2uq3dc{%MvaWm;TT!ciER30%vu!|Sko2R2YL<LNsk_@r
z(M}UNIu#AZj^8L#8$|KmbasM%WTq#;(yzCg!<XbvFr%DMB~dLw?@yUz=;C{;T6<d3
z%(>6xz3jN+^&V^`Tn9vfToye7SD~u?<w_mXI*msDZx{2@%a&;$NcZ@E@{qQNL&Y=C
z9O->)6!)O~gP334Ng3)6VKjiOllg()k+tB`@y94C;N5oDbd@wE83W|Xhk3^DI|U);
zel)7jz1H%M4|$sYDS2XXQ9g4~Tolj)ry9o6y%O%-M57VZee{v{;<ePL;zyb$VWaBO
zH9qc!`HC5x-XF8`HhamhojHCMj<21(Yfl{wq!jfE@ZBt8K0O}JJP0LZNVV`YWK@VX
zu}`tX2-DW;2g?gC<hpYf=i?yeAGSk@R{Q2}<E;-r+iY3gBlJXJ<Irw;>jV&ho&z#=
z;&Rk8-y8%A@nM)o6+?T-N{aImur88Z2z~{kT4|YSmPDj9KdL1lPt;<tV2tgk+lc<(
zIL)2mzf3S4cBlkwsqokJZhx5JtM48qYhLoPr%J&44AwdBz5N*JcYfXj4c9T?(NxCP
zO8tAAH^->=IyFD*SmpbdhPD!-RCVb}yY1$lkywzum)Gnh&;fjGZL~5h{SJ+o-B131
z)yfPB{$_tFKFiC?jDw`{={?Vc2Y?YFlu?^$d!TwwYj!xo#nX8}GmK}C;J|}%{`>^d
zQMrD9KfqmlYn-zhv>$L{6`+v0HrUiw)vIYoCLsT?)WMYykW2S$Pe1~x#E@7!i#wdL
zM1<%~O^343xV-yxo@Hms(h|0y26CSGR*XN-KBIEujK=HOcW;39(Xf9*YWKw`##bg^
zoF?w)+76~x#UEfCipkr64%7?WD*%KHLBeN-b`XMa%gaxx#RhD(xhJ=v6zEl1VfoP%
zbNV+X(`w>sY$VR3D6%rlcC*LzP}l`&M2MAeA-mdAN?{{T@816KI7}Z~K!|C;*cL4%
zSS|}}*;B92a&u9SB{uK@ZCFq<G=?@eU!&xQEoW3#(+iypA(E)1ozY@93Yr~G^~0?h
zAV9@>FkD%vBE#JWCS-r_YYwK}1-mh#2tI!8f<i@8$SMZ9j3MV~2za^?;qOv6%<n^s
z4rGspCxAgSs=*>^z3ToAjtW5aDdh)!=@jY2-5#g<Qz9`}F|c!z7P(1h)BY1rYw8Js
zgzS(;@;NJYo;zh7f*qf!+Fr?mq;)u^VK!spI;56g<yg}gPpwQCdi0uInD}tnZ`M|F
zN}%3&m`B;7Pa*+HCEhxr{E-1#SxxX;^=vZz95Ah&^Rm1Pe`Un*C4UD5%;b4%{>?y~
z5t&nP@qn(;y0|QUSDy*Co2*)V)xOq(-uJ~*Vl9vpe?&8H9+KYVJXX$zU$|IbS^*Ju
zktV?>Rxg2`J=wl}Px22RFlG_796etsR%6a1spXp^=e7D`sp2ux_0oAPcKDaSAS_U6
zI^IO^`6JKij1m8PHRRt<otrwFeop2!_kHMF-JKwOY=}i;1`^5-6My2YE>fA=4u|P#
zfyfrduSTKFn9WDh+gt1_c^(*)aOaP$CwxFN0^5`rwU>90Jm=aX)1J#?_R)X{eeM8e
zjRo(DIFWM($sYO+o$r1>g^HyT<w9Epz?4|~TwhaPKAG20yCWYxK+8sr7TBhPfEPy}
z_es$gWhe#YI22kMV6e?OX42XhH1t$_paqQ1Y3G#DilJBl4OMghA`d%F8T8BFz?-Iy
zk$VS85x9B)w(Bxp3u_M=)n}Jjaz$T%uRe5AD^R05lr6Q_7cgbsVB}Tjde5v|qE*So
zpiO7YRU1qD94yi7Jtw{j6tZn0W7c<4bo<3B8oY{|;LegC=wuh|O9g3|&99<sSBNC%
ze?eI|CiYBAT?%#!bl95YbgD(g>4Lb~iFxROPby>1_P?YkXFY*dpBdSoP475vTaL~&
zLyVO?j69KxYh9{sxr020@D!tgCb9Fps+RQlzn(kTrfkT1G>W?qo_aGH4PIet+>-}@
zQokMxTtKadr~j(&67C8=MKl2K^Yv)M42+2yHgUn4u`X2hKSlSk)5(XI{qa-hmLC>a
z{Q`R~YhGSs0&1efq*toXu`91FtMs)DEo=b)v1rSekde`n5)9oc^E54U+Y%R$VPH0T
zE(9`DOXncZ+^5$506;7tJX>MlKe469Rh}4x&a$Msn?${f|3bI*CsU8Gj!(m7YgrMm
zL`bUG#l>2eHiCUcTbXSIimR3^4D%?G26-0Y9&I`-6hgosqv<afCWrkX5eMu#7L^(>
zs7+KVN*g^4dtttNG+VmDD%H2&*=5BL=>N(uYnR}Yq?C)GiJ{0xbYpi^UR_$4)khEX
zV`KNpbfh@DjlC6Za*KeP`2QN4`#;xD-zPK`9EJx!48I>_J%XPHMo1u_Os;Id81Qo9
zZUcfFM|`*^-A{i3`+yQO3(O|8#oC~$*-upQ6<&&G8V!8YipqD?8q4fJ8hQQq)OE4-
zFBvqh<q8C0h`q5&We35-=d9{VKtp>(JV_64O2tFQT_m-C7n5YB^8khj({2;mJdGy)
z2<8@RJ6zXRqs<`ItYX{@)L)VjLKUgRM`Qat??bV_4K-QBcd*Eld@s#QGV0M%u$_1>
z^wA1+T-vVL*+HfG4S@s*IaGcNkJtr^5dy$ctF^D|b8pVKF+s#H5kPQkyl<sV<|>lN
zX)d*)3H=MrbOgl$m`^F%2tGh`S^xY;<lty|_dQFQA3ruhWrfD%dUk)`xdL`cKWU8r
z1rAS=b9p=>U?pVel9OfjVm~v{j1$p{``qFCnQ(i_3fb%C7flU*R`x<7z59{`a%8Ju
zyj@g~P+7ar-Tbi^7PpLL_k`Y`#PlNfq7%Y_P=9>!=PcNVbbTBaxu04afqimgDA6d4
zDP+<dZaD@~1tAOVd*z8(-`&BHU|XNSQKBSz4s3PZz*ZN8<lr@-)^;ZH@19%-A-ipN
zvk>WUJgFUq>S-9L2nBrR7&HIn@Fu*OfBFNc$7x|M^3)W-hv8oVE?Xp;CP%vjF;cO7
z^kVL~Sn1MTyjZ8uz&x2yd##31vKhe;uw01>*eYxbYnmPF=`ra3B0u7D6Cbo}_>+Qa
z1f0yDy@#oB#~0W$mmgGH$W+g#pQaXr<_4&txf{n7B%9yNs%msx|A2`$eHTea&_EcD
z#Uh+XEBURLQ-Y!i%W<1HcF~lGDqrZZv#E0^tO%vpd>O8jX45R2tu>8hm+xCdX5KkF
zS~E5J2kn9To7zOR<tUeIP;qLA(qVfsi812H?i>pNFi;a)JC>2UG?i5G%Kpc4NA;Z?
zRlDy^uuIRK8fbdkh#ryv?L5rr_{=c^@xc7!=JVS)<Q5ZcPBlvlV6Wf88tPWzLR1Op
zoU%UT03K`ml{0<~hUgFlE3O=UKhN$^ig_E04e|Tg#%B~n+feT)onhljgjhjeq_Ibx
zKUnOOPhwo{_{~i7A{^-)s`*47-B08INLC6JEg8c6#l{O}o~gyU<Dzs5earpiC$yPK
zU~hQ}>ccX=%rDz1xd*S@KC6hyP>T5H!JI{pyF>WhC2MG_>HIEkMTxuyl_x|(5?;fi
zq%*4I7)w6o{4|Yge+JCq(nm!NsKRXzaaU`Kp^4Dc5TR(SuRRM8&rd3Uh3%{v?Ou>B
zJdYg$(K8W=q+QQDw5b35_;UE>dM`seGjZHs4bf(}bCCY1k`AK%HF?j`bd@P3AD;LG
zAN$a)$5rq!ty4?aG!|N37IaXevpo8^mJ=1U>t{&il24fr70e~yfk(^$$dAYFC@W1>
zz4??({@gKrn=lxU+`IEXuWdf2LEZq3-S-L_Sk)*tQ)Iu5L6DHZq_s>gGAEXpBTwfD
zCgpSA-j*}d-v>7gY8t7i?}5bK^$~=W?`>XjTm_sSf{;g^4pM&mjtBr)pi5C9?w;pp
zTS+C*7{+H=d!C8s%k@%)3_Z-3lt|!sEZ^=bD~fMc!%Xv&7FF%c9+t~W2PXUCqvi7H
zv=%=CIadw^Zpk_{1ybTxEE@hKSl<U_b$&-}L|1?MmFQROKxF6}NnkClJ}Xsko4Ti2
zi~rP`&cgBG*>9f!tn#_w?;D^eTfIM+s3dA`U}7$_W=CP-BFT`}nBeDy%wft~+Rc0b
zu&uoePLmOeJ83>yL_M7VQNM<LI>%NlNGZY1Iar$sX>Qa28jU7H+xA3+`krcXHf;o>
z;9JPrK9I9$UAYZz@*!y%mlZvyZEuHCv(sk}X7=&i1rwULzk|3zAebq*JZE-0mqmE(
z->QEjLbOnp4lx~8=5g$Ou9WpwkMEZ$enO_9d~8W#`wyXG{eThSDpbE`rd$EJ?wUBX
zv5#(E@=5|4H?cTmg7|vQ<NQ1eFRI|=5+ubbH#i9PI{{FXv|3tVa|3%uy;eg<n{9$y
zr({9|Owuo>8dKiRATBw)emU{6SeEDlK(1J70PXqLwx&NsxD-9#6=iL<lk+?RkmnZ9
zYZt2f+eeQS!5YkxV-JSSS!54h<Yv};(*A(c#5C}zaGG*SMoJL)1bp8Y77mNAiT#3|
zbfC`aj@oU7ljy$R81}ka3l;{*e~F}>Tk}p=#_bo04QM~MHyB%j3UGf=W8i|x(bN4)
z1Xy4uD-C-Mgq>IO+RYf~^Jzk*4*`If+gmrZqrt1J5Z=_(1S}bH-BO4*UwodL(rh1K
zXu2-|1bNM@Q@O=V3N5oTQ(AmLL9yUP7Vmf3*mI$VijO52MF~nNK3HzlovD)5l_ukO
z@W^~m5`|C6rMws*j|Zcu5cy|yg|QO(B1i#Owq_DCncN4%CV6@oh<yLpx>?vCtg^JU
z-6ribu<Ua9l_jBqf9@blG%#QmgW4s7%%&rKVO|;M@V^pr;0CujAg^ss`S3!TFe9b7
zh=0;P8rzu+IrM=t^cB<VAPju(s!xk3b-=)qZGrskzngv%K7AGQ{Z!ni`jOzDP(QIo
z8Vz<P!FfLrw7mS(e&<=E#1A@D0WU)W`>H6q@g{9^eNi9AX@`0uV35RC@dNs6kMYc?
zar0BitqnHKZM)NV52mQ}M!Lt=7wNr>pZM*!s9FCgMm7Q8UAT4{la@YT9}>#n{H|LX
z{?E=AXdi_R)o(u+$UaYQ)>FzrFL{xZzNa_wka&_VK~wX@)B9~iYK?4->x)+b>%m$Q
zZ7@N+5o&3zJXcDgyVyA3Pr9S@u|}YnnK=3L7V+y68Wz<n#>%fq$ex&3WFK1WFj!gf
z7&X$VdL|zxuf;tm$bZ9hfb8ixaF(&DOd8jAHrnQM_;Pdj@5ZWM3%B*?^-GW1ChOa+
z>|oPuOTvQuIG2Vo)8^<><t?-d8t)<rQQvXRpMSMVU^|K5zTry;OZycm&`a#k6(9n9
z=IL79_M8-|iKo$&_bV(qZ?c&V<~W8aldo~5<LZdUB`?_}MR8=}#5GDI$#szMR<x%4
zpIjq}jV$B72u&)jw*6Hf)c$Nbei?-qU&z6nef|-BC$fF?gB#IL72nTZlIMkgb32SB
z#Wh1zdWRN@ZI-i~oUT{Q;NjRv2tJEa?i{z<w`GB`is5pSu$}i-#|~Ags1)n2{J=~Q
zl!r-ZZ3Jjp{~~nXr#;Sv$+$=o=jqqmYT&Irttt_~)RwnhqLFjQf5!Ap1F->H48T>|
zvUE4pt0M!VNcfc7ENpTiPiWsTY5D}07s%9Om<@WKxP4MyMi%{4yxt+s_;D-O#=qF;
zdC}_g`6Fa4wEgIg%1=;F+V&PHZKe^e+g#r96rYPD<8rnTU#f|ix9bqOEFBYpqTUJ0
z!u_tr9`iUUwm?kna(b`~f;6F0bcsVIA?NgqS49ldi{Zhnze$0sS0yc$$<K7PRoQ;a
z%52?CBPSGSRz{{Zkh&5mKH7lvoo=f=4DeZyp>rEN{qfAK#N3D8`yo@O?EB~S${RT7
zHzs*2P7|csj+jetu~$^*t-SAscn25Cwyu=-g(COHB-_|`Kz~7_O8TRL5K)29_7yqJ
z)deKmoykW(PWikfVccvaw}DZIu~zwfP=x~l{@tX!Bgx5h+VwA<Y_4Kum!0H~9Zr8X
zT{~0}>0VSH3i^z&lb{-;YMFOI-n}`=;eFB>6Z&jortSCjEA|KFF}J?x(jYA+9JG@!
zIz`Jgv*tHRnsHMVhKo;VU#i(wt+dKk5d5`@j~SRh_cqf8P^g9tNm=`HVPDmE*Pk^T
z>8=dCGLeH`m}fRam9^VKuj)5MCQxLLBfK_)JF2O+v&a8>gnROo7-IQjqO;J;^Y~U*
zb0E--vuJpQuMO`e*U7G4AB5jPh15ixC(A`^B*(Wsw*BP79!i(hp1UUz;!rNlJT1{w
zcuS$~GR}rnFqWZ{vz+qRa}~^|<Pp+w#DuDR_>a|;Buf$klS_j%(ltw$-ku}&lqoRu
zEH;)Sq5m2>KXeXPbo}lys5Qgpy~*e5s^-!Md)}nXnOEgjJj_HV^D#Ybw%Jviu(3$B
zFi=3pR97KfK9Q;flAkJr7PWKxv5BCefUr0<Cy$SJj5Sok*^K24Ave4DR=9VRx569a
zQRuq^!ih7f<~g+nOTP1jft;qeGVa&$hGprWH=x@OK0?ojKcRuI>!%X3Uh&zcBMaII
z)_cxd6SlmJorvECDGzX+pCf`Vty<JNC-at6QXe9pO|MI=aHlV3Xe81mSWMO`=X1Np
zdfkG8W=+at9Z#V$MzSL3GD_|<ny1;3SaT3qxqB*Y#frTWxt*7wAO9==(dhBdc@l@S
z_K&Yt81!B-<2;?MN}+sh#CjJB6fyj$tky8FN^=cd?Nn@M2V(pe6S?NtWm<T62FG_o
zk(rH439lz8NURp@S8|bIGw0}NxQ#mpM{)SF9JbdwwH`Zd3SYTn8!W!95b<1k##WYI
zZj7ZND8A%LE~hBQYt0*Ad?5RQ#T7w&5Ko?%rhvq1Hl3%rvV80*MLV>*R5NIHrryEk
z_z7c3cUV4BoMiBWajS^+yTvYG33vW8C<QuTtS<-8zy0_^sn!hTiy$|U(y{c~%S!uw
zNI~o7B|gnif7AQ8GT388v6ws<M4{$c`0bSU#y(_8I)#QL(JA9|jTyH_BZX9<Jz8fv
zmy?i_b$B*?<zk(&DVdg+X>F3ci7eNhUUVEC9=y$+_PnJj3|DRNbDOV+VggH|@gmh(
z9m)Hrl+USE$aH&#Ug@}oGFTw0sECgQ@e{xhDl01!D^1!+U>kkG$&^9w2qr){`Qs%o
zw9jf|jctV3FhNbk+!Tzv?m%K{$<Cdo-y59nLF=ZHGOT;~3By69p6dAQ1*a#`Ka>#h
zOJv6CbsYt$FPd3(GPb{aqIev%QSJOakF6iiRmY~|A?Rqg>@klZ=}?{={S$L5mm*>w
zF}Koj2VwvY(r~0qTK;t7F|^$MMB!>FdeGxlEM&ws#v>Oe(BAWWq2;~Fw^@sC4O2Bk
zs88%M4=pHClNGM2?|y2JXg@tGKa2vuW%RKs+AeLJ)Jf@ECx-o)^ux1Fa<$G~v7FS<
z7g`cA$bm@>GZ;T|-<V&}YHDG|lrG8FcQl+P;TOhbf0p@s5w}jL<nV9>y=l1JKWuGs
zj*{m3*9*<P(73%qxy<q7_Mr@#iX8^yRN5qqa|_>}H62P}pEeD0suruwY&G+hD-IXE
z**3a&eiZ+(&2j8t0SR4EaWij|>Ig-u@EHl#i<L)QF)V(fN{X&*bkMvq=O`RrVri)$
z3viv*9NUef5kG6#zSWtQ=^%Zn`pS||f_F-bKBnp#I?%#8)dy%6*hNC5^HXq4etLlW
z0OO=x1o_$YIzI~h`g-u$NE#v)%K|GAvv84Y8{MR&%J{)L6=3T9fUIpm)L8@Q7?Ii8
zUndh~Ih=M(<5GH$O<v!mnwlSdISJ%OoV-@eY?t{vI%G-NsCXg&bb`Ux*igewY8`BX
zLiWyPPewT0q9n=okE}ZG1sgCU4OP_}qI{)Axx6i9P4ne5^y>ZUk%~)Awh&mVMUPv@
z)o>&tVTWaxIp=+<*xA`=+W9o*o2%>mmRnb@nyP7D-}80N6i`2-WH81YtTYy@of=^j
zzBj_^<+fuEEZ?WMyqzjSt=n#1+}mo8LEz5V&bhfBP>$sZ^Zl9noL@GIo1HI$r$w+h
zZ~j@zE?*WjD|Xk)Sft)5+v2woQz(f$Vl$N_zBK8y275xm&)fG>bD!;2P<qaq`nNW|
z$+Al%3t8lqvY<&w&AQEKRY&=&YT0raRzospYh~N_@7jvyF$fJD7S;qxLfGe`qGEhG
zxE+0l4?P+h`Ixfu+rPQQjAwep`tg2vV4R<GxS}v(keHiz{kdo^kxkX5!0yL_GET)U
zS>xETD9h2|#HwHEJGN?<7l=w!9%#i)zk4M`31B@3HuWR(_ZM3A$V{#7ELEqZ=sIJq
z+M!|t9ul#jBSun%5cA_Ur7-*Lr4llQ<WRrY4{j>3$4#HB)6r!pX7QD5TyD=hhdgJY
zH7(kRZPD?RXm`P-yHc}%rKu)s9z44(r@dF4nxRFySc~KB#`vO<CE<HC&b2&m#lY5M
z&&4SEY;@1a1@@#9J>|H0IVrpr5#MVe{}q3dGX9%9LHV4draqPVu+D<F@|9uCdKz?!
zB?NW9h=L=Ro?UqF7WQy>=wYrJ>j?G52EUAO3eA+78(b)0b_1mk(~lyPUFjNm|B_}h
zH0^@axX!XvRYznMN9I=lJmI+U1mq%x$hG$vg*?0>vt=70)4K|1WVX5|UFDRdXlc5o
z<DO=&?2Xnx!@Xn0^@|yN>l$frKMNy*4MdVRd>@A|mDkSj|NhvHQ(HRd)Z6}?uI4oT
zz|`%C-D>r*Nc#P?M24Ke+$CFp#lb|BiL%e2;(q}pLVR@D^IdLCavAm0tM+p=jFa^B
z#lOtps)}I%Po7km`q?zMhZJLs^UrZ@wV=`yLlt5!VwBb92A0U?nL4Wk>S~iQbGvHx
zz3`Z~wBA(a3PUo|LCiQYSOs2$JI~&V3MchsQ#_Y+C7s_{4j^PycSzmCtMkBt5vya4
zEOrC8wM=5Ks{|t1gbYGduRA)G^(jw(svzCs;+ijK!YHj3g6JEbMcUCHCw<b|2$d&_
z3wh4|p6Z^D_hsijn06~Gc)eAV&PJo+q(zicnBwV_&F~53&2$U?kk!tJ%se#P9<JZH
zAO*9R7uAOd6$<9!XW^2Eq(WI>;t`7<@nlCEZRtnf<F%=)0_TE(np+KNQ5=;)-nH-5
zE8#@?6C1tbnO{y=qKpJ5sdPR5rnv2J2Y;B=a_4osqtouHi@ov19yi!Cr$uwz@srR#
zEuYwSeZ3Pwz^Ehl9HT6qy`xevIEU-Ko#L#-2bYeAL9CqSgfH}q&sO-V)yCMOo(bDf
zAxsPjLlR$Nx=pbupOMysxy235&Q+D`jYP6cQ_pjR(rn$nx9{fzc{KZGWxp9*yjYPx
zeTOE%iC~Qln;^K_a*y>RU`uvmI|_A{7G?0Sm3wCVaT_dip+9T|HHh`%a;+LWy`eY7
zqSLhZ%%55a3RjL{%jEwd>n+2&Ub=8$L6lOuOX-sC?(PQZZjkO$knWal_|qUQAxKDr
zbPCelNT0#&e$Ia1^T97(T$l2jnKk#iV@<K*oXS>HEo#d(fWR1MS5i~X)Aa>RpWW7R
zs#(#f>5=#Y?I~=M313k&pV%^MuoG{*!uNVDQ-~q{MkQXZfW}%{m{KnO6Rz{oj0@|2
zqv$vRIp;!aVVYu@3PTpbn0z?>fSdzz3A>-PifVa^w(EAnXms4(t<MQo)V<RW)mSAt
zn~8LXSofGFhre)}?2b;03{jolg7k{JTko#r+4l;K{F=p*S~o^RvtsJZ{0S;E@oE+{
zWLk26J~sn6;B}?7tZ@kLu3>6s3!m%n`C!4|lz>CI>o}97_nNqd?(?+$Ql92yNoTdA
zw2AC0RG3KLL!J5JWUKeNyoS={AB6Me>Kg<Do43v_My`ArZq5o44{r7lLW4Bf4tBT~
z1g%Q-&F4L)go=z=&C*Tv)ipEc!t%w_?bgrEwsrVj3D;)9F)@I46&8$C99fEMs#P43
zM<Q<d4wK?P1X`xdNZU|w2u;?}DeRkaGx^iRNzj@(WpMlMIo-v62iTv@Zm*aZFmow3
zdD8<lBB$~!Z;M?jMHu&<Vq}&U_jh*3Q?+NMdR);v9h<+i+>9?6U<cplzb1dCYTTmK
z9mHGUm*Da%EkJE7$5g7c`FeWn9hmi~dYYav$q1ed4ngmIc6rdQNRGtJQ9=iQZbo;F
zE+CIQHYi>h5Y*-<k&33ZqWh+lI^O8SWD0d7bup{FXw>U>GY*3#R|}Ye*X2KkZ~S^U
z@Lw;0Da&#{6B)hnM^%y7+f>z@HqSdItGaZCD_kk;((U(kO;g5wc(Y-SY^7`6>7kLB
zcR5eqO_~LKdioW^KsK2IeqljJ2%CKk5=O&)`Y<Uq33pDiAKMoP^Uk^<ed6{;{5uDN
zs63I%^eRo-*_HN$TiAhby^Si2Kt%ES(O2PuLn~jXjqa0+3lxfLkF_vQ=|e@|MJ@Ok
z&9pdNDmFBx_>|N%PN;$bdHke$Sf7U#tQ#1;R$t`G8gw$kCKPsz<5i#3QL>9xGvpNi
zQxwq8`gvurSx?1;guu=2slAzS%DnuzCH?2#K|K$?;i3pwH<uDWGgU!L{cg(QDl5IC
zQSdG14Vd%-<<r7BUGNP&&-e+FNgXPaA2MB<iF~%uV6;t;{ab57yt<NdMDG`Ux40cI
zJ9$%!!!Eqtg0I!}L&6%VL#}&Y3NbFTxvU)oO)d8wmixLq!{@KIUU!95xySv9NMtjo
z<td13BV*_sa>tFf#G59Y#rmhTx3@k|RkDVks(-(jKDpJm2*Bf^CU34*7b^$yFBs~|
zRa5%<D`vMqlO?4fQ%2aic`BRW;rAFeUAbzT>mlL8UY^U$Uk9z5z$uZs-70wq&y|z)
zcEn;>?8mJlq&Rpkr|{mIBTIBSc9*<YId|B!_N{c={oWNmMllt`YQfi2U#s^!i6`EM
z!8jQ$&d3-K7;;Z!M{?B}Y>cHBq!Zti<QC{?bC%;m5=O{p3*@5pt2aALzfYIo5w!-d
zzg1c;sX49-0CYyFv6kG8@waTt(~Hyt$zsd1*`*o>RF!dh>CDoCSj{^y+u?QWBACS`
z3-3;c`tTioV9UqBbNMwf$M%aG)^yUQ;Lm2#nj2d~xIv!D3IzVDvP_Qc8OJS<yet>J
zEZzV--vBdoXz{3;#2dSd`e?_j>I67%6F+?~ZM2(#M{F2^2_5xjf;77eV#+q4w$qlT
zEY_-kT0Ai~K+dYp9!_ccNOx$4`pdzezaJJ46n#>y-#;LIG$hw0+&UAW@U;<y8b8qx
z5%OY1g=-LfB69t>?2Xn%44GI<<RNtQDij-gpDH&wxZ2~w_RXQ+vBCQA83DKNA_f_8
zfRu0L3k&6<*Sxg_Q@oVV-fly2Jj2T6&_;N#Bu0r=pJGiW3IFL=G-t&}--y(Vv~L;Z
zUbm3c_ROcYQpM&aAxd=E{T0$so(PR5my)K}-(%_6;wVBZrxFbBw)qyS55sVhpWe^S
z;R7mbl9j<nUM{0S;kZ*M-vP2DF}eQvxzP9F@|F=smjnLzs?_&&v5_$EO0hl*nO&}#
zO`jYc)2iBXD=@o@Vv~G4FM0EArKh~e0=!$^y{ew`9fR8vbw}(|e#b$gHL}C+Z!cA_
zRIit(?fLHBmr4|3#O>z))V*evis97~j)zNpA59)+BD8Xw8pCZ$!k1)Tlao-OzAu_!
z8tH5Fi$%M<((^1?1_Gc3A6)HA3IhR{gc>jLihQ;B+~2<LayMEjFWn&t#F_M&S|=Rw
zFW-zA>L{Y^&}Gbas(O5Q<fE!ZK(%EsP-f&s^z4nw;~s=yoEhh2BE@8k2L5T#<U^8@
z34%Zu5W$567_JRzrrKvhuuQHe^5H*Kc!&5zm>2whkCQzQySCAaT*3W?pKDj=9VYUA
zny~k$JqPuL%6!18Qv2=RrTw@pIjYogXQ~ipvrWNEyi*?avIp&PLeHKrY@8kM+{9>1
zl@`p;bL%A&h1<;+KQatHr_APz^SgXq`~CNp&(+R3PrTyPmU!;s;YW3exd|y&g#uO7
zXd{DKUmG&(cAqSRy(KrLvEYdkw7gLbT)F!tz-M4w=97K0db`zZSLja87R#bv{h2ap
z(1w^x>=1R=MC?7udl>TSS8{eT$7V+QUv^m8X4<!8Zc+pkJo6QjC&;7=!h*<P{Em0%
z^xA!+0b{#*72fa~ieRzT{rH41v(+8lxN_Sj@|70i_c$vVIPJCr+1Zqh%=5)qIvMG1
zm;S}%?636{NE$jZJ>cW9AV&px;vr)o*UCVjX=Uc+dnamU)o=-_RW+56zP`jA2o;~K
zURwfZhLZydYsaVQ0Tj}M%cqr@@QeW?FfY^^-QC?I3e82(s(J&1gMaKku!FW*IpAyT
zkHXj4w@=Y&D~in*8yNURi(u>ES6fnR{CXmdBdEam*=yeUR#*mEqloOfURmTUDMpb_
zTy4v<m@n$#S4>1kKb~GJC%gH3U7J*I?0hwz9;>*thdgEV<d$76t%OytD`A<no_T9C
z<N7w)RJmTKUp-}(?-{eHq!|QFT*&8IDxN!Z5zbdKrEYUXhSTAieW~>!MB3CTTQg@6
zizOFgj$O&yv3=_1aM0=V#p8wuOhI9U$5ttleSu(sJ(xU}P^U5?PO<@db5g(@tc}DI
zi>t3~IRzcG!}OcjRM2HDqOD7tV!Iw5+-)8mVwpII@vguRIp@n@ol>wt`5<l2Zl%YB
zTrb`r7xH;s9^{2qmzIWa^T|JqN7vzHFl0TiWz)|LJ&S*Tzl>B;F_79YHFus89OU_+
zlpikp_4Ms_K%y-pR14WpU8m+zbIUZd?xV#~JlCe48OLVVfoIuab;|oc>zA5Ud7{hD
zxyHXp3-b;4?MSu^SyxK{0_w(oS~SpeWo2dRrfZN0IRX8;OziQ^8uZ+CatUH^HjSg6
zBAzESWpaDL@&%XIg=I?&<BuUWV4Tt@$+a|gr6Y?Wm~@_K(Y3oH$xl8n>2J}gCDY0}
z87|&sS`DVttluf#%a#fLh)ZOhYyQ*acKsoVlCp$EYcxGcGH%8-ju}E~=WP$U1txab
z_}^59&<%d*{rsLM2-EibOx<cDdX2%jh!^;W5vjOTZ5+kH3K0k{Z_~N_w%e$Q^Hr@E
zKjZWu#rqXvx&HpN?He>^d!GPC*dGt<xRUnTc7sOVq=>?hp*wT1?p0B6irg3c*#^V%
z+Z68lwz9jw6mX5t7Sgw7{W)T{sL$fYXf*=&n~N!T15i0_;;zb&87dC95B2RSaiTJH
z@cPrwLtm}U#bE|BS>3YF@(0&n3N^FNbyta>{P<D-NoM40K>1JILGQZgY}kx*=K)4-
zx*vld`cCn%c5v&T*~~nL2k)vrX_MRN_Y`+C5uS3b6*{z?Cclo#1WbDzQW%+7)#7Be
zIXFa&P`hY~rx|JxgbWuxbq;)u36<V(FP2C8H^e^&8&R{k=su=eQGt(z$dC)FKkE9v
zOX6EJ_Ozohw>Bpj94H<+kAT*rl!u^4c97F=S-mL9DvI3O5R_iqOzRw*M(;fTHt)4T
zQ!`5tOvLBpCcYo9(<Eq2hT_n)7F1q`^MoG9rO^aAI1xZ6+?|YwOdpMG&%cZO^<MaD
zLFuyr#g7{;TT&?8XURn)4<h@D;WsKBHhZIZA=nD~$9dgRxF!aFd0~oO=YRtH&68qH
z3Rs;Ncl9OT`MO}G_`GxcXA_#dhKB8<kqvX{MR$1KAfCSvu;UGQiJom$Wx6|va#E#x
ziiX3|_(@U!;4IZ&S$*fn&2wrB41=U8;%u^s)AATZ#6tGkC>CBsa1$`WDV{r*92Sr9
zIEnEA+WVFwOkVJB^Rw{S{0!d9=+cLB<Lmq0>!4Z<uRhH_d{I5g%UsaW7ReV_SmZ9Z
z2~`?+P2-X)?F1Il_#168^K0^rSZmdw60tlIX%)?NmjWx<HPHzg195IKwV~qXRApKf
z5`3(?J#$FldRZimkL{ul)+YT@Z{)m!oEr7EzH}i@Yiysi?K%f(<9M4AGZEVKPNI7Z
z=VOyKzId-oo2{^U5;kBt$p?4SSwVI2V~*G&N}xY!5X<;?bv(yd_7Fpj**avWeBm85
z2=ycoDCG8Kls5OxVV5gPz|%}K*ag2yRkrBPp65i%L%O<f8@)DevBcD)RddW=W|A?l
zW7Kvyk^W73dophb{t(ZuyPYruDLwTz#*bCo&~VyL4f0f<Mav9?*UaeH$S@zZJ;Lku
zs3ah&n7>5n{!Wnnlssm5olnn^Tzb-v@Cg({z$0^zjSt0k%<lDtw6H;OM4*yoiE3iM
zv6>ic%)LPW9E~C^uro0vbsg_LrVJ)7U0HAeLf>gRfxDr6cYbaB#rq1&!?Ey&gC)Pb
zEDYK--fW-gOF#BAN$#s;(OS=-7>c^K$uDOnSF7vVHi%sIXpG_rf&v;nGInUc#@bcM
zM^c}bd7ikcYbOkObPg&icJt@qhf?0+V^^oj=89KxjPsi+<RP~F*a+3_%o!<8iVVxv
zJNK6FrMb?O<ntyT!!tdMT;fIaAT`K6d4_4d-<sqBj#3a_w1YD=enXb{NRD+-618W^
zdq^dbfD2+YQG4gS+bgAdQT4^rF9=*VPcu#uv#7k_BR^ELS4llM2}u=8Oro*Bv9;$x
zs-=|e_zqVLPf(*69potw`jfX6e_)94@H@(2^2b_QuP$0l={7cxQ)x3z({0fsJBNh!
zR;$IuUjMm>ry5$O>*7ZI7K>x+J3l8vDGS!Cc~8j<=2$Hrwp(Zi5F-E1g*Qp>PrW=@
z+XvhFJibptYEKxmshBx=Ozty;=t?O0JrH(YUcT5tzZcVo4fU1JYWYx$eei3161LI#
zoY-#B<0F`_`<2#>+nX<&rp7Cu+4HLNV*5_Lblp1*WjT~n&H+lAekYNqRnl$JXDt)T
zN8k&^TQ+Kl#p!AyZ1K2e8$xt0Dhr_`87}MmtDw^s8KlZZ<)%23zE2fLtnv9tXmjfJ
zSX8@ne^JCNe=g>ZB#vAbY&Sw`<xx=*YKCdpCaT|clkaZrOv0*E!d@)Db-`RPh(({Z
z$c1I;Tlpx@KP&_ge=?dQ$!c}>y>_Ya=jHnGi9a5&1?aFyx@>CQ7?u*QXLZ{=Pf4s%
z_Pd1nPV+fS0=M8-eWL{Z58lXT8uJs|O`H=hjrEqn&7tDpRDaxLSR(NYl8P%V*<nz7
zSU2;6*|IN<&f>?_!(F^kA0}dgnkUbuSn3gOZ3Dgfex~U~1la0#a5D*tBRW+|?cje%
zj<qw}%U_)=`<ff-`l-AAjOn2M1@;9&V;SP>{(OxlZ+XzS6M8u2yUfYVG<TSgBFXsI
ze5b$ONAcBll1*k)PHd-HcghvERCrWPaK|Y{2sA|6>9<!KV#Ot2%=&EMeV2hsz-<;9
zx}{Pg3;$6Ywh&xJI{g#7O4eN)$gL4!Jj_rnB(L#U^Tk|jweh;HRg-j<Yd7tW_7!*r
z^<-1o>-q6CSC})(zmB{;^O8#5ibGVJ_+^F<lMv{csii_}V#_iAE~(W3T3)ha>}jyR
zjU`OaZ?C3^q`8eUt=f;GqN2b12%lCIZwA_<1(92$f>+YJ)-uurM`uG~;C_aWP?l&a
zlTk~!PZ^w5)G8|AuK?rx&&>{jqstQQxfjI*Vj~tM&~6`{rPLx{=(!PmXkW~?df`&o
zPTBhc5=G&HC`?~@nWSqz<}zv}b#s=^Wh28dd5X{E&xwNEG}R^sA3>x#6tZnR!|Qv3
zwbP(Q#*3{cvpYUF|Cz&T4uk{b-Y1U=F#NhCEHV|dCu7Cus7b;*{di$J=KzmH)MB?q
zkc3wxZD;!lQ{oO!zq^>*vSfej7te(*)b$?I^oG1u1YY7Pf?UalRz`>rTH3P_>t{y9
z)5GcQ;dn!v0~fd~jd)a+`Ay=Oyd`l(s^}#&l8Nx8H_!U#Lao|e)fiqJXh|tO!C5Tg
zuow8*x6C2S<Rrc2c{N8GE<joGe!PRJy*KP#9bnUN=CrR<%NGhO7QNAY=rAG&W?zk#
zlX$`=pmRZx#w3Pn5xI`vj(!++{hDX`$pa;ntp>1v_3;@Kp@Nsh)>W3hxMJ2*5RCGz
zkPPQWuLmw^^#`qS?Q1U^Kv?r=MLIQlfT)A4{t+w^VY}V6uP75c1SX4K4sG*EN2v)k
zyR6_DPiZTEL!u`UUE-Qbi2&lc&2Pq9Yk(-W#5+dQHi1xIt%wXZxt)31OQrx#Qb8?M
z>^o1m;HaR1=dW{lH1h>15~yP_$-~|K@7+#J;Eilw@-m@UaX?=Zed#!vl&H47<eHW3
z8JiYEE|+|vof`rhhTUH#<ErOnOu2ekq<~uZB9l^nx#upJLGU5V-6xUq>1ua4sZnRi
zbqLHzE_C&DFr(7+Yd4{zxiM7vx`!{C6ev>Jmmw~i?du6BF>Sj2AKve5%(Z}`TX`-#
zyKH}2#40u!))chi7$~0rzg+tpIZnoKCE0erCQdP$zMgP#cDg>@%s#~*9Tnvz0UDau
z;L%mLI{2)i^7ZFQkUwDdT>Tr%ED2~!aBFFS<{Ck(=JB@f80P&H0gEWFkRK&bxY07+
zCW_v=JFs!J7|vUs;Zdk>XHsPM%m0#?f)3qFz$3S}Su(}#MADDn^Sq80l1!J(Xq2_o
zqpS40`^EkSiT@1`cvzNZYoSLw+(XWIgC401`G+ffbiwIpT_gvByQ!%B$b?yG{ulQv
zEHda9JZMp^u=J&9@yg?afOrYKW2zx7|5bW2t|k|eu06~SPwvydCb+$-B`0-rvLS2E
zronC}5Sb}5)Sle%9M&H#D~uH~%6@x87uKnUs(t1mv?}iwFoIaaVL3%4oAnT!zkRYa
zTWzB*Q(GDR$ExruhCn7u>xSc875HAEo5lCB!`~b(^Jm=3<alrIfd_daw`<4P-Wm-&
za#@*L<K`=`{B)Hv2Gv<z1t%HI>2A?alOGED{baZw87`H!(&$D9UA9zW_5v$;4mFgC
z7Nzwr!qR37$XQVX!&FVt#%vG^{CVSbZJqq<5o1J{{^<;ARR`7l@iMxyUUlzAw0k)I
zmu51}U6|)KrbYZWLPlz6<>K~dKJJs9?2lTeE!W@@BNQd+>+6h3y~7Y{_`_WL24z32
z9j;mYz~lj#^aRk>+t~an@Z}Qo=~$hf_5K<Bs&i(>tcFMS$&+V32Maz?yBt|RugfVL
z!Q&+*d*11d^pf$+)g^B=u0Ko3&KjUoyfZd?JQ#xh^isU;XV4sWd~AGWtuC8Rg-V`>
z<O-QjfIV0Y{IJ|157xJxC+TwEfCAOZ5F7G*{)&nTN`f?-;^wOFz75Vot^W;lv#dn&
z&KjO>(j38rluMGemuUsr?P^Dk?S9#~ufsFZ`8kG*o-4SJ=SZOut%&D3{-a4Ax-?Sz
z%o-C;ZKsvJVSgCMu`Z2z!}dRrbqV16R}~35X47qmg;@gk%`_lOx3EX1TB1r%*`M4W
zrITrCN;?<o=?8$7Yziugmd}TKB1=FlJ)(rpt3|P)PuGjhH!ncLA&r2^K!|7NNkKSo
z+w=Y|g_A{f{3Y7geck0DNHRAyltM87SGYw#DRiQ_hV<F-j-eq;`z7xV@XX*Fw1~Ml
z#lx;(1u%*=or2yIiKHh{GDU*~Um9cB2!e;Ln_sZ$?O0DW^|s`zO>Oe&WZMXp@0rt2
ze%Z`gjrf~0U);%M?3Y#>8t1s@7mKjJpS|g-_Ottxp>Z$^2ft$V`&s{W%Xx$RQGbS>
zgIuj2m;bTAq52->WB_ERvB^h%N%ELOM$E#>7?wf5wX4UrNRGsTl(t1VNCbjoxt)sb
zQBms3vQ<{5H>0kZ4Vh{yztBK;t98rK^K6(ej`Pk_6ys|nH7lh|x*JdP|8yvE<98T3
zc<}w)^{e52254&Q-qOeDWYPI3xX#EaRx>uOA^-8;)Lm)ThEX@tg60AX1?6tIU%1s^
zjcSVyzp~Z<-<=BlwQNTr91hJyJObxKbVth3(b1TaB@kY=!#qKWDY<;f)AdL*{&7t)
zLTN#3h5Itd_UXOq6xDsLF@e5hcd#W2L&KI%r=#lu^ySe;d8k@vJU7h<;A*FeqdiOH
znR;VY$~92C3d26JG8V5v1>7_F5|iGJr0*P6@K0u%bd}I{P%~(QgSEY{zBf^y?xgy=
zK6docgox*#vq;T}O8yi-7M6{=r+~oOzyOK{PKDb6UoW7P->n^{ae5pe9W44qfd{g_
zp(rVlJEJA8eUHVuYGuw=RYDdCGD?tj0F7Lu$0ja-jtJP;*b^m5$ZE73HB_M_p>}Pl
zReT=}@`3#N+ohtNDMsxrd9_jZbuF1hY}Q(3-F#ZZE6`F{0cLA#PNg?xWQzW^4Ea7A
zG)Aazu`;omY~E#o`P4+;Ml6DA^?-70T;{!U;<qV2-@8sSH7|U&4vinI2|oesy?Z1P
zM%|?m#5&90s-4`^s-olJ6#UrbIApSy!YLO13DPBPIBo8pPAX~&CG0+5eP!cN7l>vS
z5D#z@N(172AbS$YUu<(qFn7exxoW~F)DU<9&ttdb8+N3vaDV3`vUzn<ClNQ0j}QQ6
z{0g3|!N)S`Dr3;4;C3UJ31{kTJELpGS5=9gTVlb7=F$64v3eO9m#TmM9ONX-AEM`3
z2<&sUiMqxDDgM4N32h_d8QUF6dYzc|=cl8-bcEc#gmT<(Vs!+8X2;}W<8z~%{lSBt
za)MU*HTo2t-KGKU#yTU^4avWyPOUGzP22HKtJqxMJgpV*(1JWrX{bANeCR8`cHX1x
z%Q!^D>L`scUBqg9{ofY*!@AuyJS4%)XWU>M@9bv?ww<5{F&Mu2v;o_UVOCL6{R8-K
zUhJyOE{#!N`(55UC6wLQnzWmpBO0!9xM1wRgWW|zLEXui;}{*4X`0fa9_fVHi}x>9
z9MO9ktmGC%yED6qTjx^wk09@Yw3k)=Sf*a+qvj9^{1}bCySaRI{CnH_n^X9QK=?%z
z0X?0&h~64<$Y`C@p-23a&7ak~*<mUyel{j5YKKx0tbiV3l04s0MS9wz5yAG}nf}_Z
z6$-yTBZ}=+vGCmLwl78Xr*hU<1gqS(@oPsDxuC%VYS~FsK;7A)IgcsyIDZ!u=JT>t
z?T&*meq45sn4m__E7y1U7Jk{(oHivsNmv;q0L>)ea)3l6;PDb|V!bc(xur6NaL<eH
zie(07WO8WwyBKD@vHrVzL0*MT#cvHktvB6QQ4?zSFEESuP&%o2DKlSsVsAJq6p{K_
zWD=*9X9Q;%#iexp$NZZROrQFg(X;(|<d9!G2-BSO`nS7ATXZE1$!EKtjB&VKYrA*%
zmfBtb>{lzPV^@t{Z6SCz&eGH;+^V@imH7!CN7UeNw-<B%VcwJWUJsLbjVxoUMQvWM
zU8*9f&EAJ6zd3%i>dv$u{=DzTxaEdeEljk&9pxeho|6h{?6B+t&pv*f{dfIAFirVQ
z&y1Rup+3~%qsdGk!_#rp`0IcXs8ZkIajxN{<-~xa!iPT;u(Bs^WDHr4t^6O4%sV(}
zt3Xd>WV$_g7>$YAqD3S9a-pA#Ezy0T^>>Fv4sd51@SkFLdZ!IW#+qsmA06?ljTV$Q
z94%bT90P5DO56Nn{_bbr)s7){xDdGWi?TyIm1IrZ4+LIcVah~0QUg6jCj=_b5@i9w
zo58YaQETzoTl}(EaOc~Bd-IQLlwya9JT|pe@3*3axYywTWhV;fsKOYy;W1Dhf-PXS
zJOmdSE7;XS-;8$y!N~Anyvi>g;r!8g>8bjj?HTO`(hGRPX<M#P)L->*l)U4+KVv(n
z-#MbEeHr942K6AJBpWLcDpbVDL?LpIs3eBV7{TOw0(7cmNHTxwHm{@83urRIo6R%Z
zJpp%2@G=SOzC$R9){$D~i%&4&z*PYT&Eu=_xlMc^gGb+#w|Y-7tGsJCykR@qHfh<^
zLH1m2UQ%*CR|pm6GkwsDZJ>Hu+0GmLjWyyUp6@xh&6-oZRxmhfNJIH@(1=pl#dH1I
zc85&ZrwRQL@H@Y%5l4G=H^kUzPA+ar`q-~I9-HB7LAQK`A^7y?{2hkuzvaf{NtO?-
z4h9RjWjquPtTG+Kj|M`N?4>TT#|6yhG$_8btK$1-S^wNS7NkF?Jsv?TI-X7CBhv;r
zd|~aOjP#I>46hL-$VcbyhAk>2o9$<pXWs|Wa-ZNn>!0`xXwHxNA3Ci*&ZsT_@!8#J
z#qq0*#=H++u=(I0qG-Q8i1p|i7&mNgyt?hi2h;SvJFl?mEpST4jJ|ZxdFfMTivMI5
zk-~rxK<+!(lpxRNuh?PP4p&x$wDj9i6z}Xyj61u!hRgy@b22{=J%5^9AXdByH-h&<
z6aip#sbt%EW+=##%fYITiEP%y#y3mkSFPU?nr^C`*HODr)hiQVL8B*<iZ*MbtcnZ&
zb8VI<8-W?Sf|v11DtWlpOx_Z0)ih_v9XCGhUfU*|?&UILW(Om*s)wE7D6wo>`-o!B
z8~9;J!a%5FK~0|Bd1VY$PMV>0jaV1p`UM3a5*xkuV`IG5TWj+S%b|<4$SZ$T$2~PE
zQ9NAoa@+HMD`A0bRIBn2z91E)_dF%6o>`#Iod5I)b3>2P6aou*n@e9Q7pYcIi6?rF
zq_-o6>F3KO;uM6Ly7V8alKGPX?izotg}J$omhM|753YisRB;i8HOrDnX27F>5_@Pb
z+NYN_589Zf5spU<1<DrYMeEo%%<~C*1jO?RmS3oUSmJ*cyNc}(lP|%kXklFxe^>n+
zsy{}ji$T#Hf@mm0&Z7ULplpr~8*V-~iYQgtV7Z3~(leuCPM))bOlsBH?x5uLHcsS*
zhGe32BObr&MqQ|RLh9Y=u!;`6pGp6g@BN8SI=`C+dkc^5g>(`bl=8jSmHfY605TbW
zqabEu;H3f57Z=Lm=%|N-tl%>!bMzgbFu{T;J^Gvt6c2ZI(Y_of2M738*L}RA7<GrX
z^c-5?FEUIv_KSQo@xu=puy6t@^?fjBBu772WZvleoQ3cBCCwDJ<AP3z@oscE0x;7S
z5Pq{zWZoqB8SVhki5qm)tE9AUDHt-Pj(?Jb(=M?=N}x-9>ETr5c5NcVpx4%Y^s3$f
zc$>|}luTuT=|(ZnQ7>PnD=~Xe$S6lomWIGj<(h1DnlJ#oO7|*e{zhV@;I`KMNgTP{
z>sdeTJ}#9>qc8zc+okSWql0fflCiuwM><iX4nm_s-aMAUYsx**WkFbxVVs@Hghnr=
zg<G`**oF5YdPvpkdNAl!f8-U%>5pmyR<8KHw(ZYF4I<i;!)HZ<FB&iRSRSFFV3_|9
z-1*=$ybOnV&En07mSJnNN8R~PaVCiL-wHB++t6wz6V_}FZu~KM#>!r0XwY7ZFRCL3
z4d%-y;z+n%B<|nz>T8U8{G?G!EsqX9laPjeNks#H&WS?qd{#CkC0blm!~!TF!^!vT
z*_T)xV9zALhxppN1Rj=&ehBjHH-GO{ObX~pmoJSC>|iJT=#jI-Ks-M)@5f5@UxF#V
zeg*$l-R>4Y0cCrAz)e_Xboo7yv_wjBi6{*9=tn-wGoYYB_hzI#xobV6hW0|%VOOi?
zbv?l<9xt#srA~WV5N<9O+Nl+@O#5`TH=LGxqU1W(;Bca;C3_RRQqCeWmtQynd9fYG
zjR{K^jwu&2TPgDl1c>;1Fm#j@5M7A6V2rXl3QCccn#KqWC58!c-oqRZJs{0B-m!Wj
z>C#%3N(RVkqRz~f#BmR5RxOk1Bv{o_vI^7bWBJ0r68yjez6C{OMR?&k0^Z)UG;{FT
zqx>Zu;k_aor1%uh->gOYhkj+M>VrRDGjj3n`~%vfL|;z@dXBN60Jq_3bsP;c{<&e?
z1cTFX(wd~boZh46@Xs<CjKGL8@czKtKPQ%*xQj*Kc?Mhm-i_%rHEz!%de)8NwZtP|
zG+KOTL@bH%Tf3n#!)O`275sDa%J22NZT&ExlCD_+SW-i1j-0lOsqMXPui_=yW}}sp
zSvsW(RaY<Vq^@f(;DWXoy}&N5_*;5^K%7oYWIOc!UB%B5;3Algw`%bPz9ZMmo>*RO
zPm0do$!Ek$(_}7V#5K`*-gfwP(ImorkYW_x%0wmlDqXM*RME0GNr2Gy6-XASzNhd@
z?GJDbjSex4IcZWqg$1^MPzZfemHKJ4^%5>I%E^twpckfid}#>i8;7k&Y0<~G-8_xm
zE@HR89b(yCj6kg(RK26~Y0eM9@12a&=&zW63MB)S$}cp}udC0KmiE~pWXo`|VQ;)8
z2B+NO?|ys`oLRBXzHGOM_tYOf{LEZuP8Bvf-#`^G_5l(qZ+>}~5U4NYbSpRM&@}rT
zs^IS0&f?`~OS7ByU7HUS0A54@fwJ|X5md<(mL7HBz^Y<^{>WcAoJ5y4Qlya821g{m
zYM7IG^`TRk@5b|Kb;G|$3#>7tgdmwe7S}K;{YbT9^fIjMPZ?;yFN>Vg8wGT6tF<zo
z_KJ7HG3fh*9RCohqFYN}!vPh9X=Qh=lYPHDFYajfk8PSZua$1V?ylA8kZTwuc0f?@
zC-62tQg?RCJ|Dr@kCpy&iB2`-GZ++nVZLTo&)owSvXH}%AbmliH4fuRA8N}4)dk0y
zrihG+^XqV(d5%C4895q`YjiBj9KW90W~KrT^lcUO19lnUw!GKw73RB3tr7drEsrB5
zs9H>!$U8n3=$WqC+4rNT-Zu1KL5DcGxaMRm!)NH??pGx(5TMCS%)d^U9hJw(6j!#o
zT^|%>q_pdB3vCr<NA+cKd}w9(VR);7w)1jd7k6<k(KY4#L^#Et5AD$=)dFm}Uc6ub
zlrB$nM%7}8VQJlWfK0q9H_6Fdl1SfY`U@(->FBry-NEH6CxH6yPJ3B;_yfQQ!qNI^
zd*LX>jzDNitNB#K%PP-b@V7oYSdvs?P<;SYb+K4mO_XOMgKMgo&649T5gOvf;VdyV
zNWE{ycd<uhTDw1LFIE<Eyw-Tf-he5c%vkto_d_s)!|~3a00FqazXc6kI-D%%)~34q
z^lu0XmMZJ4jAbG>q5mERdxafnUh?mLzSJZEf}?*wg)#7dW!?wB5AwWbF%AFk_K5S}
zIz{xt-ocb~hB?c_FbBV3Y*D1`JX(NhX_V17WbHs55+Pjpfx$ZcX@V~B_*2jxnLGjV
z{0AP=S}(_(`-=t3iQna#HLD?OXcxA7E2GEaUh_QUIeO7ZrjJn1OB)LqkqE*7p!#cJ
znO|%!$Sm^2IuY;q_7i6p)RL=B7bcw+BRBt9L$KZiasDf>Tyuq6vuja_QT?Mul+~-L
zuy#uPyLJDV5C0aVJeUM%3#G-4Fi-a8&^;{lD*q>HExWu1i7`Qc?<asff0#pSyJd+b
z2b7XK+!BtzqK>09(Rq%p*?x}UiL7sa^k!!WKbGli=e&2OdXtxZpYthXtGNF7Mu(2+
zz+MhI7khD!?Hx`=0V2vAb7lDKJOU^{8{eJMo4_r`>18=U3y$4=3lL}i-@STv^$Zwt
z7M|4)!IUW&%zg1m;We)u{igaX8nTjVJufmsn*AV6%rg-hXISxe<~b-rm_)|Wgs^@s
z@LzYMWSV_RoS(!|${0k;I7wzD271cvVt&gXb0Kux|3XD!zyKo(5uA7WA}8;S?Gp|9
zQ--3XI}@-SCBenU5?AqjUis_S;@Oyd)k|PoA7vy(HvHd(`&bp;`%4y*eVWb$JdGnF
z!CLTuUvO%7ALe-kr>@P9DR^Kb!E60kj_G(>S>75;oM8-m9Vl45_1Uot{?R1r_^m*s
z3xy-96d^pxEi&XuI`!fFI=Hw>#s~(!YFXrfi>mJ+7J(>L<+W1}29-J^I`DwikM3-)
z0O6P;^z_kNuKIO6P8`?$VQKM|ot?uQrY_@De_|L>bXm?$H+ASfVLp+lSQUIw3smge
zC$UDrCsvXnABx~sX{W~=ZIJP}qcs2TMmB(kfRM_ej0w0)&rw&QT%`@x8p}1Cm77<!
zCrtlF@hn{nz|dupbF%;l)!K>dhr0I9BI9JoYLT@?!%y30L)7pUB%8gjj^2H9V7v9K
z<|7VQ(+8g(m`8g*7VMk)cwZin>b3a>D`b(#R!_jduafwFX{>Ih&rFBsM0;!k8qz@D
zk{>^04ZgM@EmT*#6cFlNw7~`epo8Oow<!p>C4#Px|L_GMch3WB^u^H>(c;90b-%nb
z0}S7kqz1wG#p=(l-`Bmian>x%p)a3R-19nPwp{A6hBnwOc)kbdTei*JOfCreikSph
z2l~=j2&FMk1v_Evs_S9F<F8*cpC&s11N*|4I}Bi4`x=a#6~nD&6wk?OyYsCNc<CTl
zYXM##O(3_oE<;Q&2wSu!ab$Q@6c<`F)nhm-4ca0ogfdZ~j36_=f1RP|v|aQmC%5DH
zz0<Z29%(D3Wn?$|1ICf-fS>ST`YIA08TplM-Ox(t-h1$y?HQ#Jbbenqr)K-+C%2O;
zVuHLpb*EV1q93f;f*KBL5vuAe#eY6W1R%g~g=r}2b}e-me6q$kVO?r;UXP)UMvr@P
zl?2Ln%tj2Mg9IL1{T?^%n)yQo=4er^b4}ub$dtj4C>G&*5}s%Bc41chO=^+VMZ=E5
zeA0sMz((oyHmd6eLN-DA<k8Rx+Mr*+99H;yy@2*l7z+$+LoOOw?gBxOA;;W1d6eas
z;Dup{!ieuH)7gwW>I0icvK|4lg7)B0srdTx`QFt1VY9b~GXJ>)Y-0b=^BUj)8sm2!
z{7W`GLS9m+SXf4rIaEkIU`r!|1B1EV{995nrZ0aL&si*0RT&8!9FYfXpFTW!mN!@z
zu^mqg^uop1R*xH~Qa^!!tATqotaq`m?M&KfiuGCAiQhAqi0J`u4Z8V9BLLhDk<KBT
zT`_uzv9o+lqd45l&cl&bSEpuzQF|q^11xk1>jQB8(4Tm<pz{h@X2H{(sZvjxK_-Xz
zbZ>s<DY$)bjM5^Rp=98wbZ^5x@53!?kxs8>O?$so#oEzwj7xVb!^$BVi{rKj#;K3J
zHtB8+dtU1RIJ-T7K88V|zUP0u93xa7d<cI#SlKb2uNSHQF7VEKEzmrddMKZq0qr(g
z=1cs&4Pz7c==}L@g>c#CR<0&j4pBD?2on#vKJqDNXBRCwlVgQi4t#PobQC|2U_=i>
zBj<gG?6@S!qr-_r62^tx!JV-g`W4jRdGwAb;1bGC@?R0s9}Dvw`w(G;y-Eeh#7!!m
z@4Nq^)(-FjoPXa_9HTY3<ji&YMh7uIb3aPX&Ff8u2gK#O&ngbbl)|t$^!&=sok%<p
z|42xTyG<Gu&NFxf*}ITO%@tuuTzy!@I^7$OuaWypxJNM|1o8kbV<x(Huae3*bFFkH
zhW|ztFnb5~16B<}kqEVu0`0qXsFXMnL9D}qEzffSgrDHv`d|RRmKvYB^Aq3{MP+{C
z@T)ltX~kEz!wSIb=2@qojNG!|)9<@;b&G;Z5@uT;DpsSYuxtxHY#Z+^-hYn_#Xm;|
z^DMh|jDE)`i*{%GeQYErN`U&}hbj<G84XJswh2He;KRHD17R@1Ka48f&si0W?CJ8R
z)Z&L8PoEm@8$C`I;Ayt5lyTy07XY1XuP<$t*Roh?d1Rc0o#xz<)#<UQYzHb_$O3xp
zrhA$ynj4U&I=9!^z*aYad6XK3CWX_KA*a?75-D%Y>8@L6aY^(W-gy!Y|7UMdWIug8
z4XlxQi01@+KeCY|g5+kXpdb}4n@dg$UPsWtZ75oke;f-8v?l#21SQI3ap#^JodQgT
zU=~W&Uzp0?>}lBmEbh?&Zmt4KuF>I!x?<V;{rqJnaa1WSXlMN&;ilZqH_7;&Q)IP7
z8DPRkBX|B`TMmW_^;B9%rPgig^3uNFtQ$9_@2|M-n$XoG@Yclk+Qn)RiG@yeuEt#K
ztMI`d%^5+^wL#ueM{RxD|9$Ry9gOTX>vYxKASLzTPL=@G$GMi$Dr9Ln1GcOah&*9P
zV_-^M9tn;>O;TYsZwNe&oZwk*8XqiD9?LC8fOL7Qj&R<C8-R#smQJ!aU||P+>bli{
z?zOB70L1@qVatRBcPdmTngB5#neme=#s3@*1WsTQWR>#wHD_^~oboZZ!yXfy?spE}
z0xsBkZQhhzA8GoPOt~a%fv^4}0HA~PBMrjzZ&Kz4m6Gk8c1ehdx*=3{E)Oe+DiPNg
zi<^ah_6wf5T&cz6h><UImVhzk-R>~U7tcE+>Uc`l?zQsNg&$~ofPp}PiuE$OAh7=-
zQjE<=><$|H0SkVnKM>6PwLzmIgY{GyF)Au5{}^D&8UyuOT+5Z${6Kv80W}VN=euqu
z`2ASEn0iG14|DprqNH%Uy#jH^(k=Qw%qJ{`0UI>guy*ymz0HCz9yYx|OnNfJSSgVe
zO_vR%t38|wZ*m!0FSl6)E9f$p2>4B%G;M8@<em3>jDZRat8T}c^IetlL3wMoq>Q@Q
z$TEK5NNb3a1DcYf&KZH%<9aS36&SwpvF5{A1#`QXlU@A5#n(JbCi0mQ7|tdp&6erc
zg@mn3Tu(1lSP!aT&N9dH9%l%Yr(yl*N~n5?;ClV+4QBr*TAK-3iR&`Wx~iApi8cFB
zT%rM(7l2V{9kHl29oRVC{z2)H$j3;*%bWfo_+y6n4eKCi$QC%f&bYU1L?vO4?91e6
zqA;4+|6Yb(IuJE)EYuw--IMyT54`aIIo&COFQDHBd0sBH7;paIb`?_noJ-`VQ5U{%
zLh15avGSY*v=d1&1%4MV7|9b<f9SP)L@zgFL6M8_x*acVZUzGKm(<$o(S!5SYF)#4
z_~4q|;wIdLqzkYEM&nz}t_>7k&cF9tkikQ%PkomidI}d9{z)SlhbWLGwE`PeMWDd9
zZrZccC*}+bU8#s!%fU+cp{BdjgVFT1Ut(F|d*@!vqOe2oA9u)`31CRKgN=F#IwOPA
zN4?~OHdz685iH%rEp$@%(c2LS6njXZE+FdvvOX#Pi>N5AXW#?1%)fP=fr5Xu3^*kR
zt^ltMYk3rm7vb9^(M7APpR3S+>vGIzG2wP};-@Ga6{Uq&Wn5cRi{>d!1_B@*X?6UW
zOcil{0r7aXySh)k^gw6^+qJvwB_DJ!nQ$H#y}p-8Y}Ur}cnj;rxC==XfeQNi&3~*5
z+eWm_v4%c0dDipearIsH{6e+zGUVN!pQkonBBs5cBdbUlkmdnu7VBrDjx=V+sUyS$
z%rzPz@*LZ?g>AW6XO=<*GYFZK@C49;VqUkUZKwA}8s8i`6u6x1Mkze{R36K~VQ$l-
zUg~sPYd3)a4mO5XJ${D&+nh{NUTemaN5xXI*(U9OVub;8&!51K4*1bQT;tMuiH?Tv
zIysxa&(bp?%ML0M*~(qBan)7+Qw5AQS(V%$7PQ%*qzUjw{R-Z#OHQI&5d2JK!vy93
zQ37Ur8^X0WG>`oI^rxE5ORnH$xAde){7-2K-)a+ZSO6ol=xrqO55~EXJv#|ygh4tB
z516iv;b_dwTq=rsL^=ioFgC(qrn_ulbSYv2GuAw_gsu-*@(Rvbod4`vqagpS0&|H*
zUP`yGku=OorWAIv9qRDuE$*+t2%RUPQEzO?(C|5Ov3lfZgY<oQ$Smb6A@)hvrL4aD
zz!Avf*s9E-R%ws46H<9%rpQL&Y2Ne#0yyw4HxX-#`bRNu!u~YeE^&2`tH#+3QI%@H
zWVW|20NpIm09+Py^`zzon-KcG^;Gei0gy``v!A&3dm55Buqj$l!=<#u!vnm4*KX-?
z?k<1)(@YIH)}dDE5VH${X7zJ#7l>z$L&ke573MTMBbiD0qbC&EEP$gfE9{v4W{_bX
zCFb;pyEqXu<pr_bRo|Jb34ldTUw$UN1a{CvGf>d=Gi@UOvy4-(|0)yl8U5-0`&k;@
zf&v$#;Jfneq1{->L9g!)V}vc&*}Z*X9Bdi=Qp-uMo>c>+hrsN#0<Gld7e_*sQKscP
zcA_w{@Jbma=l{ed(Mx40a;)V?9%G{gQYp!FGBm1h_Tf1=2GA<UbM@LhgMAE>t39T*
z%9b7bS6zHLK6B5p(%MxMvUk8lkN{h8?WKF#1dG}~WoX&28VGOP@^BT8p`eLkK7*D&
zhg6%y;J3#7zoxkqZD)7)MoV9yu1Nt~e*GDiq5dBV@t*^o9qHd!U|s``v#qxW2KwlS
zj`|p<Ezpe<72ekgL6bu&>WuD|<6L1c%L0iA`!!h)fv$c0`@40%Y^>ttU_chXn88VU
zYYM-<c92k0kdj&}Im@RHHAGgnjsdgs_XUhr{`D`iFy7=MIDm2UsE&~r%}&<#-iefa
zpiK2N(@|Cf&7`~f^QH6i&+Kzq_I`5#V(rr5LUnUd{1wInagPH&w45+FeJ*Za0!R@5
zVh$u10_$6S<3HE_9gV=yfVmH<KTdG;vf>{0?AJ;mLuX)Eovq2oAnGDa1@{)p)v(oJ
zK-ZSn3Upxo;O<E`EE8yT*1yeVW`TXs%IBa8I`N{zVSw2FpMCJi;8E%GaChSMc_2}s
z?82-CnkZ$|x<(LBecf8~*C|lWd!|H&qSLZY7M0N$3IwTNfKe+%untA<6@jXZpo-ym
zda@=PO!90YmZim6SSmwN@>34Z%=g-cY5>?kC-bV|fVT_iU9n^iykOkN^DwWt#&XGf
z*<!^WbcbJ!2rWQUg)l0)m~ySa4?2^z6@+<aYknQ1N?%_5#_dr9la#{BlXP*}i<|Q}
zV@G4&_Acz#Jn^JU-S?gp|IA00E0qGU<-BS(;b4b*Q2-_#e}S*$39%aKG5rBlyT2*Z
zmkat#Oc2(X{b@Br{vYa&lL4wJcs^DYuqIzNKWGrOz))q6w<+wY!s5-2U_+J5SVA9x
zMAjK@H4?Ch-}2l~H{DaIkfc#~jlsq~dH0h8|M(_~rFkZvnR&NYyjXWq<_E!5zYJ`-
zq%2#0vM1^S05D*pI?djKm(Um`nvp!7Zf{xD9_aOcTV4=pq3S-4WiYh#>h>6P$<AJk
zIb<(EvcaT|f!A7yCl!L`c+`C~BP>EImF@(b?&+;qvlyB-vbpZ5w3V}1-axA#ybyaO
zaGb_%q23-7qIJ*(vlsW+zS}?Qo+rQO0)K$z54C-VK$(;kit`fN8@*}9aIyvv!>E~T
zI8D`nk8ZoQSwpI_{m(Zk24VBohm>PCwe!tji5W_$So15g1n$Q22AY5bpYhRY>#*>3
zDkk~yt%>UMu_|?6AIfKf$GITb1)k7=H=JMV^|!-}C09XMVM=oKez+&~)%7A!+1!sW
z<Hqqp;tucf)DZv0{p{xs#GCHsV$uAlf@@J(gS<Ch;OaDAf<0Qb?=TT6=%gu)X0sdh
zNUl@`qgSWp>Qw|m`y>h!N%t<5U2i1(3=M$zK1;Yw{a%RehefwgmZkWEzJKUXRW!!i
zrwaMGdS_4a`q?;j+)DJ7$$ECD3wQy>x9jx~`oA}Q9vU!QJoa)2$qb1vL$P|`$@hG{
z2pyq<qx2=>kz|YjngjBX8g$-Xuwj}Pv6k`xGZDYf#dE^z=cN!<3fM7P4Q?=9R-q<*
z>P3-bR(ZnDLi^+s$fYYtIN)&DHmo*+&Y$VgM*fMc&1RX}>GD0%H<`M4FIbv6Mh^9f
zk-lm&INPx*I7btiyPzgO34+{J)NjM^J#NXL4R#JZV)W94|4!)GV+ZpLKHI&O{T^Xg
z8Gh><m2NY}DxgZArP9zei-P`-2>FE+EUJdaw6sR0-pjf=J`+EqWUwWSlG|B^hK2cj
z+|mHmdSdeENd1=-r~=e}_{f^bpBXzh%q86wMJ^7PKH}Dm7RLn7&<LUKsQ4-YT<$KO
z_0K>-@km;(HP(8uvtmq1TSLe}O`$ouNhuYK_0LceXQ$tjyO$}vUdYhNcLlR%$@NQH
z^G(o?-Hwfj$<K??)87Y0gmCRwqWNAC`~Wehp!pWcWr(_d87HzxO<(nt+rbiDcjQ{h
zBu>Rb|J18!`YrZXy?3mQdM#6EK;HA;x$*7wCXJpqofj$oT+q_XD}T}iMtfxOQ^vh(
zH0xrz*q|6>53z$w;2jL$M{2+7CS02W>KuRJjce%terp40W*E^LFOU_82H%c801S}M
zwW{^_XmX6M8%i$Rn%kx_WM-Ey8wXmg&}NjpWsrdzt=V-12_nH6aS%mR`l5&h0-(Te
z5JzGf7AToW61%-j7Ak-^+je6~xxv8k%9)gSdFtnrm8;@kfA!7pOdq_|g#iPtI*T!+
zDW!!6@&a@TmM7OklvEmSh54@%7TJ8}@*%eien$|?Xwgp~)R`hsfC~yN?Ys{dE3?IB
zQ*Hvp_$e@5RVW~A9BNrzE^PH?g-tA}P$tXJn1j?}h{;f!4iA&bR7fs!keR9MC~zSb
zUNhETc{E1n*me<F?d@e<u11f-de-!M_~t5LWiy-dhC$JkSk#)=o*rjG<*%TJ3q6aV
z-O;Ll6t*H>MgLz>G)2$`Wt3(D(Q}lGePNQ-{I3d+c+NWZOQ>G*C%|)!o_DB3yxyS_
z3VNiOX|&1(vJuXfaj#WAn!IxgC*8gPCgnc8Ti9;Xf4auLUSW(#&8Da2`}bYgjV_zG
z70aFoVbr=U$2gVx?IN9l$1yZw!uj&4IF`zk;yVeUG3;_^`BI5}XUn2Ab)j#9R|`z7
zMq>8x2xjn>x_L!qsV}(P_qYL#4jb01H3!k?|G<TgzuQ9$5daQG5ilV`E|i|m7WW5b
z7v!033?Qh=s8N|9c>2TlTsZLPIJ73Ln<Kz!&2;N^wU%u)MLg*?a(&R3L|#BtIP)Bx
zG0<OmaA@fl7;eO}HdCeaC=8@Ll5Ovt04V;D)=B>#5k~MSC{uvkdD;6h7)SCo3ADxh
zsm31HJ1zI5YqoY_yOGa@dl7(ZS^fTl!~Nup4NR#Gl8Dw{S^1^W=yTYlb(}syCJuN0
zIb@0C&g=N3yreT*=e(uZ8h5D(;nM`2|3pbH=0)?rUH~*&N1I7LNp`2zjqIr3H1^NM
zcwz}dd~&4Y$#0_f4#WM`cbKW;6eD4*9V2TL<bM}}hl8W{9-~R#EQT%l-%kxi9$jNA
zE&_hXSOr%1Z^#(SHRsS-dN?p3Rg#CHpN5{2UuM?$(q}H|%qWyZNoxu1s*MD1|H6CA
z8(j?5zoP|mEX0|y?Au?Wx3r&u1`nGbHuwIW00l}KOv3w*YiX^pNJGb_IeIpk+`XSb
zC5o!cs{aDgdD|^KfW1qm)CRVie+!d8+avv!tMRGja*2>9YpVLASMwk-$;1!Zrcd-p
zwWt&ZAD@Jdg4s>}U=YgcC84<quG09|FHPe)NtM`Es>7#29U-E=9?~+Dc4S?Ed+Y!T
zn`<rLCwx0UmKQM^iWgIi;dMg?UsSvZThyEz*838?GXzaOR2;c1h5NSsJV7eI$eblN
zjHKIV{v4raxyWvNgC-O_?rt9mVqcPT7*~rFJBq8{j^6=3ckH0~TpPz3#h7A$bSN_+
zCp2$2TdP(0<46}e<0FmPy|Bjz@-2Ka^03jsQP_bCk3&BZ53fqf%e%n(aKeI`xANXT
z-Qj8e)SJYYY@Ah?kKHLw53<qkvwzF<qo#KO7s9K#9_Cr!V80jp3r(|W1?G9w64htH
zk+PDbU(+s%HIi64_Y<Ma@3Bg`)v5(TAAIlI0*nNl2=i%nVnepJ<j*IgsHkQtrtNEt
znk5Bjl1D5do*S9`sa1zp{R#<~B7d_jSoIRKVlva8=w~1PvY&=thQXBqqLYs0W{F=Y
zR<sv5D}-GS!SJY0bXbyM9Cdq>>D{o5c5`;!SJaX%o`%|rjPjpt0d2w;{HYSs@E6NY
zB$x;wyu&Kt6+1bf10pX#O;{s&Jsn`U$SA8p2~9Q8Y*EJp#|KEswT<F0>`PPVY9T>t
z>&nNC!*utTvLe<073Nv(a6(`#4@kcr{<$uiM|ew54Llbcg$(xpANrRMF67%nF5>yU
zhuVuXV}{5e&x;zBy&{Z&_Nb4)Zm3&WLo=u<#=iI(I$NnB9N@+YQA54HIvVPLY>n3B
z583wtdUU3F_YK7mLif6L^qY&zmK?Ux!c@~NTM6)KfstS4I#*)7rYXGZsOz&XR^@a$
zl=Qu%)a!E#46n1pdY$Cf&;#Vqb9TDJ*J-!TkF5l|eb$;XNOgtPkk_tkg{HyjTqK(a
z01`)rfiDJbe&+M97tP38qzeL`5@Ef`RI5<e9N}IF`7&uRB`Rdh-CQNi2Je5{fv{)u
zN#RDQ$7^)P5P!g5!lfnD6Ynf?l$cjOqnSWj{A}Y&{^6)r>k-<4At)hgIhm!U)&->+
zl^<&q88({W{!!Pu-E;Jl5mK1l>?0nAg+$W4?5joocLtxJ`%8|-1bKQZrue(Seet?P
z4JwtzLR$y7|A*o7RyBd&*qAC~VG4mrf*`A+%AhA*Eb*~|2p0-F(Qkc3v+5PP`F*KM
z%vsT6G-skV4kB#?><&U8_i8#VnOoO2f73CkHh2sMIAlxzYKmj`BT&O1!pFi|l_u#1
z6C@hnT?Lr<A;8>KSHmh7NRpYvi+T?6FvP#yxMHJ<06saa8i@=*G1;T}gZIDWW64c9
zhhpeUs`mS$j42OqNg5@fs`WDZbCmX>A-1Xb?*Pq+QOPYgu6{1<!FX6i(L7kt(<fkV
zmlpU*+Gc2$Y-Ojz4pU7nQ1ur+*m+ZA;8Fx()+t_xC^i5p2&@HcJ=h*q2DkrLWw1t3
zZ2m3F4+s-Hp07@f0Vyz4OU$OjdWwVJ$Hg5|*K6OB09ksr5qc<3W0FJ0Ckg)nG#2qO
zp=c62lV>3%fq%O`=WP%c2SYM_zEF>6Tihzq#tZ8=8F!6eybXTfIRI%=&<Tn%Uln$v
z{~xlxGA!z@`&tk&5J8X-q@<*!r9(m*k)cEB?ruayT3V@*?(P^srMqi@QM!j7n)l%C
z?ftyh|BK)7(!)7>t-aRT`*-Fm|JlGAjDpZxT66~gAm;W-%*BBu6f8yGB<OZabNj{o
zB4(g>A3-7F9ritzPHrmIC`CKq`o5iJ2j*>MZ_@skEOs<vZvqKvl}G7$fdGM*5F)^i
z=1KK+-+S~vo*_te=qXa4Uain!!Nsk=ZR(%xKydK*y@!}7wfN&}6j)jIHK{;4s$p5Y
z@gL+Vz7Sodx=RJ<B0l+DJ%<9^iIPn&4{T0-Zsxq_j&wYj5am~JiB{D}7gHs(cUp~l
z7TNwu%19*Gvj^f`B?@^@z%V3w>xC9)6sx8^MtYkqq@u}2M&LmwRHl@|`I-CSJ&W!m
zK|ra^bfo2Vs#q2t@#AP@KC^}PE&kH&xlSA6(V;e)&dKAnK_@vlKDXNY>lIJnTtg(;
zn61$J*&pK!hx+9?EWrd_1rthn*KdpndmVi|+L^PcqT=5E-HkBl-X8o%qn1axOWsXs
za-ukwvVNM(!cwRRVcWm+B(Q;R*!*Fi;b%XEtMtNWWj0X6w@p`h>?tR1LqI>8eTT@k
z>QnP>`ThHmbQ!D?5{=})*L=ZPk_PPqK}5(N6{r56MfHpoEfT00sk480cfJ2(m6s8C
z4X90(S(~wzaDHKbA~r-?`#>e-ulN+}=?*Po9+&chPB7Kq*=8+qmWlx=L=E7e65nNL
zB{x){U^7aZZZQJaXD?_4_m|J6c1$w}4Xe_Vg(ZpR;!Ty#oWG0KkaE@HqLZ6{@p-aP
z_aRu8OcaFI`n-dJsMKooW$NeNPT2KGE*Y;kCW8VgKqX_TW93!NAmUq&B=e4>b2iY_
zM>L%KfnlYtq9B6|m7RjQ_nd|Y->VXU3v~q2a6b_&FWiW4?}HU4p;L`|A4$g{ZjW4z
z&<`a<S5NcGu@0_<{x)WnJi8h>u0ku!RIiT~(Fxbg;p7ad<d8nj_%|T&Za&|W{y$>Q
zYZ|o7z^7ciSNR1rkYBi<9{9S+_Uy1<L6b&xFr-Y`!aOal;syLqBmtr%N$|5zr^eJ}
zx~_FoIm7Ss&#<DSG28N6zR)___3Jmz7`BTuE7!>1dB0JA5U8?P<v+v3c$j_<C@rG%
z_d;k-6)DDXfs=_}Ch5|VVwQeQCD}snh*1eMLq262;kS9*jVg5+mP!3eWqhR5lmW}@
z3Z(X#Ft{VzRXK~c`1ETc@$<ohwMpdfZEFg3Y2>$wWSWV!H^DH^*Wk~OM`<&?3Uo)Y
zgr?$u-SI4~3=%Asyow*+V>tZSOzg`R?;GIn&|gCR3p~X%!=;h{;3Af2TSjArJXq5e
zzW<@w-=|EN|9d$mH7}ThZjXnCW%O75PZR}cUoU!C>Xg}5#7wQDTZcecr4`RDMU4{;
zkMJdgM<~BF*PiagQ?GXdF-G40?n~-ra0PJR+d_A+o#jl9mQ(XrMtn039HhC0rIB*q
z3uuS6KeyOekQVxOxF*sBf=LBj8N7U=Z+|8%?mJ|I)iK0V^<|oGNRo3K(|P>x#ALZE
zgV@tS?a5NO9+x*oF{e{x<U%7yLQ$;OB)tNSf7i~IG6<ZLmS{KGLOzf@vBD9uJv6q&
zI*2P0d>c6*gI^U@4%pk%R<zibVDX!QphS_<{Vq{+)+$pUaxA0W54%8fC)~7g<wBm|
zgC}_*?_Bx>&n%4pKGuQLw?3)=xk6u=eqW(4={hOVT}%ClSI$BzR`S$nT^}^dPWsEh
z*O*Pg);z9W(b4^`u`jud^N(*hD!#j?oV;58&z6|M0w{iC5jM!8y&t2DTKtrk1PuN>
zhJY0?B>S>v$cRc`usln=W6DO{JNAr_Ks)7`H>tBW%Y%eh{fZVXF_C+5#R9K*;EXq_
zJBkvu9|`G>j*bnR+A3c)(TO1PY&cmESJ#)@Ix!i7dirxcOgv4F;dH-o6<A1iu`gJB
zc8Te}UTt_m13+}D$`=8~3i$f<;NN9|(u-hyq5C8#$se@%r6zL!gW>}r;3o9WzV~R!
z17okBw3&R>N7p4v$>eT(9Ua}Cxmo-TsG@NRLxOtw%1Vr5Tz|~iNoa?^1!EV3LL5kF
zEOaL;a<J32&IK$TQHST(SvLE3UwohF6rjx(q2Q6IG>bG4%d}J?!YgJ;8oLYWT8`2M
zOEK79HeN8xcna~yeEXRa8-v^Cb`F}GoGGUiJDvC_I~PE@Mwu2b-rlD5zgq6N#EKmq
zIwPgn_=hi677yO+phaa&?B&7&RB5p)TT<VH4MHYE_o(n=N*tGtJ|wh2#qsUv>0X26
zeq%9zh_ce!WWH^pupwbMSu>b!TxcjG1t&qQBs#<EgnjW*btL32tHo1f$@7sr2S;L|
zKge(B{bv=tUdD)F6$T}`kN5LTHP%&7K(xJ)iN>2Ybs`+1s~WQO7(4vZo!8{jL6kcL
zY`aSXYijb^urFNj!PX*Pfvi7UGd{rxb>jlc+-14*$g9D0-4(cff~_QjnJu!WGnW@`
z8%3kKpEx~9S}$l+XiC#msozi;3qdRZ(ljS1Tt0CgJ~#lIA_hB~Vi*(uZ16R1*N9?Q
zjpZbCqGPw5<-n(5(Gcj)o>qBM!tW|WOxJU(^mXgCAe>w+BajRxoc#4ozL^B$jGryK
z*JJY*P=_Ds{vLU}fVF-T7qU#^YEJ^;<M7(zs=B;CiMAJK9WQ|}SV;tW+hAt+?-gAC
zv>$$@7%>JmFC_*37=La2Mt;t~WWN@xKqBrZp+|EV+?xf-sRmj)G1n4`xF~vHwK+@g
zMGCY3k}->>Juv`sd~zknv4W)W2h@8VG3Ycsa<Px8R1$pfz8J(uq%?>}cD<sp4*ocs
zhLiG-4a;P{*Y)dH9=^K|YFM#HpVgPn3<fmZR6lhoDk%s1!788Doy|{c;x3N`BAY9m
zQToP)885L9!G>eu<nHdaPYjr2B>){9f2CxwF)dY((duP9h`t9O!g?@YDOK-&%Fs2E
zLM2FW%<W^u^2scM6yHu2$VGcR)+~VQ9lYDC17$;54iG?#AkESyRS2FVE1-A5q(5H$
zM|ZBG_L*N(ud2!ERv!0@G;~f^WAri%y56Y(ke>Op!TEddVh-~bhiP+Em(_#mhXaLa
zc{9=4A%C(QW)U2cgoTgE2B@!VNxX~1hyF7Ysk~2Z5eVQYK|(<`jHBFzyhOGP{zq`t
z<b3e1`=vqBkZN6{;<xYY0r~lT1`k1gI43Pd2TOC9AO(!atQF6;34`Fo;he~egx<w3
zm;&m~gIv}Vk#-xmgd(uwkw2&%Wu5lfg1|fu*JLpNN*`>Q?&ZcDf%LobyoAIGqe$iz
zMCD`I_zsIwB`ffNB*9655PVMg;$0U2M0e}01S~-))5jCsrgM$5|CQA>qJc)jVj;Np
zkFPWaFbynq`KcQU<(`5!@+UO?0oeEPlIUchl74z3CP*VsOk|7jX#r()d!^G-(Bte5
z4n-5!xhLuyR;q<nB)(;sSn<R*FJ@Rc!w7XfSUwYAd*fKvRs~DH{+Q+@T!*dYy}?be
z#X~Cc?Ux7nzZiD1Fydl5R1`=BjOo$iWkc3R%7nmXo;+Tx7-zA<7f7H~Jy{cw+?6-0
zWs?h1m)W2j8fl4$or_<OtGL>$D3KMrxd&Rh&-Ksb%Wzt!o-COZ9&TokY2Z33k2(QQ
z?u`bNv121eXY|=t55~rOziAXi`bB81<*Mq24#{3o(4P(Q=7||8u)4ZBwK|UjiyYVn
z?G>d4QRx`JU*a&U$+Bawll@LBlHCp*T@b|ZA&o_U|5Dog_{Wc-A3x^*2hFdh2A=48
zxjaZsnRZpL#MzC_b-<zf8ZHX9k(Ky#Gn5BBdE#V&q#Q+O<T$)ImmaZVrvQ-!<>i4(
zk5{x!pDJfm!ysKP?vI`P46>B)c#dx{$0oT>O`Mp0rF4lVk$+nNn%VgY5nf1PNZGqB
zO@uDHz;YIu>40|!i45E@a8z_XYd4-P=IwjcnN8Uv!;luumt<EHc9^JwjMdN?kd~p%
zQ;0TpH}j;(kc*ZeKu)YAT`h)w&3I##p*wr3%4eNnpS*D^<jsELRBc_p!E0?NRu3q0
z!jU}02Ylk9y(r{f$Y8|VEt;MK*1mW%uICH>AMV@5A@c{(PL`dIQIj=bCoB<B{D9?n
z(YdyNe@XA_A1>11TDnvlyKFuMvM*dX^N6&4df%`&?td|#9<4T{e}6hGk4v&jS8082
z%d<3C!Q;d%mjz^+WIwf&D){9Ej#{u{+UCFIFz+wR|f644Sgcl#jw*>KC3)X$YG
zO05_5<4%(Iu!HU{zVnEd8kZdQxIwTSKKthKTfW((bL~J)Ct~b*`qYhg^HSaF6s_CH
z>W(R4a**wfrhkwC<;Nzie<)@dE(7MH5fLA+CAeySDETpNw5(rULl*o1mni@I(a}wx
z1eo;DQj6`{8HJ}UI;~{nfi1$OMb}=exv*Z6?Ar|qQBtx87~j0VWyZF`z7c#Qz?2qy
zsY?9q8#XbKl?40-(IWy3F=cZCRXd8xO&bCXd$AkFrKdvGUUDH>9Gqp0O!*vposNP?
z)Ok>i{d~}qk3pnOo{JUuTBQB%i(TA97aanjEnjU6Gn>Z8WX7YB3McggzEC;@ygRxk
z_SNWj^N3j45TOKOPMPY|3%#4zzQS=wLz~&LcTNxaUQ|sR;8*LK<M=1h`6=V1>M7bP
zCk0%?@S^oo?pL4Q!gRdzfKH3rY$cqfkm?%REzxV}xPE^67-_rP4^vb01%mBWsO<WT
z7k783`=eYSFUO)$(AZ$jhn@&aj<uqFJRjJ;oBlC;OMo~6Nu$cyxqm!hh_wK%Zusof
z696abq@MS9c>1a-71N-o@mCYWf$!V}XXg1u{nVlNo27P#ZxAJ|s)r+&cD36Z_o{eO
zytagD&+1HB;0>rQg0h`9(#}_x-fVF<I3QF@hj44E^db?pd&nATS;30o{-%OBZyYjX
z_+_v8dFJXp?M%TB`ec<HU)65OALhP3{86i3>#!I|(9}iMKKq61tCq|7LkF|KrJ{{5
zTxyZa3-pM{pv66f1bS;njld;`-ppiUC}FQkm6-G`Hw8+@<<XOA+%<PtbecIhHm-3Y
z?Q*G{Wit2dBT|TLgJBr$KWB9fgYqXW8m~C4v-{M8ufn#fj>kihk*GI1YDh+=eW73t
zuJZKL4YcfcQ9CxLRc3V6x~`$?^>(}ld2)RM5=C`2(d(6E_-p6$Wvpp);=BlyNKsT#
zcBrfTwOko2ibvXWS}!u~dFiQQwEa_GMZMx>C(WM7ENeJ>`&{z9W)|X|jBUp@+;oD+
z*LkhAY`qWuIU(j!@a#j*K<}#Ky5;ojG6d0?yBvRBi^Fv|y83fw@q{U2n>pO|+<JKN
zn>3;<>HA)EzvFqL<J_<PWk!{Tr4P#sJElZ@+%rG>V+Ui2$mxWXghhE~jHt)YSFic&
z(7bll!+!o4)g?*5@vAEedw|zgmpPNV+wc;pI?~;m9HfNnC(3in2$Dnp_=in`nvj5q
zQk2?sH|8fDlHoOq3M1jiBi)&?KZyPw-!_py`}|VZioGLq<hx&w+qTjqFMAZ)gbwdv
zaESeEDJQQe*ml{t&$3vg?O8*v_THEAnmlUwS{f#vLKE+7QRHx6b<qb`pW=W8sJc@n
zH?PyezJ_y&5j6NCdENTmSFU50sF6Knx1S|cCb!syKV6!%c5y1*`cx_LAZQ;pce|U1
z2zn}gz+d0b`f>DNKuEGRJP4|GbEF_GLa2Pe%%%`EDdTZyM&0g=fIoX)5c|t{-NG1k
zU3wy1duo}o*5&JY0x4h0FR6BvyC|Vg$%I2p-i_y><?VU*>(9qdW3De``96v*eaNDD
z5*_N9^L^LVh@&J}_B2IvLk1HPy<>JAO#uBRU-6i!vNnD4X+=*6!&YuiyDbd_{nmjB
zt#p&>Ey1&wGs>OO>k0<aZ)z=HypNWlc;`VE1Ag+~Lv)Dn(=AJF3HIn7+oIuAdz9#j
z|8%<p?BVL^z4lmvLrOH?l+5M(Oq|T|+;xfx6Lt4(M0QB9-Ly}F=+1|iY!96+^=guP
zJl4e1tAfH*m7Nv`a%E04@f>Sc5*X49xa*a?z0X8UW#>1m?G}-B2+K@JwlANW|GHDh
z7Ki=DYFm{Yo3Bn*|Aqe0VZz7V&4<@Rt<1S$>55QF3h%m}D%gqZ#(mFj1P&t}0kf8O
zciiP;l6I%L+|nA;?mqTi_Su}HoYJ%%vU;Ux+gXvU7Dxl*V9M^1^?{c8M2WxE2qhxc
zmo6-D`s}T|$DDO{)+_pCrozL+ri6mvgDv-;2O8Wxs=_ml>N;*YNe((plSYOZp1Dsm
zl(uk1a{`swTwSPBKw+B%6_5?i^qR9#ri6QzELHCw8%>eeRlx&~7}p$~04tBBT#xZ{
ziC{{4nqEe=d!N!p2j2R%K|dcd$xBpBBC^o6Ddtt5cSot*w?24BNq#;2&dH(GtcHo;
zVE>?6gTINPU_(1BZd2iH)H1f4yh+b7S1vYo0&9Zz`}LoL@5*?Mj?9*sV-)evS1R+3
z_j*aRcoO+gulP`9?aY@A{khi0M?a~p17D^>?_5NAv!JXVv&F^JotHy>E?CrbmGZws
ze2)WEy^mo9I+apR*xDPDT!+4{Y}0x?vo-$1T)OAAGsZsC10AXDUFPl8F2>`?HkVUH
z??LZzvvq~AnW4N-T4WrP*%8;sn~?H`+POSuCX@YXp8Dk_S-ZJU4-ZMhUCBNx88e@V
zHiusm7kcM$Cayvc4$awR-%06OYKo_cnh>ao?Rav(&+G~>nw8MA#9=74N`iB?j}+oC
z$|5Z%uN2N=l@pz_kDKodAHzzsvtv5!4icP)eys?QF%8r5Z+CGL$qW|3>$EV!BF%dq
zPcEx%^PE=pFBZsARFdgSzUWurS9h^6VNovUD0%%$eP})@NNUg1w=cGAG%k_2a?^Xz
z=!^D_<WLm}B1z!DfLfrq2|{bS?>UiU-S86C=h-P{jb#*1M%rnTcyS;9?4Y``bRyGy
zNS=UpP3wzKNc;s2SvZL$T*ok@99pNGz%@HGQh?kns&`J1XNKDHZ4KijvsfR_Ou9DB
zUH3Vg<fxsi%iX<bM`SDnT&Q90pVudOc&&?|q+~y=VE8lRiRRFA)M&pMBFM?j?W{4O
z4Uy{~xH=#n%1=8@`@zA%{G(%IT9BUfw3HuNPIm~~>ZwQO$Fcu=z0sno?>soW+d4wa
z24#r+H&SYAdD9nSbtQ!gC9)>j6`F8_FQgP%SR8wFau2)OQE}I`HXnNuA`C1qI!^&E
zpCSzwzDF?m#EvD~R$JHq%tWjD4wbtpLffx+Ixy6hT%API?hJ=_Q1mXi!;{O6rd#g7
z4@su$E`q5S)<`cb;Wir|qZf|0#ti#?&K>#I@z(va4<V#3&{w$Y$DPZa!Ym07dl3;*
zWD)h<>;i=+_Dkm0i&Ne8OIBGjx~IF@h)RhpbQuEK1fT2_)Tf5yzId5LOUWPi?~OiG
zXlFOiey9g2Sl$<SVImh09-*+p;<+(4Vf185l#&_)ca3VKyQ3}m<I8s*_cd1@eaXsI
zE_PoxU?BVAbAWvG#YJk`E*NGX5qW=u_&s>1|8Nv@{Eu&yZJT<DI=UjVH0aP43la`_
zzKfDzB+W4~ZYLaS=)>QIy8MV2Y7{LZ+eu@>v3)e6+b39<oG*-QY|>DCw&$PW&6RTS
zYdSx4GudlP;>+chdevsOpj}*rwKd_*0$Z~dGPZ6Q{_?|P$9`*}g_R<fu&Jq5#TlB1
z&ay*h3!&XoS_fPH0<t;>3H;~JYmEk8Re86C@oR<)HoRx5_u*|k4*ybBn&Whspzf$y
zU8L3~q<TMuBKyRAW8-_dz>EIqsSMRGoTK)uH__N`z0=Lf4lQ`+wj*(H>!4vmW&&KE
z@1m6J;;{R}d6K&eCr_JuS{^06eZRwgf6e%<FwbAACVB!+FbJ1R-k0JHl@Pj%CRGTW
zpo<!H7<J~geCb}0_O<*}#DZo>U^(=VV?WP^z5P$6hAz@5@_Xn%<UGSjml+!kdQb`L
ztSY>N;jE{I*hKY33bgfCo;hM?2-{zN8FC6NWQONZ44e8su0A_8&Mml@9dzxra_{h$
z96H&CTOacoGdg$mOqROwesOYD()MkeMJ0sRSyma75;V*_>u6+z&c73~`ifpcdTFdy
zsC-?08O5*Mw=yVIWnJ+lP%p#QVnMfjvF}u`oGU{Nf?bs0MY3QydaNJkDKoA#K8UnE
zc4abr3-cuB!b)gAA9f!8nF}vy^68J*`7YPI)j(oVDxG|HeU!sCr>G!og0XS?hlPcO
zf%P%vrx({!$xpv2p&L_-I(PL)YsQ8!8hwu)!x<7kcdeFg-i*Rt#ho^E!yw`OucR4X
zm9#59Cs^jYq?(hxJUTePL_-!+p66@%?FA{`vofQ%Ui+&Ya$Jg?#G9Xe$~R2;&K5kw
z4D5nj?yV2#T=5R|?t{f~kuF552lU|IEC6^tW&LOItkGmuu0?rS_Rkd<%~0lVTZd+s
zZa>Gj_J$d~f8^DZ+~z3oE5~~Q!JBM0a`8C@rKu*wNw5Y{^QKSsoVMgjQ4e#ms;g|+
zUO6r@eEnQnG5#jGd-qV7w>DSwrJ`^@6ECMpJ44%PM~$9?-Dz5A7_x1f9_q~$uaC0o
z8a;4gCg{_1I{Y@YJW@3TU}5I2lJ+g5Fuz!3HhI^oTL)!TFk)hK+*`zG%kdx06+3`#
z1twgZ7174sY~Oj9+o;w({{xxV{O{s@6mO%cceFfyH>)x|ro|N_w*UEqXsBPb@)lq{
zv@j*U=BP2-{PE5e&t+oMf{|-uBQncfCW$<ulU{4bDC6eJ{)wVC?P0cPy0GK`ABzmY
z1Imn6mEsPQ%4qw(s55x8`m~HuU^^+)orYrPdF6`A-XN=<;nTh%T4sUCh;_>2WddD8
z>*6NsI>DnHXG(c^gTT5&_cgcx(wik=ad?gMHo5R&k9e)cgkx2sWtA%S#lm+KXdXI!
z0?)fEnMf+uJXTVMd@glgrF5@xuNy7qahEn9scG9hhG$S+3_Fh>RG)mn5#}H?<2<I5
z_jvvRVZ);5b=Jta*vwO(@HtAmy)P=E*yp417om-xOJ$3MiNpuxmisd(CP{B6>WM32
z)v#@rYgN}NWgZcMcB_UK7m})%0z?XyU4ACEcOEvb*qig<6_*m7(SW8*U<#A~!=V3Z
zFX+((QuWSeM#?G$IztKCPgdiy4Y{prBxXJL*D}O~6b7<`PL3!)VWX4KPt?!9=uXw%
znv_WKUGPe8`P>=$taW!|>Fgqj$&mf-u<dBEW#%GPwI)x~W%zK0T)PliuX&WD!mR0m
zxx@EUDQFx=bfdZ;)v$^8+={e80FFLPwrXOUuW|Gs#(vj)aW|uiTS{Z+duI52c%lZC
z*00`TrFj>U>(-^x_~Xwlbd=`<)bt|^52FWp$&ph<2r{m%3>nqJ#@p3U#u@(e-b!tf
zaS!`3`rWSe*fN(H<j`hGQ)|@QtkZKnZTjMd!`c|3Qc=B>Q%$WU)~27ze5d<|w+@HW
zUYLs9q8uqNs!@OSD%!p`n8LD$`0BaJ-!?j969T8Br#HKis9)gmKG<Wd`@M3%6Vvd$
ze|R0hiv?D+cOJJ699&(cew15EH#{+pU@J*3z<gcQm=SHZ&2W(`x;ld*>OQDeD8H*V
z%9|&aw%pYhyCey}JeF2O6R~z%lnm_5T)vz!6H^p+P3!SRF?ClBlQUIftDK*}wah(R
zh|>!)7|9tYjI|l2jkS+|M%!AoD=dA&G=VY*Nb^h{;p{|44=BR#Hg4F}{w%mBvy7S~
z>dx%$oVVihz8HHwnIqO{oOlDPDT;hqzM|6J3k5UoQJ|N-9v~Yc6%uDcO^}(p%`{qk
zAS}?yD!?5aBvy@Zag1ybJU$rbE^`>w(yX<pTp@>u{ZtYuj-&9=p+%h6SOlD>dTMi)
z<{h!cA}7r$x-tWxWg8xAJB3Cg1M`m*1dPofB8X)7TB`>8>(bgMr^3H_ZjM=IqxE#3
zxIE}hN!Y61heH>OPW9tKpByHy>*jLvs#Q;~;(F^B{~NgX97H603B_SKg>Ej(A>i&_
z{GYpf)!S*QC4^ud%xaIS?Kge(xpnZi&fymd&LBJ=^I$l9z*qm|1#6v&Y!@!Fu`=+H
zqlBFL`Q{ud%k@AJB0O5GAy4<dpJvsBqulkbh?t_FhkpBg0jN`is+upfBDqbID>ZXC
z+cL}2^uQU}5O0~98p{%fP;AUz_SjLUzr?#dtJBV&Y_sTNG^gj)8J-F2ET~8pTqJua
zc;?<++J5y{a%fyiT(=xMW0gjo^<$Q9tNN(BDCaeviIID+k-4wz+>m0oVA8&ISXOpG
zJaWVT$*luA9mwNYEGC|>D*Smm3Dt~*`N5vI4s@(&62z~er&4~hPfbx_eDWkmP6{hE
zWs<z?M4F|Pm*AjBW9ZJ`TWJ*FH$!{=admXku5*t)H)oko<*vtz*F*TW-JcTmpO$Hq
z?D3Roe{(5wTUBbCYp8jHts~^+s$^b{=pzk1=-1YZ2_!Ek#Ik9cAynBp%aoc+3n~8c
zxPKC(an`N0iOaGqcGC+<ETV4H<QjfIT9tG)DW&1Ehq(VpU7g;+4QKr)wCYv3ZNq?w
z`Qfm>T2Y)ge|VX*`B)5=)ph5q25qx`=GQZp!Y!Biuhfv?xC7)mEz`os;kImonq3P#
zrD<Q20$C`$pCPB<)cBj+bt`w@%Xvn=<D{UtZ-xg9rEk-tBV5wPsjk;`xWjOEB?@xq
zzLQG#KkXu{vm0A9hp;*&aQfc!H^N9`9C<|u(>mL!79==0vd)mbma3PxuoBPcA1G3k
z?<l)H(coMf6V9yU{PD$pIsV$qOd33GKN)b)zX2Op8J$iX_Jt`tI@8ki?#f+B+_f7n
ztKvwvvKH5%XdfdTHoh9~NXvU3$V=4xmMqhS{l8KI30yHJOXi6_Gh1b4_D+%aJu>rE
z*SyjPWUBZ!E&|RCBVOm}`F!Svy4Q8Fviof=rcfcrmv#jhZldGtX|WSg-aFL=pu3eP
zcUxd#@IBF`k7vHr!56yN+oc}L%gBPPRFEW(5&>m6I>#Bbb&~UmHjTbYZjcM-ekM|<
zK{wPee!q)hS5Dy;a@o|28NZ8B0wv7|ZHsb_NOsqPi$u3(83YRBc3}LjUL2*xCAZOT
zVMP0}_4negWmfZnKiVT^|FRZJICzpTU$VhHpRUa}0SfN0=8j|hAOI7CJaM4C?9TWp
zg)epcenbD*&0*$vUIhEAHihY@&o!ryvIznn-SB1dIZ@z$#K}|GoIPWtx_z{AG+VY8
z9>!zFbCG(d*1Shvo6rSvuz~Jt;dIudc)xeZ92HhGZ(Qq|<}AuOi<)!9iOXkOO&Ww5
zTIiq#%rlK2YID94;cCwI7p_Kp$06+A@g6JIQ%8J&w9O#YilZlsz3F(vQdWsS`#5U(
zn%4!ulp*cTchW3LvCS?mC+y2!l{Yb3*Co3eoSxD*G5nB%8ClLPQAx`bN8`LOB|Z_S
zq(y&u>k-{`e6Wv#kgzW=KY!imx~+zSLhnG#&mo2pa2)SF+#MYpKuyvleCIK1*K+6I
z{sDJ8(Jx5Zc6YbYr+)F+tvh8Sy261{W?D!(OpFLa&wCzg)%llEH+k_TDk_7Ws-9|p
z>oVzLarZ<U!YU{;yFWi(9M{WZZD!TZ>QgK=o5D9xMKZgKRFj2Mw`qj)F8Y^eSg-4b
zo(J)s*|!!u%r>_hUDWH>ZjmBiml5hJJv2=$Q3Xkgjp4P^!+QOUnfqfh?AeO^>ROxU
z@n%}{QJ`<5R2$tI?ph3Y99{^Y5ja=yfI9eX45xiD^vuRoGpKMANnQ`7Kj~CdfCb&;
zJC0t13edJb0_Axy_>)cJTzXqqRHg{+rV7C7w9&hqJD>yY+y~xqPQ5<+6n>|d^2<WC
z3roEUZ#tb8It-P^xbNLa2@X`INSxkY!+8ASPw$bT*s^tB0wdn|oI$`?dUQ;uO~NPf
z+F$<ni0(Dfv%7MVLaicK{V(JR6o>_Z)uuE$+T}~G48k4-#%k?ORAwbBgZ8im+j`_{
z%e6GitdA!A6(@(*gYp*w$AZ$X6&z5t6}dxmN|^vtI)q_AYfSzJVbh@z2x)TrbJrBs
zF51cE2Mxo*Q|dR=Nx3@^&F}D~dkgq1R_v>U1#T$g3+UUa-CWf_ah)`9T=&ahpzOsv
z-tr7tEYDlqog_kZ5{)8S^LW9RN)`Dl*rI*~gjWz~Htx-i53Q$lY7(qt*ViQ`d3$&c
zGh9v<t#wl@l;Cf<Y`9)&!Z!Waje&5n+i8mEYB(Nk6Z60pw!j@)7_Kz4-?pYxSWJ);
z=yr$QlNVsh7Up@Dj`&e!c(^erC8gt85vl(hCev`A$+zWKDe|MtzwnZ#ro`7XXjS2^
zNm{}j-vGAVpn2o}-ppv_VmT<<?kuBg?7Op+fKInt2l7t_+icO<V7nrGzCY*VeyH_b
z26A=q`Ucl+p|omO))+Xf$LPaf)o#`q8g=a<<yy}yx~t};l9{E+XTJk!O6-zKshg9{
z>r)vFC9DO$&|HToyL473r&>M`9GIZ6&e=)RBNAke9YxfysW+9<eI1dAaY01V;#=i^
zf$bRhnvb>cGt5a}YBNfwTu^jVZIL=AI0-rRXZT*YkGiUyFXE`|9pI#urWeN-e6l97
zremxd(#XFU&)A7GXU{>nf6oBg*CwZxb=9mXeXVUzZrP8++3?Lp2}180iSTu681f_;
z!hBg%5VG@0uIXI!-o}M(_u=Bar<{HLf~{!iO?$JPw?gNq#``cwm7)?}lcU=XLxZOt
zi+C=Sp9s<N-L%JruS@ply*Vn_QgaTkuVjt8tsZCfw6~$5VRCeIv^J88?`}x*Bg$+3
zqPR~l^`*__+l#I`8_p6%$N}!!7yAr?zr~>g^LNQDD@77JdZQa6Jr$&8xZ#e>jIxP#
zr@2xD)$;FTixpuSE;H_=+9I28pY~!^M{6qSnsF&%dnMU_`%&euK>hr77?;K#6s{Co
zST)^JWjQ&IwJ>$Rv-QJHc)yY`n|WMMHq=+2wf^v-_9;aVOr>i2`)uGTT@wTEGgaOY
zoV4%XuCOym8oACmP;j2#bRxC{gRpQe`!EihUiOb6eGy)1Np=guRo0ECKAAeM>s;=A
zQv0heGxG3!Tql6zp^Rgr)wyD*KKj{tjr;v0@h6)j)#E8(vJ3Rwk+*wir}BoILZ>XK
zdx`VJ8UyY<`NVU&$sATSJ5qyG4^i)!4wkL8-@hvzY1}?HV>C1)6lOh)i>`P7qxjFK
z51C%2>iPKiEa&AhB60BaSSVe}qyTV_4M@BE?4%g)J@I-enQat7HmCf3e}(!LmNoqd
z^=ur&+Qfev&MN-LU}Ypsx;U;A6>objwA|vv^A=VvzFl69`uQQQ;XDbn?&Oafad-vd
zf?i5W5B1fksJGlE%V6;OgJ0>U9Y9oN-puV#Y;{qCs=Xi1b5eu0t(yqD`!A3N6LH(8
zwry}>8liN3zNMCG3<C%RQJ&Y!)<%8hL~<#;-9P6~1u3yaG_1^@=XY;THk2+x1#UOg
zIiF}V`y?<S%7=wxfQ+`d65e(lN$PFEd@m*h+Lm~a?@&>0Je)7hxZr!z>auF8?d)DW
zNo%u0d%0z{FI&UenkoCYl$ndFGW#oA0tjVOTb^uZEtnWdw<M@$KPdN`6KyYFA@>F4
zPe7X)sY@9%?yr5ZmZS$UJK3IR{~t{hZGCIOz>pl{v@-N^ai{@e&ojO2CC|9#9{wPJ
z9Pj9&9pWGky(vg=JK06tveteevTT)(^r5E*5c(DXd9St-ab0UoL*uGfx=x_0*vM-x
zrBIDVjD3{s-kHvDLvZa>W9pQ$b}sFQQMm#s|Lzg2N>NeZlIltki?gl_o|Ig+`n<Du
z<)d8yb&>2jz%64T`tW6JFL}_ZeMxWH!DQ?a%+E;PsP#0QbgXWZY4y7zkXVF11o`N^
zK-bmXthK!95<4V(^Wky&B&{=>d909#Fv0g&@a>HXYnWWu4<Qb~$hpprV@~1DBcXM)
z^jpZTLp0s?XPB_UyA~xbK+BaGCaVhF3<%@tim$8VKR!qCYgTW$YQmOW2Vu&42N6YW
zD}UUGq9rlITW(^Mcl>`CB>}m*)HBWcda7Q<xCm*vbj~ui^Z%{5$zZ#J0)%jOp<3?Y
zLtytwl^Ad-pp&3h?E56-m?Fa0S;(dY)MrnClI#GSv&^$q*D%2$4Db4lktc-h{8`K1
zRo+~KulLe6!#2WwbHr^(IETwN{jFVM+mQmpyZz4OEcY6FU6`XkN3`AN8r%DsZp(Kv
zV{@sS;U!M-W&h|c@$-7*YLF>Bxc92Vv<|iDx^6nz7Qn?fzqKhnaiR=hyp-l+?e?li
zrI10WAxCRXu@<)Sd;Ns3BZ}KAsA;t#Eo#q8?~KI~lJ+y?!d~c%$4cpq-G1#TdbQD-
zpEn(gcD*m0ogWuBT+~kcARlS7o0rMx0o;7(HTkP;FPjQ@ZIpL*s(`I0lfT3bJU!i$
zyI@|Aj)xG4jsBu|9d*jcC38qJx=f#GcHzaFka~aR{^c8Qp8ab<QT}{;rDCgf(mu<t
zn_%YZeS+4@;db@~rv|T=#_`2<eCiL6HZ_i!HTAJ|lD${cIjZMEurD+=S>4S2A_q3V
z5Q%)cxODTlPCBmd{w@i=C#wEPy-0^z{cbWNThG}9q{(f5>~w?XN6{*s^`UdTt_ss#
z@|iu6I5GA@JstoDGTwDZ2@F*Pi{WcS;#*|yW%FLDIc_cH_$ssbB(JT8+~mwBp6XYK
zCz>cLEOXwT8mlh;W=_y7bnyMO4Na<A*n)ER@J)MXp8$^o+!=P*RH#RP(}`@e7Agd=
zJTdUmCclPTcN0(~$&=#8k{Y%O{1Lu-gQJ$^69InI{!xw5QED34-jFQ*hhzE1v&#7r
zj`nt{BaYMJk=B3|mB~RXgQ!RLFond0Y@Zq2_}c~^DKsnQxNefA8;8U#LEfDm&h)ye
zEkhXBYExfm7vFw%dy(t*B348Sl<V7@>3G-KORW<epQ&fxQrkXqsbxUvN(GNT3mA6}
zL7B6cmVC~0ml5%3<HCmK-5KK0R}g+~pPky9-2_C=QJ4pe>*kVmGD;0BmnqwMgDU2C
z?LFKPpUnny6Ou_bT-8d?fCn8{)!%S*-`!X*a}jFEEm#<AisGYv-mp8T=&<S@NZi{W
zA>|qhCB^rxJ=Ov$lSj(5R7Kf5dA+7OaYcD<r=0J(b68Ht);MBJ<`*lxF5HxMsu~Z?
zoOFhsq7T+(7A$gpmgvO4s@F}N+jksk4+B174=OM9)gtodCfy7_lD3OwhK+TC)P3Aa
z{kEwz!mVNdnMspVV73T`%0B^yuEfA_U@@-%K?rLgY6>zszvJjfh^PLqB>s^GEx-Ho
zQ^-g;e=**((1p1ENg2zpKZDnymA!c8%V%G*D`T11)@T6GaV_#$NY=~}Db7;n*d5R&
z8=?tBE8Xw~)*Z82Da(%MeCx$fL8lrW9gkUh>||RgwBBEV$xsWXl;2I}Bq}C5Z=T&M
z=Zoj9!)~g#=<y8#Zj(LYOR1k>8M2;2t@D0%c=cWK+Ro28=zW|g3LN+DW(#Ttgy-Hm
zm^0yieIR+@%HcbxUV2`5iO^@&D<xOQj<I}SZ3M+cCh*LvkXXYst$ZzU3%wI{jW4}j
z<2I{jrkg#>=@d0BdJE1**UB1!E16HRz*Cz3P_=(cKSedkXh;O#I;m{h#6aFkT=cdC
zBqyGpquw;6p-)j9mXJWAzcn}+_Kh0cM>k7CrwEFR92>JmPl9o!nz9xL##ACr&k``+
zCFu9@_TN|i=?1W&@5Q2>^PBh!J{K2VNHiAd7L&e!6SiIjG7^+`YivB_E-tZ#xm;cX
z3b@-cBH5P4MJ6$C&wG#G0k%`IJ8kXSS9b6#pjLe<_{My){-5qjPMhR<{i&Mg4)RZk
zf!I3Y2R~Olpi2@YRxxb0jpvz#Kc9a2R!XfHHD8ZBU#3WQUQMss5ez$SI5zN=BRmfj
zWa)3%_RmBHoKUb5rObSDu~9S-**xoczT9zfx@FYzWH_wxe2d83wS|lUC;ikD0ZWh_
zrqjN{k|+h<$rodV`LX(Hg>m}6x=bR)IxHfWWm>A&vO9%*HEX4Oz%C!`DD}13I*=DI
zUN2M$;dT|=VugQRV(QnSk*v2_f7UPd0~$e#UU@Q26FB-Ad{Ps7VJ<r#3Gj>UypK8V
z12*Zi6N7Ql%X?phPw@vA5&^#!m>Pl=k8(SQ{g!NB60R}WdS8#<*y3knvvqNHc3!qB
zxNUf)Zgdc$>5~&r2ssh6!}zrwai(iP{|(d)8$NrW&ja!Rxa`OKqB+t(pBiWjXJ_XV
zoxIBSeF6*tULZfDl@2RS>nn<wU&ct<9r)+qI5IIBiiIp9ieq3d2(i9(=dUlfpV}~`
z$k+`A${F=eAu{)<>gC{L^%=c-`6=Fqtq#ZAzH*DvN=>n#U5_2L9Bbu}v}xKT9BQx=
zFt0>~o8MGh6UZCg6h1!FHXoqWKb*<eWL{67$$l(}EkSom-k5F5Y?;2AJt@sinP?*^
zIJ^^8X<2>A%HcD@kmfJUCNe{BIex)vIaU->IGMoR)J|;q$t(TxvT4V!Hhx}fa_D??
zwZ&8CX6iZ|UM<O4`zv9~9O=He2f`ekmDLR^?wKOhg4{o{O->d9K;ndQgeeIEhO9e-
zg+PRFqw95OkTM(b`U5()`)lsy382;b^^xH3wf@O_C8|HbGu-9rL6Wt*9o*iSvCr5M
z@<=>)x$5knV2S8hP16g{-j?2^u145-F@K0W0+9yLSWmI`gay=fgF51<({Sm1Va-gH
zF**+16b`TL6Y*~+){sw?h&PuGkC<jK89%@Jr8Adp-zSGDWNbObDz;K+Xxcf0(_k~H
z&-Bnj5}^K)fJs+P5J8nS*BEL)z_c6Tf}qm9o3gK7Ixd9Mx7VV^FD1{&qcthAQ*cu<
zx>(B_B#7kYX-rW<`x9Qr%PhlI@CNR44s!r-?dDUhmfUf?hfDaG?t_QnE=eS2cyM6P
z#wND)IGqVBM(y2;%xA4?*m5Na76}y9!bscZkw?ScQu`c(FAl4y!bt{L{|VeFqLLex
zC+%U~V9bbS>#x+thprX@{qf*gg~y7NB&@?4)hvI6n&Lk;SyfUwAH_+FJ*=?!tVU|t
zFhf{o7+sj5Ib3M?U9k6KM|>g%*XiQAEu`C%f;I9C(|yPTnY`eqCJ++LG~>faS-;mf
zZnWX6=M3lGJS2HaD!lSy(rtdfOg3xtqY|dx=i^cxyU}Qdu&i(AXx;){Z*C%4-N|Py
z%rm?1yEZ(++BF7WwfY$6(5^t=B}H0zxlKvh_|$~!HsuIdMAe%Ny(qZ9mf4SswRJ1T
zTXj5l4Hm2kkh#6k$5J4be5Y!EVu<uF9vemZB?BicXpa83#>@Ml$xf|zVe5O<ZGX-a
zA!ZOXFdpJs<T2ehqvk!Ie`3CUNpQSI$_P8N2T$BL?yxdye8ZHrY=Z)sL(s3~Tt+x~
z)q6N->Mu)FFuJ@N)Xr+OWZh&YN_O1FT=G4)F0252uJE;tq{Ch|&Zk~p%NnuEcBn3m
z`n_a})x(rt(r%#gOe%I)h$qwKc$v&iUWL4j?nIaBhGFYF{%}m&>pw)bg`B^V#r6D?
z(_@~otu@7Gs6V|s`Q{W`(nWv7iw#GZU!UA-p9Q`O{o|R1;hwLjs6PT+<)q(m0i+Et
zHO+@RN+hXgT(gt)O|*5DP#0;F>Q<D83Z^<I;>|IlBkw(WwTbA3+Jiev5)u+pvmTk>
z1Q`a(yBLCALq=gg=rmZ~)IHfTe6FF}_Mn<6W_Jg6Zs*Vb$FjewSSpmb@4JcLc`#Vr
ztpkA^KIxwU5lxZstpt_?G>&1!WOjAD9({Eu_-x^q6x-=WwNg?coA7CNpQ6D}BrThh
z>0yr$-pJ5$kG*6z<AvmMiOXUNXy0a=WnK56uC@oG;QkNSwp&F0Id=}62s|r0x|x_I
za)^q^HW601Ysc*gkr$M}U`tF^uFVlIXbBheIsCw`_V=GBN(XAlI!mi{v^rZShOL$O
zS7HCMcA}0-YP}v}VuG944<}mHdiDmhyB9j5ZdVBl+DQjJnzhG~7TILP$kQl6G2DDM
zH9+%8I*Efd?$*dUGR`uBkbqUoJHcyvHt2Blm()EWHAR8O157&M3#%_qBE2c|e}KeN
zfMxiOGvn8hJKomX$hq5e3e9;t4gHN8<fq8)R;Gn|q<V0h4%eh%_H&o&N778?*mXNi
znndf~9LR*2sMfsZAsMqA@k)PhoLJc5gczakUUb9x>^KLqka6C-4#Yu*+)i0S$JT%q
z(%G_m9BvKYYzmpSb9SA~mgH=&a=})$n%sLEm}%fsS(xnJug@sJ?FgOVW?5dji*`Tg
zJN&7x857HOaUmbf-%aadKp<Y%qowLK6W(?Gmv{O3eN<)6iMxtDYC3IlD%b5l#J!`a
zW(uh6djt0`7#NSGe@6E=#@;0NmLvbN8&2;y&giSlzeN@3V#W%Rk%ajZX0i9jqS3Cu
zmT3J%gq>jUWe0M7fS$avxWcwPA6dYd-xy7wqD_Mn*I(WHB9YO=o5t=W=u~d``Qb--
z+E!J~!F~#KHE|*AoY-(?RC^F}*c{DBcXxN9uCDG3pWGI^RXDzAPW(L9196o6JCDrt
zu1xp*Io!V4dIqYIy>QyNzc;}T`j5Ni*u*~e0mDbLR_k!}86&lsO-)vP5@e{a<8YMs
zOuY;0GhSxh8FbohJx(@nw%&ji(Fq6uoi(q`NT<WXFP2BvX4eVsuAb+`BW9`(KN*HE
zG0S}YHw$29Zs`E5=Ap#}``?E5CJ-GG<!jDIGJN&CMl|4!bdVdB9B)wIrn%tV9Ou*Z
zmNAt^d+i+v!L0?6D30nhm8{;Ama2q=ws}Uqnh|@x6zEQ*4&O#?Xp3JGW)sNMa^zh8
zQtXYoIDXaN#CE;Nn=Z4FF1e^?`{|Wd8l6?j<2rL$H|>(`4w0F;9{5OuW>X8vs8!33
zNb@l9zNuDxY=|QWob#8Miint-eY^6+Yg~lDvXKCUFej&-4IR`>(WGnQFu%<k`R)Du
z*kxXrR_yjf|8kQK+!CxuA69qve%f@FyFDHW=}|LH!@Wi8Z`|!083^^#&Vro<nPsUB
z7rXy**+m7oL?2q;$<STb?U*t{E2^qujr-;ZsL28x>2v@5QeE$KW0aX5!)kqOv<(Bb
zhd7s=b*q=s^Ud#a!aAE&ydLJc*%(l9r2UGoq_=3u0!d~J<czwi=QnSGk`DR#RNfTA
z<WuQ9boGJ4qGDhc#1vqQ!{oObe!``4h|ak0&fT@~veHY2a6e1N5f@s9RN}8enuJ;=
zud)2S7}qe$#6g}!99Cc@h^ZUVd>)O6d6TZU8yc*3;Q=C*?OS~cO__)b=BuCvLka)p
zyD^U46GO6Ih2)D=9gdaUj7>dew%9mN9{Y;>wqL)C5#VN>))or#n(Afl*B;%6zbRy_
z=7TJ}`UbbWgtf3orUhqvBykRoj#PRWwBM`mOsVH4B+G8pu-u$N7_##bh&7(H5$$$0
z$f_SITvngEiWEfEBYkx*_O?ljHnNA0j<!6zS4fpU0o6pQGqj9vC8xCX&Rad-x=THe
zG-$r8F9DWCMn(m*u{u{u9dLV$;8Ka5bct3p$H&LJr^w1HkAC;GPV4wMe)P(S1Q7rS
zbwk1Pfyp5gM6&KdDmz94B`!hgY)=_GN4ix7snJ#N=0n}8sP+}r5~Bz5R&h>F&MRwc
z>!FI<)LHO&%6t5;i_d`$!~JsO-|*!%{~r90b;`vxx~VY{$Yl3{(5sji<qwRtHHkRi
z%Mh<hH%Ty8e3U3s7GpC)sZYW^rJAr=63aF7hRJ-)h{?R-AnV~F=H4XbSG0kyHRNI-
zg?cBd^wKBxir8b5EOag<S~F|gRt8zY%^W995_=8ATFFRkSBYEt+60O5(GPBG+_+b%
zJAt%?<nU&ArW$wYgYcu?20#QuU!wqU$@g=qN93SXgn#b>pqysng;7Hbko@mXzHefQ
zhgIF8!YbeNnB6A^sAYC0Fr&m|sq4FVzqBS8j;!R(&se^JL%->=xu~02ff5U+8$Z`$
zN>*y1Cq(7~8aeh;W23z{d-V7O$DM!ij<r_n2JqDEX`@swKSZQZ;dd3hH!083q3jK*
zLcC^Vd?x@xNShDzz4pD#kE^r7iMQPCQl@v2@nmpK?g_w`L2_1XBf?o|h#<B0{8S|$
zUorVGRX$}mocZjveb=7d+Ov%{8;2$H&CvV~&B;U!Vg0N8<FH-Pg^pyo(yVVBlYid5
zlC|;5q1{h#ov=mec~?F1CvfZXT8iiFm>6T!uL*3ihBYDdR<2DJrL?$fJKFxhIg$p>
zkr0Px9o*4HT!>t>-cZK!KhjNfrIQ#;&K5L2)#;ef{cfjtV42N+@2~J~r_{U^nW%+z
zer}z7mvkDo?Lq2*fiq*9-?0xk>6gsD+@O02+tUS|ym~Pecg$!mW)EL;1BLI}ef)nK
zqN`hjg;B8eG@+U`ZYXk5bNdPBa}YSct6tzCC;I*kBd*-yaGoycOjY>F$~+loE;rKC
z^1l6m>i2&L4x9BVZL+FurIV_PhGYy|*j_fI<-fc<2~*uZ08<S{z{3b_G0{>eSZFz4
z+{Z`71D_*q=yQV6E>K;fuPWM7)9I)O&sB1zw!l_euKr)^V7Gr+$TxRm#LAMosqJDJ
z^3WN@BxTJx(^06*HVD0P-(3;<%vuart=Iza*)vRbB7E;Wb?)U#O&`rBK(C(kcx66f
z#yMHqspcS>ubW|=pKM$#lvqRPs}+`dTHuWmP$7IjR-NxyXS&ciWPr2B7!|}Ld~`(0
zXZv#=;0wiV#qx8qvD8{Yr}2lAE&)5#z-CWb{ElxKvSRzz{5fS8K)f4P2b=Fa_+y<e
zq@|jbf3p7lwcNUkR)2?<Kj^8TPl_gwlu1mOVjbPB1N3r;{eyq2{Vlwb{EJ6%<#{t@
zOF=#4BZUJh+m9b(;7}jcKTYNk7kZ^qu@`mqj?5qbpLaw#LK=m=5qcX<4jsXKswp+g
zt)23@uH$1=uxXz#4ED&SV{3>Kk!YH)<-WO-8)EP_k1nh_=6OhB_b)Qk(_Q*mvz;NR
z(V-<&u2)^8;}C8`Wg1%%9mac+UZ>3u-90`UHs1Y4FSnTADiVCB*@ex58E###*)x8Q
zn$b+)w%1n}=*tG7{>5Roxb7h!ZxjjGQrFYxX2ehE=iiBoiL<8M9_G0k14v;JXeW64
z$Btgeix>|>HZ0Qc6~Slq5($heT{e-W7H(jfM>OW~aTJ2m7IvAq+*`%7dmhPv?D*;-
z4oMF`AklT|`<6EPh!<Y*=J)+;YW>CwqXdbos5mk<{w58*wK|TkMDqsM(SQGrgek|`
zlrYe7KR!a!5C0%3DVc0xX=zHEPWnN*(`So>K2`5`_^xU|DV`YXZyO`p{K4<ld%R7O
z51j1(^0wcli}Eu@(mRhg-Lm?=vEWzkh*%092iYiN+O);MXl2`${WUW*nKxB{N8w>-
zUW;b^`>vX$0qz4DT{A=6bpwsy>+dMK??C{U*(Ww`{qrY@wzE&^q=wfFv>;aX*44No
zlJcz65nitdzQRzy@Z2|h8IKR^W6K=N3p1}2EhsD@lSX62!U;4wj8HXJRP5$evs3dj
z`&lr)w=-ivz-k>Cd{6jzW!Sh#dax_nG-*Og_Vm$0t%lRmw32a0FPlkbT}q$j+{EzJ
zKvA8E_|02xl9Y_%LW{<b8~hKu&(6;JAP~qHtqx4<+6F))+zu-6UvMyEm)EfeUx}39
zN_|24Uu~j5%MR>clL0njCNN(w_+tGdH~MmuUUFW06#d^JP@~g&)62AaS-!j}E(AQ7
zicxWM%v8N6@!ZSw&Rdn2B>eNK0!7shAoj6`r$tLTck%rT*?$B=l;?L-8sB@~J^|*e
z_|OUkT`w1^7s4-lVq=-cKf5|js=U|Ky=V+PJO4VfP_uQ~v6PspPSCfwt!aC4l^NUL
zZO<$_@ub%0Cq&ZGlfj^mMl&U)Gz-B%z;6V&>_~j5d)Xp`dS)VWrR}r(Kc&DZlWb;<
z^dv@Ff%GsaxZq{uK0t50b?`t+OiWCd<H?&G^0{RLf!hZo@oUw!pLZM;{^u%9+<$t1
z?ftKv@|hcVab}Lfu1@kF^=X>+b?gl1;QUieT>NEI*Erg4)2*!;o$b4+daMQF`(A);
zUnFSXKEON_P{rI||9o*x<M$7ODt-0axaI#u(5EWzqCnIgCoa<Z{Yy~112BBFVTviq
zhyK%h&FgIj9Q3aDZ1ic8aVB-|@s+&LIxEcO>OgJYu86kOJa#NJJB~=y|75CnM7er&
zHC;tZUlm7<WBaTAqc!mDIp$-P79+D~BWnrt-TI`NT-K&ly)-J;t5V?Ylo-=^B_0%n
zA9&w%YJ6UKICuWBvUWzHBJe7C-CX?&+#}nnkWLY+MSKrAd$_nQ=KrKe{D5?l#2Aya
z^HitnnX&9+x6fldX`FxSWd||vk00MU;HZL$+Y+bht;9d0r`J$b9m%b#a*2y|d_$z}
zS^7k+l;@p?-asqne?Rx=N|e&TYo#9Pp1%0TfkF962)(k>nFG+n7!YNjb@wgxLY>PR
zngZ~ucQR9dvWY@K4y`BUc_!Vc=Nn&I<h+I92x~Hi2<tpPK6W3n3M{<Jp=4-A7doK-
zh}XY-ay2Mn3p>qhIFX8F1`|n4O;rdRtVv50nNj~zSZ1$}K>Ly!NS+m04G^P$0g8Kn
z!c)MKk-{4w^;-+V)OxH%!C^{t;PS^|SiSs@?JMIybw4V&_V(;kAFCO$zRo+3)Y&EZ
ze?yr5et0FGKjWTdQbHjLahRzow`r$nfDRqu9f=Lz<o~1Vx&xv9|9>v(jLeWoXUops
zD`ZnfE}7YxA)GCe%#ytuHre|kqsX4wd(XsCM8DTnr@r5BzyA6781H+%p7ZgX@4hU1
z)V?He4kM;mCnn77ne=&3PZ~^%VK1&w-Q=<bC-5QdQT8<AhSRZ4=6*Xk3t*{0pW{of
z5wqxUv*3+N;)3TxpyQygtWCG%#zQYDgS#AQP!9d$P_Ls-Jb3Y}&^J&m{nGTrNFFDJ
zt$F-@Xt;S&fl#zvW$zEcxc6grT>AFKAsVSy39xUn7{|0q5VVa&-zXNV9CC=%<VJbR
zYXAwY!Y<(&SR1uWcUj>B|Kq(kE|>jbA*?&J(eaO&2=H5JvfQAZ|IiIZeZ~EwF~+1t
zA2;E?f-;hMiTPnEnr|{Q-&cyckQrG=>jMwGFock6p+?KVWp8(6g|6>boW?T9NZ=!B
z2|Z7qehNZMQWRPnv~WZvD-kJ(Z*hW2{kpO9NvnfVT|?oTk$OqpNl#}$k~kSQJ;v~`
zv9S>j(#t8U-J%?nOg;r*&LYzIg%NwB@X3Q464A#lCkDp`4DQPMe)HtX(wiha5^v6Y
z%eRre^grBtY3~;NZbBYmyYgqIg%@4dqKrM;1JnQ+T150q=Yjgyu@DD2+j!eBJG9Jt
z1MF76R0IV*x{^R;s_PZjJic~F5ym|EHW*<@`74KXX{;AxanSJ28C&(TWZv1|;d_Tj
zv`jm59Jly`_GyUm#S`IqrI`L|i}7Wx0!TJNGWC~%*~LF1U7E)RnZQa8aWD1A%gB?=
z1+dC;Gg$mfgSMMI?;o8EndFcZd0wW74J1Xi#n+LC?#pDyjlO+B35=&LkXc9t=wyXd
z7+2x{zMAwa+;~YZT}3F09;p*A{zt3v{2Nc(^IjCnl|}yUMo9^W^|C$#441z@QYvYk
z<jrE@y*tC?QXAaufOLxEx2_s)pDDsQp&4wN4p@d%Lb6APp}4nV6c-wwyc)=zqMfBv
zQ$2~Hu+Z-BiL8XVP?dfLiEJQ3A>Ilb3mO;(P}eC=tNuqSX5V#~6ySxfx3xKOduC_Z
zFh|T_G(PwYXSw_!gO)t)T6IgpdCX6+n5TEEIGt=v+?OrNwI5JPk*wmlKRlclm6b@7
z=IcO!E4;3xTc?$rlFE~<;ZIDQJhl6WfW*KMX4QtlK8()x<agyYkMLF$@DQBbs*l+?
zu2rI=Z4f*J>B<8_zQFjOXWjHk;wJ9g6+w*CFubXLb+u8B;lxizkw6}dJbQNT!u)i`
z#N6W7J0eLIW=SR(9Et^frP|QxL7rjwF@EL3eZVK#w+$kq=pOOko?T?s!@kNK<<59)
zoM}0?T|L8Uc%q+JP+Msjo~%UP4x5vp5Alg&z}#?$neG#T6(n3&ZqM3)E=#w4u2ewz
z1cvD((7)k$VbRyG&Jb<S$y#VJ#3OE{!4e`$8w`a>W%7}%uF&L7(LNKI*87V<j`HBJ
z0=cphBviUuT6_cKdKa%h0W?tLbn8;30PpKoF+qJ6@!{?-?{)5bH8Z<=CPV1t@qW?@
zP+?qD0L{J7)7P@&o)$a=9`v`WMBM;a&);5LSm-P)EZj`)#c2=*f9Oxc2y%C4DO)}=
zog-9#TR#rAi;rGO>(PHQMsda}whHfrIgK)C1<HvxvJ+2$oh2e!rQ+s0_eW#8h#j2`
z<+J1X+8;59pgVseI2Gl_frd=V0vM4fc!%ELqMDimG2&!c1Pf{w=%>|}ftH2_;#G6A
zHRC@_GYj+UrxNJE3Acz1e@bQk(ryF<ZNo!NJirB_XCoHALYQjeo?tQrm(t;Xq?Z_a
z7AwzI>Ga{HME-5rsE5IsvJ%e}1MfQU;bX|rDAT}IYdK5ML?b=mA)0KSlT8`1ky;Fg
z4YXglO)v(gMB^s+97$JmP^sY`vV@?2Ov5Q`URl_BWctP?tNV|br<`bzf+4Y>e^J_v
zjZ+DAlz{qs6iw(UQcbjFD05^R$h*U@p{&eO{iWt$gRXG2A5*de1cre*tvDVwU6lu^
z_+L(2MTb%{J<;SwddoYUJbRkl{|V0Ed$%wdbykzc_DznYw6v+}++3TAkV~U-(Lh41
z3Oy?<!+}>KFFF*fchRoqchI5XE8MPPK1Z1)Ni1=~D)bF7th5$uEf!39u2mPlZ$ves
z`}aT{iKFPtM<QK|E<QR%z#)$sx&Sc++bxAM3~$`Hku^Fx3P-_lj-EAlmsgUlS}!u@
zwX@R79abxc{V`K=`GJVrY8v%mTFTb%N-HHFGbOjENs0HDwLh@0TBhMjt>ue28Of}~
z9))a6*$_cpz~Ew`5*gqy;#dXSA30P@qK5#jo0$4u_YT}iBRoE_asI6&jtJ~ayFym#
z+d$i%j)ANs292H~W)+`i*5_`bwWJ;|=y?EU5g<h&P9$ig37iYnyF{v&64a5DdA6Sp
zmf`hgrV_MrmKG94e`G!%s%Ov`sFR&VlgwpUc5dmW&!3?%m=LrIz+i)2lYM9*+G`)~
zj1N<}&%}Z`*~(_CEgbL>Ia>VWt1D3LL6sD~KMau0gbGUp5Y}o&%55Z!!vE@k0}ck!
zpJv6m?j-nd-}GnZEK?!g5N&n>R2nLS&u9;*_(o@IYDndUvLK^=Hi5yo?)5E2;+#l|
zv2w#|U1kDT0j>O5iHFbLwWttW9+6aYBiZ=lyBEO$Y*lDHKycR~^6Ydh$?8G_J@zo@
zJjF7sJ5?J2q^QN2Vc=!1;cD^<XfppT;k@z<E0UmbDzo)2CaH^-UMj5U=6I9HQnjjF
zIW|h&VLM`wD)R!iMG*|bQl^_n;-i!g)Rk|GXtcP${Lra(V0jy9E?pmimB?9$DJ7or
z2S}(UFyvM=ex7Fc^ej_FI%5=c@$LJQWKrX_2F6-TuL>F8X^Rq@Uk$LpGD_?zpi8|f
zDF?qYjxIM(qxkeeYVJQM&J2y8w8Z{KS2l48=W2!OI-BjE58efTPIlFu`w>t%I|ZO}
zuyiz{)3TR&%ed`4(ND74a{1BW3isVVLdW{^tSssvMH%UAecqS6g9;;bcj%2t`NIyW
zcXPX$#|voRE&MjPwx$mSd%=zk?wWgPoe5*OsK=EfZ^A(T@9ovmWfAK>M<>C!q~gDn
z^f=9L#akh&yICDrud*HIzcfZ3woaa=W;HfZeeCNC2LEB!VDLXM!p}igg6Wo~re<82
zaTK@G_i+;Nr$9>VZ^4*eM?b5PN0Cb13N6Nn!DBY8Je7F&&mFm<_m4U@Waca#x4$es
z2PlHHY-NV~#H&EhVdd>umEbDKWLvsM$Y)a~a;Ei$087bg7^aW5lm|nO>_25C|1Vjo
z1EGROpjo`(7TaH0-Ji5}QWm762PS3A0u2q&Nex3k3a-@ZQd1e%n^z6L%f4SNK1V5t
zb=&q1ra2TAg`MAJEt*gz%$YAM@h~(_>eF*<QvdNuVGO{OR)h-UoFmJk+BG9(R<U3h
zg*jS5cxLSYr1Cm#W_<uwC!F2V&CX(IiMGm=Wc#Cy0DyubAW=pt<sp>%#KAYPLztKP
zSs9!8-MJO&I`h<mF9E^O#J(?^O&N6dvf(<ORz~W?X(w8F&tg*abnb7ew&4Pc!N{x%
zx-ei*r-(4$M)z(+r<=TRV92~>*{(D7{`HGghwN)2#Ii_Du6tb?co~MPD$jw@r%h7v
z)<&ENCxCD=4gA}hQCqypv?9^z6QL=A=P>FBn?8pX9j^~!B4uaCN?EaN(9$q+Uk@Z*
zA!rBeN6`V^m#yWLJJ9%A&5wfSq#+7QgT+oV+29lW5$rLzyHkwEcm`~=*80rjbX6q!
zjQzbi;AGM1_EQi#I<zor2(Z1o7Kor58HXpHkwtR&M}95RJuLh=lMSbJ65Z>stgS7o
ziL)Mc3~{mMCoepq;wcE6(KxQC?ybHFSRZ|7ZmY4RiiQBgGC(vKy{<zu=?wM}XZ(t+
z#xMK9XqSrdZ8@ByXr+VV`-8Sk5^?@9^$@sN0j>K7_Qe}I-C<V>FtY6DcMDs!+emk*
z3U-!oj$nh!aMd}puk@A}k=V&$8j}62_dLokpr%dG(}An2tH8NgQ^^Ad{i14hB#loN
zIq(Tao{<`%po1Jp=qEdg9e(kkXDTg$u|+O>sU@QEPl&FV=BL!bN#O8KRgrFhFGCI_
zkYG00USzA}<m9fo1qCTF1dcu2%HK!%5AEJc?@ArzKmQs+lv+TDrOmT_ujRs7jNxD<
zDZ(RVPDSRA1}=!ND`dy%ne!+g23DWQg1;?^W3;!ScbM{LN5JlOrktFd?Tuq`{lfsK
zesl|$gwL*eI6wjeRn~8`gAi!1Bl1Vi{V1_jX=T=gWg;sRpeV(_mH)g(q@NOfNTK%`
z<v7XHeblTlq~$N*6^&4lNBVKl+k<nYl6~ID@CU82mgF(Pmsj$QO<_DK!PBd!sbc%!
zxCehudJa@iaLI72buFuk?sUat6O*+$F)ze{;N!O^`ObhAEKjo7og7?)pQ}sm%7Z$Z
zCj&Kh{+_|}<4yJ8+QF$q^%yfxLE5ygqFsz30MelN&@TsU%d`Y<qoi&d=^s{f`SU|s
z5}uH1oFm)SGcgk)6d9@1&&t3L8|7e@fe3Z1A|4qHSFRB7(kO!A?X%qv?6QrYt+MqH
zoiN%~v0>ge6Pce&AG~t^{bq)<_kFXY`A`3G?t^SKi5g$sAP$4tm%d%*PK>oX?C+X5
zxKyPdfwEy#5MDwOux<K1fCC295-e&LxzrsT|3<x4%*J1!qW1g*+k#ob;C08J^dA%c
z5mILe^VLdLavs_2rEe5XFn}N-XargDID|^9;5b9W>$5HD!kw(nre!})$&*f;s8Vdz
z+)H^ox=WQjOa<|dIw9Iz`8>DT144eqe80)w$2`4wBZ>K|&HelE(Yt5f`DtY$0ueHL
zE$av11-rH1Uy6!?oQv+{>SW`JJ@eHB3j}S?pH3L*hI3f8ZW^c!IhpJ%=Li-@x2C5R
zCnrAznItY!S){$3@yjr5cKTrzo5nc40N6;VKv`OIh8bLj1oHN1!IG>fB7?BWJmj(`
z^VPdqgXY?^4=rm?V-dYP(!2T|V@zRq*kHB}m3KkE9AArqDeFTQYX5W_!;8=OCZOLF
z7!P2h%#!?)uG>j;b3Q#%ujd@&I^=!(Zp0Nm@}+5%Y!W4n1|Y5G_;vkgmdLVaq$(vp
z%SEgdXZ;D96odSf%AZvkMDrQ2Nq!yqz(~hjEFg6&@WfC0ftNI^KD+y9=FaMaiGn}`
zV~dN*vprDi)`QN6X>vbV-PRVoVftR3oy5=^i&h79lTPh2M$Tcz3Jsfpqc30U)bgZ0
zEZ{PFM#AI~I3oG-&v1!ijGvN4ca=U~k0quTz)p?7_jUIv4+otw?O=TE`ULa4W(p%#
zVrd}Swvj(_1CsrtgX+esq9R9K(9Do^!13biSB97)W7Ox7d7nqQh#caqKfXsX#jlNF
zS#C{T=b6@ibA2jE<$ygQ1!o!l02**ZQl-EvCbeyZMJk|At~(HB2LK!ontnw(1(&hQ
z4$ewl^(55+lGyqu!I2IHdajnY60}X1qC|Yv>1o?{s%*~4QVe^r!_e$OaOH8-T`}fU
zoiG5!35@N{iInax0{s`1d*of6-^kCOA7I+y2;0X`(8J2R&H&Dc3eH6&=#l@0)aKC^
zjAf6Y>7iBh1)w=TS1}(X4fDiK?#j;G4;xAH-)9hO9#ed>B5|Jsm?SJ%6d%ZekUzpF
zlNu7g3boSVd~^IdY~mpet-@ikO&Hd1qa>{dTz3N_9%{YI)I|LBn7OiY1re0>5$e;s
zI*glX#et|}HC094$ji*yd4Tv1AS2}iNBcuyd@Db(<WJHKs5qE!Y;ve>T4m?P(fa<S
z_F52oQlgyBBJsx$l-bTNp$A&Do|l2eYnW$$e(cvqCGZ4yaJ{Rjc~pGVD<190ufwuV
zI#U{CcE@I#Vd)0tbq5e}ts7}mJ0{{Ml18izc|ge%g2kKnz~pVR6Jqy{Z>JJ1$$$}-
z4<_V}YR6WEJaYPh`Y!6+;#A_Y5m7D9q~Yx>T1qOud)|#T95Xhl?Ry{8`&06Ab1n4M
zMwP8}w6x~*JD8v2llm*2_%+*Hlec$2-?s)O!b~US+*OZi`e86C`1?L>xbe)iPxcAh
zJ%;(In4JWCL{50s;0s}P5{fo6KK4u<A=#L<YaZoor(FXYwMqJ!bNOC0d{09(6RT*v
zWhKorL`FKVJ?9`sX<g1cgxu(R63$;@YfQ$IlbOX=W<wd9Zc?Jde~yuPkLl2|$d||+
zlV3#VDRD34enJQh?~Ics6weNiC&`_WxCjEmUeU%Y*X5qt1Mc;;r^U35wTw32lxZd^
z()In3#QQ2Y^>SYG&#eOJE6F9G;W+)vqULecxqGOoKzWaP9=8+fPg7!LnUR`^MAs~=
zoOfUtT8w4g0Iice#I08{Qms#M5KQ-{Tr(x^*$nttmYaHTt>U%It)OEJN#sKOCa5^B
z;szMh7Fpi7pR0M(x1Lx#tdINgiTjVThS<}VTQ>%R%}U`%l&`z5IA(J|X%(!mSEaGb
zO6+$mKcxi`=gM~Ie#PP8)NUcL0+abTx)|~Dljemyz~!+7rCPksT^dU4S>}CmQ6f^~
zGbRGtASC{2Q2mRDc|vM~R#aClCX&0}G>?w=yy65_n*t)jG@zAz<LEkD%(=nR?XOa`
z9yfF%Rg$%J{UuK>5#u+3qiSl*@nmp!*0Yg+&2pBO-a}5?_eRHw4*~NF=BThpN)6tt
z9v((-3|g%{Gc9!jf<=&Fa5q9QE*1y7PY4^>v1r>M#gB<#i(4#sa_Q`g+*Ahoi@$vs
zFiX#&!!1)~vP!%ehQp*yViGP951BG!`zT(LVWf{(F~m=6oHKWDS9R77Bg$FWsp{y%
zLX=@kKxVOu{J!iBY#a$1!4A@V40Q%yJ_*TE3A~EosjPBCs14$V$bJ_Gbu_<~O{GBV
zsf&G=3HvcZ-XH{FhD!`4R4H<UE2~0+KSDE79mWKsPSXwG&zIngTIWe*MdOXl&8u=f
zPtoi?cRhF2{C-Rk2LDQ*$Hg?rWfc;ov4b6fAA`uLzk3YR%8#YoB4g>Me#3gK9Dh<r
z8i5&;x`o<g&_}zOfW0im_ox6yZ570t3PfPaz#?kGY(%js!tuu*K-z$)jdHDsDX}fy
zV!6b8Ed62N>`y@bi`m&EKzo+XtFHH1N}ClWSR99)rYpLTQ1DI;6Na`DV`b&1Zkx>E
zlD^PY(^zsK6&`p1^7my**Gxe&QjvmJx}O!SH7LFRDZGJ0TC3}Ep8Ah7@$cDka^s#Z
zyko_%gM`_5KejD5F?IGyuBOJ4x(*Wh!#G|P^W(Q9Ak`MMA-;Mhzu#c($JoW6sn2kO
zQ8c22iuxy$t&%9Z)W1du(q3LOyFDU5HY!20ME~LFV$W%e<SrBQNH~w+kf~kv3JS`d
zdEJ5NXY!EK(@pch$PMW5&N4O`Py2eUkX5%|>rAF4m`?K9nl=sx8{zcU$j>;S`@)~7
z>*!20g9|KmCK~^l${C)pTyC;FLU%ggvDpr)BzNV75G(q$IbiTRz#^%BS^*tPhAH$H
z9YSX<Z=O48;Aj0}(gm!W3%cg;Zj{Ahy-bW?z<1r&t*u2~9V&Q6>o=M~ufI~wT@k?{
zd#SC){T7-@ltD%r&XG<v(s+`YgUw@uwNIi)UnNYm<q;nn7#*`Wz+C(Hv(byYAZ?Bt
zwOI%~|2y<BA8ZS+>KJUGvl9mufQ=5-XpS2?+^?tutudZ|njI_a)it)}QqlULazseK
zhk%nmx{R=fxOxG@9pcBNSXrgH@5lGVjs)Q~4?eJ(d~+^jUeX2Ww(*Oq7a>nsqMT(V
z44xL5i+;Hqw^8{G()(jv_hV_^Zh&5QWXxRxj79}0G_1}ze=jcW$5DIUU6UD^fy;Gf
z<CcFt#)Qn{fltalNQm#a2PW~slC>?Lcx=MKD}ji&+-m7(K<{q$0$yDyNNddA+l(Dv
ziD(S}{4^J_Io$uJ(8EaTh*&ybZi8?VR9_mDd&Ky5Ug%FolJ|PU6`4$CHQ3~0N1A&Q
zAH!`R9y9bHX|GX?#vlpPe4RI~P1%+0W=$os&WeOk@~cxCK1RJLdl=7-p@=$mGT>55
zQ)A*_e~A9~ZgxtY)<AX@)ua!jPPDE;q3<#Gk&E~1hnF3Xuzeid9Zy!lrBV2L-JA!E
z8=i@>nx*m&2nfiazmH6L$I5c{K0d-?<U+y?!O@B&>pB2Z?N|C~PXd$g@(r6a&f<j>
z;9klf=IbtIDRP7xom6FrTM4kThLe~-tb5ppbm%$RTF|sQ<u#Ap_wsiH(M>CxfAaZN
z`6Dc6Q89Y1q0+qPS$h+YQKY4|S##9h7-jd9j=~)_c)RSOgwEIMsXyi0ylgd&O)<xV
z%X8*i+`}!*t-i`PTzP4EB=(Nga^J<7Sw4H|6CvTEFj(VKZ7+kmn2}n^Zu#YLvziXl
z-K$HVyZFpW*w?7muwoa@{G#Y^>a&Tuw#j`%wed^ZgZTRnK_eKEjJgDQP+a+S?0}W!
zJ>z6-ig#N!&>bG={433FwQ2SX@kgke5B=Gt66rxw%l54L`RRo0<FmVK%Ug06&Yci!
z9$2n+O0lQ3OJgX?zD{|yIu=tT&B4{_lZ_7|*Be&sH7=>w;fLMt(S%otM1l%K^+u`|
zA=p2cvj9c%0SwxY=a>;}E-aJxkt1Ycg3)kw#Jz5&mH#i;=197-GO`yML>{XXHIdvO
zpgcBTr%c0{ofkTVq1U`9V2(a&?3$_8<NF<*YIU4aQuWR{I)m$XjNHDXp9Qk}fI%zs
zjmfCiyq40^m!=X&XL~l^Rqb8RU;1FfbCPZV{gW==OogP#;|;_tT-Hq=ZE-`$3fZ0l
zzfl+k8*=_z_5%IDGJ~g+2TA@7CUQPBlB#4GsYFp!vl<|tteYapbR)}*Y)JZ5m_be6
zKx5@!Ck6u_=v#feR|zJ3W<HX(CtRN~NLcX@AiT1D%wnehXa!?ZPhlD13b#y>b{j;;
z?V|=O?_BIqGsOjU+p_y#Az09_G7YyfQgwPnN95WL5{|zx4E;yLq+kC+C|L$Dm;s92
zkY{73?vAVe?p(r^@FhGE8d;5WQ<?JT&(%v(Sm=@GwyQ*fNrefHx>*ZwLFX9$<rD)D
zr7V%g%j|$z)r!b^GPU^koV8%-ZX*93TOLuhKL}_S8b5xwIht`;;y^&{6|qaBeOKOg
z^?UYOZ12sl7{CZxdVzfB$Q{i?!CNJDpSUMcvD^m2IYS3<qWC>)5wd=Q-tDG`W7ib0
ziFxHQwvApj{j*=v%~T}b#|o>F(;2DfWco{OyvfXP)I9Y2P5@REF^>|fOcBpF)94vK
zMyUXrc#u(}(F=rD8$To91Fjo+`#~j9npz72R3^Yq-Jfkrzua{Xu_HrYY3L>SjgGZN
z!fQhK+oe6<&ElaLs|1Hl?;U;j%o|nMQ;Q$lHv|ZwD1(X=t^Qa>K?vp%$vJ*Z7i17w
z=DS-y{PK^C1bdv6AERhUPD<+jP!b-G@3ujwpQe6OaPssZt5hxOO}@&BF%wXLbfq)8
zDvsgSi%;gvRz@#1;m`+-RFRtx3=Dil1gADWUw(#8s+B;c0<M9^m5vv+IVCn1Yi)m}
z2L<>D?~<SBDRz|KEqaNR1F-f8i!rLE9nFy&{0i{#3LccqU+H9e1?lcQP&@#Q0DG35
z2ZN#Dm7s!?SD>h{8@<E;B;aBhU8Xwmdmp)M;@C6q+_YO-BzYn^pHP`Ty4~|`Zdv=*
zL!Uod=?@%e%LbmgaeYMbDZS_1X5)npxuy~e29j?p$7Zbl2n0l_fnQVaM^}XNic$N<
z-Z+>ei;^OrN(Sqnscp`Xk^sfZi6Th%C2=GZ3*G0<$*-JR#FzUq_s2}!U@<0j2))(j
zd@!|+4jw&OVs_YyOS6Bh#!F%NXAg~MjM4OLn{3xGocz?OnPIagn1Kxv#GF#mAQU8k
zeg%t>eDUz`Rtdg85x9!$k}&#W!U$XShtt5d7O*Utrk8HWSx)XWMj|1KXo<YznR4Ol
z?A5Y+UbE-@%1c`l1|%F182u&YfG`t{xhOo|UH>$=kwBJVSmeXv;|VqpIL!9ag0c6-
zAzw$K{Xqp)+7LIp-DJ>*AnY7*7gOv2T^c-CLT!%{lkY5EVBi(;UEql*yTj+b6eA3E
zDlLoZ$NZxIV`3iil`Dj~D+Tpe@;9{w67GPG1v4?LpUZh3+j3^2i%im9l&cBYq;qSs
zi`^X?nw3O1lf`EOK7<gT7z61j==0r_c|Ca5u@`Cgwcopqsb5`L3VXAXtmS&{$ryQa
zF4Y%|F987<iS(<b37Z;^@z}UHp<1xvztrvJ-7B`B!Wd*H@9Zhbye%+mlT%=l>qep;
z+49>Qs&!sL{CUPHG9K^G`A#WZ-e(?9dUmi<+lkC&r#Wlo<Yw`-v+OckP)tve6rKdx
za7bpcP@c2SbF!+j6b)3@z^pB37An=hh>mb0z${(dUHsWBv79rs8(wlnOG|jX6c`am
zfKBVZ%F@|Av$s}dn11iKe(}v1GTLTdbPQ{nQN0*q*U0!=6`#V46TJ;hG?lt-RjP6l
z12wzDte@-C4TOGKJ(Su9;6gO|#&y*<g4PO2*pQX+Yr-L@y)iHUWnV9h4oJh;uvO=5
zok&JJUf!bX;~S;{NeZUKYx$O^;+QeNB=^KGX;{DHt3J)p{DL<_3#U_w?!jrbqTV{4
zvj754P^r>@q%7lft>&Bbbm{h3Lx&oIU(GnC{flKP{-*+v=hJ&JP}*OU#L*L%EUoUz
zF4$XZ7%BgpkJ$rxrB5P@es~8VUxXeYLpvfca-({=B1hx#Zig;24&9j%iOwE{3UXOw
zE0?z}v;EB*8kZOfV@JQd0*xxyujR*GbUxu2F&89LXV+zjjke!mx2pQ))hW?lxdHYS
zrp>7HoBXB|(Jj19<va1`FjObzVVTZ+l6t!w;dUNXX`;JTy_>2n#kQKTrF0uO#Th2_
z)qc$^Y;b73!alF2^!k%Jx64~SJ(Hx@A2*-~X;5P;GIXV?6#UC-@XAY{esmy$gB2Z~
zmj5KRBJ@CBm3fHGpg6TqC%7VrH{IxiYE`frMqOdkBK|H91Cx~s;uwQUnPerp-Q24P
z=m+pWAr=q7xSIKcpqt)blb2&sj3YlH5)7pBPb5wPV*kwK{oFnBoqJ}xRU#GY41Ht7
z8o6yu&&|Jml|;9RtlOPe_|s(qXPjQ_xaii;H6t@{*Xdy`IkSOUAOb!<k1oy3(_l>n
zA(Yd>zpm%ll-!xG>a61$?~;G;&oP^Q{P^d<7uuzMm4o?gzi1jUUoN}<K{-8ZBd%aV
z$s%8PC@VqE;Q}}4lvR0I=@5VQ7nNa(*f()WZz^?9;AMSIKbMgz*8KDy=vKI|k~)Z+
zgojGbtu=Qg6_Ko(s3*JlCjX9}3IFLlfn`t{rM*r~@`?n*D;c_J<`GTtkxC_q+T;_a
zDXbO7xE{V-=JEk-(!a!78GAe#PF8J2zC2sehGPxTNjPcaxkVeM0qU#B+qRQ#B+j9`
zg(hE!AJh8hgIe;#knM$@`l@f=nbLn~u3Fyk5d_<#`_!az6J7;I{!AL6ul!087IU^6
z6<k8LY^^VcxWX0DZf~gLuTZo^D&C3lkFBi!+TD4mZhpR7G%PBgHsfy?J+vQZbF>n8
z^Sc4L^r57~_ih@Htx32Wx^NCR{{4{qK=Q$+6XI)rWD-aN@R6?54XJovTE>*#r8mhO
zcZNsQ$R$!lmf(jd4VGdiQ{?y$yBRtRVJ1lw1&%Xmhmq`05CWiI$?7{zq-J-mL>Sl`
z3LD0T&<(I((&6gVCqJ9yB8}`0H11v@CezfeN|A@KMeux6rOiti&6P?Gu_L?uurCv|
zT3A<)zM3)pbforPwWaXh^<{AXd2xT2yhSGU9;NgwJCc=e^-Oi5FP|wk{wTjIhY{mr
z`e_}%sGaPQ!V)qgJsl<4Y|Hn89}7Sk)&q5vCMGN48$TxD6(J?)q?I9%fEMNBe5RKe
zhcqpLB?OVUGy!gEr{QW<D&Xu)Ti|fDOOJi+hCz3j&`H4$O5yJ^>#)t2Zy=MHeXGB;
zMk+jwub?Zpc{~6m)(B>eVBQco$w?8;d{uJfnW~a`g@GtMAzwVy=p#JlY}a~N(IzDp
z`?fx>Q&Lie&$n*j8HQ=b1836jBPh33YRSY7C}f0X`ZW)?lOVIsC-?Iy6R1x+Io8ac
zEE*&mA|$ij7^Z)I;unY2m~FY?Qp)_mzX>!fMJMKh-QORNs*^l=1#5x3F^}^dP~c`|
zy}(BQDXGJ=dDkY|qO2HZy#ILw#^-V~4|O?(Vd(B5#_bR@LhNx6iO|w(W3m9h+-6%$
zN>NjqwSUWy%qAlSm(5?2dn^TdlYgRH*kkOz@cNbHl*uO~ab#2=x4Cx=oH?<t#8=Q=
zXBHqmM0@c(I=WQ{8LBH+d`CTW-ZA!$eQjEveSi~S)t%<bPxbn(71s^D;PftZ>c5B%
zi~VU@N&|O{WKl54j4!`dZuy?5XRIOBKb@*it4_&n6WLB#sm=FkaxIYb@g8rbuHGY%
zVLWj2tVuf_n;xY6A@rWzQ~d!i$|0e@s`_B*;4zC<nk1p}e!nMvbY$&0GUZ+yGN}&{
zv_dQkA)nstiGz&vGY;IFA5RC?z0@(*yqpD+Xy0Zn9T6bk1f}e!2Au{=TbV^<YfMiz
z$0wB-_vG`Jwi0<SViaH#`U(+8Qa{Cq+@DvAuXPQneBen~Af-H8{d^UXQ-FoU!~w16
zAcoLg7iQ3J`od7Ho6AH_+=E6IIsVmP8V7Wj0l6Vth2s<zL`8{QIZR?oIGaGcu_C;^
ze*55u-7Q&Rp`7`MTW!rQ4@|{q-4?n=XK=Cmn5K2dmnyt#j(aadn6bZ;E)4Fs%+KOH
zO>M(Fm|`Zg)arVD{Dy&nnI-B;JSX|~?Ob|+{_B+apvM9bn#JiwNJEdh^kXX?&swfy
zH?U*KxnG^<sk_49A+1E@{m40cyA571dN82=Y{(bjhY$9jqy4BeXs?#<-g=*-tR!kG
zfl@zQ6PkS{(aaU1f57(+f*}gA<MoET;4bEuLvKWF`+^01Ns3gsOd4|}eX%{62TtA>
zAzeuP`OEl6{i~%q<;oOK<Gq^LXwjQ>1v;OMtXrY8Ck^@#_@T;)7wlh&l4*Ocy*Iid
z)esZ9s07eln6S*C#8bH$gYQ>k4F+q&IItDa+VNPC*CcK5A*r07Efou7`)tRyvd74I
zPho?)&#a@?0?td_{hf?90%U=n)ur!O+-DVb7ZuhQb?0L{>$m!5t{VYO7RSz)3wSWw
zVD>_~+0S<am%GKoM(0zum)F+tsbqPr9~&bfoVIMM=kjuLs1gsdcf%mSwBFEL7*BYa
zRCN)w?GJUX*k>F!S!C_$8m@RhPc0VK@+h@OH%k1{9s!7<q|1XZ(o=q}D}0(@ed`08
zk7;AXj>=pnkT=Jk;?0P^X;lF=o9(zQnNit`yfcuuvXZDj;mbT34YCJAZyq{<r2id;
zD1nUB0CZxM4<|sg`?<D#S-YG-v>6|MjXTkdf&KceIuO#`5B8(RN=Ix|cyD&_C#DWi
z&R+?-Lz*KTvYY*avcr4rLpA@6x(`jFByZ#$?VBB{ciohEe6PBrv&Ri$kTy9#T}Z>h
zY&}fkrpEb25AU~X1O)df)Epc}aR>)X<mJK38dgjq<-m&Wr+SOY39rxsBd`_Q2?WNw
zI8`$C*wiFwu%0s_LS#Zi<YGG3ACZxPkrAbAJi4B~Wa>nagkT!V<4%?<vaC9FlJ{t!
zgd_-8GBdZkBw{<&NI~HtpGnC+&<!|`s(lv$^#XDToE+#-alv#n7JvL0FYR;fElygb
zyJ)d(=zemu+qSZMs`|ydKN04%VncX#x0?P3e}-Y$Cmat{oE*0y0KirWmqI==&5Shr
zJl-R<X2Scr`$K|D_Oq;)G20`)oHLeef!DApQ>P$&Xf_hdK|&hq%>tzAn|%%(95dNv
zwB)-FFWE4a6y;>HU<v@9Tjwa6<Ungoye{AV(yT&uRw)%&rLxaLeu_xoy+EmL8K{~U
z?G7nlNU#Y)N^KJ3$#;vNK5!*C#dIeSBP>?fyxTn^5m*pp$0jj%Gh)@`i1GZvkIVpM
z^N*yF1mjC5vF&$xShS_CP+JM{%26@%1Iw=FS}V{Q_XMpL-(_HVpifXnp4acu-LdyU
zo<NI6&IleQ+m&!^*?%yqc<@0F6hB0(Hf+XxKprM_<lf{*G5u7LAUd?C#bTLO7>LU+
zP_I{NYN5(9re7^4+a1RB2;i73u6375kK49O7{vNcvB8vNTf%1*<*NZV&^+_oT?i99
z)*R+=AFGJ}IN08DKAI;<mPbzXJ)n|gxdVUZf?nm2KbhjGa_U|Jdaz&1toWY%FE~ip
z#;vB5Q0|<YpLJC=oCs)(@#%*v(7gdYQ+imSr^T8)I|c^`GDfupcj?xVEPtycLz1rL
ze1PtK*B4U26i<x3A=~?Q=!jmmc`5l>HH&^ESg>EBN^k1=A;JAYnP@boRhd8${mdKm
zQK%1R>~q1}A@G-mI^<^zg_SHVbPsBG4|Z;^U)b+f>AhPb!VtKNU@<0wyo06xh^Rh#
zTh!K&Vzm|*nRy|htGOkxw*kxkx!!9=r;Fl`2|vHcoDp5><)7}wq$B*xps*el{InaE
zVorfN@)D3o3Rc#P^ib6B86Pr7GOcUbOGm39igIZ-i%gtLeZ@W;Iu?SzU-rTg%mZz)
zJUK}Q$zFp|&x!|SEyaM2npO%<&%j6K#LUmEiw%?gbw@&Pjp9Qny-p;m64&f@GNn`+
z;rgdul)=Tp-D3)P=`N-hU{p+3=_=XQHM>oz;K6|bg3gVU$8KoNsbeB$%}7;GX?Rgd
z4Wlerkadl|GasNq{FtmYP`EF|KC?MmeJ=6XBU3&;)2C;X2$D(9&+TsQs{7`x^0n>c
zTXQBXV%Yh-40Jv1U5NSBY^Ryj1EIMe!?l;)_1go~ce_VJ5`M^I2~1de;%yVj=7b5q
zUT3fpDpVFwQysD{d@Ayg)mYI(N^2=OVe$6jCH@dzg@llr{m~_L?`C?kgi>osH?v0G
zo|`X>w_;4=t+V5mH(ZxSk|~Eq^2;Zs){cJod|K+iZYs64vveqanA~z$SHE9<P*U+g
zs~$=WllD91Pl&JT%Z)N98x6Tr=2SI(n0l_sChN6OIsPijf=!GsN?b;QyFTCIcIl&y
zK8WY`TMPI;1GmpcsM{I#s#~4Oc`>K3AlOhBwe%dvN$;uadx2k1xezS31Sfj0Qa<*)
z^U25?ZapPH>HS8n5h3e|>Mo_g<-ZbzZIifu4Q@B}^{EqF=aP=b_pO~wNyFv-=XDQu
z@EE7ybFMn|>5`LTwaew)AIDl!wo2djPkyLa4kC8=Ha&Dy5rC+UOTdQ`FBePC$so(i
z%TXH}8%^+VI}VGORSl7M$m5RJPk&vI8QUUg=UH&ZUEj@`eI+#)m845#if^N1zrhSX
zKwaW~9-mlG)_A{7X#3FWd-FNTsq1lmYSsL9bB^!Y6ax^0YASDRC<fLB0UY7+*dNHe
z{A{`TlvrPzxCv7D72m!QznfWcVab*N<CK``OL<z^)mHvxW;9iZVLVJdj~oq22|i0Z
zi^W>R_NVXcy4*149wKW(u+Z!zIDWtCwwR*}hoq3`2_{$A!q%I7ccr1GRAsqMJng%k
z<r8F&_b-=t8Gfy^T`)C(UyFc*z~{$qWo~Ze?-(wo%D;CTeHpVjK{S!F62Jcij5Ee~
zCnY6O!`EZzr2V9gurCs=$}2jY+GxL@Yf013lKdR$0q=&E$@=}g_5dBRLsi}C_3m?R
zYW09`Ua--JJ+8|{FV9V>1YhkoxMV&Q{*?KBoF!jo$noHW#n8E|nnQt!$r9>TwLJ-q
zF>^bYCF}l7ciIB`7&XBkrf;Ip-Q^C=_9g5WBwVt)5b<G7cU<U=k!q+14mRq`lPmT4
z(v#_v_a^ryi}s)Q$Jq^`_^?YJp=Mui`^O-&>Ksjs!7Q+r<)o6+s-fZ41za=(rE76!
zuXSyw8q}o?HXps7KXw+Ea5#|hAmXpQH=(_K#LHPkiIZIRsn9iQC}}>()Ow+0y_h?f
zU2-mN!THlu%H)<3w{fq#u__NfniWXp6y=k3@wsWrK(H^8wbnSKS`v8<9T$F{ewZbU
zD`K=)_(ONhZOFXQqYdhn<^B0i*O<oG(L~Q%{ky&c*72wODe%f>{ZCVea^9i3wEpbu
zYcXN#8r7e~R6hThNTQ`X;gEZe_;yHP?niFo`ndPqyNfqUL<WM?(&pW+thGi;E%L!b
zCA06OCQSrUw{0&X%}3*13zD|YLvtggkr4Q0O+`e^f=rN}pav4YHqIIS-JmRAFZC=P
zMS09VUC67#iXTW3WHt2<brnI2xwIc`;-2v4l03)iHA*`g$&G>jrkJ>7u~IA<<idEc
zOL_Waltrz?UC_RiVzv>UZm64My6{B&E6ruYkTm%$@B&VQ%QXijrkC0s2X}Kki>spd
zdDE}_5{`=$*@xKACYn-EH(#vqewsSM<8HTg5AU#f+v+yN(e!xg-W)QfjPfCL!@O6O
zn{O-ie%*f6q2XuCE$_Y#ywbN(+}+6Wim7_FIGq3>Y=ih6_;_8Ox-Z)48khq-+6ZDv
zdO}@W>Yb%r_S^G}E?lyq^Z68CnYI^8B<VQ2R=1V%ve)YFy?ghJf*<AQ0aap^PbZf_
z3X^87#_*=Gwb5zLXOzDn%3i1rAqk#VP>2Z8evX!DHj;Sqfe?pdbxEf=^L<MMgN2)B
z&U#dz3oG{?gB@8mDPs(&o@{!nTQ64uip(N#KSFA;OS>5UNzLf_>xcn`^Dyc32ma#k
z%3e8pKx{nLy0zRk^e%F!!P=+F;afYZ4o|x9O*GzIyO&>PeJA%tS$Go;QSPOSksr>+
z*^IY5Bq1K4?$EBZN#E<gEpSOJ<;uZl=b_-FyGPqv=8oJqQV2h{Y)KnR_rFDDUjJzy
zX#6uZIz#*5wC)l6a<e%Mo55wCb`(cliRRjHLj{tx5Kn2RG?s5u9IiqJ%(yhKPz?8=
zT&V#GdrAQhQZK1Qe{n|J8CC}Ds-2WA4^KQ3^|L#sIId~C$>Sf`)@Wm+CfCh*wmi+e
z9&8EJf1W(#F4@vu{?sau4c*mC7m$(2T1C0XP~GN)z-gt&CZ6UKy;3uZ6LPum#&st<
zmMNVCJUy_?(LnY&MM>Vn`$HjjYsc)aNv#$2oWX)D;5{eVV}BWO|N85SxA^nt5<9=>
z?a9Q~dFxdm-*)G0lfZiFq(|S+)b;PlLIV(;8b+1~SoWucoK>CcJJ_h%hO8-@arVct
z=PX9FKF@k1=b7N4>dZ=`A)W%Ro5&8&&-eEEJ3OhqMqN4)`d+I7IG5>F@;-I{G7BS+
z(g4ZYYZA3hVy;s2$zdW}O<h;hDSvcli9vME?7fivf)s4G_U!jZiq9ZdrgB!MUUSe@
znPny(Tw3c=?$!Efo*k!Ds4XTAm5#gYX1Dswv{FO6lD5is78W<MTH*wftAD&W<gWXo
zw$xQ?WIa`|+~{%%3z9-%rKZWvwm5QUJf(Samt$ER3qnNNN-fSoDyn*sT(ayGE^)Ss
zv2GNXwdX-u&HPj+Uao}Z@jI5$K>@-Iv`K6D0<kt}SkQ-xIb>d1WpP-xg|lRoo8uU0
z0Dt@ihr*8L{m^2HGCyskzH4`#7nMNl6<GGI%*>h+JojyL-5iULG2f<lT->=MEDj83
zCbc$eh2oPdZn-(Wp_;gVyYr%GvVDQ#sT*I5lE2-=2RgAz5$}f3y<Nn$Vo65CPAQ;#
ztX}uEoO?8fzFUb+y_Y?AGGguH_pU_27xGiqmIj`USKS`XMSfFNOvOqhoN`^+XYXtQ
zRJRIlcwy-tyJF&1tghplh05!SJ9vcxKY32abCe>`y+4?5cW79v$0qr~dx2@Gi=*ri
z$7yG8VZGMeAp9c`1|ff437050=U#p3!wQ=4VCf$?U%(LiQ}ckw#S!!-%_}3HFK-_i
zC0)B<lA2^(D(f85`{MH*j%aa4K5VVKi~z0ta;m^7P~%WYI8tm~;r?1(iKuXr)gNb(
zK~iX_N7lx=ad(DtC&39^CBN4>YkV0v4Dj4s5aOFss~BCJHgU<BrK@2^maF+m^FekR
zor{HPPrE1)^{WqEG@Z6gFgBcBm07g#Ih$)E;IYzF7e}QtScv#3ap1#uus?q7O4Rvu
z1+j8-XVYo(?NZ&cVRpJ<O(`kRC#`fH8Kh$Jk~Z>F-3F5O(biH8KN|!a!KTTdEgX4w
z<kTE*NBg78an`jt_b7ut#ja@G{%m(+Le{ChwKn$TYT%2b^YN8*?nB3+O~9Mr?M$vx
z$r2B&Ej`P&)eb<!^yUi7!@3P`=YIs()W5l%>RuPQF|)L|DBV7BKAYXIN<XVr>9Nu!
z*^r4l_Ye~L#y5EE51x3?Rj=?``&YkOCw}OxQ&2zO-Q!Mg`(^1im9K(FRCO-Zz3a$r
z!zWsaV^=8=GCwK(h+fl@mHTQqcWTwzBumiOi6IsP<4K>}B~!5jJPt<o{1@$P?d&#G
z9F%Z@GVN-lyMpW2fi=->vKyS~@aN9~1o`_~XnO~Qp9h-)S<Yuq^S-;TD6kSicsKmH
zh_VclS70#aI&3QkVd#}`y|Si6og=49w-ft-Rg*DaxSQ^g>*yCaGr`Efg_6MD+-3bC
zzp)n;K~uT$33~$~M@!8uZ@hceKYWDFt3~$UA~q7XY7n#XWUfm0f`1-?KqWmyGj*%H
zy`WhB(5}{qU<oIo9bTJw%HIdCOd#8hC(sdwU)|Zl0bYoMhQ$~`!+eqc17b}beCzGX
zxZ`1`P*;FsqT!FY>njZB2iQ}zgG8`P-tO}1uC&C77~f-CZGM)RE^zJVLl7|H`t}X4
zEU|^48SCrE-;(<yaWTLzR#JARu1`r!--L8D^t9vXr1uX4oduTT2)A!!pA@yyMD`C7
zS0f$(;p0*CU)jnhf%!N{%*2Z|iDXHYJ=@BMM@4Uy7u$-<ceA85)E!miXRWjh)xr4E
zs`~&tRMKMrJH#CxdJ5Hdw)k)=zWgUUq)GhC`ggm=eQ(6?>vm)Cqx`6Arz5~@kHq%v
zE_e)im%Vyp=r!^Po-`FaUxpMEKPACVP*l6*)<KBe5^q2r&Si`|cq4&iKnE0yI2IBE
zG@j-A&(|~3S9%P;FMWldpA9AA{Gj1w>~WBm^r`1z1wP{pvclUzZTuO}J>3uD^_K88
z?LsMt6EY5!us@|Kzsk*iex-i!$#U%@X*Xacx+rm4vwpnoJak*xj8l7#`VulPw(;TZ
zm%79Hr3Y>miCZ4AM-{K)45LQAd#zw$K{n1?*=ag}{a4<eo`;SUf=QwOpFB$Y;`FKa
z>+d={pw_o)vhSM(Aaqu9#nfQklDG5Q4R0q}9r~mO?mtUalL|}N_x}hdx~rD<k!-B>
z(0H)Kw3HUnNp>o1W64#oqPvjgr{s9xECzi}{8}sIFcrfRrDJ^HIkCH64WBZKo(jpI
znzKO4z1Z9;fM0sfz@zQYdpf+gs6%sZQ{)Q!e7t#K>9I)MYcBJTWUbV??PFg`vR)EO
zIHuM)n#Ov5A$`IaMcFRQ#Jz>rX((w|zu|uvBlL)!Tg1(<CHVc^_Pg~0HYLCTAKWrQ
zu(;{gty^=Kd(QT?rsT#k&oBK_Ka4>C7V+gb(y^7EU1Dn7fySjp^(rs>p5m_Re9Zeo
z0P#>CERBoBDr=H|c9ub+%xj7f59@*Z)iqI|8=~|)M^SEHr1yAPLS4*;Yrv;sG^XJc
zHt?P)<v(T}7mq_XDL)5vU~!ze#weX$B%SvV;iL^0qUt_)icggCt;s?49Ev>lJ{{(7
zPag8~i?W-rjg3@6e0>qSl&$CKhD%vRJEt9U!GC*BiLyqqU|$FpT5CST!tUZY>zZ)N
z{~Z)Mea<5H+)o_%JG!NUSRdN;$GOX7GRXnTe1jAat+o`tDJ)d)V|D@t?ECszL85}N
z#+kAbAN4<%`1>G7B#NH3S)Gl?9j-|V^x2;>M=n?xeK`ZFX+j;YrUMfIa6-b6^>d5V
zON6Gn#5@!fXV$yw)~*1|i>G>a%Tz_ZK5s~W<1g~N=)>&lIjK@8%^rBBqEb)!ui}W3
z47AtuR+W^LY)u_aUXsy!a}e|r=O;$_Z);R0gC<L;53Q^)KnO$?CNu{<5&-aTcET@v
ziaT8DqF%#9Sci%xs@PC7V&XF$ey^ZdkOFeyqi>!M4)#Vo)5>=0v!om}zOExRCciCZ
zr66|qsi)j-i-n6S{Go%i3QphNSZ3`8P6B6&+yuRJx&R~vx+-4)UheU=sM%ls{<b!<
zprF9JxZ?+^t!+jLx>oRX2n`zlVMSSWjC(U@;D_9VmYemg9K#$0bK6yX`>iTLTJS57
z0K{va+*jEC;K@ouHp#hhI?5st>{xPEzVjCjXP-BiTQ;MxN9m#hH!?kU8kZ(08*0&B
zX~Lv#9=mRSY2!kW;oqN`?h7$b&IDHeNS{(c&QJQPzq&+V)IvczIB;iYXL##2by_?P
z!ONDJx2Cp*^#4udpO5m-#GwfKJo)8a!8AWI(=vN%10+Q}eQNn@IJj$*X(P2zHT<;)
z%6`(i{vPid+>b5^O;r^ojyZh$Je!tvC#^j4Bgt}ZL%yQ{jS6Xz%6r#?om*R%XAQ2a
z*CY|nFiEMZzXL10FZAsEhhg?##4f43z_-`+R^QqAl{o~vhN&kf-SCwf{+e1^E@eCZ
zn(IB8RnC8^ObPUr-NpB!k}fFoom^40bXZtlrvk7A8<ig}i2G_IO>KWL2Vs{e;!P)=
z&J76-2yhCsHhDKeXg%{zZ}Q4RZ@5Hz_+7)tTsf|8@m$~b`!i}*Xp?>BU20c+g)ZeM
z5f0*xa|*c?GXF&{X?^TI!?1+iosU8(V*5VLyCy1jRB=9lg6UWMyKqqS+K)Qlzoac8
z9=d1aVf{Pb@?So70smE9G2g3{Od;AxF%O42FC_sk23U4wSn9oY3O#*&{i(@G$>egw
ztX1=vn-M@eC&-0o;=9GOcP<~lEN}#XHIs1lYtB7~@2x`}UrJXs3fyz0*Hz*qcOVWP
zA9%Q7LV2n1jB1aK4Z=>Q1&b8jxGdYr?JEFp|3$|MQbo_JivV(sQbqOe)(E^osQ5{j
zLQ=dh&^hRBt!k(a)tj6rRF&0ph`qD+PYH(=g*Cho@=OKBFC{l6vofscUMUp1YJTHp
zrn*0Yr{%ueha7}!S6Br)GjzxQg~{7FzChg4YzjNynoYP&C5K&BOaf)BzKem5x6)c(
z8rk1u0*-2Nqq^m&g7FpQ^?ECZE_~j@TFJ1mQ~suqlyj>sGZw>+;;Uvz_waty6VCIu
zNdEp~%stc3LEw|?)}=+~_y&JSLRV3~5w(J{S*xyg?|1*rnoeA^5$8~(bE4sIalF#A
zl5WVTfxi~6@G`vD%Eijr*_k1M-UMmSrHlMqFfgBKJPZjHghcme6ngbn&+H{RB%ej4
z$JL(GA6Sk9@v^=`cexI3eqkXk0Ub^2&3Y|V3G0<#=;^<#i2pGzbOq(09{%{0{~pvu
z60DWoNWU)ON0<L8RX&^LpDbymFIs+8hX&*`5lIdN=yXSdPD@-3fCd}weG$CaCBGSj
z>t7akM}2=K|A;ae<n#vQgv$T&YM8sA1?C(s2V#3cU$UyaKks?;Zlk;SrWeI)I-gEb
zSdX*6;rE&NyG`3N4k{GGYv1sM#Q$Xs5p+dXO+07RW-SdBx1fIgyNF`aQ9h^)a<3yO
zm4-8Fjww!z@_qWwq?Y;z#s3h}lS`f^ZoQvqcv}%T?kT%d_gaW?iCLL@LemXt4#3nc
zBZGgDp#p2me^Kz4wD+9xm#0G_vr=jIQoqR3Aq7<j8V+FDLPfY+!;U+FsM)laZ@vZX
zKd=p~QTg|Wd*ImDFT8g|M|nt&)XX?ti{HUO`&h*naKsBwq#Xhf9)dKHs)Y@cn!43a
zr2mk{9yt0O!rAVm124|WJM6Bn4^+OL3qTa4y?E#5=<13xhR#S+Ja*?@0ZRGtrZ&~T
zNC|Sl*$pqeE9WJ*lZ+}XH<tr<MMbx3KzvsKLE^5;mwP15;>!i0Yz*u#qVefp3jd3i
z(hvO0q727*>W*TGg2x=*I$69H#4punPcV|!<gQO(U!`m%8w!25gv?4d^kpgkf7lU&
zf5#U!))ZX$*49EyNs-&m!QR~5d}?uVQDGrv5dEp596fpIhRMNb2j~Bdc$nR9k@TMz
z(q<~MD&?u55+azhoP1>ZeFO;Dgv96Ey1}I1M@l4Vy*M=Hxw<lg&goc)=IH$MLB!dr
zu=clOFXHaLkHB5g5tLwKmQJ^XAUzdLFv!^7Bk|HuD0z6JKLk}B_`k$Ke+Lh0Ho_4$
z$c49Z)Gby^RI{X?&dx>0b4yyYdv^I*LmjHdlb{h{kf`I$^iTQ<`U}<Dc8>>azx})+
zG!?J=)Vq?OP6}Y<1`_{C9Mo)8iM?PK97p}r&f2fQ^S^}F{}DFYa^MS{o>M|tJM5nC
zyJz*~%us&%WCL^E*``iJiQ#wWy^bSFlyUQ;f1Lna!rIe;Dll-D7lzJm`pc!qfmC7@
zQxl>X&tIbc-<Bg=X?joJo&Xdu5~2*%CKKU?(wYt)*8~Q*VfnY}t0^!^ffJ2Hf6?>2
z0d=ch!fSc8J=WS#(Sb_h6Dc};n>>RW0{<#2D~aNG@*ti}uv~Oj^gsUg=h1htOKeFu
zB~ZITbZ_<ct@uZW!4jD3iPDz<Kyqch_Yx1w1j(j7_)pX1{{Yx(UzL^OD8t%JZ3mzk
zkEI<RB6{naiwyHOv+J#V=E$NS0w3mLmg*Y)&yO!SUj@=#nP)zixZF|X4t>UJjt5<h
z$;m0K!3jjIQhcKQ*O{?b$%y;%x^Jt1>kM=DtN|NjI~4w&M&+6@_u1GN|9eUBop>h4
zL(ZSUOC<-5!!C!)O^E~$aPn$h8NTVC?`gFy*AJ{P&0gw#gv#|7`u}_eEDFj&-KP(S
zk5pDySIhjw{ZDXrC9^5vCHDM_jOT~jaWjSgdEyrhCDehn=Yw`R@)Q6%x@hvrstp3N
z&ETyUW5K1`peFD$`SSl89V=<$1c|C%tFo&X!wGR$1bCbhL5On;@({Y(Le)+>6zW^8
z${dYK=JR5Wy$rI$b3cyXvoN1+Vx?(?2haZVI?G6Darv!-JZ}8TDX+WmyX1ZBfGX91
zD)*zS(_0|$z20Y=`0M5F^seFmV4nXwJV1>xATC0B;-`UnNryll0s~m;k+j&Kdc6@L
z6sXB+>-OUs9T305n>zo(2R+vQco0PIHN`jmc`puAC<oS#X)eC&g?Ka~ww3)Q*dPDn
zb3w38tjGMyvb*4tb2cT&JX|-{1lnasB}C<>-X`|t%6-wvqpbA}?7i6!S3Hlt6<qzN
z+2h64KHr1S&dz=)C?q7r>nckJA;*QT>R^LjdmP@)mzM^w`y&;yROkLhJAymuR*KxT
zj{JZbBhJ)cA-rA?Y-P#|cQm>2EXK{oJ*an{mTZ^M`y<^adNU;ak$sfKc>hvht2k*L
zETph99r8gZD$0`~TWEe{Yk!z;^Q8n-#`w+DTI2G6S*M@kWtG+t;~*+D?z0mvFIo{D
z!+JmC9{Tr;iyL4M5l<QCnCFgsv0{jaLpwJE_a?s_>Z=d$B>hW?rQcwi3Um7u)*qC-
zpboI66(r~n{9dOE;=9|`0=lKr@4d^;qS!#qYzY{=!STDD|6-%Gu`tBRX!D~=SEqz=
z*75?gGf0VG-^V8aX@S@GZFy-?rxYXBDAZarO81PsraS(Ru;-4ay8Hi0WJhLEMuo@<
zSs7PCGE2y&?47;26)J_YH_2XQZz9>F>`nHy$F;e?bFK7zd4B(R-PiTG=e*B*&-0-y
zos>1NQ=fC6Mma$xp@xzsQiqcoU*r^Qc3@QjYQJu>s2uRw=IZL|@TZY?F(n{;bm(%d
zn-WPe2O08{C^SQmzYELr#8Y!tx<JZO#rSEQ3_LeVbkRzdMR^RTzY%Mu%JBTCm8<M?
zvx{m9E%1{TX`_}E<2EzA%!xA{{Mdwu0Ke_#_wi5Xu^LpSl{_<T=~CD3Cd}xd$sr?z
z|NT8X>TimExS|G$Y1zgPRRQ(4Azl(%;5kMnR^un~=!d-k>(S~&4I82@A$T66INNd3
z;X~>S9}bs;`tnh|Luzg%)bQWFBI5fl39Cx<xOt@93gdTa;7wXT4YkU=4$WNs7M`j2
z>s6x13D7`=Fo<v`L@u8x5n{aX5@-&0px@>EK3e)M$`)JkQBa0Qc3!jo?wYeg@7M~b
z{Z^7NTWr0uLc&a=hKyCAsVHigB>f+j({3tNYv>e!nA1F<olC)XfNO}O;~_3GaDd!I
z`#nXhw((t#vuE`U;>y5iHNP7zJV#11AEh;>II*SN)sI`mEWbWO?ivvEHmfOPOcb8A
zpCq}#;hB1aDX-o<L^}zMB?M1a(LzxiN|eu^6Ly231N~$prTk6m2Pi03kvzbit-|ud
zZCG#Syme<iETO)HNMCq8c}i2h_wFNHzWZJEIVQ44w5cRWyxFHD#+p?Z9fv>m#f4>)
zuZQH-(%i)RFb@#*ZEen&81DcwPj@<zNpV77Z0RQj(6OZQSt3M|bcj!IDZ#M^Q3C;H
z@y?W#reTeMFgFEY)>1cpNOYu+i(k?D*MQYpLZ##4AohawNN47pxnhJ+di-6L3Z?!h
zTMmDVq!{+&3m+Np;4A5_KkWLtIp{XEC0)xOGqrWwB;S*O3y03H7I$QET@!v)nMyX@
zEn2-G(r`Ku^36G0@+DDE{rAuKCn4|(=!>uiZbf&~jlk|a!j|dvyDiS3PCy3&Ud;zy
zrfjr$LEn@0_%$<gb8|~eN=gLC-ZuqH(oS2aCil}V$p6&6a^?%~OO1M`;iL-1h$Cbk
zeaJ$q!{P}fHfCgR;pYD>vNqU5QZAvAvad?80Xo!zX`E9ZMNbMSM+(ry@UtW|#Ktq8
z1gZ(qjghwEsps8hMk9>Vgy0cSiKzdA71*yYi9JV?Fx!5qub$Od<RqO)C7bMqTlt7b
zzT*qPOeJCTMh6CCkx!r=)eA@aZ2yyd#b^U)rgUQ%h38H#m{{Ke2YSLt1x7jC{d0a^
zL|D9Ql&19~j}OxT7Jp-HF}q~F?1du_t=PkT>D2T^(TNCWyU+KMmKAa)K^M>b7j6hr
z_LQl8i))O5P#KGOTD^dX0}RjzCC8KQ8GOIm(Q<9J?(UJ~X~KEjD#GUx`HDgLHYc4Z
zn2y!?sXC+FswD%8LkH3oDx?O5InBsAtQlhc9vUP8TFye#QN4bjWmenzj&8BJ-tG86
zu)4jvfQ=SK_`-ctM$TIUcT7vazR|6m&2!5CVB;rRUuxewsmZaIeSC?3zrucTOdIrY
z++n$~Ox?l-6n|-B;-Wuw%L;`uL7&$%9>DuLIO121sD1o{oiAi*M$*ZfMnLY-nB^4|
z2NEhIlr(^|zB^s%Hhy>e@{><wK_D25=79E}`BL~&!30|zlPe%M`|&-uatLZDj)o4k
zDMNHlqM4LOUUKobARU3DI*ugvcif6~gpX*f@C*BnAF{x4@?dNIp>!{Ls(ipI3P_do
zz)iNW^n-j>&yAK<{s@M~o_vkRV|FRYMYa6t$v-};qex-3?YvX)dlu1d(;ImyY+^(y
zGdJz>4t2{FXn#y`*K%QGuM!}!-hA2}BM}*3rtyJ}d7zAL(qqnOa6cI=4@rh7N_(hq
z_{I$0kcb$?K`XS#VycvV<khtzPLo^=>EwC`D-@F3xTk*<@M$ifw0<jot@YA;LE@?J
zKQA0!hTFVsqHpO4QpcJf3%_3l_G!5aW%lqZ%r9$$?*vcL7@5V3l$94AG|S9jwCyyg
zsWn&2!*N2~;xxi}(L!-X8!t}8EZRAJX<9V$Lh@Q2$Y|WGUQAT(DV*Z&uN@T%t<}y{
zKLcer$=l$*>f7g4kMb*b%evJ$N|Kf<H8pi6Je)2#NQX0EstAb9pkCK{9nnDs%-iu1
zdR=PXOd;xe9f`b#B3dE-E7;wIBncrpq4@OOTo@Ms;E-|c(duv>1YUZf$~k$XCD@rp
za&YbS8tF%bi&>>Bd9`x^$lTNjsIG22{s_P}A;ND<(NYM-nv-&0eBkUxYZ^bUbtiK%
zbmAi`P(C%5LcJ84b*;8Kf1d)BVe9%nGdMcwm^uXX=ViiFY}ie01?LL3%xo*<y#%8A
z3*jD0f{2W=-KI99R+##VCArkHb^Ls#)ZnwQQrp}a=UYO>Itm#^p3GlB;V?vpmBNQ~
zu7s$>baLRwASLwLmTublnc0p=&^HlPHE(Fmx81h-<#E`{M`{@NbB2dAcDHMvUOzhX
z{)cil6p=NLpzb`TLdwvw(!Nx`#ZNNi^VQcKtx+86Q#C<E28mV!N&vS(Y{{*0?jIHK
z_mig)Nr`hWd0LiC{1+{on>5ExdNCLBlYB`hiOKSrYqtPV|HUV{{iB<V<TyGxagi|4
zf~}2JAVsUCd!Ko#+vWlPRuoTNo3!;cd3Hb!4?0;6bH$ayVNCMpgMdxz)CVWNtsfHU
zIOK^M<x}?XFryEE9@fC~ZKp}Qf<laueaoENEV#b3zXU}<{wRw=M5ND_-~L!%LNjIP
z>GMlaGStj-!%r1|RhWz0w%&a%&NG}(M*D;itgrtS^0Qp+=d`k9j1g)e$^E;Ii*bRg
zgn{Fakv`%70K!Cs-()oWd6EG9lK~%a#Xs@^s2FW5zFgVC37$nVWet`lftj@ip-Oe*
zOe0W_;-_`M*#Id%DUXcI8ZP1%<+eQ9N)_pzx=cf$I5a;%BHene_?0S6Yi}zFo3)x?
zlq(e>w{1a#eu5T#>a_lLj}p_L29V!j;l3JrU9uei|3U@6NtB~Hw@eERE(j6Gkv(qK
zxoJ$u1L#XbWHWdinE)8k%b@k+`gBDS?`x$_?oi`|x-SNPBcbg!+dkvDa0C6FH(X4~
zmzO16*5T9g9j@beCC_!?u@AXp>uj>x_%UOCebt+K$p+TEAsns0&M47qf7Z|o#z>$Z
zCLaY+2@H|jcjX>m+o!3aXp92NEa`peUN3&@-|AZv5_reB=FmIAc}V54<lR@vM}$yh
zHG7>>9t<8bk8+ES3^<mofTW<M?|%QRNL@i3Q=^cPmus7z3S5&BqU$MgDhJ^&Bmza`
ztgT2!iluSS)fTifXG)nHJ_;Q1#GmrVWDt0DuvaYEd(pf0JE7{x$}i-g!g5NE2c+yi
z?(>O<woxkTTdeO3>cuvob_!l5@ZE3yBQKqwO9_5rV!-q<j=30LZk4YYjM6Q}A0la5
zqpn<=j<u)3+}h#IbB@$%l+ChRg3fp;)92KR(DSzDt4HVEMw>AbMy#DhV1j_dV8npK
zI#rBLq&U)t#Y262Ztksm=UFyks+I@P>wS4emhyZ~>=|jinxQV588`7lt!Jq-c{{v0
z_W*;0mdgr*Ef-d3@;~Ofj5MdL&qc+l=^TR^ut^jU{+)2e*ptgXNX{p^L6(2hLCszG
z;ccXptU#h`sq{NHLeGPDGsz%Q2roXmAJ1E5VMH=c-zjJEZ}6c4AZr4DuJHa1(j@$~
z_klf1d>pMmls;)4a9(um#0L(!2cPOowp;YTuU8CzzB$G=$n`yF_t`OKgN)^r?GAoh
zNDJDpaxS%MoD3Ae%=3Mrye*)4<<$N5&jUq9EF_?)cnbJb3H$tHVI%YWv}X>Hp#&sY
z*ZkyA7Wqs+={`Bus@@&h^OHa9kbq!eZJX?rC={Pc7P=1>!^IaTn&e}rj=l$+FM?b0
zA}4Q!ih%>P1d@oAvbP`n_Yti?vNWT}GU4PRJlGQ=yf4~D`oW<mdB(>;fW8BWzZ64s
zVlsP=->&bkb?xu+*8kFJ79h)LZF}-mWVPJgXNMeLNYF`Kd<<Ed;PSy{i(_9LgBZyp
zqX!bzG)I3*o&5;D+t|SP;iQA=@5y^cZ|$FNx-|I0ADws7*3pAVL66jkJr!3B`1_oN
zCM-*HDl6|F#s3EI^{>V!p9~a;$+bX8-$z_xdNEn_?=bS;=YY-u^<Zr~w-SDudeFbW
z&sn&*a#{cW^}oRMeF(}F*ZiNJ`WVP|?{Vqq3AX7t`l9;*vSuSG_qurgwaLE5QNFRS
zXh;b5+=J5)f&8r`aA1_qpAJ*_J_+l)`H_D2zZBYE3rFh$DGdE~TkKy9b@VGMjs5fW
zld_+fz527-3dmh$3@#g4ody>KZ6B-@O^oE@?S?0i2}*#BNxnZOJ`q!3Pgb0L*})ZV
zzwzHkY*7OzoNJ1R{Hxr1d{11=J<dvx(K7t2QSgLwumRT6dG!dq%I_5M@!jWAZ_u@R
z!q5L8D;tTd{^eNP)1HE0qwU+~yC{1(ssC6HN`FR(zaDxzOmh1q<S9xxIbHt;tTvDk
zP-M>^|MSfeBkX;E&SCH4YU`=!f3X#iR~MjXKs(3#le2~3*olyRu!wPlfDiolKL0wc
z>?aVcy$Mb})uZsB-j^M%M6&-!(G=bUM!LVCb+57eB+-bbJ}}Z8lAz@LpNE8q0@ivb
z`}V{n@t!~t?637<zjf`@Q$lS3vT__<R!&8he#}9faX%Va2l_9#SV2BQkLUD9EU@-S
z$gH<U^#6es3Z&5RT<7I~k>dy{EzbMrv({+9_-~+E%mAufnlpZ$iYwxM$w_}forsK?
z|Lc$sKs<f(f}H$m&wwV_J=x*Hk&pdvMFYAYQQ?JmY#mOunjF{nIm^Q%N$Ed<!oaTq
zCM@W}gIlLQ2D!So7B8}f|6derF$Ku7Y4#a76<JIN_8uG4>QpEAV25~+M`A=j9VY30
z5{}J8{{DZ!lFb5A*f<n=s<|&@*~itK*QFDq-#_bbG65-^-?+_t8eGvR`(VklqW%fX
zPevLj#DR?C)8x0Oam_`KgV-y^(EkqzCy`(hnPs+}klr7HIAJ|dEZ^74Ev&asY2G;6
z;Syjiw!8Gl`u+a3;Qjrz#3KLo{l8hG2Z^k?jh}Z;MV9-%VKt(LGM~ya`*4-}gUER7
zNPz$6Lk;~t3B&v4PIaFS;4=jNZOSE8%cBGT>jPx%z=Ap7y7uoQ{f&G`89>!0f0^j1
zUdUsreXy`RqW_;*0RIN?*fQ=(o>+k0J8tE^Q`*n^`2GK*X!=3?JU!Pp#dDfEYVqCY
zEHbiq-T(GNAXKq{wN7gp+Joy2SgSvh-|@T*sBHItJgn7Xoa(pi;~T<-RAD@34pOJN
za2!A{byAha|2Kp!cY)F&k$!tR2mQYR%|185kbI${>i92qIAU_h1OS*<(EZ&1FyCKX
z0(}G?docF+(mwKKbIP~Mtoj|=P5k8%t2UF88uetB^6=EuSM15jMM(~Md6l?QIsBh(
z6cs8&6|1-2ij`z%InO5-xctFi4QP4{5|j9U^!N&B1Ij>H8oxpwz*i``Zb30XE3L3n
z!tc$HlE)ab(-+m@CM(%(awp@fL&{=ZayPQ_%Gk~w2>OaD`1u5(vE6Cs-$z6Z7&;0f
zltc#aV7m9+KgyScvNF%=)T90|Et={`mwFbrGxm@^qfQT_&!&~`&alOAgXYP5g4UT)
zD0GoFj|9;bMiMUyxT(pPXC@D6cMsnesv&b-<k*j<&w)$?=r`k1Jxycmv&v^sPL2Bq
zG#ICunVDtKRVgSaaQ+rFGc}#t&cFlnIB37;Wyqc4T|L8K&6Z`;+3fMXyOZdG9}8{{
zZf+-V&E2R8urDM!w6GiuE!A#MdtbD=?gZJ6mr^eD%{zdu?&lLGQKd1dXv<LB9qBoj
zMx$PSpG;kmy)<NtE#SG*<}YBD?Z23T3Z(M}m@}>ibM2Sv42FY+k%d74i@0TswhJrC
z?B9ty<s%naR5&Zi)N?UA|BxKHE>sj%H1?sv@b^q~))2`D*jk`2IEwaIr=vhp?QTr{
zBz&T0bazmjjmP($;Wz#?;jfaa^&K*<9grX~(2zdX-D>;aX!K{HO0@mXA%IUnd5->~
zaUtT&2c3k(w=GWshQGW}uV*VbLBT@|P)smfa;b-u^n%$j$C)f$Z#Ah{5%%4jRTL88
z0i18~;{S={p^?C6bjM$v>9({BLBH!zDvINCBxA=vsNwH*vfISOZg$}zy|9MZhbLRo
z!(T-+LWR)bfqWtvsXVfP5l-S{e>$o!iji@lB!GvMC{B9f*dDA*tX<LSI8!EOW}8ih
z!iT*8l)0DxHUvii1fiffmV|e#;yjSe;J1u9xrOYAXY^kp`#@4sQqYAl&@&~=L0{MX
zIw_z<89Q&B*y;*;_3`mLjH)<RvIvjoDh*H;<H`+m3RjlFyu14Q1mi7G#}VzePWAy)
z1gOaLczb*LxCjc+&1-_s#9#u@#zrYxlC%3LYGHt8JCc=_Wg8ZP7mb0@e;nq-plhFP
zZwx@VE#!FR5;H2NA)O~`;g;9sZxSMir1@qqQSZ+4Ypnm;!DH>W?@chvUk%C--^`xj
zR|fq#x_N`|vG$@W=Cbk?VT$LjCYwTPcy^;}Rw6}czbFhDveu+tCHCM;E)n#zDSjwC
zL@$_?p!AS6k$|@ic(#}+R%Xpnwq^aux$6do1V1_|E|xyBh@zV=30z-uSh5|q<MZ~s
zwt=J{L>elD250kFv7zXC?J5LtdR7VX=<VF1(8sA}6R!SHOWu=%2uKmC&EOhitgSdJ
zi$s6y!>+D@^IY2lEE<+WJvRsPB$q23>L&|Ug@EfuQss;GTR{1PC*LQfrD!oH{g$(K
z?Wvy_?rO`ws!GOsIo3UxrhBgK!C&UhnRN02BnSIjD<`@{e-UiU%)<3YP#<1Hm~+hy
zO;@J<jLm)2bA)YoGPUfubzrvAKkK)G+PWvA2%7{D0ck4oQs`)5rmh38kor}HOR?xq
zn?g}X<rLL1eviY}#+3CRm@H>_r$rnELrWclz0b<lqyR0{S@)?1xV*GVbmpScKv^-X
zz%(KNS+4f@#9Me>c9bUQQnl@k!1`sIakdxwI;hQcIH8z9I;u?n8!Vn<Z294<PkuJW
z1j|h!hc2Km<9*rr+x;p;UE~W<{2tP75CP~nRQR|_FrDfxW74y#_7n@R@kpws`855_
ze$w;vLHWuQom($0bro?2sbMVIG(L94%od0I^GPb}=ljxi@4*3kHm=zSa{hhC-5N{P
zvFi36Xvrr^t&&d`>0!WLXHkH0L@kKfjaNTTOs+u%FfgX!S|snG|3K@|&RuBh&C+Wg
zX7l@v;t-G_fqP8U|LAeenDR`mk}L%LCf-h67>nTR5XeKIe!z~?Z{6I~6?Gp7o+t~l
zQEsG9JM_|MKxv)6Wh;6B1~YVO`3<?$dMz4>-b0@LOXtXXBR9j&DCBUOUY6k{a<puE
zrp}P9*p!VEPa;(EO%hIpt~|cmLv;ohOeGFrf7e6bT;N&E)?3lK7~m7bvaH7Ae-0!2
z-cW9ssfoCZNBjF;)6LN@=fbtJJD#4qe$aveZNkEswzh#;h5rL;f(}>OZW<9jgk*RG
zBqC=t{}&5HV01h+h|lg$$SYwKM>JZk=pYOjuGMdJ)){`s)yt8f>*VakWNHFr-dqV9
zuX9#g2Of*^DT1MQpe*@EnmTe(eRa*(!(Row3rxP%2#|3hdXv%jwu;_Csc@K$y2cOM
zn=V01(c!e9b7Yb!c0&0?P};6hr7HSlDR?8PeRY%e%YOHZbH3RK8wK(gi5d|6uP&BM
z1Q!e4d|O%+-4>6mC<iqtQ*(kJtls{a%L$0qqKj;~4djuMfn+pFC=8x(bbP7>O{i`0
zW}4Q(OZ&yh`A?p!znfW#VZXL+UU^^f<0b|C5xEyh4{$p9u|Lb8+;wmcm3sEqsLr7r
zLioW!;oL^<|3FPKSJ)EsB=679EH`87MIq`;8$S)4n55AhLb>&tw9+9<;MQwf;5uP+
zsxOunJSdvS#h%7>64JR?8m$h<%1bS4bDxDrgnqYRt6VVZao4AssT(5cZ>|*2#><S)
z>HW;L;2J$C?<s+iJ+-oIH~$4UDmR#DR?Nc8Vv|CrxO#(>a%1^DvMC{Gv97MJ6gdo4
z*wcC*9Hxi4r6_s;>BMeMM>-D-7cZbV*jM?Vn1cc8Gri<@Xn|I-9Uix-RTK7d)raqL
z@OUm^vDh#3#6Av5@2hO2ER7tlC3c1;wS1sKt;3kA666|*`no8$0{uwi(K00wgt@_F
zY4SqhdN4Pxx+o<tKa}@hH48%${ppgR+*j#6cCN{3mI}+i+Eve@_=$5{0qJpAsNo;l
z5_XLJIr%PA*h#wGNb|*QaW_y(bs%o@qAnTSi8DIS;&5|muKPYGp7RAmIB)yORRwK=
z*Fwei-=G)2dkKcw;_(d7dFy-x^IdEo@N#{l2Q(8(70*WHdAFW3aAlIJX<o+iC*Hi7
z64}3_!P&;$9Gkp54p%(pLqV;e%>2)K;#=AnyP2QZ#Q035C?MtUl`rb5t`c(8<sG8*
z-9n@sJe3oC%}3P|wvYGW+Uuzr=1cY=<DKW8($c!0$hIx?(Dm}IV!7pyR0E$_2j6`G
z>+t5Fz@}59)tx((lSMU~Bl$g-OL@9?m2X>u616;5+m&;vMJ4t&^6oe5LsN3O18!5m
z_pE0gst1XHfum+p4Q#6N@~Re&$avH3ed(o#UhMKRsg_$Ehf`zDx#OQ7olTHK-r^$T
z28lxBrEK#`n5-(pd3t}o=Vp4y7VYSiIN#Q`WkvTyS!VOHq4C}-f0;z^&BatmPm;r;
zP_zh(vsVI)1`qqasF1ALmzxHQ<!Ermcg)zA?T4-|SCXJOs70DM1J;A=puBjEY>Oyw
zK|^)%C`EY}b?Agq`ILW>Tm9&;G$8EV$s^Cw%y<J#YysG=AgNz_{jxWqKh0-7!mrH|
z-CZZ5(mV<h2<DCa-&kY9l;rp4<UkF3Ki-(1$<W!5^bE@KS3IeO6Dcq(zBYC0#X)=_
zlIKCUx80XVFfibibPo%p|M8(s=imdjYu4yuv0L8qjRe0e-wr?&vb9ds22^9E&k1;y
ztju`m{l?Nhm3xBVi>twYzA|wQxX;}6FNqh?f)|_bQl6+f@h9`jKka-;UGtA^0mCB3
z0&#=v#-Pg4M&D8^h_URkQaqK_L=(5yE^*|O9)8Fc9(v)|9M=6|YC#;tcqKLsI^>Sj
ziJd=`ekO1r$1z`0_pM!R5TRr#mn7W3ZshQ%NA_ezMstAPQK{15Fek7{_v{`ZcMWq(
zL3E_A`f9EG$(amck{XMr19#MSI?Q{e@|v^@w@2pBzi<#%7xJoRPSi62mn97pMo@V9
zs*~y|M4fDn)5}*Vx)0ltvq?l_fBpcY7Nyx1$cbL40JarFBG;8)UU<^q(QlAA8(&|w
zpCTha)ZK@?w8yBwz$kcwidDu&MG7-6h#>vWM=S>i{UA(q!nlmgSqoW#dteM4Ryztg
zQ#Mb`u0ajaK9;Kko<fBuM_}g=Qnlbhn^58L!VVv71}ceLXBJJW1x}O<<UpW~bo-sf
zY`+Tf0n4Q50rQ~diVxT@rHg8W_Z?qf-3=i<TX)KDA)3mX99ZOKjvHI1NPnEimo^K`
z5}P4M^U}hRbA-_uC=r-2wdJa5+S==k4qvX)kGzj^R?GZxVs#tpgtDx@9oQ!pX`37w
ztu8PT6F|~N-t*4ZwRpM1n2n94d}jAD`N=8NWFoujltHrJrOI^^fExvZ6>5;#$`(ba
zG%9=m5jPFeM_T+Ls0mYWYaTMv6A4jA`Xt*R7*%e(-*z`n(0Q#?kq;I(u&K10ZB;5N
zRMI3f9DNRS43Sa34c>}>#!`N{&4TsGiK0P7P(#p0{$7+E*-nggd*~{73UqV8g-X`?
zo%_H2oP@p?29RjBUqU$%5<{Y3+o$R?74C{^WWQ_empMrk6%<xYOii(1?c5q_1|a;q
z$sQN=Q1hpO!(wy!x?apF;VAx->tFPnnF^$3o5`+1G$)ah=m+G`!JMkftR?sL6e#%d
zFZ*|>wbn2juz%B>2|zRZhab%Kj)*@NK=lTV(OFc5SHW;{dV{~qiFyV2U`o>M<UZAD
z%}LoB%Ah}KG8vMB_@~b~bO4PmDhTGZx-s!8tJzh@kO%E>v%mQh%n}$Rz(n5Uf6d<-
zNsFpfx@IrNkQk&Wxe<>_f(8$d?`%ueKY4bvp9moTa6ac};{_P9nI@1>-&)pelcR*O
z`m-oj!W2^;mFkI^L1-x!Bd<J5DXOUgm=Um>*3(e@ZZQHDY-(<YW6pGEO)O}$DMht^
zBGv5R!c#u6^ai0s%^c+gxm(uOAHbG8gm#>R+eF3N&JuUNGRm+Uta#od$ykV^dSYG|
zx{LieBAG6+6LDL190Ltr*Y;R30A;zu(0CcCAQm6<pQ4VbeC3fkRpp7*;aH?{lm)=*
zq#s1@g@b`>aJBgJQ1z(5_^(GA^!SHpC#d~e=!n<QcnM?E8;7xGVM;@yw&_a7RaczD
z8KsH$Z3D`PPPYpulie7)zEr>dqQRb0s3fzhUCa#<gsM#5Ry6L_DMe7jSRKnNl_60t
zt#P78ZFz^hwP{Yh`@XSmWX}MsctI`@o~hRyS?O1+?YwsJKmts&oo6;Xnf6V@tFu`S
zCT3|WmPvVkv(JGCpzj3<+~H)YFs?VALI(|w*j85)kyL1K_Ir*p{yxf@<;v`dIksp=
z6eV>WF8*Z3Zz*&1lZ%-kAg{+NEM*3fTH)uM6SF7KVKB02M%YH)<iPp}1=X8m@}BHG
zk|5k>1M3D#M`6Qpq$8*J8O=5%VFYdU$-Jo+FB~dE=b3$M1~9#!ORB%<9t&wdIm^aD
z=S!2Acc5)!3)BsfJ?D)W;0lhlT^2Q4J2uSe4>U}Pde^>&RK=b=sA=_(s(xIHjajIK
zxyc;vOwyJz@C17d<gEfm5=l-}YxvyCRI^pVFB}wb&Y%br>pRP`%tA2}rf{vN$c}db
z|DnZO?9V0?w70Qg#9VXB0WTb!Bl}(6OUY(vEE{nAdFq%702J#{4~te;0bwu*zEq+c
zg-YPSsVZH4YgCb=SZtOKhK>6rL{CnmPnJS@N+yynP#~{HD-wKq?S>r4I!j7f%hR*6
zB!Ym4&K<<U*>&;lC(8aA6n>KXcYNtzY5Js9FPrJ3Vl<F~8(mmzHg+@$=N9I7f8Cy5
ze&KM}A&~P#wGCd+l|MAP{D|cWP<^F7k-Yz)F@VnOlvna`#2CN<m)4cPnW%Us_v9Wh
zRw`(A^uWRJB^MQeFlfmt_lBfUvf3en(p6ly5UU-Pi+GDlltTO|!3QpcY<jt3+4kCS
zuo)k%Q{^o@`n;z($2l5|_{`hERVs_On(uJ6m{8@Kcl7+ELHb)S)jxyV@nGH0B=Y1=
zo(e@J!De)zk9X!w@%#rGgO$yjNJ^(Oa1P7{k5Rg=$xbxp9}03D_IA@9*E>!_YGszL
zB|f~XKZm>M1O>tuvgtfVIwLPe?!M4_l`}Dedvoy1BZppw!`wJh6d;5s24<VvQA#Cy
z$!kLAR^1|y=<VSbbXao(Rk`**sgu9_dU-GqPahwxE)RpzK%!?+($2#gdndgU2UfaQ
z<KkTRxeF*DfyF@t$M!EDgl8MG;LHU)?1bXc-pWV-jE`#_VZ~P&2PeBeKX2T@l^U}L
zB?^BLHXB&jbH5J*R?bzGs8CS*GV0Kn?gk)cs?eX96*_~mu@|PA&l1mv*?$0@j6@tT
zfj$tumk!dc)6G{o4rURf?V&!ZTY?p?g)55Kw%5<Cd-q!2<(^Bb_N3#3=bO*?MSUS0
zb*zx{#;z8a>RQ09=KE~k&-@qi0#N7^U_1{q9&DfQWJ>HDt|focIe`WT8Pm_^!}3Ar
zB`eJjiS)O;MN2~copV9wyGn#2ed}E{@ZbBW$~)+Tp9p~)$@ug@0YL0g>d(P{IaqA>
zU~0j;jfRt&`v@Z~e#uqJMHY}zEOv>-tBwEFV3;Vf=h^pq<9=6;kG6adn$Aq?&h-u-
zW_F6spu*1;UB@}Ojv9y6rFSzxPG`?w$`UGB4{XkVGkK(MzZ65-g-Vq8-zCgC*Dp)s
zG}{elRc2|dgfA>q^F7^wCjQm#gN90CLgfuEi~ymPGg`8>xDTM31786@d1$nHGi{ss
zF~##Ix@ckBOVCh9rgBY>xCb+ZG4_LjE)ry91yZEp=BpbSsc7QY4$uiL#pVEcKa<Ud
zd2Sx3UWq7DX_rO%;ak;$x$~nxj#+q!7t(Qg%5`MN66*_p9S$iqpd6&>Z8V1Nl{CD<
zI?=$om7^eAnqC6g7sIyVOw>F3^AYWVc=*#Tgp}p8Z*R?SOf4~}X2zEc?xqcA2Srh+
zua~f3;n`0L2c|au)%N>@(U+vnGC7hM*>%+%I{CJ|IEQS@b)$N)KR|1e?k`C=GFsV>
zsUqX|GDzwk-M_?JL3qbNJZUW3eC}!7^xcE?%l|<W*qMmJns*r?9?lB0&Cy-);U9iJ
zyyXf*uT0Y8=XxiawhQYaW504NwMA71t}$_@&3Rw_4Ds+|RBoOk=#;WCbmU?cDjASU
zI&+}bm`K+*70W1o2X@>VTyF_nd%Sj>in89Ip3LKI2q-3qlIHqF)7sja&1Y-RbJG6f
zIYBK@cNZylZ*5Oe_7QNJ74Y|2uJI33&hL0eyruIKU++s0AQjlDeZCIk*I-KcdO7Di
z;cDr;)PzS4$V}uvIGFv~*GJlfz~5Z{(QF(jOiz=@<sULv%{?)j1B8-t*@);s4>_U=
z*sRM`6saScw(hKa5#wXzTYXdQK{x08h}#stzQEM%=)k?WLFaDfw9b{A(CTn7|Feg8
zC;=)n_eOC@PM5~bGm6BZG>V))u#}6hJY>RgRFDz*NLp;Fi;mQ_9%V(-%K?!!YugAo
z+Hh3ahWzjaA7^b#4CO<10qaKFqA0y^IK1$5{+t=a9rVOteJx0&h8_uP*xKC&588VN
z{h$yQv{DOzs9*DT!tr1Er$w5(F=zOPld9dTU6%<fMk1|@?M6o^h(#A~2>j)xTFGh-
zAk8{Dom>zm{uAYBxUC&YM566M95D6^<4>n~4ei2|AmXuI{n`~f*<m`DdcVpt`@ymz
z#Gos@-jz}S5z_40>^g2%IU%@H5V0@-hCVB2DR;b#ahYzwUnN{7RbNXn>$6`@{3|A^
z!?)Y9bTNLW@LI>VA6x+iUu&yhkQFNg>{9rFU>2BZqh?j`T9!S%*P;PsaT;G9PSDtl
zk2`m9t=HQT_IwyqhTny*+;+x~l9B@9KXSK^+!AZ>Q8x2&oIZu3u~hLPFF{i1D~2H>
zPzGp+;A7BfkPQFCzE7_5ax~L|-PmD8mSU&)3SdiZlbf_12?W*@qi})r<+-aMs_81L
z&%1Jbm)l%&p;&$G>E5{mlX;a8QOGqPha7!5vV=oIC`vCK1~v^AuLBpg<8BZjVIhYK
zxF2fIL9mjKeiuFP>8l{DOUyGzE|Mg2Gc{+y-!GCZa;c#`f-34S9Q9BEI6ahFKwA_T
z^_YqtKo;MaL2^3x2@JT;d(cO(p>!e=2Jy;vatFhCn(}tnY&|>$H-`&w=5~5YhI0kC
zwbv)DU5NxXx46d6&z7u4WZ*S-ubC9|JU;YU_MmS?0o5<yEm2}CNw*QUW!Vq_)n*hU
zfdU*Xy(m{{?yYS_8-p8%2f^1*+td7KeOPk&rdIUa6A*rSuk@mx*EQ8T{*|B5zYAmo
z)7k{wi-R0s&ux0=a_A5#-&hC}+?pQ7+l{JZceEMm$$K@T;KR~4o#8xYq&31NKs12x
z9;}nEt}0uV8&-*Qe41|bu|i{gy4h1^a8q%3+7*$rkXS893yLXG6r!;mt|H7F3Z#e2
z13kp5LUj(VzBaRLia~w(K08611R+^dGaB53z0>(3xKN8y_vgiZbp_4cw;M~1GOiEo
z=4=su3*m4xY#DH&Os{nOf%~#Y!<GBsKw^FdH;_Z)Et?6x;EFxEWzng7C2<oPXong=
z6$P>Q`({Oy>Y5mPM!^b~D{ZQPkxcrgF7fUXY3b#3$F&^l$bq0S!a<Tvzp@SX;e38N
zZTeEV@xW<8vh<CrK`dl{0)@5Iw#!CnUKumr_AQ?p)u0y#>m#XerJ~7(>8XwgU;a%6
zxQss_XH{J`L&5Yt6VkiZi!u>^5_HI$@u;P{?HQg?Te~m!3_t}<WV&rny$l|UOS?ox
zthck(&CSn*#};2v)Kr%>JBVDT=WE_0n~CvACc&is`H3Z;vfkn&lh}6s%TOcNFkEr>
z9`}@cGea6()aG~Y?K-rwi@S5_ET20ragjfhT^ROpu`}!z+%2c{SW{g}FBprofeiqs
zNbnNHQj6#3j<`MS9vHTMaNkZFHudHGz`^j^69sVZijq)gCrF7ZCgdXT&H`);rN-`C
zw$(Uawre&(D3JTyGUa0F#QQ-?_3dQ#N@2>?76rJ3-Z67D=~IqIH(yM&ECeFr-G;B=
za(G`gWE#jm%(E6B4W22J`|*%~I%qDd;ZZIv^j5U;?(K)Y0OAo7JD#prN_TXcHFDp?
zrX%V|%LrU%6Vh=g5ieC9P4iESmEHON{)b4EW7#Tvft_Wib|J3W^|@97oRasZ6C%u&
zHAm`rj%Bj6T%U+@k40-NK7lmb+C)+61rYSnBt`N9<pGoJ0GBN90X1|6+KU)vsYv5;
zx$3lNHh2EH8N&wn0MHDR$$PYxl>EhM2Ga+7(6JWp)r3o@fmZgf&FZRTD%s<{12O0g
zmw%mN2<{@HYuUWRVj#3vj|{i#5s!G;skOhA8-x^XVEeb%gegTStfdSGu@Y61zoG>i
zBRq`;c2$Oj=Qm2)3K6sDgJj;$OCuA3Mes^jl7Wp-1ba^8lcAiTGHO1(oi<4w;$sP!
z=pvvJ`y+~I9hnn5KbPh%V=?v^E#GY@4n0z#fu!pjl<Rq5q|<m>h!|<mFl`6n?hAQQ
zyQ)aruT~m!>GllIh5_~5l=HH<OxkCcfFnKx;w}j^j8;?8LqZ_=$SW$2G;<9j>%3s%
zGqge5sRD58-cn|R^%y;OH@MTYcuKKC%oTABn`Nts@(CW|x87<q(=uYG%oTrk{w0D(
zt`*Pmdu2tK>$I%hHkTl^oc-~F%w#+;SX>inQ|j2T+x%yHn*Z+Ig1}LNiifP(U)l+-
z4GvtO*vJ=~vl<%6=qE=?#;fMRjlAJyTNmc-#`UaWxx8I@1G=T16b%Pq?hPE4)Kr(;
zoIj>T^CCcBWw0LT6{ZRUyK>US9kKRZ46hds8FrOx9`l=vL5kxm@G^vQ{*1=%b_3Tt
z+V=v(^wsV!Dc1`Mob$)ZTsXF2<jl=|)lUZZ6f5wJIhSuAcX3;o03J;n6;&K=CUypf
zwD>TBt&iQE+IDSWjKzt5yUmL^w7V+|QU6VnuD^eW%hvOb>25tp;z!|dxla1nqBx5P
zAhsJbu=KWDLPVIYaSu3o&lq|WJXui%O&q4qx>Z(yh#&&$K*Z$fFSFiNwQG+X{@yCh
z(0UIU2<=mm9d&nj8iCL3FlRK;v(1v>Y;U(QU6%K#$Z>bBeweOoxx(FKyW)M2r<ubj
zS0ke(Xz0R1RIgFgv{&yrEi&6b5CVBkssZ%-clGbB3z-Horp!eXG3gtvP8Tmu=Ys*5
zGOJlP<X~4Yo!GV6b!C{;WkFAmb2B47pEZ2@x15EwK3mAE#cHm~#!kfZ^~yC@Zd?6%
z9YGr#yD<?2qhc7=q2@(yqnQT+=iA_3D=0eHF7@1jko^OdMA0ajNEt?Yil*H`+cotS
zYibUr@r8l(E8dD_w?WTGnq|s;$KJ^z*WW>vNH#4fTo@^{6XG7D8H}7Jb3YbLPI?37
z7G-<mmG7p4&*2@u{g*-_6+qQOF(B0;1v$xkS3crU!#zcD-G&O{%omKRv9s(BNA3JH
z-&}JN`}$TG16+V~$59jMQ#25FZAEgqxN-x<RwP@5u$vcr)#%fIVQ!-$sIo+(jEipv
ztr-KEVsM99TsTJGV_m6Zpam?zELrqWw*aqosVd`@1Y&A>!h80Pt0w=pS5&vgMi-*K
z8G)GTN4N49=`3FiYx&UPfA#5ILew+h)pJ(9=id^c#bLZ|yHA&WJ-fw`<pl?^KeId)
z5gO6GI&7BLzNoUv{*cL9TirpHi<(iHVg1_m2%hR?mx<mxwz@9M<(MHQ?1MF{A}SnP
zQ)O(AElNJBEH9(4&DDR})u=NIbGgmm!5cyLAVL~fvYPPqKPVVfZU=wnXf&;;?r$64
zXtT_)wjS4~Z?lYNF1W%3iHVFAsr5X3w2S4-^0Y~&@j5{YroiM@%;SyE9y_&&vN(pw
zt*I@wOx+6GPv?JgEN5Tr5v?mjc!$VJG(=2ZeCTN7sJOEt8K!WpS0cp0EVN04>e%KO
z`FU=<RvNQ82K)~nK4jL&`aN*7eDVUMyd70_mxaHiQP}+wcbQM|qj=wx0yYX%e8@j%
zz@PFFR8zGgnjHpI!=@DDouyxFsS&LcC49z7Gfy(*v!c``(C<h7-oUAT{&fENhE3Pi
z0;1<XS#w?T*5bzX;5Z`~`^)UD_0O3#qG;J>{!S(yF{v1J3s`Gj8p`hO?sH|6F1;*y
zKfA+Csh*>d>!t@##;0e7ezvJT_bAV@^1HQfbWTsNN{e3Lwr%q*Ii4*hyud#99o%P>
z1Fkoo30!8Fkcn)4H_IBALX`5Tf|c~lAH-u&4eg>3!3q#$Y-dsN%~RcaNny{6-aOO5
z$fQ0bjxtJKx`;#>mWDn`K;pv(JyMgSR^=I+jR6{+vp;7Nru)<%O*U4<|5%(HzSf=f
zG%uAbPjc<vPv7^~#@ze5CC9E~AklSjPSie`a^n~ZI+w!19$usGL@*wSz~Ux+aQI8f
zqxTLUnjw60Yg45=*F3BJM?k1z0PT)SSJ0DK`I*=!N`lTg#^E_ltY)ixJl7|;WW@t$
z159VU<9f9>FsAcResD={sVYwu@jc1h(Bqm3`sdi*_~vG3u;6`;I~Vy;Fu2vr$FSks
zJ3GSnu0Jt?0Z4Y^sIDFIg{D;bq-e?00F9vYaz#AE^G#`Yer+`}7dqo8bE#U4)*7^O
z&kdHal$mVHd(UxScimYmYZc7en(&>mx2&soKl@p-gv+E(jP-CmBr@}Z{&9Dh%TZ5f
zRmOs=Y7t+2sRY1jcyQr}!MQawwOUj+bkg8w=_NXu=+eZT$Nhya_iTnWV20}kZ^im5
zEZK~_Eqp#Ai0RjgE^}!d>qe-e1LOIRMD%hOe-Ip!Y>&WmH-Y$UrC+%w_RpCK4B5nq
z%Ziy-51*i*GpFMfbqJbI=E<lqBMoTED!gMd@b&B0Oye$s?h;1TJRkA0X5GqWgOnk%
zH!Py6zOK843@%FBqjBJFCrZj7gx-6j@>^Jcuq$<j-Q7VzKw$X8hY#|a_aK<EiPern
z(n#)VHN2&8@Zf^g8F|Kpx8te&O#(!g6P>-<-R3)C3v3fLSA|_K#}=*qN>SA-aNJ5y
zic8)#FtN`NdnWf4^@tb2!Q33NE)v5wHa8XCynfw+_!Z?S9r5rQIF|yN>MnU&Ut)Eb
z*P+75_bA`Fv-VjmL^|}PeVimUX82$={Y!bSDW*#t8ZLtt5YR^@#ZW8ujVXfd+(+87
z@}wQQv9K7g(TXTnehSIbk7%E_DqJ{;r>pwjeV8Ui*cZQ<3A!D-t^@bWzVwR*aUFD}
z9|{vB6C;?ujqpPYc)zZ%_9d!_!f-%t=Dh1T_k|(TYVYwB%jzGNwGIm%GCBTzGU(J5
zy1K7wVyQL?UsScGI5UFe(3cv0MLay8o>%36Y%$&($K$Pqi_ZU5@fpX^ngiztn2F&f
z_eGwgU6E{+1okm4G{+%7gBnf?Xx^$X{_YS1c7f%UTX<t$T)g_y4~Qk<Pl!er#KkCh
zt}Qz@f4n)tOg=uB$M?j7bd~GfwelU*woRXHs_!t&v<O`%Y7qva*6_!ws^N)ilivrP
zEk&t`jjnu?Q+U=+|JAIG@mhvFBeVMeW2%=GV$NBjNp4H#iRqeS!F{*Un-3ut#0^ZP
z^iC1yQpX8M$tT(b!a3~-kYw7!7c*l^oW2TsY%F-w^T(j?fH`TfXV0{9Ftnw#CdIT(
zwUwXMts^?p8f*B$Er$Nv_CufBzl}TdN3E!`l_wF}uMEXV?id#*vivgSPZP}zVuKH~
zZ9AzqU$QW7bqV{?{q(*O{b1q?DH9CQBcuaGw1!kS{2gx|UUh@~0&E&&)v@yG5MlW_
zZVcyP%Eb}mmr>r12Mi42San@)$8{N&$gK-~GZ<E<lrvWJD^Z4B?&}-@(bf$G=BN^F
zdQ>@?f!`a&s~vgQdR>E*)DoVk@ydTrcneSE2g*~$^cf$4OHI6v*{?G9$A;?dhUD^}
zxiX)>R(+el-Y@~n?rl!6322-8)zucIz7UWVq}}|s-@>Yzn#Xl1oaS+awo3G4p``w=
zQ0=<FS}o@fM+$X80V#B)rKR#$uU@SvX@6}mNDqmjFnd*mly~NvRooX2jFk%N4<~NP
zt3HumdDY-WuGV1@Sy8y`oppF}iUu~VPxZ;^5xaLnqoVjp=wn0Ae>DhZ8EvAaCKu~z
z^U(Yw<G@7VAvNf;h>J8R7}p~J<B97@$sCG+TQBycvpqcSMFEYzi2kPuyPe;+yWx#C
z-CwOz+Iw45t)n;R3O<(!d&Lrf_iJ9~-l%ug>3tz(VKp&Km-*S5In64#LrR@q?Z$=f
z{;a9e+k(JzSmr<%gimHgMRYl;lB|R<{Y3O-8~uuP4P~5S=U<cTC4N>KV4`8t(V$cy
zVr_7F<%62DK>uX9hXy0p14DjT#3Wln!YF-YGxg2~)I&S8k2a*SM{HS{sg;cebGbXz
z#z;r40<`J*keM7Y-UMbM3YYw(?yPw7(_CQ{S3e$TD+k@i@KE5AbnAmc47%0JemokI
z>?J`7AK&-zi$~Unf3wfmKF)4X?F-SeEJ|HdL}yS{pOlZ4JsFyxr@VUgo3e*B?`?nJ
z5#_Vq5*9vWP(Crpe0@sy@~H2(Y0)me?%qx_@zD?LOA}_UHI!R2HB&DWjO$`dm!|oO
zJU=%F55(Iv;V5vWtMOV2v5nO*x+t-S&cVKwY-MuT#cUUv`k00Z44x-IbS?!qO}{P=
zCeLHy+DbyChnF!bHme0rru_WKt7a&1OT~lH<@cJ?55!JTy9}E^lsKDi{dM|B@#go|
zmX_D<=ol<0h8ECLMqUHV<|8-=3`FUr%o`7#U3qg~7C%gxVdr^Q`aOb2jO0?TwehD-
zh4<}I?@Q-V$;Oj9;m%3zF_BuAkGUN^D58QRNr6PL#d9>k@$p&vriR+3XBiamH2i><
zhL}*a-oF^zuC-K1NLlr-u#q<Bt(sRkU(+|&dV9;hOi?bWqcDD$d+uI!%@3t%?dca%
z9$y(Ww|6gn<TWw;x5?N-ii3br)%^<)Dg)FTZ4zX9;2rA+?7(+7TCt!hl<rw_Sjk&O
zCIkBHB5S72=ixn%8zQ+9ih~7%v};4EmJnqQb8l7oK?*E#qc`4<GfK2UG1JW|e9`@L
zy>F9_lj3_7=H&a5jdJ{kyUDMUA9)!TK6_Vi=~`By^anw_aE@@VIv(@Jmg_v$D{Z@e
znKpF~<VGAEhF}qil<UyP`U*-;a@#8R^tgp?o)P7$m*Qp>eiXMcIG$ek##}c+eAR0C
zURdtPod>nH5`v2ex#_lvwDiJ)_TuWfR)IGiGv%wPeM}PP8o6*>fdXDp<LqMfKIZyN
zqlkuiCI|TV$k=IXYZK#Kcufk>ZVS*3T<*Wo?%?iFrwRLMFP+ep^)!g_QZXcp(ps~S
zm%D$)^49McsUZs`iD8@BkxTCx?t4+EHO5~*e<)W-UDJ2JTtB0+vor6(y?gia$)(=J
zAbpNE1V9EJI&mBcMzI7{;#1o<yA$yelAG;IBBNQgYS*m`Ihb!z+<wjODP5mWQkmwR
zATDd!__V}xjk0bu_nC2_)R)JznZI-;(Gwc^e=is)ap}YXm-p+vwTGFy1_pQ1pM6@o
zNIdwyn_cABMqH3(b*#3wU&}o>@nC-SZ>n0wPvwLKyPG57@9Z>|E6gNP8R{NSM|9~4
z4R&WDJe@rZZnUvqzk0|P_$|b&J$&1UA2C2tP|IG$X91dZ{>2HJRW(M|>~6}#LUfMY
zgTSLv;*%*>U*0C&8$L2>E@pMYvWcDRj*;OXuQhuuiO#Yu-JLV$ocR35c|e-+Qn#P#
z<n9jXWghSF&!ApLdT-D8YdsDPlU+9CewvHLSl^g-L~LPNs+P|W;0r!@QySO!lTb<E
z>6@7+g^h;F*1gdTjM=MBR|;ialuIkehpj@p)!r7%wT;_2;6A0SdRs_r-doXlzCOkD
zADnLPN54Mjl_f5XmX5F$%sy$n^&W$g#en^@AGP&XJ>094fPb|osxvQ~WO`(4R4~N*
z73D`IHCm%_#d*C257v6Aw{Ps9K6|DE8H<d^de*L6(8DotBv<%fWXkoI7Z6#G67?uE
zs&hZdHa`pM7@-x7iL7R#C%k)rXfD!=?N+bD`AL#^u=NHL*QN{km*mR|3g4Clp7-Ue
zS}rKysO%)ncb$j~=ck~-o9Fip(x|^S#U782bBynb(B7@~+-S=R3-kanO+1RJMk47!
z&=$7DZhUCM%1cl^+NN28t}@g1Mh}`k5{KD3YOrZ9TCQI-RV`#LrFW(g)-BU}o7G^G
ztG$2%!kM7<g$AC?nlEY1B%w)m?M;zlW0x~^ve!2^ds??cIBiY)KWvof)dDB(@<$e}
zkK1ur4~iDARL$FOVRe1-7O9(z7v`uYE;R6;QQ65#jVkRipMjk#B)I)R{9C3wjdjms
zi>*zjg3V6(0=P`}=Vb}H0<3YDaoUpg&%LB3ALhlJW4k>|5*h>|7AC!g*|M^1gfuJG
z(=8pvW4N%%o5KE*@#%z&TiZMzaqkU8CZOGD8;LL{g}S-9%|3trJRs*s%K4<E)+A_=
zr04UaFu;_BHNDE5$8s`zoR5|=bj1H9L;D%1%D9Pdj-SFLS+dIfi%!q(62<XV?9pLD
zyOo6xjFev=+pu7}K8=siTl1|jE75gWg$*^QOk~u6P}|;ot#pjF5l4yrt%ibNLUCC<
zx+v9=vkF{f;B7I^SiSuRg_&_^$=yLYtbZ{cL<kGLRV{zw>UYu`RvoXhCTgjXUY*+~
z`zr%g&4cs)049Ob0?B@-PM`viabKhJ4y#Fyaqw5stp3)7=t#lm3KaqvUkjpM;W{v1
zqO#YJ*-nZ_Y+K96+lJpD-QB+VaO9^^s%h9X91f`snHLG$lFb{%P3@4#iYQDox5-N?
zw3mktBs}B2pX+v+bB%IPv-?X`=5p9<=&Yw1IbBGmUhW-Nr%G`XQLjy!z@M?>ZChc`
z^EW#<U_Q+Up4LBSWQbVh5=n`P$Ck<@*1qRFALDtLI8m2U*+qa1T)sB{*m`j&)7R6=
zUn#_nHlL-teJ8?P)8q34zn++vh0ixU7-52nyG0YhS_AQCmG4*iB$NHPFcgUbV|`$K
zW31Ju=cW9Xl!Ht|<HhbGi@fkYTUF^R-1N3)gE?gC>$9I86&qsGKP>SdJ`<_fq!tJ~
z^STgW*9W|1D<8|W&(x}1Et5NMBE|hgUVk#I;ud;L;Y3(@;?V)Bv4FLFA_Fo2Zd=&-
z+d<gCMj(RTT{=kDxJ_VOj98j#2njS=5A;t{R9B~8ORjwTQF?}lR<6g#QMU`$Q5qW6
zo@LKlmfm|!10I&rZTf3Ii$<i5D6qL*H?WXe+5|!~j`RxJrZ-)lKC0R9o6t5adhX*z
zOz0;`JW}$6Drnr`3-Dzgl+C{|Vb*oDL&P{OilP^Ju8upJNwh*KC1BZadix^#Y*Ka3
z{_2SxC&{&Z(f{n7+4`1GS#2Lnd`R6$;$11Zhzm!Ky<MW)!<fiJ3y2pX(3&adc9;Lo
zuzuj^UarL<;!$GIsKi~*Ll<NWMXuMUl_{9K$SgFmghY%FMp9C#f>-w&s$obXPcu_(
z!9CP@mOtHXsv}3g0O=i+SxIH0k7mz2v!5pHxY3o#qCY#8W92qVzdJJ=ot$U_yp&9X
zRbILIhfb!}cVZcK9hoi9rR>bA!TrYivsbezc0&Bq<(jnWS0=v+^JI>EnNi76X`=8U
zu!baVTE>rWjc?6!XR~<XhTc2!xyeBUGa>3#B@28TainiUpv!xtgZYHZXP7ja!cCgC
z3qEVD{dko9==tKJc%M&}qrzMu&C78?cGa9)?ArAg#iMSONXb2<i%#~F&569_nDC$w
zXX-jLr?jqZ*j2F#yl($N<u8RM<_m=vY$kYiw&+6!v|dP&8^Joh^<`{Ia3YL9E^bJE
zq)v&_@srvibtN43b=Vnu^j#!nBzYh{u(&hpP2;mU`SdKQk$3YDVtW_1;<UycB_sUD
zUyF2wp1mQ9``rIZ`VEY3{D;XUW!LDWPqmrwdWrfkHg8})KMqg(56V?hcZaJ4WW>Y)
z`6*wZtK1+Q;WHKIx4BCMv!D6JcI!aXo6A8%B4}%WMtI&96WpAj=bV?}kC;(!;D6{T
zvAaGd+KMikTyo2rJ34{rYkz^YST0l7b#%_%r3ROP9t{JE9%b+{nW=cQ4?C3|y%P@m
z?&dgQZTq)aWZ*m^cqJ!AF72#X#?FopPV!F`t_k19SQ?}BiCGwzR%}t;T#diY$n~Yk
zOO2{=(}8{+u>ulS8rP4KV8!y~iLyQ=oMz7K!CWW<H+}QXIO(9|Triq5Y-&eYQChTk
z>tM|_(Ra1XlAdEXl^_C_R+F@`YqSa*TcM7MbAeR~Sn$WHo=N?mpNw^y)TLPqYdC%U
zP(&;4!FWl#^4;yF%ItQ|u?}pCY}5SF8o$<YC)b_l(mz};>n5)=S$Z=r2}R+J<vggu
zHH(u=cyunXt#n7)WR(G1g$n|g-e?QmmO!(VXpTFJQ*C^1er~qt(_6G03hS(x9&^ct
zSn2cfbgx70FXspu#Irm-ikM_bSf5j2x;sQ}qjd3bVZmo!#VUQo25NtF)aoebJBbDI
zS+t(HyYZI@bDzCf=a2eHTeW37Utcit&|-6l(!vlS-hUTj^rpR3YR!FTH0~}-Z(Byz
z%Z%`n>v)uWAB|b`{D_hoNZXN7;W(;vdq09fz9==0wZ58bs+4yPWFiY}hAAJ47%Dbv
zYz(?iFnbklu+CaeeP>CD(yj@8xSgqsUpB$6H1P5ri}nMnVciPrUw4SITr{sGb(hYK
zm|N$Mc9B%x#vA9z+Jz~UfLvx2Ca%VItP|IgQ9scvtxX_7`ygxUd(#FGK-YSN7~w9z
z8x*Xh-g@J4<-%c-1H&#F1jrQmiISLSPzh*!t;$TkSOycpx-rGrj~o(Al&nHYn;Ir_
z@5{=|EeelHAEFhRcDCMZ+p_CQSOVIvm*cTD@08-sxM6MxFcFE3-$Ekj$j9Z~Un7lU
ze|?2ceKBs~;`>bwzhtJt+YKSCLz{6%Y{`rGa$I@s`YI}huId_(5+t4_`;^J-&{PhR
ze<t7BSb@+hf6Y&FS`=!%BkljJ$h)hJ*3LOtd%4+Ejy!E>nXdT_14)}h=y`o=V!>oX
zTMf^=EB@X6@u7wY(=Fa#R10VSim@iK|K#K_sxLrafwbSEK>ka&pA=!C!b95vOAkWs
z&aqzkal`_4ugYJPX6YdCwm2p}nL2v$lF)auY%k}xND}`VtJ(TAS`*fqIdOrMln!aX
zL6U_rs7aMIr!%%aMx)~$0e4y6XhT%Jj#KO|A$7eN;|CBk2vYcMhUDlxxl{X*J!Os?
z&v?Dc;%s<Gbq}tEZcjIvB}jNa)(S20v?Iw-egv~M8ujfxFKh^JE!O!eopA0^f}C*u
z3I#}+10&`KBDQSnLYeP+&3?k?GRPMtz2!#&>wT8FVB@Czo9y&z396qq6<rlJ6=7Pi
z=KBekH12=Esv>Boe1Bv{&4@voNkuWz{+gRxAYg=I@;_P(yz}fFNT7Gj-MSL>7@8bC
z7dsTl84;iKBVI9a4YcY#Dv@Fei~nN%jaG@rCPbIqdbvMXR~Xmqi=2o<gsu%~&SEk}
zY^ywW;7!K<aX1@)M}8M~NQYK2vH(!YX;N+N|M9j2`41W#R)!~!zmAcW(nR5y8OaUM
zyd<rOBrcN7GvJ(jmei8^cFHcc+;7Xf9)wr7D+Mn7RH(acSAXMudvk2YfPV|GkjK1-
zVd{WajG*@iE0?KRn3w3EL&#uR(I+D;4IV%CuSZZs+?H|x%J}0zIE4uaE}L!Gm5(%8
z6Uh_41&Qt=7@eN&GlnvTU(;U`qmY{H|M+?fs3^DZeOwTwK?wl~K@kKIkZusAq+1%K
zq#L9`1(Xm3>6Y$p2Ds9VGBiU-58Vy_GhRUNcYS{UweDS)b=S;0@B5x}_I~!WpS?G|
zK-Rlb980AcY#86m?ZJ{1{pmreODzB|PBae#>ongxl@A@j67d?E?!R7Z&$lXX2NA#2
z>Nzp?xAo$Gwrsy#OJRofim{X5Hx3q250Xc9;#ip&RzR!HJm>RUK_~L4$+S-xIE@DN
zgK<=-oU23R%Y_#D5AF6CKE<pzvwC*+9bK261Gh|o|LpSVyp+MSynPDHnl~-rl`(H7
z`Z=Jh3F-&Umk}xk_vlMsEi+1}cth8cuN7w&bwLEv{TnX{34mo$7fqMZs&$_!ABPY=
zs3@3>h*h<Exidie%|FaivyRF$p6|FRD{4<l%wngMe7`ESJEX_nuIzR;$BR7v0repQ
zhtTvVy>Rr&r3sG=nV?z%8+p#Tei0Y?gN_kNJOy6y60LbMHG^T5RpjCf(zO$#8V1tx
zVQW+VJE#(McsH<S5^$JHs?tZLPAno)rQ(0up8@RLmd-60<*!Pc;}|=n;pRrZFPi`;
ztr7xhQ~p(2hlrsZh@PpNENWKl4m8NcD4}U$+&CTgk7c*Q>w65{-rPhF4h*y(U&5V0
zS)R5_lk06H8h)O>@SD>bRc*9ON$|^M>%Vk$PIw1KZTCS!;gHFqV023>YuDLbBJoEV
z&W&1|rE09qQAIn6i+ok$;dsQm+z&z9bqTW7Oc;1NIy!!JqS?fjDnoR(fJyY5v-p=j
zWtTr}@x-Iaz)dKQNdY{|PwLF7747opfRSRBKh11GP?w3axL@NAwcTh`U!~|GXjG$n
za3ev7MxEF99Ij>Zph3Ivi>aiv-*VSP%l&yolmx(21gj#0!1Ap^gUzJ6Lnwofi_E)+
zPcCB^UB3ok1pMg-`(4w?63kR>lmG_jE<67aH|#2fW5NGss3km)vy&WMkQ&(aa-R76
zg#Mp+xPL^@7G)&OGbzjQ$l$uUg=v4U+vVs{8vpvhuDP}Kpw(5Gj6`W^B<yiwg7Lx#
zf9i%t!3c}H<jQ;3{WRCC&*`HhqpiPFTpu5%+wPwkC7}>>o6!%NhiL0%bo_9Tk)>0r
z`#54BJQIj<4WC9SHs<{xPRZe(cr2<2$m?BjzyT;&mQc(fe}4rZMm{t?D!^wjec(l8
zEoOwIoL)&>8RrD*4XxoJyt>&|gV<1>#X^bm`Osy;*2()$fatGsUyb041kf?hNff`{
z<;)*tXeEba&}GM1p<1SjR-hgvZvrhjnl=ih@=}3|3S-&rAjR(9S%B+`ItZn(CNzCk
zUJb}Tl{b?;6eVQfg!@3@W^b?SjxPLsxtaYi?NvBlD8H<X^V@ZA1fI^1gCLk*+F%O4
zuL@<IAcir<d^Ql|t`$($KP&I5cU=WHT**Y?VT|1R&3`_$4?N4}zhiPQCT7t)<ci7J
zutuOnH}7Z)WVB%h-4ki;SVf-le?r#3m-ntdO(_QbZf=n;OLJVlDIr+^&m_voYn-yB
zD@ILrGi2&c)_YrL#ShlECiz>Xn4~T+mvl2Cm7*_j*N9!kf;kA5;qDq!Cd$ey9h$<6
z8w)>Pq6A3*jO+cWvifDG2$~Z1H9V$D?3`_+c++s_<RW(El!8%bS`w5l$ZxCS7*ROD
zGk8w>SGLDmbY?rXORyRTj&k-Je#jpDx;IzzMaL}t^z}0xI=Sj3Y1l+S6)jcSk;IN5
zr*xTQSDt%159OcafG~LKTI(O=%!U}dWM1MJ832YY*uUnvJew%ycV4ld@g8n=Vslj3
zy?D=|&(lU)))B>X@FU{PQY#?hhP#nqQR0w=Hj_N6`RGyR>w1c`GTaa5(2VHJv1|AO
zm`6Pkr-CabWjU>zdt?(?10#bsVh!eo4IP7<r5Zb)WOt&T3Y40W?^@csr5KTPYNz}N
z3O2>2XD^@u)G{;cxbf>iOX=K;@Ib2ypW{gX96cK9QkunAzQPpyQvBBCxtsDqUa4@N
z{>EyEa;uNP`cN2Rqmj`fZJKf#Yy=E-&80c^PQ_<d{6JW2lbRWZLw<O&+v=>wI?byn
zVvV5Gk#c-vM#)%s!tGuhylp=ars-FF>PkP;k3JA}?8Z9`mv-4K@;uw~lX}CgV8;ND
zC?j*$fs!xBYd9Imm1$RJ=w{!q=uNjTGqYpblg|w_#Y3dlSq<kwP7n+C9xdR;(|^gX
z@G~lo2C)a0Ic*O;Dv~cs<v-0QKFu@Ns2s^O>s)}d^{!zakCbuzpS4QC7jSoX&jGCI
z2UjX=MGlap3RZ0*M?pqLE^lpaHlgs%nr{$@<;|tmzs!@%;vMM0=|`CDEQ+PFHpWEK
zfD9obwyzZDbE&cJMT(gJhZ6Zm?w!`BDppFYaDCk}ZG<h&(Q~M&r7aUII6o+f*RcvS
zY>XsMV>@x{n#c-ee9qIzmFG1){7Q}6DB4`7w0VsToirPwqI@mO60>v<Was4I_NKaa
zI&fxLhs+HAVD)~S{_6I}X_zDZCpNJFI(<dY6E&G%1d4;X^tvX#U<!fIAJTqNqb2qR
z$wseRq%<y1S_2I5RCfD>1}A{E6Gw_<gldpN(ze=lWXv<<dcR1@FkXGq!~0`nhN1cP
zbTzum6LLyt|Amrz{P+mkeey_vOW}Z>o0mrQU?3FCea+>2?6f8k%o5KwBak>K?XSY(
z#GCjM4@8<vu|Mz}@0Q+I(x#8naH_>!;}2O&6Br77Y8N{;orh}ZIHlB0mThf06_dQB
zIgjP*IEcJ5g2m3dc2K8;qzhaLtSBp}a|&BehR#SaHfJ*UNVMc!hP6iWi<5s$IdAQ1
z(wsZTF-T7eZPl&C32GR%3XxrHVH0E!yCSkqr@Z#E8n{j|xIhk2sy@N1;^}$L1sSOa
z3KNu`^%;mz7uhAXDSi_v<vCK=Yw_%<Y>x`jEtMJnP<rX?7fxAaf_wJz@gtvUD##z3
z%Jz*ardU{*3`2_MQu^x?LpSgFMN8+94mrAs(H&Zv)YxXdXCT^3H0#gvfZyr`v>naJ
zcI_iITca;nYNt(10f~mej9M)_5$=pB?x}_3_)Y7FfgC}H8FRd|%Y8l^WuI`@eoK13
z)hC+JY)*vN7;YvV#w7o<+(;ek>BySJrq9u<#bid~VMXs3-6$vh@cth=WQ(vvCpUDD
z&?q-Pcu9J;>+S|Up~~1xL`#<ouDEmgsMgrtkBylm#8)X{|G05f1<X;)81pZD7He-B
zOtBBxRVs?89_j5$*_V9xA=X{A>esJdsb)ovqI$h<6MUuNwY$$m6jzNLq8R{Lp%!Gp
zo1gMEH>=(|Q|65_ZJD|&)OV`&xnSA&s8O%ob_X_zLPfpa?YaJ{NlQ(60SptvY)J=~
zY#tHib@>dxY{eJxn`Gm)I`g|x$<gZF&%0Z@QY`J1MEulglLg{a>|wS)U7*HB3i$6Y
znLJ9}_YFr&W(KGBt6{Tr&=REnPGLIecJU3T{w0OXg>=l;!V44CkaDBEYm737)@=dL
zP6?H(oNS~=fjJ^dEJse3Tp<f5r*xNO=yhcrxBaMT35e*zMcrfCgCvIx4QAwOaQaFk
zUcuOgS-SSz1<C{~*$y5fIO;5uZ1>dcJhPXd`M9xXID6H=N|+GEvZI?ahN~;Fc7-ud
zwe3wB0wBdvCjXB`i+u=irK96L>etz6**Wu!zk5f2qNjvdA6bzM>Lz6sshDaj$Lv_?
z(MM2Tp1*5I?g&Ku9D#^EZ;V&V1&0YZhyO9Vgv{06GC}DhNx^BttTSUPc&xUpv<7=>
z9Is9SP>3B_Q?)%20w~kCC!`;ra_Xn{YG`2}OA_mvZ4P8_PPPZ@SQd_ooi>Onl(=ib
z&KCYf>J>E*?Jb{Yc5{(suI4eGZW{+W8Io{&6Ra_W=i~1*o%G7Y<JuM#E2GQh@GGMz
ziFJlMHf<(;Xdlu4;%oOAm$=*^*P!Q1vpt{pC9=PUJdd)2vFqmM_Wi+w2l2<jq;`$e
z9XAkm=pPY1aapuCkQDUy!^SgdS1eknwk}AS6u;i7keDoz2@cm#io6LE41t;`j<GZ5
z3G*wX5d-?Mn|mNR2)-d|7R*z)33GHXv>kg2$PBv1@jNk48Z{j^ZK%LtxLqXs%)pcP
zDUPzC1X&7q=cR}X^;+&8>c#9K32hd^L*$hSq6<D8UT3Xe;a2n*+sT=KtIdc<=^lw)
zZc=;OPquS-e$-WKFf_|ur|f(=Q_QOV=Y$iDhdHpZ#pA`q&O3ow)1!8ele3!ssAjYu
zfm{+1Bab3Tze!&hS;J#sWR(W1<%2=G;3_{txpc!Nd^y(@Gm8F>qKJNv2ZA_nbtM0>
zGJmQqou<AV<J%}FXm+-_jNV-dn5LH?ZS3pJ2;@P@yElnjno%>ct2`!eNRDDHNmU#k
z9>+&0a_8{_AxW4FYh7P=n(Ei#LRWoxPPEaZrk0kD{Rj0C<dPL5qdPWkIbT-OfM^PP
zEVio5dJ3K#%IeA3sJVxtS+fBswh4`rEc#ST7?*+*{*Y}keCkHA@tkG<b7qZeZ+NKw
zTHN3pU~!?+-K@vbHcOMqC12ZVg_5<mM*2&*5n2iOI`^)K@s)`08o&#~fI(-VU=v*8
zVH*jfKRR+#PeSM{3<5kGO98*u^e;v~5zGtPdv|x&l18;hJ2ZaMoRiFWY?C}~wkGX~
z4Sb_XUfOT5eCbSCr(GvINpIr7>%t9^wA8EDeRSi=PeEksnK=Z6xHI|b^4>KcR8H;c
zg?0<pEY$*%>dRD&yL4*~q`=$E_Y=K=Jv0TU1WI4rl|bu<rOo+$Ex+rIOg7G46c5Vv
z@&bJ)9kxphfQVE8jXi+|4(gN+(sn(%#LY5x&WffiCTYdYNUQ_U8Ti5jy+QlHju8Q<
z9h0N4c{<wazGZgRlUF%)gaG6x5P~$?Ff8(F?41|NvgX5@dKA7Lk@`BH4SNdHlu=j^
z6WzPD<NmQ$Y0<EMlU{D!Sbz6$_PJdXr%!>A3`FN6LC#6Febi`o`etVA^qsxL9B0YI
z5d+ir#hq~p>EE$EvK~BpEGz>1uox&7|DEL3FIE^-8nraU|DN(dpc?Cv>j<AQM7nYL
zx2yRT$34P!>)oY^LBb_q2J)b>s%POx9_-)wt>IF1UjWVY5HDn2hG=7dANTebUC3u}
zjE`9Qt}?3|fek;%{S1s2+N)XM)}m#cyDyi7x88X=N+xRnHQ^_apm+MFPYp<CL@xS;
z4up=gV7&0e>2SpmcAc@C!H2#B&QlPHU{9qEW{4rekRO855j)Wdtnsql1x|<gF`L;T
z%Yf!?bVCo=`Sm^$E2h9T<TcpLKmoYrTcz9p1K{yh<1=3$`-f%`5r7E}16sYP4$^z6
zd#?u)a)0RWP5nIJ1I&g*Z*hF!Pl2=Av07~fAP4K;G(rv^^AS8RzI#`WtzZX4ebH>a
zO_>@yK+4mlq~Tzvvp6sBIC<YB8~@!E|H_09h>XrVaZC5Tyz;#|@4)kdCKR|CWy%#Z
zTS>2EVU-aLT{2;_i#?4o?-gjJv+-%;PZwc~8AptO6?7Be@3SU;+{zf1hsj~mq6}2F
zZhiaD8~HBMIcCHe$ai;5Xp}YT%;-7vLVg5UiZ*+SR#4%<SVW6!qXwXrP@_WnuY6VO
zm6o-mwXRi#_4!3Fb<3?%gdrZC?;kqpc`Di!(zY$$6m|Ia1`xanbl(F5g2I3x>EqG;
z$|u3q<Z>{)wm;5w3XE_}TvtNE^l|C2+5eCWxc=|-E_-B;pxZ0EhK?7eeX0)G7$-i$
z=85mJOwzvzf3NN^B~%7cGVUj|`=>(+r5PBJ=I&5|R>0O<(C7E_&Xz2MB8Benx~2Oi
zTXte*ksI(#sO#3C$EUssQs{NF2h!c>pO(2smKy#ODzXPMz;h?@$~efSRi`l<!oPC#
z8gGy9790KA_g(#($5KA`bkr=M*RYo=(WoU^bvSE}P}wFaFPPqde1KYMzi{LOx8McE
zHfZ?~$TBaSYv1j@WA6TcH4`YP`dc?a=4}YA(mZ(rPO2g7=0^Rh0O4*S^ZbV2E}vBx
zc@j&Fx+c=(yhbfv2;qPF#rSeM4Y=_<#7h!?I&`YZWU{#^eMtaRg-h2R3d<OX%%F_6
z2{b-4OaL{P>^_~sNF7~<KLlt%?;{e2qrHCbH9B~uADevEJE7G>c&_f=fpo*zABjt!
zR}qS$<uPkDL5s3^u3x-zu~pZHaKbcHQw$Bl<g{WhrHMx*78Eyo)*lpnr89_Xo$!L>
zc!V}zuZLRX&%+!nnb03a4B<H~tCuoRY@}Iw=cs(YspAGLT8{p@waJwVmQt58Ni{RW
zqnzPY`7KWP<{dS_qy`f;4&1nig_H+LX^+kG>-+<8FJ2*^{FE*9A$nel&c{KM7NT#K
z1ClE#mZQ8y`%z>tn^!o13bwB{%`@|-S7$w5B;UG}297A<InM?L9=46=y%`KGj~yPd
zB~30{ID;MkOtBa1E{ck;sQyvdI|j2~NZpe}%_pJnHSE!KSwp{mK{v*j^16+2wSVO=
z$$CXdppcWO$?Y`4T-RIizUjP-&)25>=Hlw^h(>0`g>YT*xpS7TB@n9@0tG>gKkXGt
z7HUAqMd#x_0$s#9LZHYy7_D}dr^mr(wtF}8;W{g|atst_d3))_5R+DzxALy6V(!=+
zf7Yq9d*f4L*oO?6niU{;(@VGjO;|HLy=zoHlP=xNGLLm-rr5YkcR6$JMP{jLs|T59
zM7yr05*C~E;8wfm9G1uTN&GYv-+vKDkHc5#T#r!tyg0Y(haL#VLPh*lADet4S170l
zn#If!84Ld#d3{Xtia|jby8>h<pKAM+_FX@wx8xJr|EnuDr59_DNfRMdJ=<S6C%(gO
z`Vi|(?*oGK%@+J)tx(K~0I3-3VxQ2|Eb*pMie1>eR|-=W!5{al4xcU<r_1$@_`xdc
z@3UU97GzAf>{vyQYSZd_`+(*ixUz}l_?+iA5U_XbjwMGp=S3oe-mx746Pg)wiU0}@
zqj}aXJtzMRPSXHXw=th(iQl9@vbG&7vbCMG$vNz%KsKG<kkPQhlj2I6tRM@if@Q>V
z96U-5uZ2~912p-2e&ug&4>84qMLyMx^o@E!yPaU#Rt4NDC8(J+Qr5dCgz^<UrsVcV
z_QqNZJ70z80;2Zr10l~P`}X(pT8{`r%6@2oxL(5061x4F=*#6&dmW=ijgZ}~bjd`d
zH|W9;S^z0i)|VBFk`Cs3Ut95AKevC~n_+9s_Rkyr+%KTTT9>Siq~+GNJTMWm7sf-|
zVb~g)R}88U2FOG!NJJ_40X@a$gkd7|qc#-C^V&lmzJEa`@C5J{B_`w9uVe_jn*{}X
zCLiUrJQ?cNGwT1DDOaGdo&7V_6En8O)U0sAw<NC$8e>8z5c5R4v`qHbl|M!~BkVgq
z3<cPOfFBVW2x@9IOil{+ya^<Llc4|6C{MlhuB_x*QkOdJ6B$UU?v{bA%?FCX7Aul^
z&E;K1U4lnWpN8uLW~|s|kSAx!9JV}MF>kB}^-Sni<mBmM5lEi!7<g=m@#%<UWRYah
zIz;neo2k3-(&VHS9UVfVTq8)=yC`C^LZir?-2Af9<W~3L{E44eDXpONW<z8<cS#`U
z&N{ucruBU*+aS44a^5c9xX&$+Se>N+d)5Azt9gqAX?%Lg#PHUsvLFs&KHXHYkBgb5
zWsx~02<Kv$a8fSyOTBnpKt2;EaNwuO@1&-XCT&4i?0Tr|*?FEW6~FbVJ%N|v;e*)4
zNw;*2G3d>Nltj|V{LWNaG15CrSstxx@$@E%)r38xZ_fmFBe6WKLgwWjv*q0HzUgkx
zMF;>Z73ud`pUCcbf$RZ2XpuMx%>Lv2+nf0$G~C;e%)`SA`3UwxSd`bhIgHMQ{%Q$$
z>G?!9LVHW{O8WfH3)TvvJ^CTF{TE%40Vv{=hVSuN-5Sf`20Z|BV$TF3LlX;BUlN6C
zO<Gpmjfo1<OsPA(q925&JQ1XYfB>HMWPvDQ?X9=!2(XE+N(3}685p6?SGWTuR2!k_
zg9+ze#H`#Fx^R_xy498r5Q~Aji7}o*+su?7B^QRUyYPpg?3W#qe$uoex4Mf&SJv=B
z(u%XKc%l`mB;bSZmIoZkegI@L)N~xPszjo#<x4d;#keaaOwv7{Bn9WI`FKIq!NKT$
zc<n`cgScmA$*WH~W%e*qn|WX@mMXwtpXGF2JQilmJ}|BL>^XdQXna}OjLUJ7t=~j7
z@N97B$rpN)^YG`?LywuFdt@muurjDKa0Z*YOXD_x&k0PrlU6t4R*E@XB<Cu*irh59
zh52dq!zeCJcdhyqP=G{lH`~L$=OGkefDP&x3wn%*j%*hO$(K%zh_otPv?e3f$>H2x
zD)AH~b#DW5<7<B^^#uEsf8m_;Y*oq~4P4n)QSBbCjwgM&`WYL`jwD_{4Mf9P_lDb;
zgXM()TwU68aqTVyl3{UBM8z&B2xJ%}=QdIDoQ|IPme}U})sWPe+g4w+-HI;?3+(Mk
zVcEt?nHizDlN>TEhP~$dWL<109#GC=_aApylU4MgEsjG~{7*Q^CLr!ZrcxV6%<c1u
zmEUD*P{p5L*)B;G5DEQ~O#z&oq&5m~x<qmU`5QI~7NorW?Ik3T+&7RE7P}0v8}N~5
z)Uk*hKBf=qcUm~^{hX-tko09IPSs#Ocs7Zf)<MgTRl`+jQs1@A{hT$w5o)QByS41=
ztkiHp1TYhl-Xj1U9kkosh$6-PA9Pfsm)UGtY>M%M&NAQc-Z1kJIP4?al%6&-deqxQ
zOwO1cskHhm8zRmcEdMmA#40{^Q12bt`BLV*oS)S_rs!frmN+1Llw0kMai_1yWDI&Q
zz(Oi>#mjkl1%WVP1Wl9+8SQPUmn4A9bwv-GTE!UZ2Y_?JjGGjL64AW<v(wFq5JEU9
z3&EbP?T8UE3tgqN)M}}7w|8rKj@Zb8Ax>jMPh$s7H9_TsCZF9i9LotN-bl-S*dwVZ
zA&VV%j<lSEgQ#<P)q;cN#(xsq0OTU=nX{c0LD;068Kn%tY(=r|@eSkQFCQ5w4$@YD
zI%q(yK9}U7`2q@z4fFkc2b0-;)KW)!6z8O?MvPzi;J6oG^>$&`3Y!F{@yO9!8pwfn
z69eM`sV8}aB(goq=BV+0eWLn7iXfRO+_5#oE(c$2px*F|@4$~?k#)>;NWz9SDbJ4P
zazo$RcmXy)pN(>ukqn5if><HLw7*sBTlMZ75I<r52UG%A@HOs1lK-fZP3&OG9v4*G
zuO#u_g4I`$EjM%2FV|fLT4$D`-ACPg<>R*wfUa|EnP=Pe)jglmVlocX&XkHF1N)@e
zb-^<lyZXMm&F%?d$Tn?H2Xt>ndfiWeU#D!Jpq8o&dbSwhOlq+EVV$l0<pASjz+e^n
zK+|8Pke+stwH*!JTQF!k_C23<eywR_^|9211?8&J1dZ<w0FRMK;HVy`Hk?#MhRFp9
zL#Uh5A7AY54RvqKjPnRR?6{J4RgiYiI*?cH{1ir~nDO;%mZC(55?yoae)~PafmLe|
zDZYTMIBj=GSAo2jrhhG^XFL~e22LEb)<nwehtjk0B!>r5@x@Wht+|Y0bK(b7IVNM$
zD+~{_>-FILRd465ayV1j*Y>|ComEz6kr}=FIucTSosDxju<lJpEs!FVjHp_fD{DX=
zukV{5b=?(&&37iS&Q(a)o2Dz6aRC`qOtXjSIFR%W8p40;$`X3948Of*6w?Zomln3L
zRidqSOn^OSsS9b}%6~z@ukV1hzH5FT-g_JOk6qhF*}?fVBg(%}<@yrjPWTKT^4n2>
zKYpDjY)c^-`CQTkmE{foPbWLd3pmVhes(1XevZ=X3L5*luxj~<!f*5*sk)A9?^D+V
zGM2Y6G{xtG$F}SJ=r8{*S~H@z`kn-G0PTsWGU^sXpjt-W$49*vl!ne8%TK=Xm@ppE
z-DJYk?T^~=5Y&_WMiKCiTBjY^SGt^Wm>LA0{egCSL7zX*DTrKZ0b-2`!pg!_;dM^Q
zv0}i(W#zSKa=S{jl%bVmW&RkkV|hV)6;5p+gUws3`z885GFsFf>-I~m^=WPt&l1WC
zi9or+)g0(#=ZAtUt0kKpWj4E0y);f=ekn_i@^wveoZZ*ii!m(I>hqVAZ&o%+tUu#o
znh~VftbF^}_)PQX!q~HzSP7$T0rCuo*BSKoq&2d~)V&9M*+S+o$BH-zqt!@r;Op<}
z?!5$APATcRmN`<EQf*I|hbySob9<O;MV%dsG7$96rT5^Jm253<Ad#N&G<%Ya+kNBf
z^odTvo-d+`^{25ElYHxaL;HTJ7>k>Eq>S8Kvus@wmrDe^)G4w{y=(rzMN>i`RIIJj
z38LTI7nYPo$WIhi2U+Juk`s(`ACz1I=w3oO5tr;&vg<%jls+dUWh@|i!!g?`rIAWG
zQ2w7pfAq1Z&rTA0-*Cly54f}g5jfV2($kr#-3QcKzV^OFvZ#x&#q>SfQCWCy^!7!9
zASO}ujfzMN_R;W86tvqP?kkeA&|gH(w$Cy%R@Yxz?8bF~pQ2Q3F>R#wRQ>o=(KXMs
zcR;NdQP7e)nBvf%<K8+?m6yOxsYL!QtL~cRJh{ZMx4L3kIp3vMxrW?>U^bk~5ydt8
z_}0t9{H*m@w)GG7nl3*g^b>7Gt4iYzZcHLuJa-k5GQ-|ssSqMls>Pobd+G~awc1TD
zZ}*#jGqILF5KTXL?B40o9?~p3k*Tc0V$iNTR2g_zdYSpz=X3j=PlcDEDHOVO*!&Tb
zh1NO-fjkvAWl<sWC4p@4F)0?~-q(u92u<ipLspE1I+jVE+5r(Msae&hwUr|AoYhWH
zg`7srNOy1d49N3A?_=dMwQ-QG0EM`}HVY%Lv?*OxR*HF~G-M*AJDh(KU7pe@6X24Z
zNBwD5-GHJ~-O8mznv#-I*A=)Fuwr?%b(ACJT`&F^IWFNTQ1WEpVYQ4Z-m-wJeUE13
zq=feyZ#S9f4Gkk)zrcWiu>9uHrO)iv9``_;vdoXjr#FO>OLbQbv^grIYdb4~m0dk?
ziDe+IYckbfUjPzN|KQ-=0%-iOom6`1y^OJf=-AaB^59VRFx57ocgI>t+B=1wO|Dof
z&TW*xuinJR=?YOY$(?*bG|B9?8N3B+D`4NA1)7Iy&w(Lp&ApqMkUFGr*SixRGca%k
z7spmM-MUVye%w`5px81gz+AoZuuh2h52hNj36xV0ZfF+ES#@_mvDplR9v7qdxO{da
zY`Zj46EsO22#$JjuV1AC<`6Y1T|Dn1Aa7UX*r+=YvHmhepc^lIC^|?v8P?}=ok1%z
zsL^&E@??GY&Fg;j98=bn{qAGK)4;NHj6J0vBcx@W;&bx@)EYuKH5pDv`-;VeYc8GG
zqktZ9kINAEdHvJlwK0r4b8H#tPmIdeXKO#n!1A3ITPJ5%HXpSgy;hNk=1Bw6jK`+W
zS*4HIf?HjIxPR$bCMjKOwRc$(#<)IWA$bL>t|8fhRgSj1-%*W_hmqtn0Gj@B0gnDv
zG1v<zMf*W;-nw1Wv3>lQBnCun*hDk?k)?US?HJSt*`Jm5;^_-nw?W`Pq;C^~D7)lT
z{_f5MX+Q+3`vph*c4INoHrd3=)SSf<nMRcp+WkPJzob={qo+D(X{~r8$TN|LDRv=0
z5~T52yONQi0*b*Rr_>_ZU$>4UuCr7e+ru0aWtVA09z*g;OZ1mG)C9h}B!4q~Q!dmG
zMPYmA5mo(?qf65)>OPQ@MjXZ+o_>uJbnaoEP#5}*L!vyWva~*#T#2wj0`T87+0KB9
z@@UXHu|TDb9AO(`<*h)OT(5>ojPsRomPJ2thcl6!O;A=i(rH!1vEiCL%<4bYKnagR
z*8*9Ax!Dd-!Kz`|m;<eiNgM76M}$@Z^yQs^jREWmjcF!GGy!bU(_V$Yz#!t9?Zr5Q
zNg`tLjxHD>S?)PuGEKN|D!X8+s6be$<zM-z*U&`<BwKXhHyg282itJ_6Hewn=Mbs1
z9Gf4TmgEv|=g|x`3_O1b?Ejwi&zpZ@`<uYwm^a+tAp!q=OQ)#@;^6a5?<n<|Sy6sf
z?Po2QMLY?N)*$z|q*LX$#wLhiNe4t@vjqNPxcAg7cNBeeT!MMJ6jmWo&QPQPc#=q7
z#(9Ta`|e9in3CuX`^}&Wom4kgr>PVo7}!=BqQwc7B0gyUSgw%&k4DU)v11Iim=guZ
ziVnF4i5y$-0VG7?TfumUBC3q(URF56f#2<5+@4y`{kok3s1~Q#bVb0+1vQvMYr0d4
zu3O{uSgT~Zc{J9-8>@!7Oi4U-+O~=L-crS8A9UQC(!{#yZ-3}rr@DP*q?!>%YV#MX
z14u%MHq2q{4vJc3krC03&4_gbxzv_j8fGs$6eXLXuAZ<bJ|=5>1VDW`VWBFvMbNuo
z`P{e`hd^$94HSE7_+udSjJwxK5&oTg`2CDxmY~ynvmH=*U7#kNk3o3MNV;<Hx&2B*
zO1Tk;I5-5K5EYhBT}mCwy`qKr9a#OVseDlXtj<$p_^memn&6J`Ov$aq9}n8s5}#|+
z?-b8^>>5hMQo-=!GoNGC-082M&beSdTCo_PMWBM#CZPUw7QnB&^Jf>D8@)J;gW?4#
zSX^UyP<Yo!$>KbVV&=X~f5+DJl&n;c1k)Rp$aY)mlaPU14svgn<r7UfhK}8ZzoAA@
zKJ%l~La3W$Q%@WhUytmr&J+x&t>(9Wqm-|}G3C#ObfoM9>RrxXg<-92i7#yVcl27k
z@nZkSQ~<6)chfriwiFk$qC%QP_@amSmCBeh2~nA5g$9;oqrhwI4m_*-dD*AG*mK=_
zApqQ1%A2h_EsWMSGJ}flS}eCo-q@8LxA&`J7F{QhYxV&$U}&6MKhl%bUt-D-d_T1-
z=OV$Nn=Z%1C?ia2lzlt2>3Iq-CxV~EzSADyCl2z8Oah7*v4A%<ksRvIlY4ba6>sUm
zAyGKU8=75hArv{Cgp*trd1;GzGSM?~^eBi|*sid2H=^L<r2lT8Ayps6tap-a(>d3A
zNXovzx>=zX%Hr@6wu_MOfy~ul&;>GyT6CD$8vsaRUZa~&e@`adUY76`4OYUaN8Imf
zz~JYPpAA((Qu|_k5)ZjRogm4WTcu^aXa)SV)Xfi38OJHsjGH44%1C%EJm@!fC5H@j
z2ed!IpTh{wG}9LhoQD;1oG0AYEe?SO`xE^AU*!kCbH_^EV!ftXn)#N^<6Q{87O<Gs
zRrMQP6PXK(I^rWTVwhKvGb;+Hx$F3+`*<#6-SH*#2dnM=o>pYkE6}`RHk_s4<;2i-
zz`2<-wecBI^U%=H!<H7xTZO7Z>mO9EG)k`hU~b6O1^<t!Q5IT;xw+H=Nvhod9rGWx
zOdiUhv>F|$aULuGJV3NBcV*4my?2idT7O!^tZF~ec~RZ9WWrORG6j?cb07^=Y`3H$
z?_+I50pd)<zUt{v0eu(N>4+$h=!`UM%$@uhu~${f0$3BpDwUvf8IKd&_XETGuI%A}
zRFCP<C$I;lp;&pGLab7RblI;(@)o`?)f>Kg?=1=&KB+2J5OG0j-ZCv`r{7Y-9ypo<
z*)-G7iS?6olJDz(lE+p{%fhl3R*ZW)V8sS~ZZ*{i9gC8gqM|Q!i3u($X(p>Mj`e0}
z>W?3Jo8+aPoN(k`8XT;YwXXVLQ<dH|iCht^MK3-Ohw<mtmjd-gZR8h{i`EfdW@t^I
zhl3f>*%u8}tB&p*&N)QUj{9lEc(tSC+C<%heap!*g6*vHHSjUlr~A}LiLo5G+7@hv
zJ<-abs3(NT);MfI$K@GFa48qp9n|tA9^a5yF!tE)o$oR65@JC+*&b3RImzG4o02G7
zipEmRki1&7hX{2+yP-|&Gf)b~Avx!89q4A4BjV664w(=O<_`VP2%VqqJSg)3%sdka
zKSP16QOz<%@tog1(+~i^wDYl{Z%>Y0hR)8!5D{O!Y<#4k=W!!+)mauFgnf;+*OQ*e
zLOt&r*q0=$#AY^fqZW|r(Z|R1c0BDg00IWtrkr2@^mV6DJB&ipx1!J$+;fR7PE{Zm
zOzKkl2(x|=LX;StFV&pM4e~gr2ZJJcq6@Nd;f1)u_I4cq>PbrO+UYhjvDEJ~!1gXu
z`^@ZXO*h?%_R2mm9sWpX1MDROVT|&Z8NqWg&`(*rWEG??o_KovMWVz3qm{57v;+e4
zaj^yq%w6cfE~tmMzGor|sssP^9B(VVs-xG>;$3amd)zzEC+b~wl%6Kg&q~H~gr~lY
zo%cMCr71If@C6mg3cv5z=WFqSy*a9Lii(<*zFys>^ARdWM<`tFdkr)wBG;pY`6G4i
z7m!o>xM<hxNL>=0q0fW}+p>iDWnHp8)ioxTe+<vlH8;1k_uafctI*VJJQ|U!rat<4
z`3+A72pbs9m+(1pvcENhh7)Y@jZ#UYV`Kx=P`LE_dpMSK>uh<cv7%+?Jod>eYDJw=
zPSsDdnGpj2(1*q<idBJ@w$Pd{o71O$5o6cNERz+lzqxhbT{~9yIje}8Lp*Pv(c!_&
z#zWsM=!9KBvPCLFU{D0K#1Rjs{c_6{Iqzr3-u|0*6&13Y3?khPHNsxJ0Xhtb0e*H!
z4O8^0#t<RGr=U;bFhf;Z%5puPMTDC!c)5NGEKjcXryg+R+<`wI<Ad!P-nn)si~~3|
zP_co>rE}ckR1x*){gvE6Q)miMRp@zRlQ%?vMNzSl_X*Q`C=IL`91pFBAq1oSbvE_E
zsavfB79BYl=Z;Lck&8K;X+l}=5IODG(O7JOe_6|6{ON3B^R|em2vgI@mfdDP(^cj;
znVVtx!wi~{^)>%E9YK@boqX?#Yp|7(GS`*duhN=1&kT%E_h!y74&~YOU6cacF!Qq!
z?1l9<uYlV`2B1yB)`EAKN`HjsIl0Mj$|r-i(0{fL2ewtID~)uzsY+w}Bwj)v&9za5
zVqV!)Au%BLy>M{Jp6qPj!PfEeLyI~m^1zdM`#bc2nZS7d338FBq_n>bJ$~=6Pk3C_
zGa+3oxsYi!P`P9BG}YnTQPR^(gyiLAfee_Hy9T+_q4m!zEYe@_J_qY#oEh4G8&Puz
z55W1yr3%=F4{Y35(a#s&iZ$_d&YDVoAA0za`Ab#0S9=35x<$y<xk^BOw|uz`-`UrK
z^l@~Fv4oPPF^Bd=X@w-w;mbQA)u6*x@|K7>d~0;_9HIDcTQi(aAI<Vmk~e6-n}&e)
z>7#t)h?XrQJf10J!v{Gasx6=uSYw;Pl&{!X;s-^KVdrbyds5<6zE<tDjbo$i=}jC8
zH)8xM?tKMKha~*J(gExErd{=Z{j58GJxEk7f7&slaR-4!?$C;DR{~vVPjBTgOObv$
z=(BWD1k!L0Y7=Lq5hTPJp>;bGNp8?YF+ZaWeU6?&X6y3RnMZtljgc8L(`3w%j^C37
z#1JyP*n_ZDV!_L6O-Xq(isX52N|6(&R@lAY0CJFQt`bk%ngW0Z*J>OS1Z)BKN#?t>
zWQDYS5leZe5-ft^!*Pf^CtZ#G8yzOXE(^M!u4idPWTC1sW+%6O`A;XZANX80`V<%<
z3SJ=j%{r>!Iq<@|A9KDSWDtlv^sFcwIYx+<05aYh`ebLjSN{dnZd+7&F4WC-MJlow
zbR+bX;k6I9Bthxq%IbI18sJA}Pufeigm%M07m1F95$1D5cNf#+O++0(?Q1^^^2gD3
zm6m#GcXR0#R&=sz<`W?Zm3!<{axCn>I)qsxvWK5bsPRFcfO^DJ*86r{@m+{_s`*OD
z!rB)K&)^nDc<{FeT${VQ4%2GPY*&Yz^#&mEp%#j;x)uCUW<_iVRzh6lUQm(UDmk4E
z3_umU;|f4*C?PqdL{^V<!YGuJRF}7pwp?~OhZC(Un(e0E>RYz7?fJycHPy}hnpuAA
zz{@GM)9sH#mSB_(Dw@84)HA!_xtmK8LEbAqI$z;?ms{W(@;L&j79fL#1V}aK4qW+j
zK-j8jRM==>xu(z2PNPX;?O#|6NP=;Y&1myNr0qQV$V~Hac2l*7_F;PTfwwQu?gkf9
z-lU*n=7Npn>IOQnUe*eDJmTPpp*{<}iqDwoZV{2v>fGr5f{1cQ?mIU^>z#7SvK|&F
zKcW#3U_2}<n!WMOsp1y#_B8V+K9?5wtAXy17Ye=VW=&%1tDQ%Oh<G|)vfzSFI3K(^
z&sq#Eb|OH1y@J|u0G{{u@yDaNzo=G(ADhZX#$6Zu>OqJ+2P!CIo|9dqc9k<4DzA=w
z@SVP+(~_8-((|qy1(0Z~6uZvMT?0TvN$h7|89-q7wl}0+5$<d#GUpyIzZ0Fl;SS^j
zFokh?76{PiYLpS@-|O`u^?pv(L!c92e9Cdt+Y5p$fBq@tN4Kvj2ciJS1c>(`OMi<k
zGQfUv{Qd)UQ{Y1=$$~XiX#Qj4@+qw^+<kX)ttPe55s1Pto*QCaE~}R+#v&nHL`0-&
zWxW?-Tcm0J6Ktm8cg%6E$m;7x!rwr?3eogW?uyp9WgXgZTC+bl>f<>(A5Ln!Y+9OT
zy}q}rVGE=o<DUQ)19e_NbU!<>T4TGQ5;$Jv+`9Iw4uQbTk+@%+u5UB(GpX5uf3@A7
z71u>6gT&TKDoN-a?^*l<;C4>{%%8vJ;>j<E>^~=6hxC*6Y5~)b)qx0b<CeoSMa{Sm
z99>Eenk1d0I6&qn8Yq*0=pwAWmVbF!HT8Jrf}s_CzsR3s)_{HEqVDJ?nKPmW9Rv}T
z(_O?YS63%N2HjIG+A&qi_p(NePM3hin@**+4*_o(=N&cn<!3{%kZp==wx%ZBtDF`R
z#_R2L)|$t|*_Qe{U}H|8F9bA{-hy5to~A5SD(oc)`5uCedRkUsa4S_(mZ%b+`~jrX
zX&^40{uTDbt(HV=S<l;1_b#qonR<;AUNu|H{0QZiI1N(dSx`4ZM^t^sjMTi_2()%-
ze<50Nxq82`mRC{3A-?g$mFuVPlR&GTTfl?_h28eP#9@?E1V|9+=NP+j!fQ_@0I-fq
z+o}A7Kvm3A!5WS<f9M@AFIBXY9hl4^&+OywDh{+0p$Q9}*2fG{e~yqT!qklVGXo8#
zxC}y!s>qD!bU5e*h|ZoN-?~5g0?=bod}=H?z;LKyjJ;7CL5U|qM;>7(UWKM;T=GIz
z{c<maQ@(xMk=3^TH7ve5in#vX(c9z2(!If7TA)(l5x|ujjx_VZSI+}{j3v#j!AGxp
z68SQ?;h?-%1eV?UM+6YE^KaH$Aq}wU8>#n5_z|R(1h8J(KV$o^ngoLXhxWEo2vmRp
zE=Com<_P#le`OC?<HIOWY4oq-n>s=U=Iu_I^P9P3x!&DKrV&x`WbNV~{YB`2ERv$U
z{U4xa`S6y_L200cfTmp+;xvQA$hfHt)|EzngBpB^&xAc16SUt_X-LD|E#;`zp;Ht`
zUf7pd88`XQBcgGM=w66HjMhqf)57I}bP<RTxzapCuShnuD55_4%d)=Q*8=!oCa3{2
z=gccV?q>g&a@I7=`vS`Pj`QdqT=nk(YcqD^g1`Q0@aAc0&{u%Ctc+iuU2ZIZyFpDb
zd)uqVv}Ki0?9ZP;M@>NH9x?HOC)kSDSGfEj;w6>}nVaWgW%cXi$JOL=_ujQIlWbfI
z;ve;HziR!rc0Z}i2Jj6vxw>0{VZxAEn+#Pu#4?e6H0k*Qw)$m$1DIxPZ_wx}4I$e?
zWd&I?1i~C5W=^^KM6H}qft*#x7w2Sb)&aCl;XCO-8L3S__0z`%fy$U>r!t0ov0pH`
zYeWOnaDiaJo}sk1U(kylF$rB0N6o!U&vd74?q-SJ$IDN6EntEUFQS98I;UMPz%e-~
zlkLKygGvTcSj0a`+VyDJ?-YnFQS}1FE{Lsju|dSWfdsmJ>$W7DVnnu<%4p!yR2yUa
z9}}#b;x*_I`|^2mLQvEPz=)3_%DDC<+x~uHp@wi2L?H;Mh8ZHm)A?whHt?F^@>jgX
zBfnYXu>eOumJGzu6zhIDm<PwlttLTVJR_@*;7<kqzSKvek3fnEN`C=FF_iuablV}I
zhSB?y!bGDC2}D>qL>-G(Nob(CeGnqPnEL6tWCKb+>l~#10PAZ=xP)9zd@n^g&<EnR
zK(nYcw-gKt`i9c0wc5w)8)HkA>xB%mWQj>8YbylwC*~m|At8;;{Q5<2R)$c9<<Wh*
za(%zQ7^*qdG%>@H27A@*(Jv6GB6`ZErjRQ2WmZ4s54R4A6ae0|xWJoBfD({?A#*fK
z_}31;hNMa`3<Sry(L0c}J{JZIke9=({Ix*>P}arTI3TN6EHyv3{+=-e!Aqx=N=`7L
z2ZK?CmdS(e$$+x<Z+l~Z#k7dKQ@s;~&n1mxaVl;15s}Faa^bX_qhr3REo^v<&{A*_
z_dkwEl(_HJpoq+<m5HdHHMVF5H_ZQk?hnXpIPE)$+qYh;sFDfjUy6=UYF`Si8}z<K
z1Bp&}c%ex#5HK`}ShJwNSuM$$E*y%G3K0H}iPu%10^Zg@Zg--3^E3iga_#$Bx*gsh
zCKV`bfq9>K3Ho$e%Yr}0?O??0m#+b_Su2woCX@x+LGKgzXY;LT?qAEx%kNT*#jiG*
z6QU1^*7EA&2oOlUk>BEeF0`(~4=ze2BlGwEvQ7mVfE<z2Gtp&)6cSziHvGTa{ck+d
zV2HUp%(v6FA0%;Q2mq$~fefmLzJAIRGwHd^KnHW^+FspD4VpdO=%97xc&4%VY#9>3
zY^y(OU`^NmEu~ZTaG3%=0;a(7cWcAims9^hQ1O&DF%j)Y=WAnePF~S_ylT$TjAFQa
z5dh5bpc?p!iir4B!YfG->AEQ_q{t`G?frdKPbyS1X&2yLdp0O(FTe5ooa<IJfG{G}
zbB-crbuv>;Q~*?b!wyaUdi%sktQAXH2_O;-te`aljsjnGfOR3&iJ8XLBB5%&3*!>-
zf*mG%>%aj<ahN@+3Cjw+=sAd=-br?Ks49q}phl%9o6Db!=ci|8@F_@0+Ltga>%U6@
zEMtyHv)p;|`foRT4e9z-CsI+HV%mZ$wZPO}WFyOd#Rq_{Vjt`z6nh2|RQVYC#WYDy
z%tna)Z#g(~RVMTM1~jF0mHMe{_g<yW7jlvH52yrgUKl&C3e3e-$l<?@ThaC%A0Qm^
z#Dl4F9~W~9mtM&ZslV>Yn$~;L^F(^y<LPQN)v;lBoufMWr551PX&vqtsS%c{etY7_
zmoIwC2HD7}f98X1u!f7D%tv8<Kikl*PY6o|fZU6>3V$#I2<~CXo)lbk`07DFEzX<Y
z)hQ&|KdU)TqPo16m)8u6)C$B0vCdriVl0TQmyc*hZB~Gr@(0C&wcx~m3tAK9CKjp7
ztP0XaK46mAtL1(uTU3ka<ALZf#Tkp;cm*N(WE)m2T<(8QPhNOZ-)QtAIic~9#H;}l
zqWmw95iCB7z|vpuK&XMy8}yr;?HinA8_RBdDz-w*1W@4dfLP7E^n=SY)u44liU2O5
zq(mTCTeXbLYKd~IP_4o6{Chv|v|{&GIE&AV|6rRxp6B1cyhxB(Nqx|_K<CSw1Cshc
zQvcJ(sl>z#lCjwjwo3P%uhW0P0_D^VWY8q|XWm;^SajWy=PFG0<C76B)Q##$Vq_{z
zC%jF^l&APFpyC8dZLYW}AB^K}Q)Ax#@J#+K4hlNTuX)HF0<v4=XeD1ld`LOfu)cd4
zY#vovf4Tjv?LO;+qT_d-ME>ZO#~n*cu{%5Cg31mu%JFva{s8LK<+yF$M4esNQ_F!0
z_<~2S^AeMw;B{<`%i&;Osru%rqC4ld{%LM*b|&FfcnDgh{{qe6f1L!<2zmWI`r@)Z
zyD&;M%C06eLuwX&I9(>^?OH=#bTZdRV8j}&4C-Iq<(c0Jfx*#CZhP9>q`{tMA50mZ
zU0$zGG)Y>{s_0dmj<&YI8`ea9(2QQLTBzt5<l4Q7OrnY0g@FI@%ZMiKWcY9FlC61L
zT_8Ps+uQobJEzlNR+lelxo~YmdwY9-w~?Laq|`f?%@(7p84Hv`l#m7@!9&z;r?cZR
zSK70(UeEGFXh5P&eIdzJqk0}pduQ=<Gpl%KY|!&x&fDCIjce~~@6?k_)kU=o^UK!w
ze|3^aK?`K)n-fmXaQQqQha8CyeOze5%HUk4Zh>Q=A!p@F>+|H#>v$wI6;7mMIpHzW
z4GBN%(K+$hR!lrvLU#Q2;<EkxDY906{gcQ}R3ajxh25NCDkNk(qBf4qiuL>ZtG+%J
zOFo4OO#kyfZc$o`Cv}eP#H4d;6b#9J#2coi%v>06(@N@g;CKcZugMhn^Sy0?19;+@
z!}d-=UAfb?&Ivqh9yRee+4<KnVgM>)EC3~|H{;MG$}Wppf-RVeEtRyGq%F6p0{N#0
z8~@J}{q=s6FuWOMw`)fVle7ur?^)u5c@QHp+(8YdWf*!ht$s%MP)U7@zh_1M#5m~`
z1lswM$&;D*FpsJC_uib%jwEufpEn%$Gnyjij)VIEa|wEor=PS<TeOkaX?iYu&B-C7
zbP$49LL{!5GcZq_R{<x--*UhF^ncc-OCzbXpYQ!wT9=+<H<J?AXAq8;pQ9DP#_Mfl
z&R7X4yIOg)TU^u|Vu^>1^Yx5p+x&hfYmb@O1vdlk`CW}=3-NBKLpAhu6GPjCovJh3
zS%{2`#ASJ;g%0kMSc!5uY2sq@okw|Lero;Tb~Oh4a?AfdOWOpCM;!b_=nKvX_OP-f
zZPll(!B6_;@5U~h={q&MTs$=5F-i&FE-z1-pPl7+4xMZZ)(+IJA5RR$oz3EM-a8!$
z21L<vqB*b6%)hS*eB<UFG^_qGLNJ|%lz3tX1<im-P6X%zc8djj2|&*CKd<KBS1W;1
zT_;^&5>vA4a%vFz`iXiV7~b(TFUBG{XTa3Xd>i*3q>s;S=cK6d|M4^^XtiRxlkAln
zA(gj&)f~_1MazHz#`E_>zy#egb)Q%&iV73oLT%EAt-pWp|K5VG0J7F(I+4NPvcq6v
zY;T6E^SY88)B0IvzR4&sFdBC0|MP2JLifC%4^4roxgYO7_Y#suR-3dgTo>N2yIGm^
zsz1ww>94o==K>$`H#i_%f+_o8<O4Ua?VUHS?%fV$aW$D&TkSu$H~YAdDRY102d4DF
z$_VN1)@pnQSZM8}>?G|eTX~y<uojH}`7_|e^^WaCbk4mG2ng67-%qbw0_G`w-5X;M
zGgl>tR=S_hQhz7S%B`wf@AF?X?f$bslZ3<jg<ar&D<<ETIG+v-Ihw@h4J~gRmLX2R
z(i8B0nc5QL;`m_Z9Q!qnRH;Sx1YGKtnOpzo+4K^+fq2gquNZomn3(P<6*_1TKLbOH
zPz)q@?kwU#F*&6__~{<~H4o=MH)DZ<!_UCVN4D0me`9y_8n5p)?_W2vvEe>FXv>72
z9;5$bXMs1#gSq1ztKHAWh~pTMuzu|s<Z!R<L;O0gpwj?_unX1y+rF3H9(E$PjD>M+
zFiQ{py8OLjTsV1k6mf#}_`HQHvTJi<2FL&QO)ALz&t1BKb2!`HQn`k6R|GsLfl<H?
z!7b=Ax-s#8*7$03kP!|OHDN{vZR{T96s*QImQ|*d;s76j=Y8LjkBghzxtcb_$+6$B
z)#8Y%a7eWM>Hl2%N2*f|iafX$9r%2m7KhZ~m~iRumOmnpwcfetb!>=2HjMan{^u7d
zg{DFpl2MI3mPmChaoi@$xH9e_A(z~K1ZEH~zI!dMQt*4@lwvIV^kA%X&!3e4XRP4m
zV;E8QZCki7V)ZPNNt7u+gD8d0lL#l7$uWJ{5q{PCzbBa^ET8qFg$wRI{z<a8N$JZr
zGGI=l(E~ZREAPB`cMy|&qE4IrByH--DJLa1j9Jx=SWUf1d6~ocuz$?Tjdjj`KBlT6
z_^hkd=wZn6*hAYqGZk9DZp0KMAr54BVVAMq$dZu{!v0`{-s5!T5(XtXs|bq<lmBVF
zvFNQA)ivJ|YgRI?@2cvIsXc$o-JjGPjTi<>f=;lFdTDPxBVo2xZq}evD@EN9Z6Q&M
zbEmebahd|cE+*aeetKPAKcDa}o8gqn3Pi<DcoYq>1*nOLA3x#}Ww!qa!Fp=EMH2&B
z-XIgAlf4yQn9!OyGq1Xjp+RzWM!4)f)@latSbp0$HnXz5{+HYM;{oL$4RuM{2j<-4
zLq$xfLu^46OUj;(!<0zK6*v6R;p8qy3$P3dzru;vMU@rnuV<1K8#MM#yueNBQXoG^
ziT90SwbySG8kS@4ona`Jym%)KmB@6KaWW(>fmyUhE0u7S2SP~%-m2Pehever<n@{a
zD`@i@>*&Cuk=hcy%2SEeV)r#Ot;tXHne6@lS@X)rh&8A3w4qELdK;Ut{pp%_6*>VJ
zYZk=4Q(jx62hne>FtI<7oQeOiZ)bty1co@H(BLoHJQ$r&SL_bw(TvE>Z;vt1`)lPw
z(VN^<`Z;rljk^W(L#d)aHUy&JXc6-@P^NOG*cp7x!+>KaoA!J~rlqBo-RJAu$~B2;
ztr^4^YNily5ilf~|KH}v8v|G$Hdaj{LF?cGGh4!)$va?9IgK3f6XvT8KFu4e@tz@m
zUXzVtOnd@gp#n|i?&LWEVK5m+F?`~elp?y19X~5kl@9axvT!i{=kQC>_OVY5vAYes
zd*BwuteF>XJw>8|=nK~ge)US4kSm^-eptBfG79!q;fOiKI{GHN&QVLGVmMf?e-ClP
zEOLN|oq*_VRn_j2r-LK;MVg;W3WoVl;eH+_1n0v%LA_#AR(0ED$A?NKHK92<M#Xm{
zJLf0l1Y*1GkMEkw&V#Yd2;=z^eW#XUXvO8WVs&=c3jlfCZun!uS8ER31Kt|HVmf(#
zmRC@J$oAp&(apW@cN1u0*4z0iI$mGbembN7-1MngZ(i@7<bO;y%1iu3Jd_nY7IR*!
zv6+{4%ImH-T(k*xpfa?IV0t2iI*^7$ZmN&F8n}e~{QQJ9b%7K9qpAWoy@n2{i-FfW
zgyMuelDFwe??0k}4(xY<uIOM~F-9tePCABo%Lj5Fl5PH8B~irhHe!GWaMDtGYdzlF
z?1TCiYwUgrK4nO<kj*oQF7ZUj5L@@m^#s>Omsn{#_vxo6X`4Ns^@BN2kX<3eH^~V|
z&)}@ux)yyAF~2AMV%&bTD!RzX`JU+KyFtT=RprD<Pj#mG2KbHFo<x?IUpsqOUL9L3
z7xVY@+5q?jT;3E6*}^k&b!%TfmV1mWMb+<yw)5s(dh6tSVHcTP127@1{gE^q!o=$2
z4=p#>ajyB;3d{9QDqm-BDMc(MbCgixEse<rb#rlPN{hn&hNj<3OsU~5klb8{)GXlr
z;#rfqEV&OD;HYcs9qDmWaXdIxt*OHe;MXJVq&9$#t!@uu`au?O4>oWUEM8fSL84Nf
zB)lOMocWsgyCg(*_-2W_{^t^<|Jj5WrrQZ+^aJ&%FWOl5uI3*9#{-6;=7yCT4HfTq
zE{3~i`2+bXIo<gXzYJo(bI(bQ-S7J!cIRCNhruLuJHo}gw##_d%lM}F1_8YIjqBMq
ztULGI9URz4Mr*RQ8Sd;Kt<wa5iS4F&+|Q?^^o%m7;dRj?_$p&;_q<w6d`7fkm8Q(E
zg*-l!BO+eO$SJ*{2EE9nWG_8)+8|mmR@}qeTF~<?l|t|FkyfOg?QKeCGw;{$U6Re{
z8eR12qAFH=8nlNB>h#JJJf|I=^ltu72lqCzp{PM;XlknHusooKc=1J-w2&NA#pg`p
z$cDO#d7oH$1TZ11?zwa3hNJhdm+X^GlHAC%Sj;n<vVa$k$>|kQz0=kAsY1y=I^`n~
z`MuxMZxQpfaT{!~7=A2H*ENW>){B(W8xu-qo+c3dXAn%IQ=!z1be(ICbd@eSJ*gVf
zeGe8}?vv1uXOO=4?nER`JH|s}y9sN@54nD|cZJ8@Ex!s-|3(+yHYnxZkXt)7r(Pvd
zei2o-xMbGntNoKEG<kIf>-OU_faIXgvV1+yb(&ZYQhb!Bj%-Kp2H)2V|J^d47v^-u
zSEN#c<Ps$+R;ps|GlCJ;i#?#L=}mytrbN{`ETrFY3;Z)(bS-&ms&b)r0PQQyt;4CV
zma?j!Jo(zlD86`)1Tu+qc|PoZpEjNr{LVBoO*A)jwr?hTCt!Y4U<xb*_5j=VUlRxa
zdVmH?Z|mFEAlOc;m4C<XU4x7|@nSE*@8$$~&3erL*WR_p#hA8nGG&sIj*^lUVmn!#
ztjbh`N}^V1OG=?5QoSAZB(*|X8^uIrbrREorqfPJlwU`iP8y0`9h9;wO^WKhW}eRG
zp1!;v-Vfu8@tbR|`#N9O{lA~bT?!TM^&g&;L<y}^g03_hm57kR>tUB{jxy`A=l*u+
zI3&zEAvdRRq<xrfHTZCJuzE1_-Y}g$I^^P4dD_3b^+V65k>#U|iyvgY2V$CRualZH
zl?QdF1iC%(lb~{GsQ9I`Vw;Ek;(Qt(_ERMueykN}9Bp^*YuCey*H_Cq3vXW|Ed8t?
z^=)VWucySx=b*Cq2v{tx6L>sdIed8!>9fPY%DB&+!%@4Ob@Nl^O>BIoeUQrzIv<p}
zwJOJ)WL9%-cX&~PTnpn}X&EKc=f2I_S9_08cmi64U%FRn`YrynKdn)pZ_1^t5buh-
zLj;k{BNcI4NBie({`ls`p+}2Ozb$EmzXC_YyaGpiZ@=+SIY?+%j8ur1BwELj`_3Ky
zz}yJM!&Tu;FERzr4egpN#zg~!A^#=imx`KrZXVaA=JvCMC0*lbxn=3tYD76;mj1T4
zDxG2AUhgd3Am9}DrCf5Urb^X^uh*Q)tV`@)U6OPn-^pnhYZc?Ft`ayk?63I8mZWaW
zX!ZPM^lYKB#<h{Ev5mR9mK_`IED6NN0v6(R^pnK*PvVw1%9#ZR9oO@Z)*x5i|GdBQ
z_;%a8pKUfLmRauS0i<DQ@7@Xvyn*m7Cb}<|I&Alwjlj>=*8kExl@%q<`>Dq*4Q^ET
z_Rgo}eb&4|yRt}E>z__@`+>7bu+Nznw?nHSR9NfNUK+WdQbX5hY=p>3@cG<1xT0L`
zh#f{L&d%_UUhn?!Po>TVhu+6xNh?`)hPwoB()&A>QtF2^A1R&=arKqQYAJKyGOi6-
z#2x!`*XQ<f;R;CCd!%UFgN>1g=y`NmOIK0DmL`pu#{RyC5)QI7*;+fHFy^3Vgsp?3
zWUeVGS~BXdZP@WBEH*FTwN`0i!QI^@^W&`<^nrcLvlv%4FuT~a8l@UbTC3Un)~lo%
zi#Rp%(#!AncXpXteNxTGJMyp5`Tz5~&89Wm+EyM{49OAP-1=4L<fod>JU&K#NL8~e
zebqk=Ue*59+E<{59KjNT?F?Q<uWz6&9Ff`35l8fc0y+n)C4`P>|L1AV3G+||-j&9i
zJnD_(O)+Y5p>pvfhKH`DSQn$Or14-tPkt|^yG!iZ_g#0*6zB4o6D_3c3NHG*|I2>)
zXIE7ff7|YoI^E#Xv6eAiha(tSQ`@zb;v=)*h3AHJ+m_gx%l-q(Dbeok7Sx0BZT(sI
z$x;W-_CLx@ZOLeCPx@7(EaZAf>sNV@`!=b3&H<NBMTerwfJFv3)!QdfKO)asp2$@F
zh)m3Ig<*%<Tpn-U=8(}nVf-Z>iS!zMdZ1Txjt4!AHuyC@J^loNSh>SiLXYNZkfRys
za=N(p<N2&)^_q8Ok8IdZ%*^HjH#r~Kmdc}4;(H^fu|v@B!^<rnt1rqMv~H>OG2j!H
zduAl3k<p-eTSD5@!(e|J<Huhn-G^bkLwu6<CE)=E;nux*vN6>FdR$PDi`V>)3ipqu
z7~1aDqXk&gN3x^?eQ3i8g^N1VHrBX?OmdhO3G$-Cb;gFrZt)XB7j(Pki0J##(+hU}
z8VI{@%EMJLntFdL?FcH$Cb{HhoGLAR^R#v#;O<D%KOLEa&ZC8X&Q2dPhFO&a8(i>7
zu(nNELi$s8{zKhjQbG}>JYr4cUXqr|z2p}};g~p>FTeV_l}&TulyXf90T_mGGhsh*
zp-e-C-EZOJyNpUR+g|Lg6!qm*Fz7R567h~CSr53R93W|>cQKlm&E;7M=*Go`ZX2h^
z_$2dq7a0Gtz9bKM6GHl9nc&qp^T`h}$s}UCs|NYV8>POMDBT*5A_~9o61Yg-UCU|%
zdxQaO%DTO`sZPp$hpl^QRgNb+3GPVd%9IO`#VgYa+cdTZ6+MR0OjNVzBU>?;+s6Gs
z;LvZJUB-Kozx%ATVT2`J4F;D$^54r0CX4K(N3qMi&Hj!{TP{d4mb658`=#^vtn#+$
zArPY!;il1(19hhCe#MM~3(9gSnm^|8ZWGbc(V?C8^c2@Werel*EPl=FPy+AgwWx_y
zmSxHM-XMD%>2z3q?R4h8m*m557E6fx%glWtGF<~(0;Yjc5-OVbN<(cr7+<|hj@jSU
zX{`C7EF+tFXR;IXzSD&Bj7h)D5bMHfkY;K2+0_!n4qu`|BJrb(yVwHFZWs5PCk!ec
z3sSW)sRean5k1cfug}X4(5Biu3H@D~`#^^L*FXr>dE<DMQ%xcXB_yw4Y}y*R&TF|j
zA+x#RecwImo(L$xV6{HjiIaDw_e4!va9U0XZ-;x{qv!l7_JrT1M9JGgHm5|8E=_Z5
z)oyc4{OiCyTinU<wpiZ$%K~mUY9TIF?*yGryHxJFemnrMYp_49y8DJYV~;W!dXItN
z$)MEO#$z%Nr-dfA6mT@B_jh!r_3NpEp8+<4hR$~53r|;NoSE475D6~bV#r7^Spl{`
zo3D;2T4O6l=0K9R4DXg<zY1L6eGLd41$PUf2yo~B0HIJ3&u=s?-Z$EpGtrV{%XKp*
zfaw7jRJ0E=#eRoX*i?f|rAc94b|0?MOyDfeWw?WTRht^z&N%hQlfs!5fL1*+)Xj+R
z$0GWaB*1NMj>yGS?V$`wX|^_SHgd)`7~)dd&g3isJ0Uz9=T;=Jmvub5V$MOqHCgIP
zMv;-0f;4Iojwm=5{~Z9$!M^IA&uu0Hce=B%Vu(<_HB^)f#;h0)jplob!-9!6J84~9
z;v~Q-$$MIxQR3qKpTdBvQd5``j#}=i1rR(hRhYo+X4IH5;YGpYJAZL!kHP^>yfzr;
zqi@nIX|+%ZvgSd?Aj+ZG9XL|F<CI)A-DWf6Ob&cyBVza)lFEZwvW$~^enY8^TeAmT
z(QLKxGLq^UU}pR%PmW3TngW+&djz9-0795P@2UYxXPk|yC>ZdXTC)a<3{!)^I<~Az
zpS^>|!T`Gt26XC5q9ILl2714ssmn~0PT@NEOe=Q$SpX)P7#tE>{#+q8<=XM-%9^YN
zuO=2wJt><Rtzg5*WN_NefZgJ#r6W^;AnD?6K(#pq^=f9|3%AyyXh=R_sJ4ESCny43
z_$3EmR5^O64>_v*1dZaSdQ<W&`96k}A*tHG1LH)S(${d-su|ppopVwiNfp1i&O-$0
zRb1C1YZ#R~koA-le+Dc(Uonh505I-U+tpye4aXk1BFT<Hkkx8kKRf=5DZw*EeswTl
zecet!q-pq|{!7O;ar8DIXE0Ny=U!bKgIX<S4|vijz!~XDmT}~EL4s+2xwQ_poSZoj
zm`MMwh$6tXG*JM5t|&}I=IKX5D6va@rw7$0JuWIaVU~(0Oax<B&qJ<Bnh#P5?Gu`%
zv73NL9BYr#$Dh~uCQxdL#ID2>Y64UI!zMqRL}27hB{PskW+8I`*2?7NTocV=A<j-H
znX95!Q%xk9n5j6vkMg5xplHS#^D+tHI*-tA^qCOgN8;m<9=lR6g+FI$@N%kI(BYrH
zzKTz^Y_u<z_=TVT<bsmb&VDxEPOJ!9DJuxc-jQbXShr7ZwTt8ijej*vCY}u#A2eo^
zMt<l)B6io24Lao@ixo9+|0bk!S)!5hn01d~+J=(7h`_V{P(e(ls7w826wO$UJ%f}=
zUlmm{Mduk}>#?M|Iu*ZT$MY@B(M))ObppVS;IM+QDD}J)dBxew=*3t+d~L`a^AJ`9
zicUEeCwjt`S?XI*(NnP0$}?8u`=4)kDp=t*cTunBTvX!SAUpHFID4~GjTy5fJ1i!{
zy0YmpT@@KDW6R4!Uo-v%o_|H~iVwtD4LS5M&mj?1%p%nW!`c=4rYlI<Me&?5&G^X}
zS>tj3Bx6wUs+kZnXNUxrRQrZF$iw_9G95}nA#FEu&j1HOCQZtYE}>#Z><{@SRK}#C
z%O&$n{Cnny_%k@CFcBFe_MK@$*4cp&+~!&J7E|C_HrB`TA&t9w(_bO?o#%M%9(k>|
z7j-W=HL*PV50PdtlCMZgMdbrFgtO|7&Nko_27qTUcd>3CEYQ1HJZCl)?*-V3japMo
zs{{%j(BF6gNhM5RDfkW<b-1>qLP)+rhM@y{<soH4@iHZ~M}Z0MqsW+Fxk1;9W&eZ?
zvs__9I42pSQwp3J!4rDW*M7)tL)ttGw|T{0a~;mCx}aaB6+HAp)_r-r%k2C^@HsMA
z#!RIT$rlongv^I7Nmj$Ucd$VSHICrIn!z5$CXZ&m5hc4S#>$!Z1gmWH2#QC@1E^EZ
z{B5d8&ya*!QYCWE80IlCv8#<PeFbR+sRgXo?zlZc9=RAEA-S;M^H|3n61YPGcSvB(
z8{8p*J0!5$F?UGd4hh^Lfi?DUhXgK?z#6Q%NCFp0VC@ySNCFp0z|lAEknq175{llv
zdpDexmX^HfsxI?2L!m43DVlmEikuN9Y%zd2h9oNT^vWz6-V<zo`RY$!ny&Awt!C+z
zz9KgiUW4Nt8RA;L#Y>O}j__<*0k7NqUWxNh1|9GFc)`oVhxp)GGZ7R@g2zf103d1i
z)4lMgi`htULGHM(ukR?_p!rA7o?u_s#AQe@2rolfpn7>%v=a)%!GZyNTi|w+`Q*$P
zTFmalaS|Zp@6ZoIEZG4EW%5ViaY+tLNVbOob^5hf2`O`>C9o-^<#=!eWoE^}4$H&Q
z!0ruE$g?KgHX#5P@~_{CG&a1N1n;nZ<)~5+=>~A|WYkmGcne=<Az1ham+)B*AtJ%T
zDsykGMX{O7<4Q2ywd=z{YipfXC*HsCr3KbH3bB8$aT0GO1u`Mg{w-TkSSRTT5A$71
zZ@=#TW_5D7pPMs^k_Cs=sl2}BKCDAUqQi14v3e)*me>KUs0Z+JhbFmdgZuM}6g{JF
zj%tI2^KQ?XGhcDX7Cnn;VMAI3Q6hVk!@-;#FUu0}KWmm%+iP=9DWZuA;I9FTEI0+`
zq`}z2!n|;TF8)$GaTffU?Qd~&7JULgJRczOa<uL=GtA^>mH|L(ah3Uxv*^o#3D*=8
x#^IWRYl;aAOnl%*1=kc47MS?({}L6Wi;gJ|-!0L(aBmL$*|F7h3q{{H_`h<O%yR$$

literal 0
HcmV?d00001

diff --git a/media/docs/pythonDSL/cute_dsl_general/framework_integration.rst b/media/docs/pythonDSL/cute_dsl_general/framework_integration.rst
new file mode 100644
index 00000000..5abba902
--- /dev/null
+++ b/media/docs/pythonDSL/cute_dsl_general/framework_integration.rst
@@ -0,0 +1,412 @@
+.. _framework_integration:
+.. |DSL| replace:: CuTe DSL
+
+Integration with Frameworks
+=============================
+
+.. contents:: Table of Contents
+   :depth: 2
+   :local:
+
+In order to facilitate the integration of CUTLASS Python with popular frameworks, we leverage the
+`DLPack protocol <https://github.com/dmlc/dlpack>`_ and transform tensors originating from these
+frameworks to CuTe tensors. The present page documents the conventions, the API available to the
+user, and provide example code snippets for common usage patterns.
+
+Implicit Conversion
+-------------------
+
+Tensors originating from frameworks supporting the DLPack protocol can be directly provided to a
+JIT function as a regular parameter. |DSL|'s  runtime implicitly converts the original tensor to a
+CuTe tensor with a fully dynamic layout except for the stride element corresponding to the leading
+dimension. The example below demonstrates this use case.
+
+.. code-block:: python
+
+    import torch
+    import cutlass.cute as cute
+
+    @cute.jit
+    def foo(src):
+        """
+        The following lines print
+
+        ptr<f32, generic> o (?,?,?):(?,?,1)
+        <class 'cutlass.cute.core._Tensor'>
+        """
+        print(src)
+        print(type(src))
+
+    a = torch.randn(30, 20, 32, device="cpu")
+    foo(a)
+
+
+Explicit conversion using ``from_dlpack``
+------------------------------------------
+
+|DSL|'s runtime provides an interface for converting DLPack-compatible tensors to CuTe tensors,
+
+.. code-block:: python
+
+    b = cute.runtime.from_dlpack(a)
+
+where ``a`` is a tensor supporting the DLPack protocol with the ``__dlpack__``
+and ``__dlpack_device__`` methods. The resulting CuTe tensor ``b`` has a fully static layout. This
+conversion is performed without copying any tensor data, enabling seamless integration with major
+frameworks. Users can create tensors using NumPy, PyTorch, etc. and directly feed them into JIT
+functions writtnen using |DSL|.
+
+The resulting CuTe tensor shares the same underlying memory buffer as the original tensor. This
+zero-copy approach maximizes performance by eliminating unnecessary data duplication. However, it is
+important to note that the CuTe tensor's validity is tied to the lifetime of the original tensor. If
+the source tensor is destroyed or goes out of scope, the corresponding CuTe tensor becomes invalid
+since it references the original memory location.
+
+The full signature of from_dlpack is as follows:
+
+.. code-block:: python
+
+    def from_dlpack(tensor, assumed_align=None):
+
+The ``assumed_align`` integer parameter specifies the alignment of the tensor in unit of bytes.
+The tensor's base address must be divisible by ``assumed_align``. When not provided explicitly,
+the alignment is set to the natural alignment of the tensor's element type. Note that the alignment
+information is part of the pointer type in the generated IR. Therefore, programs with different
+alignments have a different IR and identical IRs are required for hitting the kernel caching
+mechanism of |DSL|.
+
+Code Example
+~~~~~~~~~~~~
+
+The following code demonstrates how to convert a PyTorch tensor to a CuTe tensor using the
+``from_dlpack`` function with default parameters.
+
+.. code-block:: python
+
+    import torch
+    import cutlass
+    from cutlass.cute.runtime import from_dlpack
+
+    x = torch.randn(30, 20, device="cpu")
+    y = from_dlpack(x)
+
+Once converted, we can access the tensor's information through various
+attributes. The following list shows the attributes of the converted tensor:
+
+- ``tensor.shape``: the tensor's shape
+- ``tensor.stride``: the tensor's stride
+- ``tensor.memspace``: the tensor's memory space
+- ``tensor.element_type``: the tensor's element data type
+
+.. code-block:: python
+
+    import torch
+    import cutlass
+    from cutlass.cute.runtime import from_dlpack
+
+    x = torch.randn(30, 20, device="cpu")
+    y = from_dlpack(x)
+
+    print(y.shape)        # (30, 20)
+    print(y.stride)       # (20, 1)
+    print(y.memspace)     # generic (if torch tensor in on device memory, memspace will be gmem)
+    print(y.element_type) # Float32
+    print(y)              # Tensor<0x000000000875f580@generic o (30, 20):(20, 1)>
+
+The string format of the resulting CuTe tensor is
+
+.. code-block::
+
+    Tensor<0x{tensor.data_ptr:016x}@{tensor.memspace} o {tensor.shape}:{tensor.stride}>
+
+As can be seen in the example above, ``from_dlpack`` first results in a tensor with a static layout.
+To obtain dynamic or mixed static/dynamic layouts after calling ``from_dlpack``, the
+``mark_layout_dynamic`` and ``mark_compact_shape_dynamic`` functions are used and described in
+the following sections.
+
+When to Use Explicit Conversion?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The DLPack protocol is a widely used protocol for interoperability between different frameworks.
+However, there is some associated overhead. Based on our benchmark, it usually takes between 2 to 3
+us per call to ``from_dlpack``.
+
+Explicit conversion allows for caching the converted CuTe tensors in order to avoid the overhead of
+repeated calls to ``from_dlpack``.
+
+.. code-block:: python
+
+    x = torch.randn(30, 20, device="cpu")
+    if key not in cached_tensors:
+        # Do the conversion only for cache misses
+        cached_tensors[key] = cute.runtime.from_dlpack(x)
+    foo(cached_tensors[key])
+
+Another use case for explicit conversion is to gain fine-grain control over which modes of a tensor
+are considered dynamic from the perspective of the generated program.
+
+Mark the Tensor's Layout as Dynamic with ``mark_layout_dynamic``
+----------------------------------------------------------------
+
+After calling this function, all shape modes become dynamic. The stride modes also become dynamic
+with the following two exceptions:
+
+1. the leading dimension's stride remains fixed at 1;
+2. stride elements equal to 0 (which indicates broadcasting) are retained.
+
+The full signature of ``mark_layout_dynamic`` is as follows:
+
+.. code-block:: python
+
+    def mark_layout_dynamic(self, leading_dim: int|None = None):
+
+The ``leading_dim`` parameter specifies the leading dimension of the tensor. The leading dimension's
+stride is set to 1 unless inconsistent with the layout of the DLPack tensor. For example,
+
+- For a tensor with layout ``(2,2,3,4):(2,1,4,12)``, if ``leading_dim`` is specified to be 1,
+  the layout will be marked as ``(?,?,?,?):(?,1,?,?)``.
+- If ``leading_dim`` is specified to be 0, a deduction failure error is raised because the stride of
+  dimension 0 is 2 (not 1).
+
+The default value for ``leading_dim`` is ``None``.  In such case, the system
+automatically deduces it from the tensor's layout using the following logic:
+
+1. If a dimension's stride is 1, that dimension is marked as the leading dimension.
+2. If multiple dimensions satisfy condition 1, an error is thrown indicating deduction failure.
+   Note that after converting a **PyTorch** tensor to the DLPack format, the stride for dimensions
+   with size 1 are canonicalized to 1. This canonicalization can increase the likelihood of
+   deduction failures. This behavior is specific to PyTorch and does not occur with NumPy for
+   example.
+3. If no dimension satisfies condition 1, all strides are marked as dynamic.
+
+For example:
+
+- For a tensor with layout ``(2,2,3,4):(2,1,4,12)``, the leading dimension is 1.
+  The layout will be marked as ``(?,?,?,?):(?,1,?,?)``.
+- For a tensor with layout ``(1,5,1):(1,1,1)``, if ``leading_dim`` is not specified,
+  a deduction failure error is raised.
+- For a tensor with layout ``(2,2):(8,2)``, since no dimension has stride 1,
+  all dimensions are marked as dynamic: ``(?,?):(?,?)``.
+
+Code Example
+~~~~~~~~~~~~
+
+The following example demonstrates how to use ``mark_layout_dynamic`` to specify dynamic tensor layouts.
+
+* ``t0`` shows the usage of ``mark_layout_dynamic`` with unspecified ``leading_dim`` and the automatic deduction of leading dimension.
+* ``t1`` & ``t2`` shows the usage of ``mark_layout_dynamic`` with specified ``leading_dim``.
+* ``t3`` shows the usage of ``mark_layout_dynamic`` with no leading dimension.
+* ``t4`` shows the usage of ``mark_layout_dynamic`` with broadcasted dimensions.
+* ``t5`` demonstrates the deduction failure when the there're more than one dimensions with stride equals to 1.
+* ``t6`` & ``t7`` demonstrates incorrect settings for ``leading_dim`` and expected errors.
+
+.. code-block:: python
+
+    import torch
+    from cutlass.cute.runtime import from_dlpack
+
+    # (8,4,16,2):(2,16,64,1)
+    a = torch.empty(16, 4, 8, 2).permute(2, 1, 0, 3)
+    # (1,4,1,32,1):(4,1,4,4,4) => torch tensor when dimension has shape 1, its stride is degenerated to 1,
+    # resulting in (1,4,1,32,1):(1,1,1,4,1)
+    b = torch.empty(32, 1, 1, 1, 4).permute(3, 4, 1, 0, 2)
+    # (2,2):(8,2)
+    c = torch.empty(3, 4)[::2, ::2]
+    # (3,1,1,5):(5,0,0,1)
+    d = torch.empty(3, 1, 1, 5).expand(3, 4, 2, 5)
+
+    # auto deduce the leading dimension to be 3
+    t0 = from_dlpack(a).mark_layout_dynamic()
+    print(t0)
+    # (?,?,?,?):(?,?,?,1)
+
+    t1 = from_dlpack(b).mark_layout_dynamic(leading_dim=0)
+    print(t2)
+    # (?,?,?,?,?):(1,?,?,?,?)
+
+    t2 = from_dlpack(b).mark_layout_dynamic(leading_dim=2)
+    print(t3)
+    # (?,?,?,?,?):(?,?,1,?,?)
+
+    t3 = from_dlpack(c).mark_layout_dynamic()
+    print(t3)
+    # (?,?):(?,?)
+
+    t4 = from_dlpack(d).mark_layout_dynamic()
+    print(t4)
+    # (?,?,?,?):(?,0,0,1)
+
+    t5 = from_dlpack(b).mark_layout_dynamic()
+    # Can't decude the leading dimension from layout, please specify the leading_dim explicitly.
+
+    t6 = from_dlpack(a).mark_layout_dynamic(leading_dim=1)
+    # Expected strides[leading_dim] == 1, but got 16
+
+    t7 = from_dlpack(b).mark_layout_dynamic(leading_dim=3)
+    # Expected strides[leading_dim] == 1, but got 4
+
+Mark the Tensor's Layout as Dynamic with ``mark_compact_shape_dynamic``
+-----------------------------------------------------------------------
+
+The ``mark_compact_shape_dynamic`` function provides fine-grain control over dynamic shapes for compact
+layouts. The full signature of ``mark_compact_shape_dynamic`` is as follows:
+
+.. code-block:: python
+
+    def mark_compact_shape_dynamic(self, mode: int, stride_order: tuple[int, ...]|None = None, divisibility: int = 1):
+
+The ``mode`` parameter determines which shape dimension becomes dynamic. After calling this function,
+the specific shape dimension given by ``mode`` is marked as dynamic immediately. The stride will be
+updated accordingly but this process is delayed until the C ABI of the tensor is constructed.
+For modes that have a shape of size 1, their stride are canonicalized to 0.
+
+The ``stride_order`` parameter specifies the ordering of strides in the tensor. It is consistent
+with ``torch.Tensor.dim_order()`` and defaults to ``None``. The parameter indicates the order of
+modes (dimensions) if the current layout were to be converted to row-major order. It starts from the
+outermost to the innermost dimension when reading it from left to right. This parameter must be
+explicitly set when the stride order cannot be automatically deduced from the tensor's layout, such
+as when multiple dimensions have a stride of 1.
+
+For example:
+
+- Layout ``(4,2):(1,4)`` has a ``stride_order`` of ``(1,0)`` indicates the innermost dimension is
+  0 (``4:1``), the outermost dimension is 1 (``2:4``).
+- Layout ``(5,3,2,4):(3,1,15,30)`` has a ``stride_order`` of ``(3,2,0,1)`` indicates the innermost
+  dimension is 1 (``3:1``), the outermost dimension is 3 (``4:30``).
+
+If ``stride_order`` is not specified, the system automatically deduces it from the tensor's layout
+using the following logic:
+
+1. Sort the strides in descending order.
+2. If multiple dimensions have a stride of 1, a deduction failure error is raised.
+
+For example:
+
+- For a tensor with layout ``(2,2,3,4):(2,1,4,12)``, the deduced ``stride_order`` is ``[3,2,0,1]``.
+- For a tensor with layout ``(1,5,1):(1,1,1)``, ``stride_order``'s deduction fails because
+  all dimensions have an identical stride of 1, making it impossible to determine the correct ordering.
+
+If ``stride_order`` is specified, the system validates that the order is consistent with the
+tensor's layout.
+
+The ``divisibility`` parameter specifies the divisibility of the dynamic shape. It could be used to
+represent the assumption alignment of the input. Defaults to 1.
+
+Note that this API is only available for compact tensors. For non-compact tensors, we can use
+``cute.assume`` to attach divisibility information to a specific shape mode in a host JIT function,
+as demonstrated in the following example:
+
+.. code-block:: python
+
+    @cute.jit
+    def foo(a: cute.Tensor):
+        new_shape = a.shape
+        # use cute.assume to set shape of mode=0 with divisibility=16
+        new_shape[0] = cute.assume(new_shape[0], 16)
+        new_layout = cute.make_layout(new_shape, stride=a.stride)
+        new_a = cute.make_tensor(a.iterator, new_layout)
+
+
+Code Example
+~~~~~~~~~~~~
+
+The following example demonstrates how to use ``mark_compact_shape_dynamic`` to specify dynamic tensor layouts.
+
+* ``t0`` & ``t1`` show the usage of ``mark_compact_shape_dynamic`` with unspecified ``stride_order`` and different ``mode`` and ``divisibility``.
+* ``t2`` shows the usage of consecutive ``mark_compact_shape_dynamic`` with unspecified ``stride_order`` and different ``mode`` and ``divisibility``.
+* ``t3`` & ``t4`` show the usage of ``mark_compact_shape_dynamic`` with different specified ``stride_order``.
+* ``t5``, ``t6``, ``t7``, ``t8``, ``t9``, ``t10``, ``t11``, and ``t12`` demonstrate incorrect settings for parameters and expected errors.
+
+.. code-block:: python
+
+    import torch
+    from cutlass.cute.runtime import from_dlpack
+
+    @cute.jit
+    def kernel(t: cute.Tensor):
+        pass
+
+    # (8,4,16,2):(2,16,64,1)
+    a = torch.empty(16, 4, 8, 2).permute(2, 1, 0, 3)
+    # (1,4,1,32,1):(4,1,4,4,4) => torch tensor when dimension has shape 1, its stride is degenerated to 1,
+    # resulting in (1,4,1,32,1):(1,1,1,4,1)
+    # b.dim_order() is (3,2,4,0,1)
+    b = torch.empty(32, 1, 1, 1, 4).permute(3, 4, 1, 0, 2)
+
+    # auto deduce the stride order to be [2,1,0,3]
+    t0 = from_dlpack(a).mark_compact_shape_dynamic(
+        mode=0, divisibility=2
+    )
+    kernel(t0)
+    # (?{div=2},4,16,2):(2,?{div=4},?{div=16},1)
+    print(t0)
+
+    t1 = from_dlpack(a).mark_compact_shape_dynamic(
+        mode=1, divisibility=2
+    )
+    kernel(t1)
+    # (8,?{div=2},16,2):(2,16,?{div=32},1)
+    print(t1)
+
+    t2 = from_dlpack(a).mark_compact_shape_dynamic(
+        mode=1, divisibility=2
+    ).mark_compact_shape_dynamic(
+        mode=3, divisibility=2
+    )
+    kernel(t2)
+    # (8,?{div=2},16,?{div=2}):(?{div=2},?{div=16},?{div=32},1)
+    print(t2)
+
+    t3 = from_dlpack(b).mark_compact_shape_dynamic(
+        mode=2, divisibility=1, stride_order=(3, 0, 2, 4, 1)
+    )
+    kernel(t3)
+    # (1,4,?,32,1):(0,1,4,?{div=4},0)
+    print(t3)
+
+    t4 = from_dlpack(b).mark_compact_shape_dynamic(
+        mode=2, divisibility=1, stride_order=(2, 3, 4, 0, 1)
+    )
+    kernel(t4)
+    # (1,4,?,32,1):(0,1,128,4,0)
+    print(t4)
+
+    t5 = t2.mark_compact_shape_dynamic(
+        mode=3, divisibility=5, stride_order=(0, 1, 2, 3)
+    )
+    # The stride_order is not consistent with the last stride_order
+
+    t6 = from_dlpack(a).mark_compact_shape_dynamic(
+        mode=3, divisibility=5, stride_order=(0, 1, 2, 3)
+    )
+    # The stride_order is not consistent with the deduced stride_order
+
+    t7 = from_dlpack(b).mark_compact_shape_dynamic(
+        mode=0, divisibility=4
+    )
+    # The layout could not be deduced, please specify the stride_order explicitly
+
+    t8 = from_dlpack(b).mark_compact_shape_dynamic(
+        mode=30, divisibility=5, stride_order=(3, 0, 2, 4, 1)
+    )
+    # Expected mode value to be in range [0, 5), but got 30
+
+    t9 = from_dlpack(b).mark_compact_shape_dynamic(
+        mode=3, divisibility=5, stride_order=(2, 1, 2, 3, 4)
+    )
+    # Expected stride_order to contain all the dimensions of the tensor, but it doesn't contain 0.
+
+    t10 = from_dlpack(b).mark_compact_shape_dynamic(
+        mode=3, divisibility=5, stride_order=(0, 1, 2, 3, 4, 5)
+    )
+    # Expected stride_order to have 5 elements, but got 6.
+
+    t11 = from_dlpack(b).mark_compact_shape_dynamic(
+        mode=0, divisibility=4, stride_order=b.dim_order()
+    )
+    # The shape(1) of mode(0) is not divisible by the divisibility(4)
+
+    t12 = from_dlpack(b).mark_compact_shape_dynamic(
+        mode=0, divisibility=1, stride_order=(2, 1, 3, 0, 4)
+    )
+    # The stride_order is not consistent with the layout
diff --git a/media/docs/pythonDSL/cute_dsl_general/notebooks.rst b/media/docs/pythonDSL/cute_dsl_general/notebooks.rst
new file mode 100644
index 00000000..623c2ac3
--- /dev/null
+++ b/media/docs/pythonDSL/cute_dsl_general/notebooks.rst
@@ -0,0 +1,16 @@
+.. _notebooks:
+
+Educational Notebooks
+=====================
+
+A number of notebooks for educational purposes are provided in the `CUTLASS GitHub repository <https://github.com/NVIDIA/cutlass>`__.
+A list with handful links is given below:
+
+- `"Hello world" <https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/notebooks/hello_world.ipynb>`__
+- `Printing <https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/notebooks/print.ipynb>`__
+- `Data Types Basics <https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/notebooks/data_types.ipynb>`__
+- `Tensors <https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/notebooks/tensor.ipynb>`__
+- `The TensorSSA Abstraction <https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/notebooks/tensorssa.ipynb>`__
+- `Layout Algebra <https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/notebooks/cute_layout_algebra.ipynb>`__
+- `Element-wise Add Tutorial <https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/notebooks/elementwise_add.ipynb>`__
+- `Using CUDA Graphs <https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/notebooks/cuda_graphs.ipynb>`__
diff --git a/media/docs/pythonDSL/faqs.rst b/media/docs/pythonDSL/faqs.rst
new file mode 100644
index 00000000..e8cce741
--- /dev/null
+++ b/media/docs/pythonDSL/faqs.rst
@@ -0,0 +1,137 @@
+.. _faqs:
+
+FAQs
+====
+
+General
+---------------------
+
+**Are the DSLs replacing C++ templates?**
+
+    TL;DR: No - but also yes. The CUTLASS 4.0 release (CuTe DSL), along with all
+    future extensions to our Python-native programming models, does not come at the
+    expense of CUTLASS C++.  CUTLASS 2.x and 3.x C++ APIs are both going to continue
+    receiving fixes and updates for the architectures we support them for. However,
+    CUTLASS 4.x CuTe DSL is fully isomorphic in its programming model and performance
+    with CuTe C++ for Blackwell, and it is our hope that the community embraces this
+    for much easier while still equally performant custom kernel development.  This is
+    why we are releasing CuTe DSL with support for all architectures starting with the
+    NVIDIA Ampere Architecture.
+
+**What is the difference between CuTe DSL, CUTLASS Python, and CUTLASS DSLs?**
+
+    CUTLASS Python was the Python interface for instantiating C++ kernels via a Python
+    frontend. This is now deprecated with the release of CUTLASS 4.0. CUTLASS DSLs are
+    a family of Python DSLs for native device programming in Python. Currently, this is
+    limited to our initial release of CuTe DSL, but future versions will include higher-level
+    abstractions that gradually trade off control for convenience.
+
+**What should I learn, CUTLASS C++ or the Python DSLs?**
+
+    We believe the Python DSLs will significantly improve the learning curve and recommend starting
+    with them for all newcomers, as they eliminate the inherent complexity of learning C++
+    metaprogramming for GPU kernel programming. Since CuTe C++ and CuTe DSL share fully isomorphic
+    programming models and patterns, any knowledge gained can eventually be applied to C++.
+
+**Where will the code live? PIP wheel or GitHub repo? Do I have to build it myself?**
+
+    This is a major change compared to CUTLASS C++ and Python DSLs. Going forward,
+    the GitHub code only exists as a way for users to file issues and pull requests against.
+    While it can be used with the pip wheel, we do not recommend most users do so unless they are
+    hacking on the DSL itself. For all other users, we recommend they
+    simply ``pip install nvidia-cutlas-dsl`` and use the pip wheel as the single source
+    of truth for the dialect compiler and DSL implementation. CUTLASS GitHub repository will
+    contain a ``requirements.txt`` file pinning the version of the wheel consistent with the state
+    of the OSS repository (please see :doc:`quick_start`). This means getting started with
+    CUTLASS is easier than ever: no more CMake command lines to learn and no more builds to kick
+    off. Simply install the pip wheel and start running the examples.
+
+Migration
+---------------------
+
+**Should I port my code from C++ templates to Python?**
+
+    Almost certainly not, unless you need extremely fast JIT times for your kernel and C++ compile times
+    are a blocker for you. The 2.x and 3.x APIs will continue to be supported, and Nvidia's Hopper and
+    Blackwell architectures 3.x will continue to improve in terms of features
+    and performance.
+
+**Are portability promises different with Python?**
+
+    For the initial release while the DSL is still in beta, we do not promise any portability
+    as we may make changes to the DSL itself. While we do not expect any changes to the CuTe operations,
+    the DSL utilities, decorators, helper classes like pipelines and schedulers may change as we refine them
+    with community feedback. We encourage users to file issues and discussions on GitHub during this
+    beta period with their feedback!
+
+    In the long term, we plan to continue to treat the OSS community with care.
+    Just like the prior history of CUTLASS, we plan not to break users unless necessary,
+    but we reserve the right to make limited breaking changes in case we believe it is a
+    net benefit to the community and project. These will be announced ahead of time and/or
+    clearly highlighted in the CHANGELOG of each release.
+
+Technical
+---------------------
+**What NVIDIA architectures will it support?**
+
+    CuTe DSL will support all NVIDIA GPU architectures starting with NVIDIA Ampere Architecture (SM80).
+
+**Will it be compatible with DL frameworks (e.g., PyTorch, JAX)?**
+
+    Yes, we will provide utilities to convert from DLPack-supported tensor formats
+    to ``cute.Tensor``. This should allow a user to never have to leave Python
+    when writing model code in their framework of choice. Our JAX interoperability story is not
+    as strong as PyTorch's today, however, we are actively working on improving it
+    and welcome contributions in this space.
+
+**Does it compile to PTX or SASS?**
+
+    CuTe DSL compiles the program down to PTX. After that, we currently use the PTX compiler that
+    ships with the CUDA toolkit to compile the PTX down to SASS. We plan to remove
+    this limitation in the future and allow the use of the PTX JIT that is included in the
+    CUDA driver in case a user does not have a CUDA toolkit installed.
+
+**Do I need to use NVCC or NVRTC?**
+
+    No, the ``nvidia-cutlass-dsl`` wheel packages is everything needed to generate GPU kernels. It
+    shares the driver requirements of the 12.9 toolkit which can be found
+    `here <https://developer.nvidia.com/cuda-toolkit-archive>`__.
+
+**How would one debug the code?**
+
+    Since CuTe DSL is not native python and an embedded DSL instead, tools like `pdb`
+    cannot be used.  However, if you have experience with GPU kernel programming, the debugging
+    techniques will be nearly identical. Typically, compile time and runtime printing
+    of types and values are the most expedient. Please see `documentation on printing <https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/notebooks/print.ipynb>`__
+    to learn how to print types and values at both compile time and runtime.
+    You can also use ``cuda-gdb`` to set breakpoints in the program and step through the execution
+    or use tools such as ``compute-sanitizer`` to detect and triage bugs in your program. As the DSL
+    matures, our source location tracking from Python user programs will also improve to provide
+    more helpful source-level mapping when setting breakpoints and using other tools such as nsight.
+
+**How would one implement warp specialization in CuTe DSL?**
+
+    Exactly the same way you would in C++ but in a Python-native syntax instead.
+    Consult our :doc:`cute_dsl_general/dsl_control_flow` and
+    `"Blackwell kernel example" <https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/blackwell/dense_gemm_persistent.py>`__
+    for a detailed how-to guide.
+
+**Can I call functions from other functions or use OOP?**
+
+    Yes. We frequently call functions from one another and set up class
+    hierarchies to organize and modularize our code for pipelines and schedulers.
+    Consult the :doc:`cute_dsl_general/dsl_introduction` documentation or our examples for more details.
+
+License
+---------------------
+**Q:What is the license for CuTe DSL and the associated GitHub samples?**
+    CuTe DSL components available `on Github <https://github.com/NVIDIA/cutlass/tree/main/python/CuTeDSL>`__ and via the nvidia-cutlass-dsl Python pip wheel
+    are released under the `"NVIDIA Software End User License Agreement (EULA)" <https://github.com/NVIDIA/cutlass/tree/main/EULA.txt>`__.
+    Because the pip package includes a compiler that shares several components with the CUDA Toolkit,
+    it is subject to usage terms and restrictions similar to those of the CUDA SDK. Please refer to the EULA for specific terms of use.
+
+    CuTe DSL samples and Jupyter notbooks, released `on GitHub <https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL>`__ are provided under
+    the BSD 3-Clause License and may be used and redistributed under those terms. This distinction ensures that developers have flexibility
+    when using or modifying the code samples, independent of the compiler and runtime components governed by the EULA.
+
+    If you have any questions or need clarification, feel free to contact us.
diff --git a/media/docs/pythonDSL/functionality.rst b/media/docs/pythonDSL/functionality.rst
new file mode 100644
index 00000000..b3575dd7
--- /dev/null
+++ b/media/docs/pythonDSL/functionality.rst
@@ -0,0 +1,34 @@
+.. _functionality:
+
+Functionality
+====================
+
+The CUTLASS DSL 4.0 release supports **Python 3.12** only.  It shares the same driver requirements 
+as the `CUDA Toolkit 12.9 <https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html>`__.
+Specifically, the driver version must be 575.51.03 or later.
+
+Currently, only Linux x86_64 is supported. Additional platform support will be added in future releases.
+
+Supported MMA Operations
+---------------------------------
+
+**NVIDIA Ampere Architecture:**
+
+- FP16 / BF16 tensor core instructions
+
+**NVIDIA Hopper Architecture:**
+
+- FP16 / BF16
+- FP8
+
+**NVIDIA Blackwell Architecture:**
+
+- FP16 / BF16
+- TF32
+- I8
+- F8
+
+Notable Limitations
+------------------------------
+
+For current constraints and unsupported features, refer to the :doc:`limitations` section.
diff --git a/media/docs/pythonDSL/limitations.rst b/media/docs/pythonDSL/limitations.rst
new file mode 100644
index 00000000..73d23a25
--- /dev/null
+++ b/media/docs/pythonDSL/limitations.rst
@@ -0,0 +1,279 @@
+.. _limitations:
+
+Limitations
+====================
+
+.. contents::
+   :depth: 2
+   :local:
+
+Overview
+---------------------
+CuTe DSL is an embedded domain-specific language within Python. It utilizes a subset of Python's
+syntax to provide a streamlined programming experience. It is important to understand that CuTe DSL
+does NOT implement the complete Python language semantics in its JIT compilation process.
+
+This section documents the current limitations of the CuTe DSL. While some of these limitations
+may be addressed in future releases, developers should be aware of them when building applications with
+the DSL.
+
+Notable unsupported features
+----------------------------
+
+- GeForce RTX 50 Series support
+- RS WGMMA (The input matrix A comes from register and the input matrix B comes from shared memory)
+- Programmatic Dependent Launch (PDL)
+- narrow-precision data type support, including related tensor core instructions
+- convolutions
+- full support for ahead of time compilation
+- preferred clusters
+- CLC-based tile schedulers
+- EVT support
+- Windows support
+
+Programming Model
+---------------------
+
+**Python Native Data Types**
+    CuTe DSL supports Python data structures when used for "meta-programming,"
+    but these structures cannot be treated as dynamic values modifiable at runtime.
+    For instance, lists and dictionaries can be used to configure kernel parameters
+    during compilation or serve as containers for dynamic values,
+    but their structure and organization cannot be altered during kernel execution.
+
+    - **Static Values:**
+        - Evaluated during JIT compilation phase
+        - Immutable after compilation completes
+        - Most Python native types (lists, tuples, dictionaries) are processed as static values
+        - Primarily utilized for "meta-programming" and configuration purposes
+        - Example: Lists can contain dynamic values but their structure cannot
+          be modified during kernel execution
+
+    - **Dynamic Values:**
+        - Evaluated during runtime execution
+        - Modifiable during execution of JIT-compiled functions
+        - Only a specific subset of Python types are supported as dynamic values
+        - Primitive types are automatically converted when passed as function arguments:
+          - ``int`` → ``Int32`` (may be updated to ``Int64`` in future releases)
+          - ``bool`` → ``Bool``
+          - ``float`` → ``Float32`` (may be updated to ``Float64`` in future releases)
+
+    The JIT compiler processes Python native types analogously to C++ template parameters.
+    The compiled code cannot manipulate dynamic values of composite types
+    such as lists, tuples, or dictionaries.
+
+    For example, following code doesn't work as traditional Python program inside JIT function.
+
+    .. code:: python
+
+        @cute.jit
+        def foo(a: Float32, b: Float32, i: Int32, res: cute.Tensor):
+            xs = [a, b]
+            # indexing list with dynamic index is not supported in CuTe DSL:
+            res[0] = xs[i]
+
+            if i == 0:
+                # This will alway append Float32(3.0) to the list regardless
+                # of the runtime value of `i`
+                xs.append(Float32(3.0))
+
+            for i in range_dynamic(10):
+                # This only append one element to the list at compile-time
+                # as loop doesn't unroll at compile-time
+                xs.append(Float32(1.0))
+
+**Python Function**
+    The DSL currently does not implement support for return values from Python functions,
+    although this capability is planned for future releases.
+
+    Example:
+
+    .. code:: python
+
+        @cute.jit
+        def foo():
+            return 1  # Currently unsupported in CuTe DSL
+
+**Expression or Statement with Dependent Type**
+    CuTe DSL implements static typing and does not support dependent types.
+    The type of each expression must be determinable during compile time,
+    in contrast to standard Python which implements dynamic typing.
+
+    Example illustrating functionality in Python that is not supported in the DSL:
+
+    .. code:: python
+
+        # Valid in standard Python, but unsupported in CuTe DSL
+        max(int(1), float(2.0))  # => 2.0 : float
+        max(int(3), float(2.0))  # => 3   : int
+
+    In CuTe DSL, types are promoted. For example:
+
+    .. code:: python
+
+        @cute.jit
+        def foo(a: Int32, b: Float32, res: cute.Tensor):
+            res[0] = max(a, b)  # Type is automatically promoted to Float32
+
+    Following code using inlined if-else expression with dependent types
+    is not supported in CuTe DSL:
+
+    .. code:: python
+
+        @cute.jit
+        def foo(cond: Boolean, a: Int32, b: Float32, res: cute.Tensor):
+            res[0] = a if cond else b
+
+
+**Control Flow**
+    The DSL transforms Python control flow statements (``if``, ``for``, ``while``)
+    during Abstract Syntax Tree (AST) processing into structured control flow in MLIR
+    which has the same constraints as dependent types. For instance,
+    changing type of a variable in loop body is not allowed.
+
+    - Variables must be defined prior to the control flow statement
+    - Type consistency must be maintained throughout the control flow statement
+    - Don't support early exit or return from if-else statements
+
+    Example illustrating functionality in Python that is not supported in the DSL:
+
+    .. code:: python
+
+        @cute.jit
+        def foo():
+            a = Int32(1)
+            for i in range_dynamic(10):
+                a = Float32(2)  # Changing type inside loop-body is not allowed in the DSL
+
+**Built-in Operators**
+    The DSL transforms built-in operators like ``and``, ``or``, ``max``, ``min``, etc.
+    into MLIR operations. They also follow the same constraints of dependent types.
+    For instance, ``a and b`` requires ``a`` and ``b`` to be of the same type.
+
+    Comparison like ``==`` on Sequence of dynamic values is known to not produce
+    expected result at runtime.
+
+**Object Oriented Programming**
+    The DSL is implemented on top of Python and supports Python's object-oriented programming (OOP) features
+    for meta-programming at compile-time.
+
+    However, similar to other composed data types, the DSL provides limited support for OOP when objects
+    contain dynamic values. It is strongly recommended to avoid passing dynamic values between member methods
+    through class state in your code.
+
+    The following example illustrates functionality in Python that is not supported in the DSL
+    without implementing the ``DynamicExpression`` protocol:
+
+    .. code:: python
+
+        class Foo:
+            def __init__(self, a: Int32):
+                self.a = a
+
+            def set_a(self, i: Int32):
+                self.a = i
+
+            def get_a(self):
+                return self.a
+
+        @cute.jit
+        def foo(a: Int32, res: cute.Tensor):
+            foo = Foo(a)
+            for i in cutlass.range_dynamic(10):
+                foo.set_a(i)
+
+            # This fails to compile because `a` is assigned a local value defined within the for-loop body
+            # and is not visible outside of the loop body
+            res[0] = foo.get_a()
+
+    The example above fails to compile because ``Foo.a`` is assigned a local value defined within the for-loop body,
+    which is not visible outside the loop body.
+
+    The CuTe DSL implements an internal mechanism that provides limited support for OOP patterns via protocol.
+    As the DSL continues to evolve to support additional features, this mechanism is subject to change
+    and is not recommended for direct use in users' code for better portability.
+
+
+**CuTe Layout algebra in native Python**
+    Entirety of CuTe Layout algebra operations and APIs require JIT compilation. These 
+    functionalities are exclusively available within JIT-compiled functions and cannot be 
+    accessed in standard Python execution environments.
+    
+    Additionally, there exists a restricted set of data types that can be passed as arguments 
+    to JIT-compiled functions, which further constrains their usage in native Python contexts. 
+    Only following CuTe algebra types are supported as JIT function arguments: ``Tensor``, ``Pointer``, 
+    ``Shape``, ``Stride``, ``Coord`` and ``IntTuple``. For ``Stride``, we don't support ``ScacledBasis``
+    from native Python Context. Unfortunately, in the first release, we don't support 
+    passing ``Layout`` under native Python Context.
+
+
+Suggestions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+For reliable and predictable results:
+
+- Avoid dependent types in your code
+- Implement explicit type conversion for dynamic values
+- Clearly distinguish between static (compile-time) and dynamic (runtime) values
+- Use type annotations as much as possible to help JIT compiler
+  to identify type to avoid ambiguity
+
+
+.. code:: python
+
+    # Example demonstrating explicit typing
+    alpha = 1.0  # Explicitly defined as float using `1.0` instead of `1`
+                 #  or `float(1)`
+    beta = 2.0   # Explicitly defined as float
+    result = max(alpha, beta)  # Will correctly perform float comparison
+
+**Debugging Capabilities**
+    Debugging tools and facilities for the Python DSL are currently more limited in comparison to the C++
+    API. For instance, we don't support single-stepping through the JIT-compiled code. And lack of exception
+    handling in JIT-compiled code makes it hard to debug in some cases.
+
+**Integration with Frameworks**
+    Integration with certain deep learning frameworks is in early development stages and may have
+    limitations. For instance, converting frameworking tensor to cute.Tensor is known to have overhead
+    with 2us~3us per tensor as we convert from general DLPack protocol which offers comptibility with
+    all frameworks.
+
+**Hashing DSL APIs and Objects**
+    DSL APIs and Objects are sensitive to MLIR context, region or other contextual information which has no meaning cross
+    different context. Any stateful design rely on ``__hash__`` likely misbehave with unexpected results. An example is
+    ``functools.lru_cache``, which combined with ``@cute.jit``, it may cache MLIR object from one context and use in another one.
+
+
+Future Improvements
+---------------------
+
+The CuTe DSL development team is actively addressing these limitations.
+Upcoming releases will aim to:
+
+- Implement support for return values from JIT compiled functions
+- Improve support for built-in operators to handle more cases without dependent types
+- Enhance debugging capabilities and tools
+- Improve error messages with precise diagnostic information
+- Extend support for additional numeric data types
+- Improve performance of converting framework tensor to ``cute.Tensor`` with native support
+  for different frameworks
+- Offer more user friendly benchmarking methodology
+
+Design Limitations Likely to Remain
+--------------------------------------------
+
+The primary objective of CuTe DSL is to provide a domain-specific language for expressing
+complex CUDA kernels with optimal GPU performance, not to execute arbitrary Python code on GPU hardware.
+
+The following limitations will likely remain by design:
+
+- **Complex Data Structures as Dynamic Values**: Lists, tuples, and dictionaries will continue to function
+  as static containers. While they can store dynamic values, their structure (adding/removing elements)
+  cannot be modified during execution of JIT-compiled functions.
+
+- **Dependent Types**: Supporting dependent types would introduce substantial complexity and
+  adversely affect the performance characteristics of generated code.
+
+- **CuTe Layout Algebra**: We don't have plan to extend the support of CuTe Layout Algebra 
+  under native Python Context. We are planning to extend support for data types and allow 
+  JIT function to interoperate with native Python code.
diff --git a/media/docs/pythonDSL/overview.rst b/media/docs/pythonDSL/overview.rst
new file mode 100644
index 00000000..07abfb09
--- /dev/null
+++ b/media/docs/pythonDSL/overview.rst
@@ -0,0 +1,108 @@
+.. _overview:
+
+Overview
+===========================
+
+CUTLASS 4.x bridges the gap between productivity and performance for CUDA kernel development. 
+By providing Python-based DSLs to the powerful CUTLASS C++ template library, it enables 
+faster iteration, easier prototyping, and a gentler learning curve for high-performance linear 
+algebra on NVIDIA GPUs.
+
+Overall we envision CUTLASS DSLs as a family of domain-specific languages (DSLs). 
+With the release of 4.0, we are releasing the first of these in CuTe DSL. 
+This is a low level programming model that is fully consistent with CuTe C++ abstractions — exposing 
+core concepts such as layouts, tensors, hardware atoms, and full control over the hardware thread and data hierarchy.
+
+Why CUTLASS DSLs?
+============================
+
+While CUTLASS offers exceptional performance through its C++ template abstractions, the complexity 
+can present challenges for many developers. CUTLASS 4.x addresses this by:
+
+- **Simplifying metaprogramming**: Metaprogramming in Python is a lot more intuitive than with C++
+- **Accelerating Iteration**: Rapid prototyping with familiar Python syntax and blazing fast compile times
+- **Lowering Barriers**: Reduced learning curve for GPU programming concepts and consistency between CuTe C++ and DSL
+- **Maintaining Performance**: Generated code leverages optimized CUTLASS primitives
+
+Students can learn GPU programming concepts without the complexity of C++ templates. 
+Researchers and performance engineers can rapidly explore algorithms, prototype, and tune 
+kernels before moving to production implementations.
+
+Key Concepts and Approach
+================================
+
+CUTLASS DSLs translate Python code into a custom intermediate representation (IR), 
+which is then Just-In-Time (JIT) compiled into optimized CUDA kernels using MLIR and `ptxas`.
+
+Core CuTe DSL Abstractions
+-----------------------------------
+
+- **Layouts** – Describe how data is organized in memory and across threads.
+- **Tensors** – Combine data pointers or iterators with layout metadata.
+- **Atoms** – Represent fundamental hardware operations like matrix multiply-accumulate (MMA) or memory copy.
+- **Tiled Operations** – Define how atoms are applied across thread blocks and warps (e.g., ``TiledMma``, ``TiledCopy``).
+
+For more on CuTe abstractions, refer to the `CuTe C++ library documentation <https://github.com/NVIDIA/cutlass/blob/main/media/docs/cute/00_quickstart.md>`__.
+
+**Pythonic Kernel Expression**
+
+Developers express kernel logic, data movement, and computation using familiar Python syntax and control flow.
+
+The DSLs simplify expressing loop tiling, threading strategies, and data transformations using concise Python code.
+
+**JIT Compilation**
+
+Python kernels are compiled at runtime into CUDA device code using MLIR infrastructure and NVIDIA’s ``ptxas`` toolchain, 
+enabling rapid iteration and interactive debugging.
+
+Relationship to CUTLASS C++
+=================================
+
+CUTLASS DSLs are not a replacement for the CUTLASS C++ library or its 2.x and 3.x APIs. Instead, it aims to be a high-productivity kernel 
+authoring framework that shares all concepts with CUTLASS 3.x C++ API such as CuTe, pipelines, schedulers etc.
+
+- **Performance**: Generated kernels aim to match CUTLASS C++ kernels in performance; however, some performance gaps 
+  may exist due to missing optimizations that have been added over the years to CUTLASS C++ and may be missing in the DSLs examples.
+- **Library**: The CUTLASS DSLs do not currently ship with a full GEMM/Conv autotuning profiler or library interface 
+  akin to CUTLASS C++. Instead, it focuses on generating and autotuning individual kernel instances (for example: via tile size exploration) and via native integration DL frameworks that support auto-tuning.
+
+Getting Started
+================================
+
+- :doc:`quick_start` – Initial setup and installation.
+- :doc:`cute_dsl` – Overview of the typical development and workflow using CuTe DSL.
+- :doc:`cute_dsl_api` – Refer to the full API documentation.
+- :doc:`limitations` – Understand current CuTe DSL constraints and differences from C++.
+- :doc:`faqs` – Common questions and known issues.
+
+Current Status & Roadmap
+=================================
+
+CuTe DSL is in public beta and actively evolving. Interfaces and features are subject to 
+change as we improve the system.
+
+Upcoming Milestones
+----------------------------------
+
+- Public release targeted for **Summer 2025**
+- Expanded support for additional data types and kernel types
+- Usability improvements: better error messages, debugging tools, and streamlined APIs
+- Broader integration of CUTLASS primitives and features
+
+For known issues and workarounds, please consult the :doc:`limitations` and :doc:`faqs`.
+
+Community & Feedback
+==================================
+
+We welcome contributions and feedback from the developer community!
+
+You can:
+
+- Submit bug reports or feature requests via our `GitHub Issues page <https://github.com/NVIDIA/cutlass/issues>`__
+- Join the CUTLASS community on `Discord <https://discord.com/channels/1019361803752456192/1150868614921064590>`__ to ask questions and share ideas
+- Contribute examples, tutorials, or enhancements to the DSLs
+- Report unclear or missing documentation
+- Propose support for additional data types or kernel variants
+- Help prioritize roadmap features by upvoting GitHub issues
+
+Thank you for helping shape the future of CUTLASS DSLs!
\ No newline at end of file
diff --git a/media/docs/pythonDSL/quick_start.rst b/media/docs/pythonDSL/quick_start.rst
new file mode 100644
index 00000000..0c7fb505
--- /dev/null
+++ b/media/docs/pythonDSL/quick_start.rst
@@ -0,0 +1,31 @@
+.. _quick_start:
+
+Quick Start Guide
+=======================
+
+The CUTLASS DSL 4.0 release currently supports **Linux** and **Python 3.12** only. To install CUTLASS DSLs (limited to CuTe DSL for now), use the following command
+
+Installation
+-----------------------
+
+To install the CUTLASS DSL, run:
+
+.. code-block:: bash
+
+   pip install nvidia-cutlass-dsl
+
+The ``nvidia-cutlass-dsl`` wheel includes everything needed to generate GPU kernels. It requires 
+the same NVIDIA driver version as the 
+`CUDA Toolkit 12.9 <https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html>`_.
+
+To ensure compatibility with the examples and code on `GitHub <https://github.com/NVIDIA/cutlass/tree/main/python>`_,
+use the ``requirements.txt`` file from the corresponding commit in the repository.
+
+Recommended Dependencies
+---------------------------------
+
+To run examples and begin development, we recommend installing:
+
+.. code-block:: bash
+
+   pip install torch jupyter
diff --git a/python/CuTeDSL/EULA.txt b/python/CuTeDSL/EULA.txt
new file mode 100644
index 00000000..e7699599
--- /dev/null
+++ b/python/CuTeDSL/EULA.txt
@@ -0,0 +1,188 @@
+NVIDIA Software License Agreement
+
+IMPORTANT NOTICE – PLEASE READ AND AGREE BEFORE USING THE SOFTWARE
+This software license agreement (“Agreement”) is a legal agreement between you, whether an individual or entity, (“you”) and NVIDIA Corporation (“NVIDIA”) and governs the use of the NVIDIA CUTLASS DSLs software and materials that NVIDIA delivers to you under this Agreement (“Software”).
+NVIDIA and you are each a “party” and collectively the “parties.”
+This Agreement can be accepted only by an adult of legal age of majority in the country in which the Software is used.
+If you don’t have the required age or authority to accept this Agreement, or if you don’t accept all the terms and conditions of this Agreement, do not use the Software.
+
+1. License Grants
+
+    1.1. License Grant to You. The Software made available by NVIDIA to you is licensed, not sold.
+    Subject to the terms of this Agreement, NVIDIA grants you a limited, non-exclusive, revocable, non-transferable, and non-sublicensable (except as expressly granted in this Agreement), license to:
+
+    a. install and use copies of the Software,
+    b. configure the Software using configuration files provided (if applicable),
+    c. modify and create derivative works of any sample or example source code NVIDIA delivers to you as part of the Software (“Derivatives”) (if applicable), and
+    d. distribute python files in the Software package in source format as incorporated into a software application subject to the following distribution requirements:
+
+       i. Your application must have material additional functionality, beyond the included portions of the Software.
+       ii. The distributable portions of the Software shall only be accessed by your application.
+       iii. The following notice shall be included in modifications and derivative works of sample source code distributed: “This software contains source code provided by NVIDIA Corporation.”
+       iv. Unless a developer tool is identified in this Agreement as distributable, it is delivered for your internal use only.
+       v. The terms under which you distribute your application must be consistent with the terms of this Agreement, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA’s intellectual property rights.
+       vi. Additionally, you agree that you will protect the privacy, security and legal rights of your application users.
+
+    The foregoing (a) through (d) are, collectively, the “Purpose”, and the developed applications are only for use in systems with NVIDIA GPUs.
+
+    1.2. License Grant to NVIDIA. Subject to the terms of this Agreement, you grant NVIDIA and its affiliates a non-exclusive, perpetual, irrevocable, sublicensable, worldwide, royalty-free, fully paid-up and transferable license, under your intellectual property rights, to publicly perform, publicly display, reproduce, use, make, have made, sell, offer for sale, distribute (through multiple tiers of distribution), import, create derivative works of and otherwise commercialize and exploit at NVIDIA’s discretion any Derivatives created by or for you.
+    You may, but are not required to, deliver any Derivatives to NVIDIA.
+
+2. License Restrictions
+
+    Your license to use the Software and Derivatives is restricted as stated in this Section 2 (“License Restrictions”).
+    You will cooperate with NVIDIA and, upon NVIDIA’s written request, you will confirm in writing and provide reasonably requested information to verify your compliance with the terms of this Agreement.
+    You may not:
+
+    2.1. Use the Software or Derivatives for any purpose other than the Purpose;
+
+    2.2. Sell, rent, sublicense, transfer, distribute or otherwise make available to others (except authorized users as stated in Section 3 (“Authorized Users”)) any portion of the Software or Derivatives, except as expressly granted in Section 1.1 (“License Grant to You”);
+
+    2.3. Reverse engineer, decompile, or disassemble the Software components provided in binary form, nor attempt in any other manner to obtain source code of such Software;
+
+    2.4. Modify or create derivative works of the Software, except as expressly granted in Section 1.1 (“License Grant to You”);
+
+    2.5. Change or remove copyright or other proprietary notices in the Software;
+
+    2.6. Bypass, disable, or circumvent any technical limitation, encryption, security, digital rights management or authentication mechanism in the Software;
+
+    2.7. Use the Software or Derivatives in any manner that would cause them to become subject to an open source software license, subject to the terms in Section 6 (“Components Under Other Licenses”);
+
+    2.8. Use the Software or Derivatives in violation of any applicable law or regulation in relevant jurisdictions
+
+    2.9. Indicate that a product or service developed with the Software or Derivatives is sponsored or endorsed by NVIDIA;
+
+    2.10. Replace any NVIDIA software components in the Software that are governed by this Agreement with other software that implements NVIDIA APIs;
+
+    2.11. Reverse engineer, decompile or disassemble any portion of the output generated using Software elements for the purpose of translating such output artifacts to target a non-NVIDIA platform; or
+
+3. Authorized Users
+
+    You may allow employees and contractors of your entity or of your subsidiary(ies), and for educational institutions also enrolled students, to internally access and use the Software as authorized by this Agreement from your secure network to perform the work authorized by this Agreement on your behalf.
+    You are responsible for the compliance with the terms of this Agreement by your authorized users.
+    Any act or omission that if committed by you would constitute a breach of this Agreement will be deemed to constitute a breach of this Agreement if committed by your authorized users.
+
+4. Pre-Release
+
+    Software versions identified as alpha, beta, preview, early access or otherwise as pre-release (“Pre-Release”) may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability and reliability standards relative to NVIDIA commercial offerings.
+    You use Pre-Release Software at your own risk. NVIDIA did not design or test the Software for use in production or business-critical systems.
+    NVIDIA may choose not to make available a commercial version of Pre-Release Software.
+    NVIDIA may also choose to abandon development and terminate the availability of Pre-Release Software at any time without liability.
+
+5. Updates
+
+    NVIDIA may at any time and at its option, change, discontinue, or deprecate any part, or all, of the Software, or change or remove features or functionality, or make available patches, workarounds or other updates to the Software.
+    Unless the updates are provided with their separate governing terms, they are deemed part of the Software licensed to you under this Agreement, and your continued use of the Software is deemed acceptance of such changes.
+
+6. Components Under Other Licenses
+
+    The Software may include or be distributed with components provided with separate legal notices or terms that accompany the components, such as open source software licenses and other license terms (“Other Licenses”).
+    The components are subject to the applicable Other Licenses, including any proprietary notices, disclaimers, requirements and extended use rights;
+    except that this Agreement will prevail regarding the use of third-party open source software, unless a third-party open source software license requires its license terms to prevail.
+    Open source software license means any software, data or documentation subject to any license identified as an open source license by the Open Source Initiative (http://opensource.org), Free Software Foundation (http://www.fsf.org) or other similar open source organization or listed by the Software Package Data Exchange (SPDX) Workgroup under the Linux Foundation (http://www.spdx.org).
+
+7. Ownership
+
+    7.1. NVIDIA Ownership. The Software, including all intellectual property rights, is and will remain the sole and exclusive property of NVIDIA or its licensors.
+    Except as expressly granted in this Agreement, (a) NVIDIA reserves all rights, interests and remedies in connection with the Software, and (b) no other license or right is granted to you by implication, estoppel or otherwise.
+
+    7.2. Your Ownership. Subject to the rights of NVIDIA and its suppliers in the Software, which continue to be licensed as stated in this Agreement, even when incorporated in your products or services, and the extent permitted by applicable law, as between you and NVIDIA, you hold all rights, title and interest in and to your products, services and Derivatives you develop as permitted in this Agreement including their respective intellectual property rights.
+
+8. Feedback
+
+    You may, but you are not obligated to, provide suggestions, requests, fixes, modifications, enhancements, or other feedback regarding the Software (collectively, “Feedback”).
+    Feedback, even if designated as confidential by you, will not create any confidentiality obligation for NVIDIA or its affiliates.
+    If you provide Feedback, you grant NVIDIA, its affiliates and its designees a non-exclusive, perpetual, irrevocable, sublicensable, worldwide, royalty-free, fully paid-up and transferable license, under your intellectual property rights, to publicly perform, publicly display, reproduce, use, make, have made, sell, offer for sale, distribute (through multiple tiers of distribution), import, create derivative works of and otherwise commercialize and exploit the Feedback at NVIDIA’s discretion.
+
+9. Termination
+
+    9.1. Termination. This Agreement will automatically terminate without notice from NVIDIA if you fail to comply with any of the terms in this Agreement or if you commence or participate in any legal proceeding against NVIDIA with respect to the Software.
+    Additionally, either party may terminate this Agreement at any time with thirty (30) days’ advance written notice to the other party.
+
+    9.2. Effect of Termination. Upon any expiration or termination of this Agreement, you will promptly (a) stop using and return, delete or destroy NVIDIA confidential information and all Software received under this Agreement, and (b) delete or destroy Derivatives created under this Agreement, unless an authorized NVIDIA representative provides prior written approval that you may keep a copy of the Derivatives solely for archival purposes.
+    Upon written request, you will certify in writing that you have complied with your obligations under this Section 9.2 (“Effect of Termination”).
+
+    9.3. Survival. Section 1.2 (“License Grant to NVIDIA”), Section 5 (“Updates”), Section 6 (“Components Under Other Licenses”), Section 7 (“Ownership”), Section 8 (“Feedback), Section 9.2 (“Effect of Termination”), Section 9.3 (“Survival”), Section 10 (“Disclaimer of Warranties”), Section 11 (“Limitation of Liability”), Section 12 (“Use in Mission Critical Applications”), Section 13 (“Governing Law and Jurisdiction”), Section 14 (“Indemnity”) and Section 15 (“General”) will survive any expiration or termination of this Agreement.
+
+10. Disclaimer of Warranties
+
+    THE SOFTWARE IS PROVIDED BY NVIDIA AS-IS AND WITH ALL FAULTS. TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, NVIDIA DISCLAIMS ALL WARRANTIES AND REPRESENTATIONS OF ANY KIND, WHETHER
+    EXPRESS, IMPLIED OR STATUTORY, RELATING TO OR ARISING UNDER THIS AGREEMENT, INCLUDING, WITHOUT LIMITATION, THE WARRANTIES OF TITLE, NONINFRINGEMENT, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, USAGE OF TRADE AND COURSE OF DEALING. NVIDIA DOES NOT WARRANT OR ASSUME RESPONSIBILITY FOR THE ACCURACY OR COMPLETENESS OF ANY THIRD-PARTY INFORMATION, TEXT, GRAPHICS, LINKS CONTAINED IN THE SOFTWARE.
+    WITHOUT LIMITING THE FOREGOING, NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS, ANY DEFECTS OR ERRORS WILL BE CORRECTED, ANY CERTAIN CONTENT WILL BE AVAILABLE; OR THAT THE SOFTWARE IS FREE OF VIRUSES OR OTHER HARMFUL COMPONENTS. NO INFORMATION OR ADVICE GIVEN BY NVIDIA WILL IN ANY WAY INCREASE THE SCOPE OF ANY WARRANTY EXPRESSLY PROVIDED IN THIS AGREEMENT.
+    NVIDIA does not warrant or assume responsibility for the accuracy or completeness of any third-party information, text, graphics or links contained in the Software.
+
+11. Limitations of Liability
+
+    11.1. EXCLUSIONS. TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT WILL NVIDIA BE LIABLE FOR ANY (I) INDIRECT, PUNITIVE, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES, OR (ii) DAMAGES FOR (a) THE COST OF PROCURING SUBSTITUTE GOODS, OR (b) LOSS OF PROFITS, REVENUES, USE, DATA OR GOODWILL ARISING OUT OF OR RELATED TO THIS AGREEMENT, WHETHER BASED ON BREACH OF CONTRACT, TORT (INCLUDING NEGLIGENCE), STRICT LIABILITY, OR OTHERWISE, AND EVEN IF NVIDIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES AND EVEN IF A PARTY’S REMEDIES FAIL THEIR ESSENTIAL PURPOSE.
+
+    11.2. DAMAGES CAP. ADDITIONALLY, TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, NVIDIA’S TOTAL CUMULATIVE AGGREGATE LIABILITY FOR ANY AND ALL LIABILITIES, OBLIGATIONS OR CLAIMS ARISING OUT OF OR RELATED TO THIS AGREEMENT WILL NOT EXCEED FIVE U.S. DOLLARS (US$5).
+
+12. Use in Mission Critical Applications
+
+    You acknowledge that the Software provided under this Agreement is not designed or tested by NVIDIA for use in any system or application where the use or failure of such system or application developed with NVIDIA’s Software could result in injury, death or catastrophic damage (each, a “Mission Critical Application”).
+    Examples of Mission Critical Applications include use in avionics, navigation, autonomous vehicle applications, AI solutions for automotive products, military, medical, life support or other mission-critical or life-critical applications.
+    NVIDIA will not be liable to you or any third party, in whole or in part, for any claims or damages arising from these uses.
+    You are solely responsible for ensuring that systems and applications developed with the Software include sufficient safety and redundancy features and comply with all applicable legal and regulatory standards and requirements.
+
+13. Governing Law and Jurisdiction
+
+    This Agreement will be governed in all respects by the laws of the United States and the laws of the State of Delaware, without regard to conflict of laws principles or the United Nations Convention on Contracts for the International Sale of Goods.
+    The state and federal courts residing in Santa Clara County, California will have exclusive jurisdiction over any dispute or claim arising out of or related to this Agreement, and the parties irrevocably consent to personal jurisdiction and venue in those courts;
+    except that either party may apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
+
+14. Indemnity
+
+    By using the Software you agree to defend, indemnify and hold harmless NVIDIA and its affiliates and their respective officers, directors, employees and agents from and against any claims, disputes, demands, liabilities, damages, losses, costs and expenses arising out of or in any way connected with (i) products or services that have been developed or deployed with or use the Software, or claims that they violate laws, or infringe, violate, or misappropriate any third party right;
+    or (ii) use of the Software in breach of the terms of this Agreement.
+
+15. General
+
+    15.1. Independent Contractors.
+       The parties are independent contractors, and this Agreement does not create a joint venture, partnership, agency, or other form of business association between the parties.
+       Neither party will have the power to bind the other party or incur any obligation on its behalf without the other party’s prior written consent.
+       Nothing in this Agreement prevents either party from participating in similar arrangements with third parties.
+
+    15.2. No Assignment.
+       NVIDIA may assign, delegate or transfer its rights or obligations under this Agreement by any means or operation of law.
+       You may not, without NVIDIA’s prior written consent, assign, delegate or transfer any of your rights or obligations under this Agreement by any means or operation of law, and any attempt to do so is null and void.
+
+    15.3. No Waiver.
+       No failure or delay by a party to enforce any term or obligation of this Agreement will operate as a waiver by that party, or prevent the enforcement of such term or obligation later.
+
+    15.4. Trade Compliance.
+       You agree to comply with all applicable export, import, trade and economic sanctions laws and regulations, as amended, including without limitation U.S. Export Administration Regulations and Office of Foreign Assets Control regulations.
+       You confirm (a) your understanding that export or reexport of certain NVIDIA products or technologies may require a license or other approval from appropriate authorities and (b) that you will not export or reexport any products or technology, directly or indirectly, without first obtaining any required license or other approval from appropriate authorities, (i) to any countries that are subject to any U.S. or local export restrictions (currently including, but not necessarily limited to, Belarus, Cuba, Iran, North Korea, Russia, Syria, the Region of Crimea, Donetsk People’s Republic Region and Luhansk People’s Republic Region);
+       (ii) to any end-user who you know or have reason to know will utilize them in the design, development or production of nuclear, chemical or biological weapons, missiles, rocket systems, unmanned air vehicles capable of a maximum range of at least 300 kilometers, regardless of payload, or intended for military end-use, or any weapons of mass destruction;
+       (iii) to any end-user who has been prohibited from participating in the U.S. or local export transactions by any governing authority;
+       or (iv) to any known military or military-intelligence end-user or for any known military or military-intelligence end-use in accordance with U.S. trade compliance laws and regulations.
+
+    15.5. Government Rights.
+       The Software, documentation and technology (“Protected Items”) are “Commercial products” as this term is defined at 48 C.F.R.
+       2.101, consisting of “commercial computer software” and “commercial computer software documentation” as such terms are used in, respectively, 48 C.F.R.
+       12.212 and 48 C.F.R. 227.7202 & 252.227-7014(a)(1). Before any Protected Items are supplied to the U.S. Government, you will (i) inform the U.S. Government in writing that the Protected Items are and must be treated as commercial computer software and commercial computer software documentation developed at private expense;
+       (ii) inform the U.S. Government that the Protected Items are provided subject to the terms of the Agreement;
+       and (iii) mark the Protected Items as commercial computer software and commercial computer software documentation developed at private expense.
+       In no event will you permit the U.S. Government to acquire rights in Protected Items beyond those specified in 48 C.F.R.
+       52.227-19(b)(1)-(2) or 252.227-7013(c) except as expressly approved by NVIDIA in writing.
+
+    15.6. Notices.
+       Please direct your legal notices or other correspondence to legalnotices@nvidia.com with a copy mailed to NVIDIA Corporation, 2788 San Tomas Expressway, Santa Clara, California 95051, United States of America, Attention: Legal Department.
+       If NVIDIA needs to contact you, you consent to receive the notices by email and agree that such notices will satisfy any legal communication requirements.
+
+    15.7. Severability.
+       If a court of competent jurisdiction rules that a provision of this Agreement is unenforceable, that provision will be deemed modified to the extent necessary to make it enforceable and the remainder of this Agreement will continue in full force and effect.
+
+    15.8. Amendment.
+       Any amendment to this Agreement must be in writing and signed by authorized representatives of both parties.
+
+    15.9. Construction.
+       The headings in the Agreement are included solely for convenience and are not intended to affect the meaning or interpretation of the Agreement.
+       As required by the context of the Agreement, the singular of a term includes the plural and vice versa.
+
+    15.10. Force Majeure.
+        Neither party will be liable during any period where an event or circumstance prevents or delays that party from performing its obligations under this Agreement and that event or circumstance: (i) is not within the reasonable control of that party and is not the result of that party’s negligence, and (ii) cannot be overcome or avoided by that party using reasonably diligent efforts.
+
+    15.11. Entire Agreement.
+        Regarding the subject matter of this Agreement, the parties agree that (a) this Agreement constitutes the entire and exclusive agreement between the parties and supersedes all prior and contemporaneous communications and (b) any additional or different terms or conditions, whether contained in purchase orders, order acknowledgments, invoices or otherwise, will not be binding and are null and void.
+
+(v. May 8, 2025)
diff --git a/python/CuTeDSL/base_dsl/__init__.py b/python/CuTeDSL/base_dsl/__init__.py
new file mode 100644
index 00000000..cbb617dc
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/__init__.py
@@ -0,0 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+# Local module imports
+from .dsl import *
+from .runtime import *
+from ._mlir_helpers import lru_cache_ir
+from .env_manager import get_str_env_var, detect_gpu_arch
+
diff --git a/python/CuTeDSL/base_dsl/_mlir_helpers/__init__.py b/python/CuTeDSL/base_dsl/_mlir_helpers/__init__.py
new file mode 100644
index 00000000..607a24d0
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/_mlir_helpers/__init__.py
@@ -0,0 +1,27 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides MLIR Dialect helper functions
+"""
+
+from . import arith
+from .lru_cache_ir import lru_cache_ir
+
+
+__all__ = ["arith", "lru_cache_ir"]
+
+try:
+    from . import gpu
+
+    __all__.extend(["gpu"])
+except ImportError:
+    pass
diff --git a/python/CuTeDSL/base_dsl/_mlir_helpers/arith.py b/python/CuTeDSL/base_dsl/_mlir_helpers/arith.py
new file mode 100644
index 00000000..d515113b
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/_mlir_helpers/arith.py
@@ -0,0 +1,691 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides MLIR Arith Dialect helper functions
+"""
+
+import array
+import numpy as np
+
+from ..common import *
+from ..._mlir import ir  # type: ignore
+from ..._mlir.extras import types as T  # type: ignore
+from ..._mlir.dialects import arith, nvgpu, math, builtin  # type: ignore
+
+from .lru_cache_ir import lru_cache_ir
+
+# =============================================================================
+# Arith Dialect Helper functions
+# =============================================================================
+
+
+def recast_type(src_type, res_elem_type) -> ir.Type:
+    if isinstance(src_type, T.VectorType):
+        if src_type.scalable:
+            res_type = T.vector(
+                *src_type.shape,
+                res_elem_type,
+                scalable=src_type.scalable,
+                scalable_dims=src_type.scalable_dims,
+            )
+        else:
+            res_type = T.vector(*src_type.shape, res_elem_type)
+    elif isinstance(src_type, T.RankedTensorType):
+        res_type = T.RankedTensorType.get(
+            element_type=res_elem_type, shape=src_type.shape, strides=src_type.strides
+        )
+    elif isinstance(src_type, T.UnrankedTensorType):
+        res_type = T.UnrankedTensorType.get(element_type=res_elem_type)
+    elif isinstance(src_type, T.MemRefType):
+        res_type = T.MemRefType.get(
+            element_type=res_elem_type, shape=src_type.shape, strides=src_type.strides
+        )
+    else:
+        res_type = res_elem_type
+    return res_type
+
+
+def is_scalar(ty) -> bool:
+    return not isinstance(
+        ty, (T.VectorType, T.RankedTensorType, T.UnrankedTensorType, T.MemRefType)
+    )
+
+
+def element_type(ty) -> ir.Type:
+    if not is_scalar(ty):
+        return ty.element_type
+    else:
+        return ty
+
+
+def is_narrow_precision(ty) -> bool:
+    narrow_types = {
+        T.f8E8M0FNU(),
+        T.f8E4M3FN(),
+        T.f8E4M3(),
+        T.f8E5M2(),
+        T.f8E4M3B11FNUZ(),
+        T.f4E2M1FN(),
+        T.f6E3M2FN(),
+        T.f6E2M3FN(),
+    }
+    return ty in narrow_types
+
+
+def is_float_type(ty) -> bool:
+    return (
+        arith._is_float_type(ty)
+        # TODO-upstream: prediction is not correct. Patch here and fix in upstream later
+        or is_narrow_precision(ty)
+        or ty in (T.bf16(), T.tf32())
+    )
+
+
+def truncf_to_narrow(res_ty, src, loc, ip):
+    res_elem_ty = element_type(res_ty)
+    if res_elem_ty == T.f8E8M0FNU():
+        rnd = nvgpu.RoundingMode.RP
+    else:
+        rnd = nvgpu.RoundingMode.RN
+    return nvgpu.cvt_fptrunc(res_ty, src, rnd=rnd, loc=loc, ip=ip)
+
+
+def extf_from_narrow(res_ty, src, loc, ip):
+    src_elem_ty = element_type(src.type)
+
+    # When source type is E8M0, temporary element type has to be bf16
+    tmp_elem_ty = T.bf16() if src_elem_ty == T.f8E8M0FNU() else T.f16()
+    tmp_ty = recast_type(src.type, tmp_elem_ty)
+
+    # narrow -> bf16/f16 -> target type
+    tmp = nvgpu.cvt_fpext(tmp_ty, src, loc=loc, ip=ip)
+    return arith.extf(res_ty, tmp, loc=loc, ip=ip)
+
+
+def bitcast(src, res_elem_type, *, loc=None, ip=None):
+    res_type = recast_type(src.type, res_elem_type)
+    return arith.bitcast(res_type, src, loc=loc, ip=ip)
+
+
+def cvtf(src, res_elem_type, *, loc=None, ip=None):
+    src_elem_type = element_type(src.type)
+
+    if res_elem_type == src_elem_type:
+        return src
+
+    res_type = recast_type(src.type, res_elem_type)
+
+    # Treat TF32 as F32 and use i32 as intermediate data
+    # TODO-upstream: update arith to support tf32 <-> f32 conversion
+    if src_elem_type == T.tf32():
+        # tf32 -> i32
+        tmp_type = recast_type(src.type, T.i32())
+        src = builtin.unrealized_conversion_cast([tmp_type], [src], loc=loc, ip=ip)
+        # i32 -> f32
+        src = bitcast(src, T.f32(), loc=loc, ip=ip)
+        # f32 -> X with `cvtf` recursively
+        return cvtf(src, res_elem_type, loc=loc, ip=ip)
+
+    if res_elem_type == T.tf32():
+        # X -> f32 with `cvtf`` recursively
+        tmp = cvtf(src, T.f32(), loc=loc, ip=ip)
+        # f32 -> i32
+        tmp = bitcast(tmp, T.i32(), loc=loc, ip=ip)
+        # i32 -> tf32
+        return builtin.unrealized_conversion_cast([res_type], [tmp], loc=loc, ip=ip)
+
+    if res_elem_type.width > src_elem_type.width:
+        if is_narrow_precision(src_elem_type):
+            return extf_from_narrow(res_type, src, loc, ip)
+        else:
+            return arith.extf(res_type, src, loc=loc, ip=ip)
+    else:
+        tmp_mlir_type = recast_type(src.type, T.f32())
+
+        # f16 -- extf -> f32 -- truncf -> bf16
+        # TODO-upstream: update arith to support bf16 <-> f16 conversion?
+        if (src_elem_type == T.f16() and res_elem_type == T.bf16()) or (
+            src_elem_type == T.bf16() and res_elem_type == T.f16()
+        ):
+            tmp = arith.extf(tmp_mlir_type, src, loc=loc, ip=ip)
+            return arith.truncf(res_type, tmp, loc=loc, ip=ip)
+
+        # {f8, f6, f4} -> f16, f32, ...
+        elif is_narrow_precision(res_elem_type):
+            return truncf_to_narrow(res_type, src, loc, ip)
+        else:
+            return arith.truncf(res_type, src, loc=loc, ip=ip)
+
+
+def fptoi(src, signed: Union[bool, None], res_elem_type, *, loc=None, ip=None):
+    res_type = recast_type(src.type, res_elem_type)
+    # TODO-upstream: update arith to support this kind of conversion
+    if element_type(src.type) in (T.tf32(), T.bf16()):
+        src = cvtf(src, T.f32(), loc=loc, ip=ip)
+
+    if signed:
+        return arith.fptosi(res_type, src, loc=loc, ip=ip)
+    else:
+        return arith.fptoui(res_type, src, loc=loc, ip=ip)
+
+
+def itofp(src, signed: Union[bool, None], res_elem_type, *, loc=None, ip=None):
+    res_type = recast_type(src.type, res_elem_type)
+
+    orig_res_type = res_type
+    # TODO-upstream: update arith to support this kind of conversion
+    if res_elem_type in (T.tf32(), T.bf16()):
+        res_type = recast_type(src.type, T.f32())
+
+    if signed and element_type(src.type).width > 1:
+        res = arith.sitofp(res_type, src, loc=loc, ip=ip)
+    else:
+        res = arith.uitofp(res_type, src, loc=loc, ip=ip)
+
+    if orig_res_type == res_type:
+        return res
+
+    return cvtf(res, element_type(orig_res_type), loc=loc, ip=ip)
+
+
+def int_to_int(a, dst_elem_type, *, loc=None, ip=None):
+    src_signed = a.signed
+    dst_signed = dst_elem_type.signed
+    src_width = element_type(a.type).width
+    dst_width = dst_elem_type.width
+
+    dst_mlir_type = recast_type(a.type, dst_elem_type.mlir_type)
+
+    if dst_width == src_width:
+        return a
+    elif src_signed and not dst_signed:
+        # Signed -> Unsigned
+        if dst_width > src_width:
+            return arith.extui(dst_mlir_type, a, loc=loc, ip=ip)
+        else:
+            return arith.trunci(dst_mlir_type, a, loc=loc, ip=ip)
+    elif src_signed == dst_signed:
+        # Same signedness
+        if dst_width > src_width:
+            if src_signed and src_width > 1:
+                return arith.extsi(dst_mlir_type, a, loc=loc, ip=ip)
+            else:
+                return arith.extui(dst_mlir_type, a, loc=loc, ip=ip)
+        else:
+            return arith.trunci(dst_mlir_type, a, loc=loc, ip=ip)
+    else:
+        # Unsigned -> Signed
+        if dst_width > src_width:
+            return arith.extui(dst_mlir_type, a, loc=loc, ip=ip)
+        else:
+            # For truncation from unsigned to signed, we need to handle overflow
+            # First truncate to the target width
+            trunc = arith.trunci(dst_mlir_type, a, loc=loc, ip=ip)
+            # Then reinterpret as signed
+            if dst_signed:
+                return arith.bitcast(dst_mlir_type, trunc, loc=loc, ip=ip)
+            return trunc
+
+
+# =============================================================================
+# Arith Ops Emitter Helpers
+#   - assuming type of lhs and rhs match each other
+#   - op name matches python module operator
+# =============================================================================
+
+
+def _cast(res_elem_ty, src, is_signed=None, *, loc=None, ip=None):
+    """
+    This function provides simplified interface to upstream op builder
+        arith.truncf(T.vector(shape, new_type), src)
+
+    is simplified as because it's element-wise op which can't change shape
+        arith.truncf(new_type, src)
+    """
+    if isinstance(src, ir.Value):
+        src_ty = src.type
+    else:
+        src_ty = type(src).mlir_type
+        src = src.ir_value()
+
+    src_elem_ty = element_type(src_ty)
+
+    if src_elem_ty == res_elem_ty:
+        return src
+    elif is_float_type(src_elem_ty) and is_float_type(res_elem_ty):
+        # float-to-float
+        return cvtf(src, res_elem_ty, loc=loc, ip=ip)
+    elif arith._is_integer_like_type(src_elem_ty) and arith._is_integer_like_type(
+        res_elem_ty
+    ):
+        if src_elem_ty.width >= res_elem_ty.width:
+            cast_op = arith.trunci
+        else:
+            if is_signed:
+                cast_op = arith.extsi
+            else:
+                cast_op = arith.extui
+
+        res_ty = recast_type(src_ty, res_elem_ty)
+        return cast_op(res_ty, src, loc=loc, ip=ip)
+    elif is_float_type(src_elem_ty) and arith._is_integer_like_type(res_elem_ty):
+        return fptoi(src, is_signed, res_elem_ty, loc=loc, ip=ip)
+    elif arith._is_integer_like_type(src_elem_ty) and is_float_type(res_elem_ty):
+        return itofp(src, is_signed, res_elem_ty, loc=loc, ip=ip)
+    else:
+        raise DSLRuntimeError(
+            f"cast from {src_elem_ty} to {res_elem_ty} is not supported"
+        )
+
+
+@lru_cache_ir()
+def const(value, ty=None, *, loc=None, ip=None):
+    """
+    Generates dynamic expression for constant values.
+    """
+    from ..typing import Numeric, NumericMeta
+    from ..dsl import is_dynamic_expression, _numpy_type_to_mlir_type
+
+    if isinstance(value, Numeric):
+        value = value.value
+
+    # Early return
+    if is_dynamic_expression(value) and (
+        value.type.isinstance(value.type) or T.bool().isinstance(value.type)
+    ):
+        return value
+
+    # Assume type
+    if ty is None:
+        if isinstance(value, float):
+            ty = T.f32()
+        elif isinstance(value, bool):
+            ty = T.bool()
+        elif isinstance(value, int):
+            ty = T.i32()
+        elif isinstance(value, np.ndarray):
+            ty = T.vector(*value.shape, _numpy_type_to_mlir_type(value.dtype))
+            value = array.array(value.dtype.kind, value.flatten().tolist())
+        else:
+            raise DSLNotImplemented(f"{type(value)} is not supported")
+    elif isinstance(ty, NumericMeta):
+        ty = ty.mlir_type
+    elif isinstance(ty, ir.Type):
+        if ir.RankedTensorType.isinstance(ty) or ir.VectorType.isinstance(ty):
+            elem_ty = ty.element_type
+            if isinstance(elem_ty, ir.IntegerType):
+                attr = ir.IntegerAttr.get(elem_ty, value)
+            else:
+                attr = ir.FloatAttr.get(elem_ty, value)
+            value = ir.DenseElementsAttr.get_splat(ty, attr)
+        elif arith._is_float_type(ty) and isinstance(value, (bool, int)):
+            value = float(value)
+        elif arith._is_integer_like_type(ty) and isinstance(value, float):
+            value = int(value)
+    else:
+        raise DSLNotImplemented(f"type {ty} is not supported")
+
+    return arith.constant(ty, value, loc=loc, ip=ip)
+
+
+def _dispatch_to_rhs_r_op(op):
+    """Decorator that dispatches to the right-hand-side's reverse operation.
+
+    If the other operand is not an ArithValue or is a subclass (more specific)
+    of ArithValue, this allows proper method resolution for binary operations.
+    """
+
+    def wrapper(self, other, **kwargs):
+        if not isinstance(other, ArithValue):
+            if not isinstance(other, (int, float, bool)):
+                # allows to call other.__rmul__
+                return NotImplemented
+
+        return op(self, other, **kwargs)
+
+    return wrapper
+
+
+def _binary_op(op):
+    """
+    Decorator to check if the 'other' argument is an ArithValue.
+    If not, returns NotImplemented.
+    """
+
+    def wrapper(self, other, **kwargs):
+        # When reach this point, `self` must be cast to base `ArithValue` type
+        if isinstance(other, (int, float, bool)):
+            other = const(other, self.type).with_signedness(self.signed)
+
+        # Call the original function
+        # If sub-class doesn't implement overloaded arithmetic, cast to base class
+        return op(self, other, **kwargs)
+
+    return wrapper
+
+
+# Operator overloading
+@ir.register_value_caster(ir.Float4E2M1FNType.static_typeid)
+@ir.register_value_caster(ir.Float6E2M3FNType.static_typeid)
+@ir.register_value_caster(ir.Float6E3M2FNType.static_typeid)
+@ir.register_value_caster(ir.Float8E4M3FNType.static_typeid)
+@ir.register_value_caster(ir.Float8E4M3B11FNUZType.static_typeid)
+@ir.register_value_caster(ir.Float8E5M2Type.static_typeid)
+@ir.register_value_caster(ir.Float8E4M3Type.static_typeid)
+@ir.register_value_caster(ir.Float8E8M0FNUType.static_typeid)
+@ir.register_value_caster(ir.BF16Type.static_typeid)
+@ir.register_value_caster(ir.F16Type.static_typeid)
+@ir.register_value_caster(ir.FloatTF32Type.static_typeid)
+@ir.register_value_caster(ir.F32Type.static_typeid)
+@ir.register_value_caster(ir.F64Type.static_typeid)
+@ir.register_value_caster(ir.IntegerType.static_typeid)
+@ir.register_value_caster(ir.VectorType.static_typeid)
+@ir.register_value_caster(ir.RankedTensorType.static_typeid)
+class ArithValue(ir.Value):
+    """Overloads operators for MLIR's Arith dialects binary operations."""
+
+    def __init__(self, v, signed: Union[bool, None] = None):
+        if isinstance(v, int):
+            v = arith.constant(self.type, v)
+        super().__init__(v)
+
+        elem_ty = element_type(self.type)
+        self.is_float = arith._is_float_type(elem_ty)
+        # arith dialect consider `1` in `i1` as `-1`, treat it as unsigned for DSL
+        self.signed = signed and elem_ty.width > 1
+
+    def with_signedness(self, signed: Union[bool, None]):
+        return type(self)(self, signed)
+
+    def __neg__(self, *, loc=None, ip=None):
+        if self.type == T.bool():
+            raise TypeError(
+                "Negation, the operator `-` is not supported for boolean type"
+            )
+
+        if self.is_float:
+            return arith.negf(self, loc=loc, ip=ip)
+        else:
+            c0 = arith.constant(self.type, 0, loc=loc, ip=ip)
+            return arith.subi(c0, self, loc=loc, ip=ip)
+
+    @_binary_op
+    def __pow__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float and other.is_float:
+            return math.powf(self, other, loc=loc, ip=ip)
+        elif self.is_float and not other.is_float:
+            return math.fpowi(self, other, loc=loc, ip=ip)
+        elif not self.is_float and other.is_float:
+            lhs = itofp(self, self.signed, T.f32(), loc=loc, ip=ip)
+            rhs = cvtf(other, T.f32(), loc=loc, ip=ip)
+            return math.powf(lhs, rhs, loc=loc, ip=ip)
+        elif not self.is_float and not other.is_float:
+            return math.ipowi(self, other, loc=loc, ip=ip)
+        else:
+            raise DSLNotImplemented(f"Unsupported '{self} ** {other}'")
+
+    @_binary_op
+    def __rpow__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return other.__pow__(self, loc=loc, ip=ip)
+
+    # arith operators
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __add__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            return arith.addf(self, other, loc=loc, ip=ip)
+        else:
+            return arith.addi(self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __sub__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            return arith.subf(self, other, loc=loc, ip=ip)
+        else:
+            return arith.subi(self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __mul__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            return arith.mulf(self, other, loc=loc, ip=ip)
+        else:
+            return arith.muli(self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __truediv__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            return arith.divf(self, other, loc=loc, ip=ip)
+        else:
+            lhs = itofp(self, self.signed, T.f32(), loc=loc, ip=ip)
+            rhs = itofp(other, other.signed, T.f32(), loc=loc, ip=ip)
+            return arith.divf(lhs, rhs, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __floordiv__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            q = arith.divf(self, other, loc=loc, ip=ip)
+            return math.floor(q, loc=loc, ip=ip)
+        elif self.signed:
+            return arith.floordivsi(self, other, loc=loc, ip=ip)
+        else:
+            return arith.divui(self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __mod__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            return arith.remf(self, other, loc=loc, ip=ip)
+        elif self.signed:
+            return arith.remsi(self, other, loc=loc, ip=ip)
+        else:
+            return arith.remui(self, other, loc=loc, ip=ip)
+
+    @_binary_op
+    def __radd__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return other.__add__(self, loc=loc, ip=ip)
+
+    @_binary_op
+    def __rsub__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return other.__sub__(self, loc=loc, ip=ip)
+
+    @_binary_op
+    def __rmul__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return other.__mul__(self, loc=loc, ip=ip)
+
+    @_binary_op
+    def __rtruediv__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return other.__truediv__(self, loc=loc, ip=ip)
+
+    @_binary_op
+    def __rfloordiv__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return other.__floordiv__(self, loc=loc, ip=ip)
+
+    @_binary_op
+    def __rmod__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return other.__mod__(self, loc=loc, ip=ip)
+
+    # Comparison operators (comparison doesn't have right-hand-side variants)
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __lt__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            return arith.cmpf(arith.CmpFPredicate.OLT, self, other, loc=loc, ip=ip)
+        elif self.signed:
+            return arith.cmpi(arith.CmpIPredicate.slt, self, other, loc=loc, ip=ip)
+        else:
+            return arith.cmpi(arith.CmpIPredicate.ult, self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __le__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            return arith.cmpf(arith.CmpFPredicate.OLE, self, other, loc=loc, ip=ip)
+        elif self.signed:
+            return arith.cmpi(arith.CmpIPredicate.sle, self, other, loc=loc, ip=ip)
+        else:
+            return arith.cmpi(arith.CmpIPredicate.ule, self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __eq__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            return arith.cmpf(arith.CmpFPredicate.OEQ, self, other, loc=loc, ip=ip)
+        else:
+            return arith.cmpi(arith.CmpIPredicate.eq, self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __ne__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            # In Python, bool(float("nan")) is True, so use unordered comparison here
+            return arith.cmpf(arith.CmpFPredicate.UNE, self, other, loc=loc, ip=ip)
+        else:
+            return arith.cmpi(arith.CmpIPredicate.ne, self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __gt__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            return arith.cmpf(arith.CmpFPredicate.OGT, self, other, loc=loc, ip=ip)
+        elif self.signed:
+            return arith.cmpi(arith.CmpIPredicate.sgt, self, other, loc=loc, ip=ip)
+        else:
+            return arith.cmpi(arith.CmpIPredicate.ugt, self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __ge__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            return arith.cmpf(arith.CmpFPredicate.OGE, self, other, loc=loc, ip=ip)
+        elif self.signed:
+            return arith.cmpi(arith.CmpIPredicate.sge, self, other, loc=loc, ip=ip)
+        else:
+            return arith.cmpi(arith.CmpIPredicate.uge, self, other, loc=loc, ip=ip)
+
+    # Unary operators
+    def __invert__(self, *, loc=None, ip=None) -> "ArithValue":
+        return arith.xori(self, arith.const(self.type, -1))
+
+    # Bitwise operations
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __and__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return arith.andi(self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __or__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return arith.ori(self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __xor__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return arith.xori(self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __rshift__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.signed:
+            return arith.shrsi(self, other, loc=loc, ip=ip)
+        else:
+            return arith.shrui(self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __lshift__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return arith.shli(self, other, loc=loc, ip=ip)
+
+    @_binary_op
+    def __rand__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return arith.andi(other, self, loc=loc, ip=ip)
+
+    @_binary_op
+    def __ror__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return arith.ori(other, self, loc=loc, ip=ip)
+
+    @_binary_op
+    def __rxor__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return arith.xori(other, self, loc=loc, ip=ip)
+
+    @_binary_op
+    def __rrshift__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return other.__rshift__(self, loc=loc, ip=ip)
+
+    @_binary_op
+    def __rlshift__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return other.__lshift__(self, loc=loc, ip=ip)
+
+    def __hash__(self):
+        return super().__hash__()
+
+    def __str__(self):
+        return super().__str__().replace(ir.Value.__name__, ArithValue.__name__)
+
+    def __repr__(self):
+        return self.__str__()
+
+
+def _min(lhs, rhs, *, loc=None, ip=None):
+    """
+    This function provides a unified interface for building arith min
+
+    Assuming the operands have the same type
+    """
+    from ..dsl import is_dynamic_expression
+
+    if not is_dynamic_expression(lhs):
+        if not is_dynamic_expression(rhs):
+            return min(lhs, rhs)
+        else:
+            lhs = arith.constant(rhs.type, lhs, loc=loc, ip=ip)
+    else:
+        if not is_dynamic_expression(rhs):
+            rhs = arith.constant(lhs.type, rhs, loc=loc, ip=ip)
+
+    if arith._is_integer_like_type(lhs.type):
+        if lhs.signed:
+            return arith.minsi(lhs, rhs, loc=loc, ip=ip)
+        else:
+            return arith.minui(lhs, rhs, loc=loc, ip=ip)
+    else:
+        return arith.minimumf(lhs, rhs, loc=loc, ip=ip)
+
+
+def _max(lhs, rhs, *, loc=None, ip=None):
+    """
+    This function provides a unified interface for building arith max
+
+    Assuming the operands have the same type
+    """
+    from ..dsl import is_dynamic_expression
+
+    if not is_dynamic_expression(lhs):
+        if not is_dynamic_expression(rhs):
+            return max(lhs, rhs)
+        else:
+            lhs = arith.constant(rhs.type, lhs, loc=loc, ip=ip)
+    else:
+        if not is_dynamic_expression(rhs):
+            rhs = arith.constant(lhs.type, rhs, loc=loc, ip=ip)
+
+    if arith._is_integer_like_type(lhs.type):
+        if lhs.signed:
+            return arith.maxsi(lhs, rhs, loc=loc, ip=ip)
+        else:
+            return arith.maxui(lhs, rhs, loc=loc, ip=ip)
+    else:
+        return arith.maximumf(lhs, rhs, loc=loc, ip=ip)
diff --git a/python/CuTeDSL/base_dsl/_mlir_helpers/gpu.py b/python/CuTeDSL/base_dsl/_mlir_helpers/gpu.py
new file mode 100644
index 00000000..a0b0d050
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/_mlir_helpers/gpu.py
@@ -0,0 +1,64 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides MLIR GPU Dialect helper functions
+"""
+
+
+from ..._mlir import ir
+from ..._mlir.dialects import gpu, arith, scf
+from ..._mlir.extras import types as T
+
+from ..common import *
+
+# =============================================================================
+# GPU Dialect Helper functions
+# =============================================================================
+
+
+def create_async_token():
+    token_ty = gpu.AsyncTokenType.get()
+    token = gpu.wait(token_ty, [])
+    return token
+
+
+def printf(fmt, *args, threadNumber=-1):
+    """Generate gpu.printf OP predicated on threadNumber"""
+    type_formats = []
+    for arg in args:
+        ty_format = None
+        if ir.IndexType.isinstance(arg.type):
+            ty_format = "%llu"
+        if ir.IntegerType.isinstance(arg.type):
+            width = ir.IntegerType(arg.type).width
+            if width == 64:
+                ty_format = "%llu"
+            elif width == 32:
+                ty_format = "%d"
+            elif width == 1:
+                ty_format = "%i"
+        if ir.F32Type.isinstance(arg.type):
+            ty_format = "%f"
+        if ty_format is None:
+            raise DSLNotImplemented(arg.type)
+        type_formats.append(ty_format)
+    if threadNumber == -1:
+        gpu.printf(fmt.format(*type_formats) + "\n", args)
+    if threadNumber != -1:
+        tidx = gpu.thread_id(gpu.Dimension.x)
+        predicate = arith.cmpi(
+            arith.CmpIPredicate.eq, tidx, arith.constant(_T.index(), threadNumber)
+        )
+        if_op = scf.IfOp(predicate)
+        with ir.InsertionPoint(if_op.then_block):
+            gpu.printf(fmt.format(*type_formats) + "\n", args)
+            scf.yield_([])
diff --git a/python/CuTeDSL/base_dsl/_mlir_helpers/lru_cache_ir.py b/python/CuTeDSL/base_dsl/_mlir_helpers/lru_cache_ir.py
new file mode 100644
index 00000000..57d717b4
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/_mlir_helpers/lru_cache_ir.py
@@ -0,0 +1,76 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides @lru_cache_ir
+It extends functools.lru_cache with IR Context awareness.
+
+Example usage:
+from cutlass import ir
+from lru_cache_ir import lru_cache_ir
+
+@lru_cache_ir(ir, maxsize=128, typed=False)
+def make_layout(...):
+...
+
+"""
+
+
+from functools import lru_cache, wraps
+
+from ..._mlir import ir  # type: ignore
+
+
+def get_ir_context(func):
+    """
+    Return the context for given func called under ir.
+    Currently the context includes MLIRContext and InsertionPoint.
+    """
+    try:
+        if ir:
+            return (ir.Context.current, ir.InsertionPoint.current)
+        else:
+            return None
+    except ValueError:
+        return None
+
+
+def lru_cache_ir(maxsize=128, typed=True):
+    """
+    Applies an LRU cache to a given function, with awareness of IR context.
+
+    Usage is similar to functools.lru_cache while taking `ir` as required argument.
+
+    :param ir: The IR object from which to derive the context by `get_ir_context`
+    :param maxsize: Max cache size, same as functools.lru_cache
+    :param typed: Whether params are type-sensitive, default to True as IR is type-sensitive
+    """
+
+    def decorator(func):
+        # Use functools.lru_cache with a custom wrapper to control the key generation
+        @lru_cache(maxsize=maxsize, typed=typed)
+        def cached_func(context, *args, **kwargs):
+            return func(*args, **kwargs)
+
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            try:
+                # Call the cached function with the context
+                return cached_func(get_ir_context(func), *args, **kwargs)
+            except (RuntimeError, TypeError):
+                return func(*args, **kwargs)
+
+        # Expose cache-related methods for introspection
+        wrapper.cache_clear = cached_func.cache_clear
+        wrapper.cache_info = cached_func.cache_info
+        return wrapper
+
+    return decorator
diff --git a/python/CuTeDSL/base_dsl/_mlir_helpers/op.py b/python/CuTeDSL/base_dsl/_mlir_helpers/op.py
new file mode 100644
index 00000000..3989c75e
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/_mlir_helpers/op.py
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides MLIR's OP helper functions
+"""
+
+
+import inspect
+from functools import wraps
+
+from ..._mlir import ir
+
+
+def dsl_user_op(opFunc):
+    @wraps(opFunc)
+    def wrapper(*args, **kwargs):
+        loc = kwargs.pop("loc", None)
+        if loc is None:
+            frame = inspect.currentframe().f_back
+            file_loc = ir.Location.file(frame.f_code.co_filename, frame.f_lineno, 0)
+            loc = ir.Location.name(frame.f_code.co_name, childLoc=file_loc)
+        res_or_list = opFunc(*args, **kwargs, loc=loc)
+        return res_or_list
+
+    return wrapper
diff --git a/python/CuTeDSL/base_dsl/ast_helpers.py b/python/CuTeDSL/base_dsl/ast_helpers.py
new file mode 100644
index 00000000..e8796cff
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/ast_helpers.py
@@ -0,0 +1,584 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides helper functions that are generated by the preprocessor.
+The preprocessor read through python's ast and changes the input code.
+"""
+
+from typing import Callable, Iterator, Optional, overload
+
+from .utils.logger import log
+from .common import *
+
+from ._mlir_helpers.arith import ArithValue
+
+class Executor:
+    """
+    The Executor class handles dynamic and compile-time (constexpr) execution
+    of "for" loops and "if-else-elif" statements.
+
+    Methods:
+        set_functions:  Assigns the functions for checking loop bounds and
+                        conditional evaluation.
+
+        for_dynamic: Generates MLIR for OP
+        for_constexpr: Executes a for loop at JIT compile-time
+        for_execute: Decides whether to execute the loop at compile-time or generate MLIR for OP based on the provided bounds.
+
+        if_dynamic: Generates MLIR if OP
+        if_constexpr: Executes a if at JIT compile-time by python interpreter
+        if_execute: Decides whether to execute the if statement at compile-time or generate MLIR if OP based on the predicate.
+    """
+
+    def __init__(self):
+        self._is_dynamic_expression = None
+        self._loop_execute_range_dynamic = None
+        self._if_dynamic = None
+        self._while_dynamic = None
+
+    def set_functions(
+        self,
+        is_dynamic_expression: Callable,
+        loop_execute_range_dynamic: Callable,
+        if_dynamic: Callable,
+        while_dynamic: Callable,
+    ):
+        self._is_dynamic_expression = is_dynamic_expression
+        self._loop_execute_range_dynamic = loop_execute_range_dynamic
+        self._if_dynamic = if_dynamic
+        self._while_dynamic = while_dynamic
+
+    @staticmethod
+    def convert_to_list(x):
+        """This function is used to convert x to a list.
+        If x is None, return an empty list.
+        If x is not a list, return a list containing x.
+        Otherwise, return x itself.
+        """
+        if x is None:
+            return []
+        if not isinstance(x, list):
+            return [x]
+        return x
+
+    @staticmethod
+    def converge_ret_val(res):
+        """This function is used to converge res (the return value) of the function.
+        If res is None, return None.
+        If res is a list and has only one element, return the element.
+        Otherwise, return res itself.
+        """
+        if res is None:
+            return res
+        elif isinstance(res, list) and len(res) == 1:
+            return res[0]
+        return res
+
+    def for_dynamic(
+        self,
+        func: Callable,
+        start,
+        stop,
+        step,
+        used_args: list,
+        iter_args: list,
+        iter_arg_names: list,
+        unroll=bool,
+        unroll_full=int,
+    ):
+        log().info("start [%s] stop [%s] step [%s]", start, stop, step)
+        return self._loop_execute_range_dynamic(
+            func,
+            start,
+            stop,
+            step,
+            used_args,
+            iter_args,
+            iter_arg_names,
+            unroll,
+            unroll_full,
+        )
+
+    @staticmethod
+    def for_constexpr(
+        func: Callable,
+        start: int,
+        stop: int,
+        step: int,
+        used_args: list,
+        iter_args: list,
+    ):
+        log().info("start [%s] stop [%s] step [%s]", start, stop, step)
+        loop_results = iter_args
+        log().debug("iter_args [%s]", iter_args)
+        for i in range(start, stop, step):
+            log().debug("i  [%s] iter_args  [%s]", i, iter_args)
+            loop_results = func(i, *used_args, *loop_results)
+            log().debug("loop_results  [%s]", loop_results)
+            if loop_results is None:
+                loop_results = []
+            if not isinstance(loop_results, list):
+                loop_results = [loop_results]
+
+        log().debug("done loop_results [%s]", loop_results)
+        return Executor.converge_ret_val(loop_results)
+
+    def for_execute(
+        self,
+        func,
+        start,
+        stop,
+        step,
+        used_args=[],
+        iter_args=[],
+        iter_arg_names=[],
+        unroll=-1,
+        unroll_full=False,
+        is_range_constexpr=None,
+    ):
+        assert (
+            self._loop_execute_range_dynamic and self._is_dynamic_expression
+        ), "Functions must be set before execution."
+        log().debug("start [%s] stop [%s] step [%s]", start, stop, step)
+        any_dynamic_expression = (
+            self._is_dynamic_expression(start)
+            or self._is_dynamic_expression(stop)
+            or self._is_dynamic_expression(step)
+        )
+
+        if is_range_constexpr is None:
+            if not any_dynamic_expression:
+                return self.for_constexpr(func, start, stop, step, used_args, iter_args)
+            else:
+                return self.for_dynamic(
+                    func,
+                    start,
+                    stop,
+                    step,
+                    used_args,
+                    iter_args,
+                    iter_arg_names,
+                    unroll,
+                    unroll_full,
+                )
+
+        # Ensure bounds are compile-time constants for constexpr execution
+        if is_range_constexpr:
+            if any_dynamic_expression:
+                raise DSLRuntimeError(
+                    "Loop bounds must be constexpr (compile-time constants)"
+                )
+            return self.for_constexpr(func, start, stop, step, used_args, iter_args)
+
+        # MLIR generation
+        return self.for_dynamic(
+            func,
+            start,
+            stop,
+            step,
+            used_args,
+            iter_args,
+            iter_arg_names,
+            unroll,
+            unroll_full,
+        )
+
+    def if_dynamic(
+        self,
+        pred,
+        then_block: Callable,
+        else_block: Optional[Callable] = None,
+        used_args=[],
+        yield_args=[],
+        yield_arg_names=[],
+    ):
+        return self._if_dynamic(
+            pred, then_block, else_block, used_args, yield_args, yield_arg_names
+        )
+
+    @staticmethod
+    def if_constexpr(
+        pred,
+        then_block: Callable,
+        else_block: Optional[Callable] = None,
+        used_args=[],
+        yield_args=[],
+    ):
+        if pred:
+            log().debug(" running then block [%s]", yield_args)
+            res = then_block(*used_args, *yield_args)
+            log().debug("result [%s]", res)
+            return Executor.converge_ret_val(res)
+        elif else_block is not None:
+            log().debug("running else [%s]", yield_args)
+            res = else_block(*used_args, *yield_args)
+            log().debug("result [%s]", res)
+            return Executor.converge_ret_val(res)
+
+    def if_execute(
+        self,
+        pred,
+        then_block: Callable,
+        else_block: Optional[Callable] = None,
+        used_args=[],
+        yield_args=[],
+        yield_arg_names=[],
+        if_constexpr=None,
+    ):
+        assert (
+            self._if_dynamic and self._is_dynamic_expression
+        ), "Functions must be set before execution."
+
+        is_if_constexpr = not self._is_dynamic_expression(pred)
+        if if_constexpr is None:
+            if is_if_constexpr:
+                return self.if_constexpr(
+                    pred, then_block, else_block, used_args, yield_args
+                )
+            else:
+                return self.if_dynamic(
+                    pred, then_block, else_block, used_args, yield_args, yield_arg_names
+                )
+
+        # Ensure bounds are compile-time constants for constexpr execution
+        if if_constexpr:
+            if not is_if_constexpr:
+                raise DSLRuntimeError(
+                    "If predicate must be constexpr (compile-time constants)"
+                )
+            return self.if_constexpr(
+                pred, then_block, else_block, used_args, yield_args
+            )
+
+        # MLIR generation
+        return self.if_dynamic(
+            pred, then_block, else_block, used_args, yield_args, yield_arg_names
+        )
+
+    def while_dynamic(
+        self,
+        while_before_block: Callable,
+        while_after_block: Callable,
+        used_args=[],
+        yield_args=[],
+        yield_arg_names=[],
+    ):
+        return self._while_dynamic(
+            while_before_block,
+            while_after_block,
+            used_args,
+            yield_args,
+            yield_arg_names,
+        )
+
+    @staticmethod
+    def while_constexpr(
+        while_before_block,
+        while_after_block,
+        used_args=[],
+        yield_args=[],
+    ):
+        log().debug(
+            "while_constexpr begin %s", while_before_block.__qualname__
+        )
+        cond, loop_results = while_before_block(*used_args, *yield_args)
+        while cond:
+            loop_results = Executor.convert_to_list(loop_results)
+            log().debug(
+                "calling while_after [%s], [%s]",
+                used_args,
+                loop_results,
+            )
+            loop_results = while_after_block(*used_args, *loop_results)
+            log().debug(
+                "while after [%s]", loop_results
+            )
+            loop_results = Executor.convert_to_list(loop_results)
+            log().debug(
+                "calling while_before [%s], [%s]",
+                used_args,
+                loop_results,
+            )
+            cond, loop_results = while_before_block(*used_args, *loop_results)
+            log().debug(
+                "while_before cond, results [%s], [%s]",
+                cond,
+                loop_results,
+            )
+
+        log().debug(
+            "while_constexpr results %s", loop_results
+        )
+        return Executor.converge_ret_val(loop_results)
+
+    def while_execute(
+        self,
+        pred,
+        while_before_block: Callable,
+        while_after_block: Callable,
+        used_args=[],
+        yield_args=[],
+        yield_arg_names=[],
+        while_constexpr=None,
+    ):
+        assert (
+            self._while_dynamic and self._is_dynamic_expression
+        ), "Functions must be set before execution."
+
+        is_while_constexpr = not self._is_dynamic_expression(pred)
+
+        # Ensure bounds are compile-time constants for constexpr execution
+        if while_constexpr:
+            if not is_while_constexpr:
+                raise DSLRuntimeError(
+                    "While predicate must be constexpr (compile-time constants)"
+                )
+            return self.while_constexpr(
+                while_before_block, while_after_block, used_args, yield_args
+            )
+
+        # MLIR generation
+        return self.while_dynamic(
+            while_before_block,
+            while_after_block,
+            used_args,
+            yield_args,
+            yield_arg_names,
+        )
+
+
+# =============================================================================
+# Decorator
+# =============================================================================
+
+executor = Executor()
+
+
+def loop_selector(
+    start,
+    stop,
+    step,
+    used_args=[],
+    iter_args=[],
+    iter_arg_names=[],
+    unroll=-1,
+    unroll_full=False,
+    constexpr=None,
+):
+    log().info(
+        "start [%s] stop [%s] step [%s] used_args [%s] iter_args [%s] unroll [%s] unroll_full [%s] constexpr [%s]",
+        start,
+        stop,
+        step,
+        used_args,
+        iter_args,
+        unroll,
+        unroll_full,
+        constexpr,
+    )
+    from .typing import Integer, Numeric
+
+    def _maybe_upcast(value):
+        if isinstance(value, Integer):
+            value = value.ir_value()
+
+        return value
+
+    start = _maybe_upcast(start)
+    stop = _maybe_upcast(stop)
+    step = _maybe_upcast(step)
+
+    def ir_loop(func):
+        return executor.for_execute(
+            func,
+            start,
+            stop,
+            step,
+            used_args,
+            iter_args,
+            iter_arg_names,
+            unroll,
+            unroll_full,
+            constexpr,
+        )
+
+    return ir_loop
+
+
+def if_selector(pred, used_args=[], yield_args=[]):
+    log().info("pred [%s] used_args [%s] yield_args [%s]", pred, used_args, yield_args)
+    # Handle Numeric types here?
+
+    from .typing import Numeric
+
+    if isinstance(pred, Numeric):
+        pred = pred.value
+
+    def ir_loop(func):
+        return func(pred, *used_args, *yield_args)
+
+    return ir_loop
+
+
+def while_selector(pred, used_args=[], yield_args=[]):
+    def ir_while_loop(func):
+        return func(pred, *used_args, *yield_args)
+
+    return ir_while_loop
+
+
+def while_executor(
+    pred,
+    while_before_block: Callable,
+    while_after_block: Callable,
+    used_args=[],
+    yield_args=[],
+    yield_arg_names=[],
+    constexpr=None,
+):
+    return executor.while_execute(
+        pred,
+        while_before_block,
+        while_after_block,
+        used_args,
+        yield_args,
+        yield_arg_names,
+        constexpr,
+    )
+
+
+def if_executor(
+    pred,
+    then_block: Callable,
+    else_block: Optional[Callable] = None,
+    used_args=[],
+    yield_args=[],
+    yield_arg_names=[],
+    constexpr=None,
+):
+    return executor.if_execute(
+        pred, then_block, else_block, used_args, yield_args, yield_arg_names, constexpr
+    )
+
+
+# =============================================================================
+# Range
+# =============================================================================
+
+
+class range_dynamic:
+    @overload
+    def __new__(cls, stop, unroll=0, unroll_full=False):
+        pass
+
+    @overload
+    def __new__(cls, start, stop, step, unroll=0, unroll_full=False):
+        pass
+
+    def __new__(cls, *args, **kwargs):
+        raise DSLRuntimeError("range_dynamic should be always preprocessed to IR")
+
+
+class range_constexpr:
+    def __init__(self, *args):
+        if len(args) == 1:
+            self.start = 0
+            self.stop = args[0]
+            self.step = 1
+        elif len(args) == 2:
+            self.start, self.stop = args
+            self.step = 1
+        elif len(args) == 3:
+            self.start, self.stop, self.step = args
+        else:
+            raise DSLRuntimeError(
+                "range_constexpr supports up to 3 arguments (start, stop, step)"
+            )
+        # Ensure the arguments are compile-time constants (if required)
+        for arg_name, arg_value in [
+            ("step", self.step),
+            ("start", self.start),
+            ("stop", self.stop),
+        ]:
+            if executor._is_dynamic_expression(arg_value):
+                raise DSLRuntimeError(
+                    f"`range_constexpr` requires `constexpr` (non-IR Values) for all arguments, "
+                    f"but `{arg_name}` is not. If the arguments are dynamic, use `range`; the DSL "
+                    f"will handle them during runtime. ",
+                    suggestion="Use `range` instead of `range_constexpr`.",
+                )
+
+    def __iter__(self) -> Iterator[int]:
+        current = self.start
+        while current < self.stop:
+            yield current
+            current += self.step
+
+
+# =============================================================================
+# If expressions
+# =============================================================================
+
+
+def const_expr(expression):
+    if executor._is_dynamic_expression(expression):
+        raise DSLRuntimeError(
+            f"The function `const_expr({expression})` received a dynamic expression (non compile-time constant).",
+            context={
+                "const_expr": "Accepts only constexpr (compile-time constant)",
+                "If your expression depends on dynamic values": "Avoid marking it as `const_expr()`",
+                "If the expression could be either dynamic or constexpr": "Omit explicit `const_expr()` marker; the DSL will infer the correct handling automatically",
+            },
+        )
+    return expression
+
+
+def dynamic_expr(expression):
+    raise DSLRuntimeError("dynamic_expr should be always preprocessed to IR")
+
+
+# =============================================================================
+# Assertion & casting
+# =============================================================================
+
+
+def assert_executor(test, msg=None):
+    from .typing import Numeric
+
+    fail = False
+    # Implicit convert dynamic expression to bool is not allowed
+    # So here explicitly do a None check
+    if test is not None and executor._is_dynamic_expression(test):
+        if isinstance(test, Numeric):
+            try:
+                test = test.to(bool)
+            except:
+                fail = True
+        else:
+            fail = True
+
+    if not fail:
+        assert test, msg
+    else:
+        raise DSLRuntimeError(
+            "Only constexpr (Python Value) is allowed here, but got non-constexpr (IR Values) expression.",
+            suggestion = "Please replace with runtime assert."
+        )
+
+
+def bool_cast(value):
+    if executor._is_dynamic_expression(value):
+        raise DSLRuntimeError(
+            "Only constexpr (Python Value) is allowed here, but got non-constexpr (IR Values) expression.",
+            suggestion = "Please explicitly convert to boolean with expressions like comparision."
+        )
+    return bool(value)
diff --git a/python/CuTeDSL/base_dsl/ast_preprocessor.py b/python/CuTeDSL/base_dsl/ast_preprocessor.py
new file mode 100644
index 00000000..e165c1db
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/ast_preprocessor.py
@@ -0,0 +1,1459 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module defines the `DSLPreprocessor` class, which acts as a Python preprocessor.
+It uses Python's AST and rewrites specific Python statements such as `for` and `if-else`.
+
+The preprocessor operates on the following constructs:
+    - `for` loops:
+        - Rewrites `for` loops with the `@loop_selector` decorator.
+        - Supports `range`, `range_dynamic`, and `range_constexpr` for loop iteration.
+    - `if-elif-else` statements:
+        - Rewrites conditional statements with the `@if_selector` decorator.
+        - Supports `dynamic_expr` and `const_expr` in the condition expressions.
+
+Additionally, both `for` loops and `if-else` statements require `yield`
+operation generation. The preprocessor handles this by:
+    - Using a `ScopeManager` to track symbols across different scopes during AST traversal.
+    - Identifying read-only, read-write, and active variables for DSL constructs.
+    - Generating `yield` operations for symbols that are classified as read-write or write.
+
+It is designed to be generic and can handle `for` and `if` constructs from other dialects.
+In such cases, the user's DSL should implement `@loop_selector` and `@if_selector`
+to generate dialect-specific operations for `for` and `if` statements.
+"""
+
+import ast
+import importlib
+import inspect
+import textwrap
+from dataclasses import dataclass
+from typing import List, Set, Dict, Any, Callable, Optional
+from types import ModuleType
+
+from .common import *
+from .utils.logger import log
+
+
+class OrderedSet:
+    """
+    A deterministic set implementation for ordered operations.
+    """
+
+    def __init__(self, iterable=None):
+        self._dict = dict.fromkeys(iterable or [])
+
+    def add(self, item):
+        self._dict[item] = None
+
+    def __iter__(self):
+        return iter(self._dict)
+
+    def __and__(self, other):
+        return OrderedSet(key for key in self._dict if key in other)
+
+    def __or__(self, other):
+        new_dict = self._dict.copy()
+        new_dict.update(dict.fromkeys(other))
+        return OrderedSet(new_dict)
+
+    def __sub__(self, other):
+        return OrderedSet(key for key in self._dict if key not in other)
+
+
+@dataclass
+class ScopeManager:
+    """
+    Manages symbol scopes during AST traversal.
+    Manage nested scopes during transformations.
+    """
+
+    scopes: List[Set[str]]
+    current_scope: Set[str]
+
+    @classmethod
+    def create(cls) -> "ScopeManager":
+        return cls([], set())
+
+    def enter_scope(self) -> None:
+        self.scopes.append(self.current_scope.copy())
+
+    def exit_scope(self) -> None:
+        self.current_scope = self.scopes.pop()
+
+    def add_to_scope(self, name: str) -> None:
+        self.current_scope.add(name)
+
+    def get_active_symbols(self) -> Set[str]:
+        return set(self.current_scope)
+
+
+class DSLPreprocessor(ast.NodeTransformer):
+    """
+    A preprocessor for transforming Python ASTs. It supports:
+
+    - Rewriting `for` loops with the `@loop_selector` decorator.
+    - Rewriting `if-elif-else` statements with the `@if_selector` decorator.
+    - Generating `yield` operations for read-write or write symbols.
+    """
+
+    DECORATOR_FOR_STATEMENT = "loop_selector"
+    DECORATOR_IF_STATEMENT = "if_selector"
+    DECORATOR_WHILE_STATEMENT = "while_selector"
+    IF_EXECUTOR = "if_executor"
+    WHILE_EXECUTOR = "while_executor"
+    ASSERT_EXECUTOR = "assert_executor"
+    BOOL_CAST = "bool_cast"
+    IMPLICIT_DOWNCAST_NUMERIC_TYPE = "implicitDowncastNumericType"
+    SUPPORTED_FOR_RANGE_STATEMENTS = {"range", "range_dynamic", "range_constexpr"}
+
+    def __init__(self):
+        super().__init__()
+        self.counter = 0  # Unique function names for multiple loops
+        self.scope_manager = ScopeManager.create()
+        self.processed_functions = set()
+        self.function_counter = 0
+        self.function_name = "<unknown function>"
+        self.class_name = None
+        self.file_name = "<unknown filename>"
+        self.function_depth = 0
+        self.local_closures = set()
+        self.function_globals = None
+
+    def _get_module_imports(self, decorated_func):
+        """Extract imports from the module containing the decorated function"""
+        # Get the module containing the decorated function
+        module = inspect.getmodule(decorated_func)
+        if module is None:
+            return {}
+
+        # Get the module source code
+        try:
+            source = inspect.getsource(module)
+            module_ast = ast.parse(source)
+
+            # Extract imports from the full module
+            imports = {}
+            for node in ast.walk(module_ast):
+                if isinstance(node, ast.Import):
+                    for name in node.names:
+                        imports[name.name] = name.asname if name.asname else name.name
+                elif isinstance(node, ast.ImportFrom):
+                    module_name = node.module
+                    for name in node.names:
+                        if name.name == "*":
+                            # Handle wildcard imports
+                            try:
+                                imported_module = importlib.import_module(module_name)
+                                imports[module_name] = imported_module
+                            except ImportError:
+                                pass
+                        else:
+                            full_name = f"{module_name}.{name.name}"
+                            imports[full_name] = (
+                                name.asname if name.asname else name.name
+                            )
+            return imports
+        except (IOError, TypeError):
+            return {}
+
+    def exec(self, function_name, original_function, code_object, exec_globals):
+        # Get imports from the original module
+        module_imports = self._get_module_imports(original_function)
+
+        # Import all required modules
+        for module_path, alias in module_imports.items():
+            try:
+                if "." in module_path:
+                    base_module, attribute = module_path.rsplit(".", 1)
+                    module = importlib.import_module(base_module)
+                    if hasattr(module, attribute):
+                        attr = getattr(module, attribute)
+                        exec_globals[alias] = attr
+                else:
+                    path = importlib.import_module(module_path)
+                    exec_globals[alias] = path
+            except (ImportError, AttributeError) as e:
+                raise ImportError(f"Failed to import {module_path}: {str(e)}")
+
+        # Execute the transformed code
+        log().info(
+            "ASTPreprocessor Executing transformed code for function [%s]",
+            function_name,
+        )
+        exec(code_object, exec_globals)
+        return exec_globals.get(function_name)
+
+    @staticmethod
+    def print_ast(transformed_tree=None):
+        print("#", "-" * 40, "Transformed AST", "-" * 40)
+        unparsed_code = ast.unparse(transformed_tree)
+        print(unparsed_code)
+        print("#", "-" * 40, "End Transformed AST", "-" * 40)
+
+    def make_func_param_name(self, base_name, used_names):
+        """Generate a unique parameter name that doesn't collide with existing names."""
+        if base_name not in used_names:
+            return base_name
+
+        i = 0
+        while f"{base_name}_{i}" in used_names:
+            i += 1
+        return f"{base_name}_{i}"
+
+    def transform_function(self, func_name, function_pointer):
+        """
+        Transforms a function.
+        """
+        # Skip if the function has already been processed
+        if function_pointer in self.processed_functions:
+            log().info(
+                "ASTPreprocessor Skipping already processed function [%s]", func_name
+            )
+            return []
+
+        # Step 1. Parse the given function
+        file_name = inspect.getsourcefile(function_pointer)
+        lines, start_line = inspect.getsourcelines(function_pointer)
+        dedented_source = textwrap.dedent("".join(lines))
+        tree = ast.parse(dedented_source, filename=file_name)
+        # Bump the line numbers so they match the real source file
+        ast.increment_lineno(tree, start_line - 1)
+
+        # Step 1.2 Check the decorator
+        if not self.check_decorator(tree.body[0]):
+            log().info(
+                "[%s] - Skipping function due to missing decorator",
+                func_name,
+            )
+            return []
+
+        self.processed_functions.add(function_pointer)
+        log().info("ASTPreprocessor Transforming function [%s]", func_name)
+
+        # Step 2. Transform the function
+        transformed_tree = self.visit(tree)
+        ast.fix_missing_locations(transformed_tree)
+        combined_body = transformed_tree.body
+
+        # Step 3. Return the transformed tree
+        return combined_body
+
+    def check_early_exit(self, tree):
+        """
+        Checks if a given region or scope in the provided Python code has early exits.
+        """
+
+        class EarlyExitChecker(ast.NodeVisitor):
+            def __init__(self):
+                self.has_early_exit = False
+                self.early_exit_node = None
+                self.early_exit_type = None
+
+            def visit_Return(self, node):
+                self.has_early_exit = True
+                self.early_exit_node = node
+                self.early_exit_type = "return"
+
+            def visit_Break(self, node):
+                self.has_early_exit = True
+                self.early_exit_node = node
+                self.early_exit_type = "break"
+
+            def visit_Continue(self, node):
+                self.has_early_exit = True
+                self.early_exit_node = node
+                self.early_exit_type = "continue"
+
+            def visit_Raise(self, node):
+                self.has_early_exit = True
+                self.early_exit_node = node
+                self.early_exit_type = "raise"
+
+        checker = EarlyExitChecker()
+        checker.visit(tree)
+        if not checker.has_early_exit:
+            return
+        raise DSLAstPreprocessorError(
+            message=f"Early exit ({checker.early_exit_type}) is not allowed in `{self.function_name}`"
+            + (f" in `{self.class_name}`" if self.class_name else ""),
+            filename=self.file_name,
+            snippet=ast.unparse(tree),
+            suggestion=(
+                "If predicates are constant expression, write like "
+                "`if const_expr(...)` or `for ... in range_constexpr`. "
+                "In that case, early exit will be executed by Python "
+                "interpreter, so it's supported."
+            ),
+        )
+
+    def is_node_constexpr(self, node) -> bool:
+        """
+        Determines if the node is a constexpr.
+        Supported nodes are if, for, while statements.
+        """
+        if isinstance(node, ast.If) or isinstance(node, ast.While):
+            if isinstance(node.test, ast.Call):
+                func = node.test.func
+
+                if isinstance(func, ast.Attribute) and func.attr == "const_expr":
+                    return True
+
+                elif isinstance(func, ast.Name) and func.id == "const_expr":
+                    return True
+        elif isinstance(node, ast.For):
+            if isinstance(node.iter, ast.Call):
+                func = node.iter.func
+                if isinstance(func, ast.Attribute) and func.attr == "range_constexpr":
+                    return True
+
+                elif isinstance(func, ast.Name) and func.id == "range_constexpr":
+                    return True
+        return False
+
+    def transform(self, original_function, exec_globals):
+        """
+        Transforms the provided function using the preprocessor.
+        """
+        self.file_name = inspect.getsourcefile(original_function)
+        self.function_globals = exec_globals
+        transformed_tree = self.transform_function(
+            original_function.__name__, original_function
+        )
+        unified_tree = ast.Module(body=transformed_tree, type_ignores=[])
+        unified_tree = ast.fix_missing_locations(unified_tree)
+
+        return unified_tree
+
+    def analyze_region_variables(self, node: Union[ast.For, ast.If], active_symbols):
+        """
+        Analyze variables in different code regions to identify read-only, write-only,
+        and active variables for DSL constructs.
+        """
+
+        # we need orderedset to keep the insertion order the same. otherwise generated IR is different each time
+        read_args = OrderedSet()
+        write_args = OrderedSet()
+        local_closure = self.local_closures
+        file_name = self.file_name
+        region_node = node
+
+        class RegionAnalyzer(ast.NodeVisitor):
+
+            def visit_Name(self, node):
+                """
+                Mark every load as read, and every store as write.
+                """
+                if isinstance(node.ctx, ast.Load):
+                    read_args.add(node.id)
+                elif isinstance(node.ctx, ast.Store):
+                    write_args.add(node.id)
+
+            @staticmethod
+            def get_call_base(func_node):
+                if isinstance(func_node, ast.Attribute):
+                    # If the .value is another Attribute, keep digging
+                    if isinstance(func_node.value, ast.Attribute):
+                        return RegionAnalyzer.get_call_base(func_node.value)
+                    # If the .value is a Name, that's our base
+                    elif isinstance(func_node.value, ast.Name):
+                        return func_node.value.id
+                    else:
+                        # Could be something else (lambda, call, etc.)
+                        return None
+                elif isinstance(func_node, ast.Name):
+                    return None
+                return None
+
+            @staticmethod
+            def get_function_name(func_node: ast.Call):
+                if isinstance(func_node.func, ast.Name):
+                    function_name = func_node.func.id
+                # Check if it's a method or attribute call
+                elif isinstance(func_node.func, ast.Attribute):
+                    function_name = func_node.func.attr
+                else:
+                    function_name = None
+                return function_name
+
+            def visit_Call(self, node):
+                base_name = RegionAnalyzer.get_call_base(node.func)
+
+                if isinstance(node.func, ast.Name):
+                    func_name = node.func.id
+                    if func_name in local_closure:
+                        raise DSLAstPreprocessorError(
+                            f"Function `{func_name}` is a closure and is not supported in for/if statements",
+                            filename=file_name,
+                            snippet=ast.unparse(region_node),
+                        )
+
+                # Classes are mutable by default. Mark them as write. If they are
+                # dataclass(frozen=True), treat them as read in runtime.
+                if base_name is not None and base_name not in ("self"):
+                    write_args.add(base_name)
+
+                self.generic_visit(node)
+
+        analyzer = RegionAnalyzer()
+        analyzer.visit(ast.Module(body=node))
+
+        # Argument can be Load and Store. We should just mark it as Store.
+        read_args = read_args - write_args
+
+        used_args = read_args & active_symbols
+        iter_args = write_args & active_symbols
+        flattend_args = used_args | iter_args
+
+        return list(used_args), list(iter_args), list(flattend_args)
+
+    def extract_range_args(self, iter_node):
+        args = iter_node.args
+        if len(args) == 1:
+            return ast.Constant(value=0), self.visit(args[0]), ast.Constant(value=1)
+        elif len(args) == 2:
+            return self.visit(args[0]), self.visit(args[1]), ast.Constant(value=1)
+        elif len(args) == 3:
+            return self.visit(args[0]), self.visit(args[1]), self.visit(args[2])
+        else:
+            raise DSLAstPreprocessorError(
+                "Unsupported number of arguments in range", filename=self.file_name
+            )
+
+    def extract_unroll_args(self, iter_node):
+        keywords = {kw.arg: kw.value for kw in iter_node.keywords}
+        return (
+            keywords.get("unroll", ast.Constant(value=-1)),
+            keywords.get("unroll_full", ast.Constant(value=False)),
+        )
+
+    def create_loop_function(
+        self,
+        func_name,
+        node,
+        start,
+        stop,
+        step,
+        unroll,
+        unroll_full,
+        used_args,
+        iter_args,
+        flattened_args,
+        is_loop_constexpr,
+    ):
+        """
+        Creates a loop body function with the `loop_selector` decorator.
+        """
+
+        func_args = [ast.arg(arg=node.target.id, annotation=None)]
+        func_args += [ast.arg(arg=var, annotation=None) for var in flattened_args]
+
+        # Create the loop body
+        transformed_body = []
+        for stmt in node.body:
+            transformed_stmt = self.visit(stmt)  # Recursively visit inner statements
+            if isinstance(transformed_stmt, list):
+                transformed_body.extend(transformed_stmt)
+            else:
+                transformed_body.append(transformed_stmt)
+
+        # Handle the return for a single iterated argument correctly
+        if len(iter_args) == 0:
+            transformed_body.append(ast.Return())
+        else:
+            transformed_body.append(
+                ast.Return(
+                    value=ast.List(
+                        elts=[ast.Name(id=var, ctx=ast.Load()) for var in iter_args],
+                        ctx=ast.Load(),
+                    )
+                )
+            )
+
+        # Define the decorator with parameters
+        decorator = ast.copy_location(
+            ast.Call(
+                func=ast.Name(id=self.DECORATOR_FOR_STATEMENT, ctx=ast.Load()),
+                args=[start, stop, step],
+                keywords=[
+                ast.keyword(arg="unroll", value=unroll),
+                ast.keyword(arg="unroll_full", value=unroll_full),
+                ast.keyword(arg="constexpr", value=is_loop_constexpr),
+                ast.keyword(
+                    arg="used_args",
+                    value=ast.List(
+                        elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in used_args],
+                        ctx=ast.Load(),
+                    ),
+                ),
+                ast.keyword(
+                    arg="iter_args",
+                    value=ast.List(
+                        elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in iter_args],
+                        ctx=ast.Load(),
+                        ),
+                    ),
+                    ast.keyword(
+                        arg="iter_arg_names",
+                        value=ast.List(
+                            elts=[ast.Constant(value=arg) for arg in iter_args],
+                            ctx=ast.Load(),
+                        ),
+                    ),
+                ],
+            ),
+            node,
+        )
+
+        return ast.copy_location(
+            ast.FunctionDef(
+                name=func_name,
+                args=ast.arguments(
+                    posonlyargs=[],
+                    args=func_args,
+                    kwonlyargs=[],
+                    kw_defaults=[],
+                    defaults=[],
+                ),
+                body=transformed_body,
+                decorator_list=[decorator],
+            ),
+            node,
+        )
+
+    def create_loop_call(self, func_name, iter_args):
+        """
+        Assigns the returned value from the loop function directly (without a tuple unpacking).
+        """
+        if len(iter_args) == 0:
+            return ast.Expr(value=ast.Name(id=func_name, ctx=ast.Load()))
+        elif len(iter_args) == 1:
+            return ast.Assign(
+                targets=[ast.Name(id=iter_args[0], ctx=ast.Store())],
+                value=ast.Name(id=func_name, ctx=ast.Load()),
+            )
+        else:
+            return ast.Assign(
+                targets=[
+                    ast.Tuple(
+                        elts=[ast.Name(id=var, ctx=ast.Store()) for var in iter_args],
+                        ctx=ast.Store(),
+                    )
+                ],
+                value=ast.Name(id=func_name, ctx=ast.Load()),
+            )
+
+    def is_supported_range_call(self, node):
+        return (
+            isinstance(node, ast.For)
+            and isinstance(node.iter, ast.Call)
+            and (
+                (
+                    isinstance(node.iter.func, ast.Name)
+                    and node.iter.func.id in self.SUPPORTED_FOR_RANGE_STATEMENTS
+                )
+                or (
+                    isinstance(node.iter.func, ast.Attribute)
+                    and node.iter.func.attr in self.SUPPORTED_FOR_RANGE_STATEMENTS
+                )
+            )
+        )
+
+    def get_loop_constexpr(self, node):
+        if not self.is_supported_range_call(node):
+            return None
+
+        # Map function names to their constexpr values
+        constexpr_map = {"range": None, "range_dynamic": False, "range_constexpr": True}
+        range_name = (
+            node.iter.func.id
+            if isinstance(node.iter.func, ast.Name)
+            else node.iter.func.attr
+        )
+        return ast.Constant(value=constexpr_map[range_name])
+
+    def transform_for_loop(self, node, active_symbols):
+        # Constexpr doesn't get preprocessed
+        if self.is_node_constexpr(node):
+            self.generic_visit(node)
+            return node
+
+        # We only support range, range_constexpr, range_dynamic
+        if self.is_supported_range_call(node):
+            constexpr_val = self.get_loop_constexpr(node)
+            # Check for early exit and raise exception
+            self.check_early_exit(node)
+            start, stop, step = self.extract_range_args(node.iter)
+            unroll, unroll_full = self.extract_unroll_args(node.iter)
+            used_args, iter_args, flat_args = self.analyze_region_variables(
+                node, active_symbols
+            )
+
+            func_name = f"loop_body_{self.counter}"
+            self.counter += 1
+
+            func_def = self.create_loop_function(
+                func_name,
+                node,
+                start,
+                stop,
+                step,
+                unroll,
+                unroll_full,
+                used_args,
+                iter_args,
+                flat_args,
+                constexpr_val,
+            )
+
+            assign = ast.copy_location(
+                self.create_loop_call(func_name, iter_args), node
+            )
+
+            # This should work fine as it modifies the AST structure
+            return [func_def, assign]
+
+        self.generic_visit(node)
+
+        return node
+
+    def visit_BoolOp(self, node):
+        # Visit child nodes first
+        self.generic_visit(node)
+
+        # It is necessary to expand short circuit evaluation explicit here
+        # Although we do not support inline if-else for IR generation, this is actually evaluated in Python
+        # So it's fine here
+        # Transform "and" to "and_"
+        if isinstance(node.op, ast.And):
+            # Create an if-else statement in AST form
+            # if type(lhs) == bool and lhs == False:
+            #     return lhs
+            # else
+            #     return and_(lhs, rhs)
+            short_circuit_value = ast.Constant(value=False)
+            helper_func = ast.Name(id="and_", ctx=ast.Load())
+        # Transform "or" to "or_"
+        elif isinstance(node.op, ast.Or):
+            # Create an if-else statement in AST form
+            # if type(lhs) == bool and lhs == True:
+            #     return lhs
+            # else
+            #     return or_(lhs, rhs)
+            short_circuit_value = ast.Constant(value=True)
+            helper_func = ast.Name(id="or_", ctx=ast.Load())
+        else:
+            # BoolOp should be either And or Or
+            raise DSLAstPreprocessorError(
+                f"Unsupported boolean operation: {node.op}",
+                filename=self.file_name,
+                snippet=ast.unparse(node),
+            )
+
+        test = ast.BoolOp(
+            op=ast.And(),
+            values=[
+                ast.Compare(
+                    left=ast.Call(
+                        func=ast.Name(id="type", ctx=ast.Load()),
+                        args=[node.values[0]],
+                        keywords=[],
+                    ),
+                    ops=[ast.Eq()],
+                    comparators=[ast.Name(id="bool", ctx=ast.Load())],
+                ),
+                ast.Compare(
+                    left=node.values[0],
+                    ops=[ast.Eq()],
+                    comparators=[short_circuit_value],
+                ),
+            ],
+        )
+        return ast.copy_location(
+            ast.IfExp(
+                test=test,
+                body=node.values[0],
+                orelse=ast.Call(
+                    func=helper_func,
+                    args=node.values,
+                    keywords=[],
+                ),
+            ),
+            node,
+        )
+
+    def visit_UnaryOp(self, node):
+        # Visit child nodes first
+        self.generic_visit(node)
+
+        # Transform "not" to "~" as we overload __invert__
+        if isinstance(node.op, ast.Not):
+            func_name = ast.Name(id="not_", ctx=ast.Load())
+            return ast.copy_location(
+                ast.Call(func=func_name, args=[node.operand], keywords=[]), node
+            )
+
+        return node
+
+    def visit_For(self, node):
+        active_symbols = self.scope_manager.get_active_symbols()
+        self.scope_manager.enter_scope()
+
+        if isinstance(node.target, ast.Name):
+            self.scope_manager.add_to_scope(node.target.id)
+
+        new_for_node = self.transform_for_loop(node, active_symbols)
+        self.scope_manager.exit_scope()
+        return new_for_node
+
+    def visit_Name(self, node):
+        self.generic_visit(node)
+        return node
+
+    def visit_Assert(self, node):
+        test = self.visit(node.test)
+
+        args = [ast.keyword(arg="test", value=test)]
+        if node.msg:
+            msg = self.visit(node.msg)
+            args.append(ast.keyword(arg="msg", value=msg))
+
+        # Rewrite to assert_executor(test, msg)
+        new_node = ast.Expr(
+            ast.Call(
+                func=ast.Name(id=self.ASSERT_EXECUTOR, ctx=ast.Load()),
+                args=[],
+                keywords=args,
+            )
+        )
+
+        # Propagate line number from original node to new node
+        ast.copy_location(new_node, node)
+        return new_node
+
+    def visit_Call(self, node):
+        func = node.func
+        self.generic_visit(node)
+
+        # Check if the function is 'bool'
+        if isinstance(func, ast.Name) and func.id == "bool":
+            return ast.copy_location(
+                ast.Call(
+                    func=ast.Name(id=self.BOOL_CAST, ctx=ast.Load()),
+                    args=[node.args[0]],
+                    keywords=[],
+                ),
+                node,
+            )
+        elif isinstance(func, ast.Attribute) and isinstance(func.value, ast.Name):
+            def create_downcast_call(arg):
+                return ast.copy_location(
+                    ast.Call(
+                        func=ast.Name(
+                            id=self.IMPLICIT_DOWNCAST_NUMERIC_TYPE, ctx=ast.Load()
+                        ),
+                        args=[arg],
+                        keywords=[],
+                    ),
+                    arg,
+                )
+            module = self.function_globals.get(func.value.id)
+            if isinstance(module, ModuleType) and module.__package__.endswith(
+                "._mlir.dialects"
+            ):
+                # Check if argument is Numeric, if so, call ir_value()
+                args = []
+                for arg in node.args:
+                    args.append(create_downcast_call(arg))
+                kwargs = []
+                for kwarg in node.keywords:
+                    kwargs.append(
+                        ast.copy_location(
+                            ast.keyword(
+                                arg=kwarg.arg,
+                                value=create_downcast_call(kwarg.value),
+                            ),
+                            kwarg,
+                        )
+                    )
+                return ast.copy_location(
+                    ast.Call(func=func, args=args, keywords=kwargs), node
+                )
+
+        return node
+
+    def visit_ClassDef(self, node):
+        self.class_name = node.name
+        self.generic_visit(node)
+        self.class_name = None
+        return node
+
+    def _visit_target(self, target):
+        if isinstance(target, ast.Name):
+            self.scope_manager.add_to_scope(target.id)
+        elif isinstance(target, ast.Tuple):
+            for t in target.elts:
+                if isinstance(t, ast.Name):
+                    self.scope_manager.add_to_scope(t.id)
+
+    def visit_Assign(self, node):
+        for target in node.targets:
+            self._visit_target(target)
+        self.generic_visit(node)
+        return node
+
+    def visit_AugAssign(self, node):
+        self._visit_target(node.target)
+        self.generic_visit(node)
+        return node
+
+    def check_decorator(self, node: ast.AST) -> bool:
+        """
+        Check if the function has the correct decorator for preprocessing.
+        """
+        if not isinstance(node, ast.FunctionDef):
+            return False
+        decorator_list = node.decorator_list
+        if len(decorator_list) == 0:
+            return False
+
+        for d in decorator_list:
+            if isinstance(d, ast.Call):
+                if isinstance(d.func, ast.Attribute):
+                    if d.func.attr in ["jit", "kernel"]:
+                        if d.keywords == []:
+                            return True
+                        for keyword in d.keywords:
+                            if keyword.arg == "preprocess":
+                                try:
+                                    if isinstance(keyword.value, ast.Constant):
+                                        return keyword.value.value
+                                    else:
+                                        return ast.literal_eval(keyword.value)
+                                except:
+                                    pass
+
+            elif isinstance(d, ast.Attribute):
+                if d.attr in ["jit", "kernel"]:
+                    return True
+
+        return False
+
+    def remove_dsl_decorator(self, decorator_list):
+        """
+        Remove .jit and .kernel decorators
+        The decorator can be in two forms:
+        - @jit(...)
+        - @jit
+        """
+        new_decorator_list = []
+        decorator_names = ["jit", "kernel"]
+        for d in decorator_list:
+            is_jit_or_kernel = False
+            if isinstance(d, ast.Call):
+                if isinstance(d.func, ast.Attribute):
+                    if d.func.attr in decorator_names:
+                        is_jit_or_kernel = True
+            elif isinstance(d, ast.Attribute):
+                if d.attr in decorator_names:
+                    is_jit_or_kernel = True
+
+            if not is_jit_or_kernel:
+                new_decorator_list.append(d)
+        return new_decorator_list
+
+    def visit_FunctionDef(self, node):
+        self.scope_manager.enter_scope()
+        self.function_counter += 1
+        self.function_name = node.name
+        if self.function_depth > 0:
+            self.local_closures.add(node.name)
+
+        self.function_depth += 1
+
+        # Add function name and arguments
+        self.scope_manager.add_to_scope(node.name)
+        for arg in node.args.args:
+            self.scope_manager.add_to_scope(arg.arg)
+
+        self.generic_visit(node)
+        self.scope_manager.exit_scope()
+
+        self.function_depth -= 1
+
+        # Remove .jit and .kernel decorators
+        node.decorator_list = self.remove_dsl_decorator(node.decorator_list)
+        return node
+
+    def visit_With(self, node):
+        self.scope_manager.enter_scope()
+
+        for item in node.items:
+            if isinstance(item.optional_vars, ast.Name):
+                self.scope_manager.add_to_scope(item.optional_vars.id)
+        self.generic_visit(node)
+
+        self.scope_manager.exit_scope()
+        return node
+
+    def visit_While(self, node):
+        active_symbols = self.scope_manager.get_active_symbols()
+        self.scope_manager.enter_scope()
+
+        # Constexpr doesn't get preprocessed
+        if self.is_node_constexpr(node):
+            self.generic_visit(node)
+            self.scope_manager.exit_scope()
+            return node
+
+        # Check for early exit and raise exception
+        self.check_early_exit(node)
+
+        used_args, yield_args, flat_args = self.analyze_region_variables(
+            node, active_symbols
+        )
+        func_name = f"while_region_{self.counter}"
+        self.counter += 1
+
+        func_def = self.create_while_function(
+            func_name, node, used_args, yield_args, flat_args
+        )
+        assign = ast.copy_location(self.create_loop_call(func_name, yield_args), node)
+
+        self.scope_manager.exit_scope()
+        return [func_def, assign]
+
+    def visit_Try(self, node):
+        self.scope_manager.enter_scope()
+        self.generic_visit(node)
+        self.scope_manager.exit_scope()
+        return node
+
+    def visit_ExceptHandler(self, node):
+        self.scope_manager.enter_scope()
+        if node.name:  # Exception variable
+            self.scope_manager.add_to_scope(node.name)
+        self.generic_visit(node)
+        self.scope_manager.exit_scope()
+        return node
+
+    def create_if_call(self, func_name, yield_args, flat_args):
+        """Creates the assignment statement for the if function call"""
+        if not yield_args:
+            return ast.Expr(value=ast.Name(id=func_name, ctx=ast.Load()))
+        elif len(yield_args) == 1:
+            return ast.Assign(
+                targets=[ast.Name(id=yield_args[0], ctx=ast.Store())],
+                value=ast.Name(id=func_name, ctx=ast.Load()),
+            )
+        else:
+            return ast.Assign(
+                targets=[
+                    ast.Tuple(
+                        elts=[ast.Name(id=var, ctx=ast.Store()) for var in yield_args],
+                        ctx=ast.Store(),
+                    )
+                ],
+                value=ast.Name(id=func_name, ctx=ast.Load()),
+            )
+
+    def visit_IfExp(self, node):
+        """
+        Visits an inline if-else expression (ternary operator).
+        This is the Python equivalent of `x if condition else y`.
+        """
+        # Check if the condition is constexpr
+        constexpr_val, test = self.is_constexpr(node)
+
+        node.test = test
+        node.body = self.visit(node.body)
+        node.orelse = self.visit(node.orelse)
+
+        # If it's a constexpr node, we don't need to transform it
+        if constexpr_val.value is True:
+            return node
+
+        # Emit
+        # node if type(pred) == bool else select_(pred, body, orelse)
+        # so if pred is a python bool, use python to short-circuit and avoid emit arith.select
+        return ast.copy_location(
+            ast.IfExp(
+                test=ast.Compare(
+                    left=ast.Call(
+                        func=ast.Name(id="type", ctx=ast.Load()),
+                        args=[node.test],
+                        keywords=[],
+                    ),
+                    ops=[ast.Eq()],
+                    comparators=[ast.Name(id="bool", ctx=ast.Load())],
+                ),
+                body=node,  # Original ternary expression
+                orelse=ast.Call(
+                    func=ast.Name(id="select_", ctx=ast.Load()),
+                    args=[
+                        node.test,
+                        node.body,
+                        node.orelse,
+                    ],
+                    keywords=[],
+                ),
+            ),
+            node,
+        )
+
+    def visit_If(self, node):
+        active_symbols = self.scope_manager.get_active_symbols()
+        self.scope_manager.enter_scope()
+
+        # Constexpr doesn't get preprocessed
+        if self.is_node_constexpr(node):
+            self.generic_visit(node)
+            self.scope_manager.exit_scope()
+            return node
+
+        # Check for early exit and raise exception
+        self.check_early_exit(node)
+
+        used_args, yield_args, flat_args = self.analyze_region_variables(
+            node, active_symbols
+        )
+        func_name = f"if_region_{self.counter}"
+        self.counter += 1
+
+        func_def = self.create_if_function(
+            func_name, node, used_args, yield_args, flat_args
+        )
+        assign = ast.copy_location(
+            self.create_if_call(func_name, yield_args, flat_args), node
+        )
+
+        self.scope_manager.exit_scope()
+        return [func_def, assign]
+
+    def is_constexpr(self, node):
+        """Determines if the if condition is wrapped in const_expr or dynamic_expr"""
+        if isinstance(node.test, ast.Call):
+            func = node.test.func
+
+            # Check if the function is 'const_expr'
+            if isinstance(func, ast.Name) and func.id == "const_expr":
+                return ast.Constant(value=True), node.test.args[0]
+
+            # Check if the function is 'dynamic_expr'
+            elif isinstance(func, ast.Name) and func.id == "dynamic_expr":
+                return ast.Constant(value=False), self.visit(node.test.args[0])
+
+            # Check if it's an attribute access for 'const_expr' or 'dynamic_expr'
+            elif isinstance(func, ast.Attribute):
+                if func.attr == "const_expr":
+                    return ast.Constant(value=True), node.test.args[0]
+                elif func.attr == "dynamic_expr":
+                    return ast.Constant(value=False), self.visit(node.test.args[0])
+
+        return ast.Constant(value=None), self.visit(node.test)
+
+    def create_if_function(
+        self, func_name, node, used_args, yield_args, flattened_args
+    ):
+        is_constexpr, test_expr = self.is_constexpr(node)
+        pred_name = self.make_func_param_name("pred", flattened_args)
+        func_args = [ast.arg(arg=pred_name, annotation=None)]
+        func_args += [ast.arg(arg=var, annotation=None) for var in flattened_args]
+        func_args_then_else = [
+            ast.arg(arg=var, annotation=None) for var in flattened_args
+        ]
+
+        then_body = []
+        for stmt in node.body:
+            transformed_stmt = self.visit(stmt)  # Recursively visit inner statements
+            if isinstance(transformed_stmt, list):
+                then_body.extend(transformed_stmt)
+            else:
+                then_body.append(transformed_stmt)
+
+        # Create common return list for all blocks
+        return_list = ast.List(
+            elts=[ast.Name(id=var, ctx=ast.Load()) for var in yield_args],
+            ctx=ast.Load(),
+        )
+
+        # Create common function arguments
+        func_decorator_arguments = ast.arguments(
+            posonlyargs=[], args=func_args, kwonlyargs=[], kw_defaults=[], defaults=[]
+        )
+        func_then_else_arguments = ast.arguments(
+            posonlyargs=[],
+            args=func_args_then_else,
+            kwonlyargs=[],
+            kw_defaults=[],
+            defaults=[],
+        )
+
+        then_block_name = f"then_block_{self.counter}"
+        else_block_name = f"else_block_{self.counter}"
+        elif_region_name = f"elif_region_{self.counter}"
+        self.counter += 1
+
+        # Create then block
+        then_block = ast.copy_location(
+            ast.FunctionDef(
+                name=then_block_name,
+                args=func_then_else_arguments,
+                body=then_body + [ast.Return(value=return_list)],
+                decorator_list=[],
+            ),
+            node,
+        )
+
+        # Decorator keywords
+        decorator_keywords = [
+            ast.keyword(
+                arg="pred", value=test_expr
+            ),  # ast.Name(id="pred", ctx=ast.Load())
+            ast.keyword(
+                arg="used_args",
+                value=ast.List(
+                    elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in used_args],
+                    ctx=ast.Load(),
+                ),
+            ),
+            ast.keyword(
+                arg="yield_args",
+                value=ast.List(
+                    elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in yield_args],
+                    ctx=ast.Load(),
+                ),
+            ),
+        ]
+
+        # Create decorator
+        decorator = ast.copy_location(
+            ast.Call(
+                func=ast.Name(id=self.DECORATOR_IF_STATEMENT, ctx=ast.Load()),
+                args=[],
+                keywords=decorator_keywords,
+            ),
+            node,
+        )
+
+        # Executor keywords
+        execute_keywords = [
+            ast.keyword(arg="pred", value=ast.Name(id=pred_name, ctx=ast.Load())),
+            ast.keyword(
+                arg="used_args",
+                value=ast.List(
+                    elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in used_args],
+                    ctx=ast.Load(),
+                ),
+            ),
+            ast.keyword(
+                arg="yield_args",
+                value=ast.List(
+                    elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in yield_args],
+                    ctx=ast.Load(),
+                ),
+            ),
+            ast.keyword(
+                arg="yield_arg_names",
+                value=ast.List(
+                    elts=[ast.Constant(value=arg) for arg in yield_args],
+                    ctx=ast.Load(),
+                ),
+            ),
+            ast.keyword(
+                arg="then_block", value=ast.Name(id=then_block_name, ctx=ast.Load())
+            ),
+        ]
+
+        # Handle different cases
+        if not yield_args and node.orelse == []:
+            # No yield_args case - only then_block needed
+            execute_call = ast.copy_location(
+                ast.Call(
+                    func=ast.copy_location(
+                        ast.Name(id=self.IF_EXECUTOR, ctx=ast.Load()), node
+                    ),
+                    args=[],
+                    keywords=execute_keywords,
+                ),
+                node,
+            )
+            func_body = [then_block, ast.Return(value=execute_call)]
+        else:
+            # Create else block based on node.orelse
+            if node.orelse:
+                if len(node.orelse) == 1 and isinstance(node.orelse[0], ast.If):
+                    # Handle elif case
+                    elif_node = node.orelse[0]
+                    nested_if_name = elif_region_name
+                    # Recursion for nested elif
+                    nested_if = self.create_if_function(
+                        nested_if_name, elif_node, used_args, yield_args, flattened_args
+                    )
+                    else_block = ast.FunctionDef(
+                        name=else_block_name,
+                        args=func_then_else_arguments,
+                        body=[
+                            nested_if,
+                            ast.Return(
+                                value=ast.Name(id=nested_if_name, ctx=ast.Load())
+                            ),
+                        ],
+                        decorator_list=[],
+                    )
+                else:
+
+                    else_body = []
+                    for stmt in node.orelse:
+                        transformed_stmt = self.visit(
+                            stmt
+                        )  # Recursively visit inner statements
+                        if isinstance(transformed_stmt, list):
+                            else_body.extend(transformed_stmt)
+                        else:
+                            else_body.append(transformed_stmt)
+
+                    # Regular else block
+                    else_block = ast.FunctionDef(
+                        name=else_block_name,
+                        args=func_then_else_arguments,
+                        body=else_body + [ast.Return(value=return_list)],
+                        decorator_list=[],
+                    )
+            else:
+                # Default else block
+                else_block = ast.FunctionDef(
+                    name=else_block_name,
+                    args=func_then_else_arguments,
+                    body=[ast.Return(value=return_list)],
+                    decorator_list=[],
+                )
+
+            # Add else_block to execute keywords
+            execute_keywords.append(
+                ast.keyword(
+                    arg="else_block", value=ast.Name(id=else_block_name, ctx=ast.Load())
+                )
+            )
+            # Add constexpr
+            execute_keywords.append(ast.keyword(arg="constexpr", value=is_constexpr))
+
+            execute_call = ast.copy_location(
+                ast.Call(
+                    func=ast.Name(id=self.IF_EXECUTOR, ctx=ast.Load()),
+                    args=[],
+                    keywords=execute_keywords,
+                ),
+                node,
+            )
+            func_body = [
+                then_block,
+                ast.copy_location(else_block, node),
+                ast.Return(value=execute_call),
+            ]
+
+        return ast.copy_location(
+            ast.FunctionDef(
+                name=func_name,
+                args=func_decorator_arguments,
+                body=func_body,
+                decorator_list=[decorator],
+            ),
+            node,
+        )
+
+    def create_while_function(
+        self, func_name, node, used_args, yield_args, flattened_args
+    ):
+        """Create a while function that looks like:
+
+        @while_selector(pred, used_args=[], yield_args=[])
+        def while_region(pred, flattened_args):
+            def while_before_block(*used_args, *yield_args):
+                # Note that during eval of pred can possibly alter yield_args
+                return *pred, yield_args
+            def while_after_block(*used_args, yield_args):
+                ...loop_body_transformed...
+                return yield_args
+            return self.while_executor(pred, used_args, yield_args,
+                while_before_block, while_after_block, constexpr)
+        yield_args = while_region(pred, flattened_args)
+
+        Which will later be executed as psuedo-code:
+
+        # Dynamic mode:
+        scf.WhileOp(types(yield_args), yield_args)
+        with InsertionPoint(before_block):
+            cond, yield_args = while_before_block(*flattened_args)
+            scf.ConditionOp(cond, yield_args)
+        with InsertionPoint(after_block):
+            yield_args = while_after_block(yield_args)
+            scf.YieldOp(yield_args)
+        return while_op.results_
+
+        # Const mode:
+        cond, yield_args = while_before_block(yield_args)
+        while pred:
+            yield_args = body_block(yield_args)
+            cond, yield_args = while_before_block(yield_args)
+        return yield_args
+        """
+        is_constexpr, test_expr = self.is_constexpr(node)
+        pred_name = self.make_func_param_name("pred", flattened_args)
+
+        # Section: decorator construction
+        decorator_keywords = [
+            ast.keyword(arg="pred", value=test_expr),
+            ast.keyword(
+                arg="used_args",
+                value=ast.List(
+                    elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in used_args],
+                    ctx=ast.Load(),
+                ),
+            ),
+            ast.keyword(
+                arg="yield_args",
+                value=ast.List(
+                    elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in yield_args],
+                    ctx=ast.Load(),
+                ),
+            ),
+        ]
+        decorator = ast.copy_location(
+            ast.Call(
+                func=ast.Name(id=self.DECORATOR_WHILE_STATEMENT, ctx=ast.Load()),
+                args=[],
+                keywords=decorator_keywords,
+            ),
+            node,
+        )
+
+        # Section: Shared initialization for before and after blocks
+        while_before_block_name = f"while_before_block_{self.counter}"
+        while_after_block_name = f"while_after_block_{self.counter}"
+        self.counter += 1
+        block_args_args = [ast.arg(arg=var, annotation=None) for var in used_args]
+        block_args_args += [ast.arg(arg=var, annotation=None) for var in yield_args]
+        block_args = ast.arguments(
+            posonlyargs=[],
+            args=block_args_args,
+            kwonlyargs=[],
+            kw_defaults=[],
+            defaults=[],
+        )
+
+        yield_args_ast_name_list = ast.List(
+            elts=[ast.Name(id=var, ctx=ast.Load()) for var in yield_args],
+            ctx=ast.Load(),
+        )
+
+        # Section: while_before_block FunctionDef, which contains condition
+        while_before_return_list = ast.List(
+            elts=[test_expr, yield_args_ast_name_list],
+            ctx=ast.Load(),
+        )
+        while_before_stmts = [ast.Return(value=while_before_return_list)]
+        while_before_block = ast.copy_location(
+            ast.FunctionDef(
+                name=while_before_block_name,
+                args=block_args,
+                body=while_before_stmts,
+                decorator_list=[],
+            ),
+            test_expr,
+        )
+
+        # Section: while_after_block FunctionDef, which contains loop body
+        while_after_stmts = []
+        for stmt in node.body:
+            transformed_stmt = self.visit(stmt)  # Recursively visit inner statements
+            if isinstance(transformed_stmt, list):
+                while_after_stmts.extend(transformed_stmt)
+            else:
+                while_after_stmts.append(transformed_stmt)
+        while_after_stmts.append(ast.Return(value=yield_args_ast_name_list))
+
+        while_after_block = ast.copy_location(
+            ast.FunctionDef(
+                name=while_after_block_name,
+                args=block_args,
+                body=while_after_stmts,
+                decorator_list=[],
+            ),
+            node,
+        )
+
+        # Section: Execute via executor
+        execute_keywords = [
+            ast.keyword(arg="pred", value=ast.Name(id=pred_name, ctx=ast.Load())),
+            ast.keyword(
+                arg="used_args",
+                value=ast.List(
+                    elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in used_args],
+                    ctx=ast.Load(),
+                ),
+            ),
+            ast.keyword(
+                arg="yield_args",
+                value=ast.List(
+                    elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in yield_args],
+                    ctx=ast.Load(),
+                ),
+            ),
+            ast.keyword(
+                arg="while_before_block",
+                value=ast.Name(id=while_before_block_name, ctx=ast.Load()),
+            ),
+            ast.keyword(
+                arg="while_after_block",
+                value=ast.Name(id=while_after_block_name, ctx=ast.Load()),
+            ),
+            ast.keyword(arg="constexpr", value=is_constexpr),
+            ast.keyword(
+                arg="yield_arg_names",
+                value=ast.List(
+                    elts=[ast.Constant(value=arg) for arg in yield_args],
+                    ctx=ast.Load(),
+                ),
+            ),
+        ]
+
+        execute_call = ast.Call(
+            func=ast.Name(id=self.WHILE_EXECUTOR, ctx=ast.Load()),
+            args=[],
+            keywords=execute_keywords,
+        )
+
+        # Putting everything together, FunctionDef for while_region
+        func_args_args = [ast.arg(arg=pred_name, annotation=None)]
+        func_args_args += [ast.arg(arg=var, annotation=None) for var in flattened_args]
+        func_args = ast.arguments(
+            posonlyargs=[],
+            args=func_args_args,
+            kwonlyargs=[],
+            kw_defaults=[],
+            defaults=[],
+        )
+
+        return ast.copy_location(
+            ast.FunctionDef(
+                name=func_name,
+                args=func_args,
+                body=[
+                    while_before_block,
+                    while_after_block,
+                    ast.Return(value=execute_call),
+                ],
+                decorator_list=[decorator],
+            ),
+            node,
+        )
diff --git a/python/CuTeDSL/base_dsl/cache_helpers.py b/python/CuTeDSL/base_dsl/cache_helpers.py
new file mode 100644
index 00000000..8ea08874
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/cache_helpers.py
@@ -0,0 +1,154 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides jit cache load/dump helper functions
+"""
+
+import os
+import uuid
+import random
+import tempfile
+import pwd
+import time
+from pathlib import Path
+import hashlib
+
+from .utils.logger import log
+from .jit_executor import JitExecutor
+
+from .._mlir import ir
+
+# =============================================================================
+# Jit Cache Helper functions
+# =============================================================================
+
+
+def get_current_user():
+    # Try to get the user from the environment variable first
+    user = os.getenv("USER") or os.getenv("USERNAME")
+    if not user:
+        # Fallback for Unix-like systems
+        user = pwd.getpwuid(os.getuid()).pw_name
+    return user
+
+
+try:
+    default_generated_ir_path = f"/tmp/{get_current_user()}/cutlass_python_cache/"
+except Exception as e:
+    # If all else fails, provide a default fallback path
+    default_generated_ir_path = "/tmp/cutlass_python_cache/"
+    print(f"Could not determine user, using default path. Error: {e}")
+
+
+def load_ir(file, asBytecode=False):
+    """Load generated IR from a file."""
+    assert "mlir" in file
+    func_name = file.split(".mlir")[0].split("dsl_")[-1]
+    with ir.Context() as ctx:
+        with open(file, "rb" if asBytecode else "r") as f:
+            module = ir.Module.parse(f.read())
+
+    return func_name, module
+
+
+def make_unique_filename(fpath: Path, new_ext: str = None) -> Path:
+    """Generate a unique filename with an optional new extension."""
+    random_part = random.randint(0, 999999)
+    timestamp = time.time()
+    hash_input = f"{fpath}_{timestamp}_{random_part}".encode()
+    hash_code = hashlib.md5(hash_input).hexdigest()[:16]  # Shorter hash for readability
+    stem_with_hash = f"{fpath.stem}_{hash_code}"
+    return fpath.with_name(stem_with_hash).with_suffix(new_ext or fpath.suffix)
+
+
+def save_ir(
+    dsl_name: str,
+    module: object,
+    fname: str,
+    isTemp: bool = False,
+    asBytecode: bool = False,
+) -> str:
+    """Save generated IR to a file."""
+    initial_name = f"{dsl_name.lower()}_{fname}.mlir"
+    save_path = Path(tempfile.gettempdir() if isTemp else os.getcwd())
+    save_fname = save_path / initial_name
+    # Random ID to avoid any collisions
+    rnd_id = str(uuid.uuid4())
+    pid = os.getpid()
+    # use temp dir to be robust against program interruptions
+    temp_dir = os.path.join(save_path, f"tmp.pid_{pid}_{rnd_id}")
+    # If the process exits abnormally, may leave a temporary folder. Needs to be removed manually.
+    os.makedirs(temp_dir, exist_ok=False)
+    temp_fname = os.path.join(temp_dir, initial_name)
+
+    if asBytecode:
+        with open(temp_fname, "wb") as f:
+            module.operation.write_bytecode(f)
+    else:
+        with open(temp_fname, "w") as f:
+            print(module, file=f)
+    # os.replace is guaranteed to be atomic on POSIX systems if it succeeds
+    # so filepath cannot see a partial write
+    os.replace(temp_fname, save_fname)
+    os.removedirs(temp_dir)
+    log().debug("Generated IR saved into %s", save_fname)
+    return save_fname
+
+
+def check_func_name(jit_cache, func_name):
+    if not func_name in jit_cache:
+        jit_cache[func_name] = JitExecutor(None, None, None, None, None, None)
+    return jit_cache
+
+
+def load_cache_from_path(dsl_name, cache_limit, path=default_generated_ir_path):
+    """Load cache from a directory path."""
+    if not os.path.exists(path):
+        return dict()
+    files = os.listdir(path)
+    jit_cache = dict()
+    try:
+        for idx, file in enumerate(files):
+            if idx >= int(cache_limit):
+                break
+            # identify dsl prefix
+            if not file.startswith(f"{dsl_name.lower()}"):
+                continue
+            if ".mlir" in file:
+                func_name, ir_module = load_ir(
+                    os.path.join(path, file), asBytecode=True
+                )
+                jit_cache = check_func_name(jit_cache, func_name)
+                jit_cache[func_name].ir_module = ir_module
+    except Exception as e:
+        print(f"{dsl_name} failed with loading generated IR cache.", e)
+        jit_cache = dict()
+    return jit_cache
+
+
+def dump_cache_to_path(
+    dsl_name, jit_cache, cache_limit, path=default_generated_ir_path
+):
+    log().info("JIT cache : dumping [%s] items=[%s]", dsl_name, len(jit_cache))
+    if not os.path.exists(path):
+        os.makedirs(path)
+    original_path = os.getcwd()
+    try:
+        os.chdir(path)
+        for idx, [key, value] in enumerate(jit_cache.items()):
+            if idx >= int(cache_limit):
+                break
+            save_ir(dsl_name, value.ir_module, key, asBytecode=True)
+    except Exception as e:
+        print(f"{dsl_name} failed with caching generated IR", e)
+    finally:
+        os.chdir(original_path)
diff --git a/python/CuTeDSL/base_dsl/common.py b/python/CuTeDSL/base_dsl/common.py
new file mode 100644
index 00000000..3cf413ed
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/common.py
@@ -0,0 +1,268 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import os
+from typing import Any, Dict, Iterable, Optional, Union
+
+"""
+This module provides a Exception classes DSL class for any Dialect.
+"""
+
+
+# Add color codes at the top of the file after imports
+class Colors:
+    """ANSI color codes for error messages"""
+
+    RED = "\033[91m"
+    YELLOW = "\033[93m"
+    BLUE = "\033[94m"
+    GREEN = "\033[92m"
+    BOLD = "\033[1m"
+    RESET = "\033[0m"
+
+
+# =============================================================================
+# DSL Exceptions
+# =============================================================================
+
+
+class DSLBaseError(Exception):
+    """
+    Base exception for DSL-related errors.
+    Provides optional contextual metadata to aid in debugging.
+    """
+
+    def __init__(
+        self,
+        message: str,
+        line: Optional[int] = None,
+        snippet: Optional[str] = None,
+        filename: Optional[str] = None,
+        error_code: Optional[Union[str, int]] = None,
+        context: Optional[Union[Dict[str, Any], str]] = None,
+        suggestion: Optional[str] = None,
+        cause: Optional[BaseException] = None,
+    ) -> None:
+        self.message = message
+        self.line = line
+        self.filename = filename
+        self.snippet = snippet
+        self.error_code = error_code
+        self.context = context
+        self.suggestion = suggestion
+        self.cause = cause
+
+        super().__init__(self._format_message())
+
+    def _format_message(self):
+        """
+        Formats the complete error message with available metadata.
+        Override this in subclasses if you want to change formatting logic.
+        """
+        parts = [f"{self.__class__.__name__}: {self.message}"]
+
+        if self.error_code is not None:
+            parts.append(f"{Colors.BOLD}Error Code:{Colors.RESET} {self.error_code}\n")
+
+        if self.line is not None:
+            parts.append(f"  Line: {self.line}")
+
+        if self.filename is not None:
+            parts.append(f"  File: {self.filename}")
+
+        if self.snippet:
+            # Optionally truncate long snippets for readability
+            parts.append(f"  Snippet: \n {self.snippet}")
+
+        if self.cause:
+            parts.append(f"  Caused exception: {self.cause}")
+
+        if self.context:
+            if isinstance(self.context, dict):
+                parts.append(f"{Colors.BLUE}🔍 Additional Context:{Colors.RESET}\n")
+                for key, value in self.context.items():
+                    parts.append(f"    {key}: {value}")
+            else:
+                parts.append(
+                    f"{Colors.BLUE}🔍 Additional Context:{Colors.RESET} {self.context}"
+                )
+
+        if self.suggestion:
+            parts.append(f"{Colors.GREEN}💡 Suggestions:{Colors.RESET}")
+            if isinstance(self.suggestion, (list, tuple)):
+                for suggestion in self.suggestion:
+                    parts.append(f" {Colors.GREEN}{suggestion}{Colors.RESET}")
+            else:
+                parts.append(f" {self.suggestion}")
+
+        return "\n".join(parts)
+
+
+class DSLRuntimeError(DSLBaseError):
+    """
+    Raised when an error occurs during JIT-time code generation in the DSL.
+    """
+
+    # Inherits all logic from DSLBaseError; override methods if you need
+    # specialized behavior or formatting for runtime errors.
+    pass
+
+
+def _get_friendly_cuda_error_message(error_code, error_name):
+    # Avoid circular dependency
+    from .runtime.cuda import get_device_info
+
+    """Get a user-friendly error message for common CUDA errors."""
+    # Strip the byte string markers if present
+    if isinstance(error_name, bytes):
+        error_name = error_name.decode("utf-8")
+    elif (
+        isinstance(error_name, str)
+        and error_name.startswith("b'")
+        and error_name.endswith("'")
+    ):
+        error_name = error_name[2:-1]
+
+    # Add target architecture info
+    target_arch = os.getenv("CUTE_DSL_ARCH", "unknown")
+
+    error_messages = {
+        "CUDA_ERROR_INVALID_SOURCE": (
+            f"{Colors.RED}❌ Failed to load CUDA kernel - likely architecture mismatch.{Colors.RESET}\n\n"
+        ),
+        "CUDA_ERROR_NO_BINARY_FOR_GPU": (
+            f"{Colors.RED}❌ CUDA kernel not compatible with your GPU.{Colors.RESET}\n\n"
+        ),
+        "CUDA_ERROR_OUT_OF_MEMORY": (
+            f"{Colors.RED}💾 CUDA out of memory error.{Colors.RESET}\n\n"
+        ),
+        "CUDA_ERROR_INVALID_DEVICE": (
+            f"{Colors.RED}❌ Invalid CUDA device.{Colors.RESET}\n\n"
+        ),
+        "CUDA_ERROR_NOT_INITIALIZED": (
+            f"{Colors.RED}❌ CUDA context not initialized.{Colors.RESET}\n\n"
+        ),
+        "CUDA_ERROR_INVALID_VALUE": (
+            f"{Colors.RED}⚠️ Invalid parameter passed to CUDA operation.{Colors.RESET}\n\n"
+            f"{Colors.YELLOW}This is likely a bug - please report it with:{Colors.RESET}"
+        ),
+    }
+
+    error_suggestions = {
+        "CUDA_ERROR_INVALID_SOURCE": (
+            f"1. Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            f"2. Clear the compilation cache and regenerate the kernel",
+            f"3. Check CUDA toolkit installation",
+        ),
+        "CUDA_ERROR_NO_BINARY_FOR_GPU": (
+            f"Set env CUTE_DSL_ARCH to match your GPU architecture",
+        ),
+        "CUDA_ERROR_OUT_OF_MEMORY": (
+            f"1. Reduce batch size",
+            f"2. Reduce model size",
+            f"3. Free unused GPU memory",
+        ),
+        "CUDA_ERROR_INVALID_DEVICE": (
+            f"1. Check if CUDA device is properly initialized",
+            f"2. Verify GPU is detected: nvidia-smi",
+            f"3. Check CUDA_VISIBLE_DEVICES environment variable",
+        ),
+        "CUDA_ERROR_NOT_INITIALIZED": (
+            f"1. Check CUDA driver installation",
+            f"2. call `cuda.cuInit(0)` before any other CUDA operation",
+            f"3. Run nvidia-smi to confirm GPU status",
+        ),
+        "CUDA_ERROR_INVALID_VALUE": (
+            f"1. Your GPU model",
+            f"2. SM ARCH setting",
+            f"3. Steps to reproduce",
+        ),
+    }
+
+    message = error_messages.get(
+        error_name, f"{Colors.RED}Unknown CUDA error{Colors.RESET}"
+    )
+
+    # Add debug information
+    debug_info = f"\n- {Colors.BOLD}Error name: {error_name}\n"
+    debug_info += f"- CUDA_TOOLKIT_PATH: {os.getenv('CUDA_TOOLKIT_PATH', 'not set')}\n"
+    debug_info += (
+        f"- Target SM ARCH: {os.getenv('CUTE_DSL_ARCH', 'not set')}{Colors.RESET}\n"
+    )
+
+    try:
+        # Get GPU information using CUDA Python API
+        debug_info += f"\n{Colors.BLUE}📊 GPU Information:{Colors.RESET}\n"
+        gpu_info = get_device_info()
+        debug_info += gpu_info.pretty_str()
+
+        if target_arch and gpu_info.compatible_archs:
+            debug_info += f"\n{Colors.BOLD}Compatibility Check:{Colors.RESET}\n"
+
+            if target_arch not in gpu_info.compatible_archs:
+                debug_info += (
+                    f"{Colors.RED}❌ Error: Target SM ARCH {target_arch} is not compatible\n"
+                    f"💡 Please use one of SM ARCHs: "
+                    f"{Colors.GREEN}{', '.join(gpu_info.compatible_archs or [])}{Colors.RESET}\n"
+                )
+            elif target_arch != gpu_info.sm_arch:
+                debug_info += (
+                    f"{Colors.YELLOW}⚠️  Warning: Using compatible but non-optimal architecture\n"
+                    f"• Current: {target_arch}\n"
+                    f"• Recommended: {Colors.GREEN}{gpu_info.sm_arch}{Colors.RESET} (native)\n"
+                )
+            else:
+                debug_info += f"{Colors.GREEN}✓ Using optimal architecture: {gpu_info.sm_arch}{Colors.RESET}\n"
+
+    except Exception as e:
+        debug_info += (
+            f"\n{Colors.YELLOW}ℹ️  Could not retrieve GPU info: {str(e)}{Colors.RESET}"
+        )
+
+    return message, debug_info, error_suggestions.get(error_name, "")
+
+
+class DSLCudaRuntimeError(DSLBaseError):
+    """
+    Raised when an error occurs during CUDA runtime code generation in the DSL.
+    """
+
+    # Inherits all logic from DSLRuntimeError; override methods if you need
+    # specialized behavior or formatting for runtime errors.
+    def __init__(self, error_code, error_name) -> None:
+        self._error_code = error_code
+        self._error_name = error_name
+        message, debug_info, suggestion = _get_friendly_cuda_error_message(
+            error_code, error_name
+        )
+
+        super().__init__(
+            message, error_code=error_code, context=debug_info, suggestion=suggestion
+        )
+
+
+class DSLAstPreprocessorError(DSLBaseError):
+    """
+    Raised when an error occurs during AST preprocessing or visiting in the DSL.
+    """
+
+    # Same approach: You could override _format_message if you want
+    # to emphasize AST node details or anything specific to preprocessing.
+    pass
+
+
+class DSLNotImplemented(DSLBaseError):
+    """
+    Raised when a feature of the DSL is not implemented yet.
+    """
+
+    # Useful for stubs in your DSL that you plan to implement in the future.
+    pass
diff --git a/python/CuTeDSL/base_dsl/compiler.py b/python/CuTeDSL/base_dsl/compiler.py
new file mode 100644
index 00000000..2e5b75cd
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/compiler.py
@@ -0,0 +1,221 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides a class that compiles generated IR using MLIR's PassManager
+and executes it using MLIR's ExecutionEngine.
+
+"""
+
+from typing import Sequence, Optional, Tuple
+import os
+import sys
+import inspect
+from .common import DSLRuntimeError
+
+_SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(_SCRIPT_PATH)
+
+from .._mlir import ir
+
+
+# =============================================================================
+# Compiler Class
+# =============================================================================
+
+
+class CompilationError(RuntimeError):
+    """Custom error class for compilation failures"""
+
+    # Add ANSI color codes
+    RED = "\033[91m"
+    YELLOW = "\033[93m"
+    BLUE = "\033[94m"
+    GREEN = "\033[92m"
+    BOLD = "\033[1m"
+    RESET = "\033[0m"
+
+    def __init__(
+        self,
+        message: str,
+        nvvm_error: Optional[str] = None,
+        ir_context: Optional[str] = None,
+        cuda_toolkit: Optional[str] = None,
+        arch: Optional[str] = None,
+    ):
+        self.nvvm_error = nvvm_error
+        self.ir_context = ir_context
+        self.cuda_toolkit = cuda_toolkit
+        self.arch = arch
+        # Call parent with formatted error to avoid showing class name
+        super().__init__("")  # Empty string to avoid class name
+        # Store formatted error for str() representation
+        self._formatted_error = self._format_error()
+
+    def __str__(self) -> str:
+        """Override string representation to avoid showing class name"""
+        return self._formatted_error
+
+    def __repr__(self) -> str:
+        """Override repr representation to avoid showing class name"""
+        return self._formatted_error
+
+    def _format_error(self) -> str:
+        if not self.nvvm_error:
+            return str(self.args[0])
+
+        return f"""NVVM Compilation Error:
+----------------------
+
+{self.BLUE}⚙️  Current Settings:{self.RESET}
+{self.BOLD}- CUDA Toolkit Path: {self.cuda_toolkit or "Not Set"}
+- Target Architecture: {self.arch}{self.RESET}
+
+IR Context (truncated):
+{self.ir_context}
+
+{self.YELLOW}💡 Possible Solutions:{self.RESET}
+{self.GREEN}1. Check if CUDA_TOOLKIT_PATH is set correctly
+2. Verify target architecture ({self.arch}) is supported by your CUDA toolkit
+3. Make sure CUDA toolkit version matches the target architecture requirements{self.RESET}"""
+
+
+class Compiler:
+    """Compiler class for compiling and building MLIR modules."""
+
+    def __init__(self, passmanager, execution_engine):
+        self.passmanager = passmanager
+        self.execution_engine = execution_engine
+
+    def __call__(self, module):
+        """Convenience application method."""
+        self.compile(module)
+
+    def _process_error(self, error_msg: str) -> Tuple[Optional[str], Optional[str]]:
+        """Process error message to extract NVVM error and IR context"""
+        nvvm_error = None
+        ir_msg = ""
+
+        if "NVVM_ERROR" in error_msg:
+            # Extract the specific NVVM error
+            nvvm_error = (
+                error_msg.split("libNVVM extra log:")[1].strip()
+                if "libNVVM extra log:" in error_msg
+                else error_msg
+            )
+
+            # Extract IR context
+            if "see current operation:" in error_msg:
+                # Get the IR section
+                ir_section = error_msg.split("see current operation:")[1].strip()
+                # Remove duplicate IR section
+                ir_section = ir_section.split("error: unknown: Failed translating")[
+                    0
+                ].strip()
+
+                # Get first few lines and last few lines of the IR
+                ir_lines = ir_section.split("\n")
+                if len(ir_lines) > 10:
+                    ir_msg = "\n".join(ir_lines[:5] + ["  ..."] + ir_lines[-5:])
+                else:
+                    ir_msg = ir_section
+
+        return nvvm_error, ir_msg
+
+    def compile(
+        self,
+        module,
+        pipeline: str,
+        cuda_toolkit: str = "",
+        arch: str = "",
+        enable_verifier=False,
+    ):
+        """Compiles the module by invoking the pipeline."""
+        try:
+            pm = self.passmanager.PassManager.parse(pipeline)
+            pm.enable_verifier(enable_verifier)
+            pm.run(module.operation)
+        except Exception as e:
+            error_msg = str(e)
+            nvvm_error, ir_msg = self._process_error(error_msg)
+
+            if nvvm_error:
+                raise CompilationError(
+                    error_msg,
+                    nvvm_error=nvvm_error,
+                    ir_context=ir_msg,
+                    cuda_toolkit=cuda_toolkit,
+                    arch=arch,
+                ) from e
+            raise e
+
+    def jit(self, module, opt_level: int = 2, shared_libs: Sequence[str] = ()):
+        """Wraps the module in a JIT execution engine."""
+        return self.execution_engine.ExecutionEngine(
+            module, opt_level=opt_level, shared_libs=shared_libs
+        )
+
+    def compile_and_jit(
+        self,
+        module,
+        pipeline: str,
+        shared_libs: Sequence[str] = (),
+        opt_level: int = 2,
+        cuda_toolkit: str = "",
+        arch: str = "",
+    ):
+        """Compiles and jits the module."""
+        self.compile(
+            module,
+            pipeline,
+            cuda_toolkit,
+            arch,
+        )
+        return self.jit(module, opt_level, shared_libs)
+
+
+def compile(func, *args, **kwargs):
+    if func is None:
+        raise DSLRuntimeError("Function is not set or invalid.")
+
+    if not callable(func):
+        raise DSLRuntimeError("Object is not callable.")
+
+    kwargs["compile_only"] = True
+    kwargs["no_cache"] = True
+
+    if inspect.isfunction(func):
+        # regular function
+        pass
+    elif inspect.ismethod(func):
+        # if it's a method, add the instance to the first argument
+        args = [func.__self__] + list(args)
+        func = func.__func__
+    elif inspect.isclass(type(func)) and hasattr(func, "__call__"):
+        # If it's a class instance, get the class's __call__ method
+        args = [func] + list(args)
+        # Get the actual function from the class definition
+        func = func.__call__.__func__
+    else:
+        raise DSLRuntimeError(
+            "Invalid function type, only function, method and module are supported, but got",
+            func,
+        )
+
+    # If it's a wrapped function created by jit decorator, get the original function
+    if hasattr(func, "__wrapped__"):
+        func = func.__wrapped__
+
+    if not hasattr(func, "_dsl_object"):
+        raise DSLRuntimeError("Function is not decorated with jit decorator.")
+
+    fcn_ptr = func._dsl_object._preprocess_and_execute(func)
+    return func._dsl_object._func(fcn_ptr, *args, **kwargs)
diff --git a/python/CuTeDSL/base_dsl/dsl.py b/python/CuTeDSL/base_dsl/dsl.py
new file mode 100644
index 00000000..619ed4c8
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/dsl.py
@@ -0,0 +1,1637 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides a main DSL class for any Dialect.
+The DSL should be inherited as a new class, and its initialization requires dialects.
+It handles most of the mechanics for the DSL in an agnostic way,
+for example, it can handle various dialect-specific tasks.
+"""
+
+
+# Standard library imports
+from dataclasses import dataclass, field
+import atexit
+import os
+import io
+import sys
+import errno
+import ctypes
+import re
+import inspect
+import argparse
+import hashlib
+from functools import lru_cache, wraps
+from collections import namedtuple
+from abc import ABC, abstractmethod
+from typing import Any, Union, Tuple, get_origin, get_args
+from types import FunctionType
+import warnings
+
+from . import typing as t
+from .env_manager import EnvironmentVarManager
+
+# =============================================================================
+# CUDA Python
+# =============================================================================
+
+from ..base_dsl._mlir_helpers.arith import const
+
+# =============================================================================
+# Local module imports
+# =============================================================================
+
+from .cache_helpers import *
+from .jit_executor import JitExecutor
+from .utils.timer import timer
+from .utils.logger import setup_log, log
+from .utils.stacktrace import filter_exception, walk_to_top_module, filter_stackframe
+from .runtime.jit_arg_adapters import is_argument_constexpr, JitArgAdapterRegistry
+from .runtime.tensor_descriptor import TensorDescriptor
+from .ast_preprocessor import DSLPreprocessor
+from .common import *
+from .typing import (
+    get_c_pointers,
+    get_mlir_types,
+)
+
+# =============================================================================
+# MLIR modules
+# =============================================================================
+
+from .._mlir import ir
+from .._mlir import runtime as rt
+from .._mlir.extras import types as T
+from .._mlir.dialects import arith, math, func
+
+# =============================================================================
+# cutlass.dlpack_runtime
+# =============================================================================
+
+from .runtime.dlpack_runtime import dlpack_to_tensor_desc, mark_layout_dynamic
+
+# =============================================================================
+# Global Variables
+# =============================================================================
+
+MLIR_DYNAMIC = -9223372036854775808
+
+# =============================================================================
+# Codegen Utils
+# =============================================================================
+
+
+def _numpy_type_to_mlir_type(dtype):
+    if dtype == np.float64:
+        return T.f64()
+    if dtype == np.float16:
+        return T.f16()
+    if dtype == np.float32:
+        return T.f32()
+    if dtype == np.int64:
+        return T.i64()
+    if dtype == np.int32:
+        return T.i32()
+    if dtype == np.int16:
+        return T.i16()
+    if dtype == np.int8:
+        return T.i8()
+    if dtype == np.uint64:
+        return T.ui64()
+    if dtype == np.uint32:
+        return T.ui32()
+    if dtype == np.uint16:
+        return T.ui16()
+    if dtype == np.uint8:
+        return T.ui8()
+    if dtype == np.bool_:
+        return T.bool()
+    if dtype == f8E5M2:
+        return T.f8E5M2()
+    if dtype == f8E4M3FN:
+        return T.f8E4M3FN()
+    if dtype == f8E8M0FNU:
+        return T.f8E8M0FNU()
+    if dtype == f6E3M2FN:
+        return T.f6E3M2FN()
+    if dtype == f6E2M3FN:
+        return T.f6E2M3FN()
+    if dtype == f4E2M1FN:
+        return T.f4E2M1FN()
+    assert False, f"Unknown type {type}"
+
+
+def _mlir_type_to_numpy_type(type):
+    if type == T.f64():
+        return np.float64
+    if type == T.f16():
+        return np.float16
+    if type == T.f32():
+        return np.float32
+    if type == T.i64():
+        return np.int64
+    if type == T.i32():
+        return np.int32
+    if type == T.i16():
+        return np.int16
+    if type == T.i8():
+        return np.int8
+    if type == T.ui64():
+        return np.uint64
+    if type == T.ui32():
+        return np.uint32
+    if type == T.ui16():
+        return np.uint16
+    if type == T.ui8():
+        return np.uint8
+    if type == T.bool():
+        return np.bool_
+    assert False, f"Unknown type {type}"
+
+
+# =============================================================================
+# Main DSL Class
+# =============================================================================
+
+
+def is_dynamic_expression(value):
+    """
+    Check if the value is an MLIR's SSA value.
+    """
+    # Case 1: If the value has MLIR's SSA value, return True
+    # Case 2: If the value supports __extract_mlir_values__ then it's possible to get SSA value
+    return (
+        isinstance(value, ir.Value)
+        or hasattr(value, "__extract_mlir_values__")
+        or len(extract_mlir_values(value)) > 0
+    )
+
+
+def extract_mlir_values(obj):
+    """
+    Given the `obj`, recursively go through it to extract all contained IR values as list of MLIR values
+    """
+    res = []
+    if hasattr(obj, "__extract_mlir_values__"):
+        res = obj.__extract_mlir_values__()
+    elif isinstance(obj, (tuple, list)):
+        res = sum((extract_mlir_values(x) for x in obj), [])
+    # Can't call is_dynamic_expression as _is_dynamic_expression depends on extract_mlir_values
+    elif isinstance(obj, set):
+        raise DSLRuntimeError(
+            "Sets are not supported in extract_mlir_values to ensure order preservation",
+            context="The DSL attempted to generate JIT function argument(s) for an argument of type set but failed.",
+            suggestion="Consider using a list or tuple instead",
+        )
+    elif isinstance(obj, ir.Value):
+        res = [obj]
+    elif isinstance(obj, ir.BlockArgumentList):
+        res = list(obj)  # type: ignore
+
+    return res
+
+
+def new_from_mlir_values(obj, values):
+    """
+    Create a new python object by populating containing MLIR values with list of new values
+    """
+    if hasattr(obj, "__new_from_mlir_values__"):
+        return obj.__new_from_mlir_values__(values)
+    elif isinstance(obj, (tuple, list)):
+        res = []
+        for x in obj:
+            n_items = len(get_mlir_types(x))
+            res.append(new_from_mlir_values(x, values[:n_items]))
+            values = values[n_items:]
+        obj_ty = type(obj)
+        return obj_ty(res)
+    elif isinstance(obj, set):
+        raise DSLRuntimeError(
+            "Sets are not supported in new_from_mlir_values to ensure order preservation",
+            context="The DSL attempted to generate JIT function argument(s) for an argument of type set but failed.",
+            suggestion="Consider using a list or tuple instead",
+        )
+    elif is_dynamic_expression(obj):
+
+        if len(values) == 0:
+            return obj
+
+        assert len(values) == 1
+        return values[0]
+    else:
+        assert len(values) == 0, f"{obj} expects 0 values, but got {values}"
+        return obj
+
+
+class BaseDSL:
+    gpu_module = None
+
+    def __init__(
+        self,
+        name: str,
+        compiler_provider: Any,
+        pass_sm_arch_name: str,
+        device_compilation_only=False,
+        preprocess=False,
+    ):
+        """
+        Constructor for initializing the class with required providers and environment settings.
+
+        Parameters:
+        - name (str): Name of DSL, used for environment variables and logging.
+        - compiler_provider (MLIR dialect): Provider for compiler.
+        - pass_sm_arch_name (str): The keyword name of the SM.
+        - device_compilation_only (bool) : Only device code, and call it via cuda driver
+        - preprocess (bool): Enable AST transformation.
+
+        This constructs a DSL instance and sets up environment management,
+        warning configurations, and logging functionalities. It reads
+        environment variables using `EnvironmentVarManager` and configures
+        a logger with settings from the environment. If environment warnings
+        are detected, they are escalated to errors to ensure strict handling.
+        """
+        # Enforcing initialization of instance variables
+        if not all([name, compiler_provider, pass_sm_arch_name]):
+            raise DSLRuntimeError(
+                "All required parameters must be provided and non-empty"
+            )
+
+        self.name = name
+        self.compiler_provider = compiler_provider
+        self.pass_sm_arch_name = pass_sm_arch_name
+        self.frame = None
+        self.no_cache = False
+        self.device_compilation_only = device_compilation_only
+        self.num_kernels = 0
+        # Read environment variables
+        self.envar = EnvironmentVarManager(self.name)
+        self.enable_preprocessor = preprocess
+        # This cache uses hash of original ir and env as key, allows dump/load to/from file. Enabled by default
+        self.jit_cache = (
+            dict()
+            if self.envar.disable_file_caching
+            else load_cache_from_path(self.name, self.envar.file_caching_capacity)
+        )
+        self.host_jit_decorator_name = f"@{BaseDSL.jit.__name__}"
+        self.device_jit_decorator_name = f"@{BaseDSL.kernel.__name__}"
+
+        # set warning
+        if self.envar.warnings_as_errors:
+            warnings.filterwarnings("error")
+        if self.envar.warnings_ignore:
+            warnings.filterwarnings("ignore")
+
+        # Initialize logger
+        if self.envar.log_to_console == False and self.envar.jitTimeProfiling:
+            self.envar.log_to_console = True
+            self.envar.log_level = 20  # info level
+        setup_log(
+            self.name,
+            self.envar.log_to_console,
+            self.envar.log_to_file,
+            f"{self.name}.log",
+            self.envar.log_level,
+        )
+
+        # kernel symbols are temporary symbol string variables, their values are valid until the compilation is done.
+        self.kernel_symbols = []
+        # used to generate unique name for gpu.launch
+        self.launch_inner_count = 0
+
+        if preprocess:
+            self.preprocessor = DSLPreprocessor()
+        log().info(f"Initializing {name} DSL")
+        log().debug(f"Logger initialized for {self.name}")
+
+        # Hook excepthook
+        if self.envar.filterStacktrace:
+            origin_excepthook = sys.excepthook
+            module_dir = walk_to_top_module(os.path.dirname(os.path.abspath(__file__)))
+
+            def excepthook(excep_type, value, traceback):
+                filter_exception(value, module_dir)
+                if hasattr(value, "__traceback__"):
+                    origin_excepthook(excep_type, value, value.__traceback__)
+                else:
+                    origin_excepthook(
+                        excep_type, value, filter_stackframe(traceback, module_dir)
+                    )
+
+            sys.excepthook = excepthook
+
+            # Restore original excepthook
+            def restore_excepthook(hook):
+                sys.excepthook = hook
+
+            atexit.register(restore_excepthook, origin_excepthook)
+
+    def dump_cache(self):
+        if not self.envar.disable_file_caching:
+            dump_cache_to_path(
+                self.name, self.jit_cache, self.envar.file_caching_capacity
+            )
+
+    @lru_cache(maxsize=1)
+    def print_warning_once(self, message):
+        log().warning(f"Warning: {message}")
+        warnings.warn(message, UserWarning)
+
+    def print_warning(self, message):
+        log().warning(f"Warning: {message}")
+        warnings.warn(message, UserWarning)
+
+    @classmethod
+    @lru_cache(maxsize=1)
+    def _get_dsl(cls):
+        # Instantiate the DSL Class once
+        main_dsl = cls()
+        if not main_dsl.no_cache:
+            # register atexit callback
+            atexit.register(main_dsl.dump_cache)
+        return main_dsl
+
+    @staticmethod
+    def _can_preprocess(**dkwargs):
+        """
+        Check if AST transformation is enabled or not for `jit` and `kernel` decorators.
+        """
+        return dkwargs.pop("preprocess", True)
+
+    @staticmethod
+    def _get_original_function(fcn_ptr, name):
+        """
+        Get the original function from the decorated function
+        """
+        while fcn_ptr.__name__ != name:
+            # If the function is wrapped with functools, get from __wrapped__
+            if hasattr(fcn_ptr, "__wrapped__"):
+                fcn_ptr = fcn_ptr.__wrapped__
+            # If the function is wrapped manually, it's the first in clousure
+            elif callable(fcn_ptr.__closure__[0].cell_contents):
+                fcn_ptr = fcn_ptr.__closure__[0].cell_contents
+            else:
+                raise DSLRuntimeError(
+                    f"Cannot find the original function {name} in the closure chain"
+                )
+        return fcn_ptr
+
+    @staticmethod
+    def _preprocess_and_execute(func):
+        """
+        Run ast transformation and return the materialized function pointer
+        """
+        if hasattr(func, "_transformed_ast"):
+            # If the function ptr is already materialized, use the existing one
+            func._dsl_object.frame = func._decorator_frame
+
+            if func._transformed_ast is None:
+                func._transformed_ast = func._dsl_object.run_preprocessor(func)
+                if func._transformed_ast is None:
+                    del func._decorator_frame
+                    del func._transformed_ast
+                    return func
+
+            fcn_ptr = func._dsl_object.get_function_ptr(func, func._transformed_ast)
+            # If the function is decorated, de-decorate it
+            fcn_ptr = BaseDSL._get_original_function(fcn_ptr, func.__name__)
+            return fcn_ptr
+        return func
+
+    def jit_runner(self, frame, executor, *dargs, **dkwargs):
+        """
+        Decorator to mark a function for JIT compilation.
+        """
+        # Set the frame, that can be used AST preprocessor
+        self.frame = frame
+        log().info("jit_runner")
+
+        def jit_runner_decorator(func):
+            func._dsl_object = self
+            # Run preprocessor that alters AST
+            if self.enable_preprocessor and BaseDSL._can_preprocess(**dkwargs):
+                # For an annotated function, add some DSL attributes
+                # When materializing the AST, we need decorator's frame
+                func._decorator_frame = frame
+                # No transformed ast at this point
+                func._transformed_ast = None
+
+            @wraps(func)
+            def jit_wrapper(*args, **kwargs):
+                func_ptr = BaseDSL._preprocess_and_execute(func)
+                return executor(func_ptr, *args, **kwargs)
+
+            return jit_wrapper
+
+        if len(dargs) == 1 and callable(dargs[0]):
+            return jit_runner_decorator(dargs[0])
+        else:
+            return jit_runner_decorator
+
+    @classmethod
+    def jit(cls, *dargs, **dkwargs):
+        """
+        Decorator to mark a function for JIT compilation for Host code.
+        """
+        frame = inspect.currentframe().f_back
+        # Instantiate the DSL Class
+        main_dsl = cls._get_dsl()
+        return main_dsl.jit_runner(frame, main_dsl._func, *dargs, **dkwargs)
+
+    @classmethod
+    def kernel(cls, *dargs, **dkwargs):
+        """
+        Decorator to mark a function for JIT compilation for GPU.
+        """
+        frame = inspect.currentframe().f_back
+        # Instantiate the DSL Class
+        main_dsl = cls._get_dsl()
+        return main_dsl.jit_runner(frame, main_dsl._kernel_helper, *dargs, **dkwargs)
+
+    @abstractmethod
+    def _kernel_helper(self, func, *args, **kwargs):
+        """
+        Helper function to handle kernel generation logic
+        """
+        pass
+
+    @abstractmethod
+    def _build_gpu_module(self, attrs):
+        """
+        Build the module op that contains the kernels.
+        """
+        pass
+
+    @abstractmethod
+    def _get_pipeline(self, pipeline):
+        """
+        Get the pipeline from the other configuration options.
+        """
+        if pipeline != None:
+            return pipeline
+        return None
+
+    @staticmethod
+    def log_additions(func_type, operands=None, types=None, arg_attrs=None):
+        if operands is not None and operands != []:
+            log().debug(
+                f"Added {func_type} operands: [%s]", ", ".join(map(str, operands))
+            )
+        if types is not None:
+            log().debug(
+                f"Added {func_type} arg_types: [%s]", ", ".join(map(str, types))
+            )
+        if arg_attrs is not None:
+            log().debug(
+                f"Added {func_type} arg_attrs: [%s]", ", ".join(map(str, arg_attrs))
+            )
+
+    def mangle_name(self, function_name, args, args_spec: inspect.FullArgSpec):
+        """Does simple name mangling"""
+
+        for spec_arg, arg in zip(args_spec.args, args):
+            spec_ty = args_spec.annotations.get(spec_arg, None)
+            if spec_ty != None:
+                if issubclass(type(spec_ty), (t.IRValue, t.IRVariadic)):
+                    continue
+                if isinstance(spec_ty, (ir.Type, ir.Value)):
+                    continue
+            if isinstance(arg, (ir.Type, ir.Value, ir.OpResult)):
+                continue
+            if isinstance(type(arg), (ir.Type, ir.Value, ir.OpResult)):
+                continue
+            if self._is_tensor_descriptor(arg):
+                continue
+            if inspect.isclass(spec_ty):
+                class_name = str(arg).replace("class", "")
+                class_name = class_name.replace(" ", "")
+                function_name = f"{function_name}_{class_name}"
+            elif isinstance(arg, (list, tuple)):
+                function_name = f"{function_name}_{'_'.join(map(str, arg))}"
+            else:
+                function_name = f"{function_name}_{arg}"
+        # we would need a dedicated MR to follow up
+        unwanted_chars = r"'-![]#,.<>()\":{}=%?@;"
+        translation_table = str.maketrans("", "", unwanted_chars)
+        function_name = function_name.translate(translation_table)
+        # identify address and drop
+        function_name = re.sub(r"0x[a-f0-9]{8,16}", "", function_name)
+        function_name = re.sub(r"\s+", " ", function_name)
+        function_name = function_name.replace(" ", "_")
+        function_name = function_name.replace("\n", "_")
+        # max fname is 256 character, leave space
+        function_name = function_name[:180]
+        log().info(f"Final mangled function name: {function_name}")
+        return function_name
+
+    def _generate_execution_arguments_for_known_types(
+        self, arg, arg_spec, arg_name, i, fop_args, iv_block_args
+    ):
+        """
+        Generate MLIR arguments for known types.
+
+        Sub-DSLs can override this method to handle types that are not
+        natively supported by the Base DSL.
+        """
+        ir_arg = []
+        if is_argument_constexpr(arg, arg_spec, arg_name, i, func):
+            ir_arg.append(arg)
+
+        return ir_arg, iv_block_args
+
+    def generate_execution_arguments(
+        self,
+        args,
+        kwargs,
+        fop,
+        args_spec: inspect.FullArgSpec,
+    ):
+        """Create list of arguments that will be passed to MLIR's func.func op"""
+
+        def gen_exec_args(input_args, arg_names, annotations, fop_args):
+            assert len(input_args) == len(arg_names)
+
+            ir_args = []
+            iv_block_args = 0
+            for i, arg in enumerate(input_args):
+                arg_name = arg_names[i]
+                arg_spec = annotations.get(arg_name, None)
+                log().debug("Processing [%d] Argument [%s : %s]", i, arg_name, arg_spec)
+
+                # Implicit cast to NumericMeta
+                if isinstance(arg_spec, t.NumericMeta):
+                    arg = t.cast(arg, arg_spec)
+
+                ir_arg, iv_block_args = (
+                    self._generate_execution_arguments_for_known_types(
+                        arg, arg_spec, arg_name, i, fop_args, iv_block_args
+                    )
+                )
+
+                if not ir_arg:
+                    # If it's not a known type, try JIT argument adapter
+                    # to convert the argument if possible
+                    adapter = JitArgAdapterRegistry.get_registered_adapter(type(arg))
+                    arg = adapter(arg) if adapter else arg
+
+                    n_args = len(get_mlir_types(arg))
+                    blk_args = fop_args[iv_block_args : iv_block_args + n_args]
+                    ir_arg.append(new_from_mlir_values(arg, blk_args))
+                    iv_block_args += n_args
+
+                self.log_additions(ir_arg)
+                ir_args.extend(ir_arg)
+
+            return ir_args
+
+        fop_args = list(fop.regions[0].blocks[0].arguments)
+        ir_args = gen_exec_args(args, args_spec.args, args_spec.annotations, fop_args)
+        ir_kwargs = gen_exec_args(
+            [kwargs[arg] for arg in args_spec.kwonlyargs],
+            args_spec.kwonlyargs,
+            args_spec.annotations,
+            fop_args[len(ir_args) :],
+        )
+        ir_kwargs = {k: v for k, v in zip(args_spec.kwonlyargs, ir_kwargs)}
+
+        log().debug("execution args: %s", ", ".join(map(str, ir_args)))
+        log().debug("execution kwargs: %s", ", ".join(map(str, ir_kwargs)))
+        return ir_args, ir_kwargs
+
+    @abstractmethod
+    def _generate_mlir_type_for_tensor_descriptor(self, tensor: TensorDescriptor):
+        """
+        Generate MLIR type for the tensor descriptor.
+        """
+        pass
+
+    @abstractmethod
+    def _generate_executable_arg_for_tensor_descriptor(
+        self, mlir_value=None, ptr_tensor_ty=None, tensor=None
+    ):
+        """
+        Generates executable value for the given tensor descriptor.
+        """
+        pass
+
+    @abstractmethod
+    def _get_globals(self):
+        """
+        Combines global and local variables from the current context and the
+        caller's frame comes. This includes the current module's globals, the
+        global variables from the caller's frame, and the local variables from
+        the caller's frame.
+
+        "self.frame" is used to fetch the caller's frame.
+
+        AST preprocessor generates a new python code, so the resulting globals
+        dictionary is used to execute the python code.
+        """
+        pass
+
+    def _is_tensor_descriptor(self, maybe_tensor_descriptor) -> bool:
+        return isinstance(
+            maybe_tensor_descriptor, TensorDescriptor
+        ) or TensorDescriptor.can_transformed_to_dlpack(maybe_tensor_descriptor)
+
+    def _handle_tensor_descriptor(
+        self, maybe_tensor, arg_name: str, need_gpu_memory: bool
+    ) -> TensorDescriptor:
+        if self._is_tensor_descriptor(maybe_tensor):
+            tensor = (
+                maybe_tensor
+                if isinstance(maybe_tensor, TensorDescriptor)
+                else TensorDescriptor(maybe_tensor)
+            )
+            if need_gpu_memory and not tensor.is_in_device:
+                log().info(
+                    "FAIL name=[%s] tensor=[%s] in_gpu=[%s]",
+                    arg_name,
+                    tensor,
+                    tensor.is_in_device,
+                )
+                raise DSLRuntimeError(
+                    f'Tensor "{arg_name}" is tensor "{tensor}" '
+                    "is not in the GPU memory. "
+                )
+
+            return tensor
+
+        raise DSLRuntimeError(
+            f"Argument {arg_name} could not be transformed into a TensorDescriptor."
+        )
+
+    def _validate_arg(self, arg, arg_index, arg_name, arg_spec):
+        """
+        Validates if the arg is really of the annotated type for type safety.
+
+        The default implementation is empty. Subclasses can override this method to add more validation logic.
+        Returns None if validation passes, otherwise returns an error derived from DSLBaseError.
+        """
+        pass
+
+    def _generate_jit_func_args_for_known_types(
+        self,
+        func,
+        arg,
+        arg_name,
+        arg_spec,
+        arg_index,
+        *,
+        is_host=True,
+    ):
+        """
+        Generate JIT function arguments for known types.
+
+        Sub-DSLs can override this method to handle types that are not
+        natively supported by the Base DSL.
+        """
+
+        jit_arg_type, jit_arg_attr, jit_exec_arg = [], [], []
+        default_attr = ir.DictAttr.get({})
+
+        if is_argument_constexpr(arg, arg_spec, arg_name, arg_index, func):
+            jit_exec_arg = jit_arg_type = jit_arg_attr = None
+
+        return jit_exec_arg, jit_arg_type, jit_arg_attr
+
+    def _generate_jit_func_args(
+        self,
+        func,
+        function_name,
+        args,
+        kwargs,
+        args_spec: inspect.FullArgSpec,
+        *,
+        is_host=True,
+    ):
+        """Generate JIT function arguments."""
+
+        assert len(args) == len(args_spec.args) and len(kwargs) == len(
+            args_spec.kwonlyargs
+        ), f"Input args {len(args)=} and kwargs {len(kwargs)=} must match arg_spec.args "
+        f"{len(args_spec.args)=} and arg_spec.kwonlyargs {len(args_spec.kwonlyargs)=}"
+
+        jit_arg_types, jit_arg_attrs, jit_exec_args = [], [], []
+        default_attr = ir.DictAttr.get({})
+
+        input_args = [*args, *kwargs.values()]
+        input_arg_names = [*args_spec.args, *args_spec.kwonlyargs]
+        for i, (arg_name, arg) in enumerate(zip(input_arg_names, input_args)):
+            spec_ty = args_spec.annotations.get(arg_name, None)
+            log().debug("Processing [%d] Argument [%s : %s]", i, arg_name, spec_ty)
+
+            # Implicitly convert into Numeric type if possible
+            if isinstance(spec_ty, t.NumericMeta):
+                arg = t.cast(arg, spec_ty)
+
+            # Type safety check
+            if spec_ty is not None:
+                err = self._validate_arg(arg, i, arg_name, spec_ty)
+                if err is not None:
+                    raise err
+
+            jit_exec_arg, jit_arg_type, jit_arg_attr = (
+                self._generate_jit_func_args_for_known_types(
+                    func,
+                    arg,
+                    arg_name,
+                    spec_ty,
+                    i,
+                    is_host=is_host,
+                )
+            )
+
+            if jit_arg_type is not None and len(jit_arg_type) == 0:
+                # If not any known type, try JIT argument adapter
+                # to convert the argument
+                adapter = JitArgAdapterRegistry.get_registered_adapter(type(arg))
+                arg = adapter(arg) if adapter else arg
+
+                if is_host:
+                    jit_exec_arg.extend(get_c_pointers(arg))
+                    jit_arg_type.extend(get_mlir_types(arg))
+                else:
+                    dyn_vals = extract_mlir_values(arg)
+                    jit_exec_arg.extend(dyn_vals)
+                    jit_arg_type.extend([v.type for v in dyn_vals])
+
+                if not jit_arg_type or not jit_exec_arg:
+                    if (is_host and hasattr(arg, "__c_pointers__")) or (
+                        not is_host
+                        and hasattr(arg, "__extract_mlir_values__")
+                        and hasattr(arg, "__new_from_mlir_values__")
+                    ):
+                        pass
+                    else:
+                        raise DSLRuntimeError(
+                            f"failed to generate argument #{i+1} ({arg_name}) for JIT function '{function_name}'.",
+                            context={
+                                f"Argument {arg_name}": "The DSL attempted to convert it into Dynamic Expression (aka MLIR values) but failed.",
+                                f"Call-site argument value": arg,
+                                f"Call-site argument type": type(arg),
+                            },
+                            suggestion=f"Consider annotating the argument with `{arg_name} : Constexpr` "
+                            "if it's a value known at compile-time. "
+                            f"Otherwise, implement the {'`JitArgument`' if is_host else '`DynamicExpression`'} "
+                            f"protocol or register a custom JIT argument adapter for type `{type(arg)}` to "
+                            "enable dynamic value conversion at runtime.",
+                        )
+
+                jit_arg_attr.extend([default_attr] * len(jit_arg_type))
+
+            if jit_arg_type is not None:
+                jit_exec_args.extend(jit_exec_arg)
+                jit_arg_types.extend(jit_arg_type)
+                jit_arg_attrs.extend(jit_arg_attr)
+
+        return jit_exec_args, jit_arg_types, jit_arg_attrs
+
+    def generate_mlir_function_types(
+        self, func, function_name, input_args, kwargs, args_spec: inspect.FullArgSpec
+    ):
+        """Convert input arguments to MLIR function signature also convert numpy arrays to memref."""
+
+        exe_args, types, _ = self._generate_jit_func_args(
+            func, function_name, input_args, kwargs, args_spec, is_host=True
+        )
+
+        log().debug("Execution Arguments: %s", ", ".join(map(str, exe_args)))
+        log().debug("Types: %s", ", ".join(map(str, types)))
+
+        assert len(exe_args) == len(
+            types
+        ), "expects the same number of arguments and function parameters"
+
+        return exe_args, types
+
+    @dataclass
+    class LaunchConfig:
+        cluster: list = None
+        grid: list = field(default_factory=lambda: [1, 1, 1])
+        block: list = field(default_factory=lambda: [1, 1, 1])
+        smem: int = 0
+        async_deps: list = field(default_factory=list)
+        has_cluster: bool = False
+        min_blocks_per_mp: int = 0
+
+        def __post_init__(self):
+            if len(self.grid) != 3:
+                raise DSLRuntimeError(f"Expect 3d grid!")
+            if len(self.block) != 3:
+                raise DSLRuntimeError(f"Expect 3d block!")
+
+            self.has_cluster = self.cluster is not None
+            if self.cluster is None:
+                self.cluster = [None, None, None]
+            elif len(self.cluster) != 3:
+                raise DSLRuntimeError(f"Expect 3d cluster!")
+
+    def diagnostic(self):
+        """Check command line parameters and enables diagnostic"""
+        # Check command line arguments "-diagnostic"
+        parser = argparse.ArgumentParser(description="Process diagnostic status.")
+        parser.add_argument(
+            "-diagnostic",
+            nargs="?",
+            const="all",
+            choices=["all", "fail", "success", "info", "suggestion"],
+            help="Set diagnostic status (fail, success, info, suggestion).",
+        )
+
+        args, _ = parser.parse_known_args()
+        ctx = ir.Context.current
+
+        def callback(d):
+            print(f"  [{self.name} Diagnostic] : {d.message}")
+
+        ctx.attach_diagnostic_handler(callback)
+
+        # Early return, don't enable diagnostics
+        if args.diagnostic is None:
+            return
+
+        # Enable MLIR Flags
+        ctx.emit_error_diagnostics = True
+        ir._GlobalDebug.flag = True
+        if args.diagnostic == "all":
+            ir._GlobalDebug.set_types("diagnostic")
+        else:
+            ir._GlobalDebug.set_types(f"diagnostic-{args.diagnostic}")
+
+    def get_location(self):
+        """
+        Get python location information and generate MLIR location
+        """
+
+        frame = self.frame
+        if frame is None:
+            print("Frame is None")
+            return None
+
+        file_loc = ir.Location.file(frame.f_code.co_filename, frame.f_lineno, 0)
+
+        def print_all_frames():
+            for i, frame in enumerate(inspect.stack()):
+                print(
+                    f"Frame {i}: {frame.function} in {frame.filename}, line {frame.lineno}"
+                )
+
+        loc = ir.Location.name(frame.f_code.co_name, childLoc=file_loc)
+        return loc
+
+    def compile_and_jit(self, module, pipeline, shared_libs, function_name=""):
+        """
+        Compile and JIT an MLIR module.
+        """
+
+        try:
+            self.diagnostic()
+
+            orig_stdout = sys.stdout
+            orig_stderr = sys.stderr
+            sys.stderr = redirect_stderr = io.StringIO()
+            sys.stdout = redirect_stdout = io.StringIO()
+
+            try:
+                kernel = self.compiler_provider.compile_and_jit(
+                    module,
+                    pipeline,
+                    shared_libs=shared_libs,
+                    cuda_toolkit=self.envar.cuda_toolkit,
+                    arch=self.envar.arch,
+                )
+
+            finally:
+                sys.stdout = orig_stdout
+                sys.stderr = orig_stderr
+                ir._GlobalDebug.flag = False
+
+            # Print captured output.
+            print(redirect_stdout.getvalue(), file=sys.stdout, end="")
+            print(redirect_stderr.getvalue(), file=sys.stderr, end="")
+
+            return kernel
+
+        except Exception as e:
+            raise DSLRuntimeError("🧊🧊🧊 ICE 🧊🧊🧊", cause=e)
+        finally:
+            pass
+
+    def preprocess_pipeline(self, pipeline, arch) -> str:
+
+        if self.envar.cuda_toolkit is None:
+            self.print_warning(
+                "CUDA_TOOLKIT_PATH environment variable is not set. Cannot set toolkitPath."
+            )
+
+        options = {
+            "toolkitPath": self.envar.cuda_toolkit if self.envar.cuda_toolkit else None,
+            self.pass_sm_arch_name: arch,
+        }
+
+        opt_str = ""
+        for k, v in options.items():
+            if v:
+                opt_str += f"{k}={v} "
+
+        if opt_str:
+            # Automatically append the pipeline options if any is specified through env var
+            pattern = re.compile(r"{(.+)}")
+            match = pattern.search(pipeline)
+            if match:
+                opt_str = f"{{{match[1]} {opt_str}}}"
+                pipeline = re.sub(r"{.+}", opt_str, pipeline)
+            else:
+                pipeline = pipeline.rstrip(")") + f"{{{opt_str}}})"
+        log().debug(f"Using pipeline = {pipeline}")
+        return pipeline
+
+    def get_shared_libs(self) -> list:
+        shared_libs = []
+        support_libs = self.envar.shared_libs
+        if support_libs is not None:
+            _libs = support_libs.split(":")
+            for lib in _libs:
+                if not os.path.exists(lib):
+                    raise FileNotFoundError(
+                        errno.ENOENT, os.strerror(errno.ENOENT), lib
+                    )
+                shared_libs.append(lib)
+        else:
+            self.print_warning(f"{self.name}_LIBS environment variable is not set")
+
+        return shared_libs
+
+    @lru_cache(maxsize=1)
+    def get_version(self):
+        version_hash = hashlib.sha256()
+
+        return version_hash
+
+    def get_module_hash(self, module, function_name):
+        s = io.BytesIO()
+        module.operation.write_bytecode(s)
+        for attr, value in self.envar.__dict__.items():
+            if value is not None:
+                s.write(str(value).encode())
+        module_hash = self.get_version().copy()
+        module_hash.update(s.getvalue())
+        module_hash = module_hash.hexdigest()
+
+        log().debug("Bytecode=[%s]", s.getvalue().hex())
+        log().debug("Version=[%s]", self.get_version().hexdigest())
+        log().info(
+            "Function=[%s] Computed module_hash=[%s]", function_name, module_hash
+        )
+        return module_hash
+
+    def build_module(self, module, function_name: str):
+        """
+        Build the MLIR module, verify and return the module
+        """
+
+        # Save IR in a file
+        if self.envar.keepIR:
+            save_ir(self.name, module, function_name)
+
+        if self.envar.printIR:
+            print("\n//===--- ------ Generated IR ------ ---====\n")
+            module.operation.print(
+                enable_debug_info=self.envar.generate_source_location
+            )
+            print("\n//===--- --- End of Generated IR -- ---====\n")
+
+        # Verify the module
+        try:
+            module.operation.verify()
+        except Exception as e:
+            raise DSLRuntimeError(f"🧊🧊🧊 ICE IR Verification Failed 🧊🧊🧊", cause=e)
+
+        return module
+
+    def generate_original_ir(
+        self,
+        ir,
+        func,
+        funcBody,
+        kwargs,
+        function_name,
+        func_types,
+        gpu_module_attrs,
+        args,
+        args_spec,
+    ):
+        # This location is set to None for now; otherwise, calls to the same
+        # function on different lines would produce different line numbers,
+        # which would break the cache.
+        loc = None  # self.get_location()
+
+        def build_ir_module():
+            module = ir.Module.create(loc=loc)
+            unit_attr = ir.UnitAttr.get()
+            module.operation.attributes["gpu.container_module"] = unit_attr
+
+            with ir.InsertionPoint(module.body):
+                # Always generate gpu module. It's canonicalized by the compiler when it's not used.
+                self._build_gpu_module(gpu_module_attrs)
+
+                fop = func.FuncOp(function_name, (func_types, []), loc=loc)
+                fop.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
+                log().debug("Generated Function OP [%s]", fop)
+                with ir.InsertionPoint(fop.add_entry_block()):
+                    ir_args, ir_kwargs = self.generate_execution_arguments(
+                        args, kwargs, fop, args_spec
+                    )
+                    # Call user function body
+                    try:
+                        result = funcBody(*ir_args, **ir_kwargs)
+                        func.ReturnOp([])
+                    except DSLAstPreprocessorError as pp_error:
+                        raise pp_error
+                    except NameError as name_error:
+                        raise DSLRuntimeError(
+                            f"💥💥💥 Error during runtime code generation for function `{funcBody.__name__}` 💥💥💥",
+                            cause=name_error,
+                            suggestion="Using variables defined in dynamic control flow is not supported. Please give an initial value before control flow.",
+                        )
+                    except DSLRuntimeError as dsl_error:
+                        # Throw it's already a DSL error
+                        raise dsl_error
+                    except Exception as general_e:
+                        # Transform internal error to a DSL error
+                        raise DSLRuntimeError(
+                            f"💥💥💥 Error during runtime code generation for function `{funcBody.__name__}` 💥💥💥"
+                        ) from general_e
+            return module, result
+
+        # Build IR module
+        profiler = timer(enable=self.envar.jitTimeProfiling)
+        module, result = profiler(build_ir_module)()
+        module_hash = self.get_module_hash(module, function_name)
+
+        module = self.build_module(module, function_name)
+
+        return module, module_hash, result
+
+    def compile_and_cache(
+        self, module, module_hash, function_name, pipeline, args_spec, no_cache
+    ):
+        arch = self.envar.arch
+        pipeline = self.preprocess_pipeline(self._get_pipeline(pipeline), arch)
+        shared_libs = self.get_shared_libs()
+        profiler = timer(enable=self.envar.jitTimeProfiling)
+        if (
+            no_cache
+            or module_hash not in self.jit_cache
+            or self.jit_cache[module_hash].ir_module is None
+        ):
+            log().info(
+                "JIT cache miss function=[%s] module_hash=[%s]",
+                function_name,
+                module_hash,
+            )
+            # Compile and JIT MLIR module
+            engine = profiler(self.compile_and_jit)(
+                module, pipeline, shared_libs, function_name=function_name
+            )
+        else:
+            log().info(
+                "JIT cache hit IN-FILE function=[%s] module_hash=[%s]",
+                function_name,
+                module_hash,
+            )
+            module = self.jit_cache[module_hash].ir_module
+            engine = self.compiler_provider.jit(module, shared_libs=shared_libs)
+        capi_func = profiler(engine.lookup)(function_name)
+        jit_executor = JitExecutor(
+            self,
+            engine,
+            capi_func,
+            module,
+            args_spec,
+            function_name,
+            jit_time_profiling=self.envar.jitTimeProfiling,
+        )
+        jit_executor = jit_executor.update_jit_cuda_modules(self.kernel_symbols)
+
+        if not no_cache:
+            # module stored in cache is compiled.
+            self.jit_cache[module_hash] = jit_executor
+
+        return jit_executor
+
+    def post_compilation_cleanup(self):
+        """Clean up some internal state after one compilation is completed."""
+        # clear the kernel symbols after the compilation is done.
+        self.kernel_symbols = []
+        self.launch_inner_count = 0
+        # reset num_kernels to 0 for next compilation.
+        self.num_kernels = 0
+
+    def generate_mlir(
+        self,
+        funcBody,
+        kwargs,
+        function_name,
+        gpu_module_attrs,
+        args,
+        args_spec,
+        pipeline,
+        no_cache,
+        compile_only,
+        loc=None,
+    ):
+        """Generate MLIR module and compile iself.T_provider."""
+        with ir.Context(), ir.Location.unknown():
+            # Convert input arguments to MLIR arguments
+            exe_args, func_types = self.generate_mlir_function_types(
+                funcBody, function_name, args, kwargs, args_spec
+            )
+
+            # Generate original ir module and its hash value.
+            module, module_hash, result = self.generate_original_ir(
+                ir,
+                func,
+                funcBody,
+                kwargs,
+                function_name,
+                func_types,
+                gpu_module_attrs,
+                args,
+                args_spec,
+            )
+
+            # dryrun is used to only generate IR
+            if self.envar.dryrun:
+                return result
+
+            if (
+                no_cache
+                or module_hash not in self.jit_cache
+                or self.jit_cache[module_hash].capi_func is None
+            ):
+                # no cache or cache miss, do ir generation/compilation/jit engine
+                jit_executor = self.compile_and_cache(
+                    module, module_hash, function_name, pipeline, args_spec, no_cache
+                )
+            else:
+                # cache hit
+                log().info(
+                    "JIT cache hit IN-MEMORY function=[%s] module_hash=[%s]",
+                    function_name,
+                    module_hash,
+                )
+                jit_executor = self.jit_cache[module_hash]
+
+            self.post_compilation_cleanup()
+        # If compile_only is set, bypass execution return the jit_executor directly
+        if compile_only:
+            return jit_executor
+        # Run the compiled program
+        jit_executor.run_compiled_program(exe_args)
+
+        return result
+
+    def run_preprocessor(self, funcBody):
+        if not hasattr(funcBody, "_preprocessed"):
+            function_name = funcBody.__name__
+            self.funcBody = funcBody
+            log().info("Started preprocessing [%s]", function_name)
+            exec_globals = self._get_globals()
+            transformed_ast = self.preprocessor.transform(funcBody, exec_globals)
+            if self.envar.print_after_preprocessor:
+                log().info(
+                    f"# Printing unparsed AST after preprocess of func=`{function_name}` id=`{id(funcBody)}`"
+                )
+                DSLPreprocessor.print_ast(transformed_ast)
+            funcBody._preprocessed = True
+            return transformed_ast
+        return None
+
+    def get_function_ptr(self, original_function, transformed_ast):
+        file_name = inspect.getsourcefile(original_function)
+        code_object = compile(transformed_ast, filename=file_name, mode="exec")
+        return self.preprocessor.exec(
+            original_function.__name__,
+            original_function,
+            code_object,
+            self._get_globals(),
+        )
+
+    @lru_cache(maxsize=None)
+    def _get_function_signature(self, func):
+        return inspect.signature(func)
+
+    def _get_function_bound_args(self, sig, func_name, *args, **kwargs):
+        """
+        Binds provided arguments to a function's signature and applies default values.
+
+        E.g. given a function signature `def foo(a, b=2, c=3)`, and at call-site if we do
+        `foo(a=1, c=4)`, the returned BoundArguments object will have args = `[1]`
+        and kwargs = `{'b': 2, 'c': 4}`
+
+        An exception will be raised if binding fails.
+        """
+        try:
+            bound_args = sig.bind_partial(*args, **kwargs)
+            bound_args.apply_defaults()
+        except Exception as e:
+            raise DSLRuntimeError(
+                f"Failed to bind arguments to function `{func_name}` with signature `{sig}`",
+                cause=e,
+            )
+        return bound_args
+
+    def _canonicalize_args(self, *args, **kwargs):
+        """
+        Canonicalize the input arguments so that returned args only contain
+        positional arguments and kwargs only contain keyword arguments.
+        """
+        sig = self._get_function_signature(self.funcBody)
+        function_name = self.funcBody.__name__
+        bound_args = self._get_function_bound_args(sig, function_name, *args, **kwargs)
+        canonicalized_args = bound_args.args
+        canonicalized_kwargs = bound_args.kwargs
+        return canonicalized_args, canonicalized_kwargs
+
+    def _check_arg_count(self, *args, **kwargs):
+        if not self.funcBody:
+            raise DSLRuntimeError("Function body is not set.")
+
+        # Pass the actual function object to _get_function_signature.
+        sig = self._get_function_signature(self.funcBody)
+        function_name = self.funcBody.__name__
+
+        bound_args = self._get_function_bound_args(sig, function_name, *args, **kwargs)
+
+        # Check if all non-default arguments are provided
+        for param in sig.parameters.values():
+            if (
+                param.default is inspect.Parameter.empty
+                and param.name not in bound_args.arguments
+            ):
+                raise DSLRuntimeError(
+                    f"Missing required argument in `{function_name}`: '{param.name}'"
+                )
+
+    def _func(self, funcBody, *args, **kwargs):
+        """Decorator for MLIR functions.
+        It cuts the boilerplate code, does the following:
+            1. Generates `func.func`
+            2. Types translation (numpy arrays -> cute.memref, float -> <f32>, etc.)
+            3. Compiles and JITs the MLIR module
+            4. Invokes the generated function
+            5. Operator overloading (a + b --> arith.addi a, b)
+            6. Generates GPU kernel function with GPU module and kernel attributes baked
+        """
+        if ir.Context.current is None:
+            pass
+        elif ir.InsertionPoint.current is not None:
+            return funcBody(*args, **kwargs)
+
+        function_name = funcBody.__name__
+        self.funcBody = funcBody
+
+        pipeline = kwargs.pop("pipeline", None)
+        gpu_module_attrs = kwargs.pop("gpu_module_attrs", {})
+
+        # Disable cache
+        no_cache = kwargs.pop("no_cache", False)
+
+        # Always compile(disable cache) and return the result jit_executor
+        compile_only = kwargs.pop("compile_only", False)
+
+        if not no_cache and compile_only:
+            no_cache = True
+            self.print_warning("Cache is disabled as user wants to compile only.")
+
+        # Check the number of arguments
+        self._check_arg_count(*args, **kwargs)
+
+        args_spec = inspect.getfullargspec(funcBody)
+
+        # Canonicalize the input arguments
+        canonicalized_args, canonicalized_kwargs = self._canonicalize_args(
+            *args, **kwargs
+        )
+
+        # Simple name mangling
+        function_name = self.mangle_name(function_name, canonicalized_args, args_spec)
+
+        # Generate MLIR Context and start generating IR
+        log().debug(f"Generating MLIR for function '{function_name}'")
+        result = self.generate_mlir(
+            funcBody,
+            canonicalized_kwargs,
+            function_name,
+            gpu_module_attrs,
+            canonicalized_args,
+            args_spec,
+            pipeline,
+            no_cache,
+            compile_only,
+        )
+
+        return result
+
+    class _KernelGenHelper(ABC):
+        def __init__(self):
+            self.func_op = None
+            self.func_type = None
+
+        @abstractmethod
+        def generate_func_op(self, arg_types, arg_attrs, kernel_name, loc=None):
+            assert arg_types is not None, "Invalid arg_types!"
+            assert kernel_name is not None, "kernel name is empty"
+            pass
+
+        @abstractmethod
+        def generate_func_ret_op(self):
+            pass
+
+        @abstractmethod
+        def generate_launch_op(self, *args, **kwargs):
+            pass
+
+        @abstractmethod
+        def get_func_body_start(self):
+            pass
+
+    @abstractmethod
+    def enter_gpu_module(module):
+        """Compute the insertion point into the given module."""
+        pass
+
+    @lru_cache(maxsize=1)
+    def _get_default_stream(self):
+        """Returns the default stream 0"""
+        from .runtime import cuda as cuda_helpers
+
+        return cuda_helpers.stream_create()
+
+    def _execute_cuda(
+        self, fname_cubin, kernel_name, grid_size, block_size, stream=None
+    ):
+        """
+        Executes a specified CUDA kernel from a cubin file, handling module loading,
+        kernel retrieval, stream creation, kernel launch, and synchronization.
+        """
+        from .runtime import cuda as cuda_helpers
+
+        # Step 1. Load CUDA Module
+        module = cuda_helpers.load_cubin_module(fname_cubin)
+        # Step 2. Find CUDA function
+        kernel_ptr = cuda_helpers.get_kernel_function(module, kernel_name)
+
+        sync_execution_default = False
+        if stream is None:
+            stream = self._get_default_stream()
+            sync_execution_default = True
+
+        # Step 4. Launch the kernel
+        cuda_helpers.launch_kernel(
+            kernel_ptr,
+            grid_size,
+            block_size,
+            stream,
+            smem_size=16000,
+            kernel_args=self.exe_args,
+        )
+
+        if sync_execution_default:
+            # Step 5. Optional Sync cuda stream
+            cuda_helpers.stream_sync(stream)
+
+    def _execute_by_cuda_driver(
+        self, kernel_generator, generate_cubin, grid_size, block_size, stream=None
+    ):
+        """
+        This function builds IR and execute the module using cuda driver.
+        It doesn't use mlir's cuda runtime
+        """
+        ret = None
+
+        # Step 1. Build IR
+        with ir.Context(), ir.Location.unknown():
+            loc = self.get_location()
+            module = ir.Module.create(loc=loc)
+            unit_attr = ir.UnitAttr.get()
+            module.operation.attributes["gpu.container_module"] = unit_attr
+            with ir.InsertionPoint(module.body):
+                self._build_gpu_module()
+                ret, kernel_name = kernel_generator()
+                log().debug(
+                    f"Kernel generator returned: ret={ret}, kernel_name={kernel_name}"
+                )
+
+        module = self.build_module(module, kernel_name)
+
+        # dryrun is used to only generate IR
+        if self.envar.dryrun:
+            return ret
+
+        # Generate cubin
+        fname_cubin = generate_cubin(module, kernel_name)
+
+        # Execute a cuda kernel from cubin
+        if block_size is None:
+            # The TileIR driver should set this automatically.
+            block_size = self.block_size
+        self._execute_cuda(fname_cubin, kernel_name, grid_size, block_size, stream)
+
+        return ret
+
+    def generate_kernel_operands_and_types(
+        self, kernel_func, kernel_name, args_spec, args, kwargs
+    ):
+        """
+        Generate the operands and types for the kernel function
+        """
+
+        kernel_operands, kernel_arg_types, kernel_arg_attrs = [], [], []
+
+        log().debug(
+            "Processing GPU kernel call in [%s] mode",
+            (
+                f"Only {self.device_jit_decorator_name}"
+                if self.device_compilation_only
+                else f"{self.host_jit_decorator_name} + {self.device_jit_decorator_name}"
+            ),
+        )
+
+        if self.device_compilation_only:
+            return kernel_operands, kernel_arg_types, kernel_arg_attrs
+
+        kernel_operands, kernel_arg_types, kernel_arg_attrs = (
+            self._generate_jit_func_args(
+                kernel_func, kernel_name, args, kwargs, args_spec, is_host=False
+            )
+        )
+
+        log().debug("Final kernel_operands: %s", ", ".join(map(str, kernel_operands)))
+        log().debug("Final kernel_arg_types: %s", ", ".join(map(str, kernel_arg_types)))
+        log().debug("Final kernel_arg_attrs: %s", ", ".join(map(str, kernel_arg_attrs)))
+
+        assert (
+            len(kernel_operands) == len(kernel_arg_types) == len(kernel_arg_attrs)
+        ), "Size of kernel_operands, kernel_arg_types and kernel_arg_attrs must be equal"
+
+        return kernel_operands, kernel_arg_types, kernel_arg_attrs
+
+    def kernel_launcher(self, *dargs, **dkwargs):
+        def decorator(funcBody):
+            @wraps(funcBody)
+            def kernel_wrapper(*args, **kwargs):
+                """
+                Base decorator for generating kernel function
+
+                This decorator provides a template for kernel function generation
+                including kernel function header/body and kernel launch op at call site
+
+                Optional arguments (with default value in <>):
+                  - requiredArgs <[]>:      specifies the mandatory arguments that must present in kernel function signature
+                                            the args will be validated and collected as a namedtuple
+                  - optionalArgs <[]>:      specifies the optional arguments that might present in kernel function signature
+                                            the args will be collected (if present) as a namedtuple
+                  - unitAttrNames <[]>:     specifies the name(s) of ir.UnitAttr to be set for kernel function op
+                  - valueAttrDict <{}>:     specifies the name(s) and value(s) of ir.Attribute to be set for kernel function op
+                  - kernelGenHelper <None>: specifies the mandatory customized kernel generation helper class (derived from _KernelGenHelper)
+
+                Return value:
+                  A namedtuple "KernelReturns" is returned with following fields:
+                  - kernel_func_ret: the return of the kernel function
+                  - launch_op_ret:   the return of the launch op
+                """
+
+                requiredArgs = dkwargs.get("requiredArgs", [])
+                optionalArgs = dkwargs.get("optionalArgs", [])
+                unitAttrNames = dkwargs.get("unitAttrNames", [])
+                valueAttrDict = dkwargs.get("valueAttrDict", {})
+                kernelGenHelper = dkwargs.get("kernelGenHelper", None)
+
+                kernel_name = funcBody.__name__
+                args_spec = inspect.getfullargspec(funcBody)
+                self.funcBody = funcBody
+
+                # Give each kernel a unique name. (The same kernel may be
+                # called multiple times, resulting in multiple kernel traces.)
+                # The mangled name of Python function is part of the name to
+                # improve readability.
+                kernel_name = f"kernel_{self.mangle_name(kernel_name, args, args_spec)}_{self.num_kernels}"
+                self.num_kernels += 1
+
+                # Step 0. Preprocess the arguments
+                def extract_args(argNames, assertIfNone=False) -> list:
+                    extracted = []
+                    for name in argNames:
+                        value = kwargs.pop(name, None)
+                        if assertIfNone and value is None:
+                            raise DSLRuntimeError(
+                                f"{name} is required for {kernel_name}"
+                            )
+                        extracted.append(value)
+
+                    return extracted
+
+                RequiredArgs = namedtuple("RequiredArgs", requiredArgs)
+                req_args = (
+                    RequiredArgs._make(extract_args(requiredArgs, assertIfNone=True))
+                    if requiredArgs
+                    else None
+                )
+                OptionalArgs = namedtuple("OptionalArgs", optionalArgs)
+                opt_args = (
+                    OptionalArgs._make(extract_args(optionalArgs))
+                    if optionalArgs
+                    else None
+                )
+                assert (
+                    kernelGenHelper is not None
+                ), "kernelGenHelper should be explicitly specified!"
+
+                # check arguments
+                self._check_arg_count(*args, **kwargs)
+
+                # Canonicalize the input arguments
+                canonicalized_args, canonicalized_kwargs = self._canonicalize_args(
+                    *args, **kwargs
+                )
+
+                kernel_operands, kernel_types, kernel_arg_attrs = (
+                    self.generate_kernel_operands_and_types(
+                        funcBody,
+                        kernel_name,
+                        args_spec,
+                        canonicalized_args,
+                        canonicalized_kwargs,
+                    )
+                )
+
+                with self._enter_gpu_module():
+                    log().debug("Generating device kernel")
+                    if self.device_compilation_only:
+                        log().debug("Generating cuda-python arguments")
+                        # Convert input arguments to MLIR arguments
+                        self.exe_args, kernel_types = self.generate_mlir_function_types(
+                            funcBody,
+                            kernel_name,
+                            canonicalized_args,
+                            canonicalized_kwargs,
+                            args_spec,
+                        )
+
+                    helper = kernelGenHelper()
+                    loc = self.get_location()
+                    fop = helper.generate_func_op(
+                        kernel_types, kernel_arg_attrs, kernel_name, loc
+                    )
+                    log().debug(f"Kernel function op: {fop}")
+                    for attr in unitAttrNames:
+                        fop.attributes[attr] = ir.UnitAttr.get()
+                    for key, val in valueAttrDict.items():
+                        fop.attributes[key] = val
+
+                    fop.sym_visibility = ir.StringAttr.get("public")
+                    with ir.InsertionPoint(helper.get_func_body_start()):
+                        ir_args, ir_kwargs = self.generate_execution_arguments(
+                            canonicalized_args, canonicalized_kwargs, fop, args_spec
+                        )
+                        log().debug(
+                            f"IR arguments - args: {ir_args} ; kwargs: {ir_kwargs}"
+                        )
+                        # Call user function body
+                        kernel_ret = funcBody(*ir_args, **ir_kwargs)
+                        helper.generate_func_ret_op()
+
+                # Step 3. Generate call site `launch_func`
+                kernel_sym = ir.SymbolRefAttr.get(["kernels", kernel_name])
+                launch_ret = helper.generate_launch_op(
+                    kernelSym=kernel_sym,
+                    kernelOperands=kernel_operands,
+                    requiredArgs=req_args,
+                    optionalArgs=opt_args,
+                )
+
+                KernelReturns = namedtuple(
+                    "KernelReturns", ["kernel_func_ret", "launch_op_ret"]
+                )
+                result = KernelReturns(
+                    kernel_func_ret=kernel_ret, launch_op_ret=launch_ret
+                )
+                log().debug(f"Kernel result: {result}, kernel name: {kernel_name}")
+                return result, kernel_name
+
+            return kernel_wrapper
+
+        if len(dargs) == 1 and callable(dargs[0]):
+            return decorator(dargs[0])
+        else:
+            return decorator
diff --git a/python/CuTeDSL/base_dsl/env_manager.py b/python/CuTeDSL/base_dsl/env_manager.py
new file mode 100644
index 00000000..ef1fea7a
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/env_manager.py
@@ -0,0 +1,303 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides utilities for the environment variables setup.
+
+It provides an EnvironmentVarManager, which reads environment variables for the DSL
+and caches them for efficient access.
+
+It also provides utilities to automatically setup a subset of environment variables
+based on heuristics.
+"""
+
+import os
+import sys
+import shutil
+import glob
+from pathlib import Path
+from functools import lru_cache
+from typing import Any
+
+from ..base_dsl.runtime.cuda import get_compute_capability_major_minor
+from .utils.logger import log
+
+IS_WINDOWS = sys.platform == "win32"
+CLIB_EXT = ".dll" if IS_WINDOWS else ".so"
+
+# =============================================================================
+# Environment Variable Helpers
+# =============================================================================
+
+
+@lru_cache(maxsize=None)
+def get_str_env_var(var_name, default_value=None):
+    value = os.getenv(var_name)
+    return value if value is not None else default_value
+
+
+@lru_cache(maxsize=None)
+def get_bool_env_var(var_name, default_value=False):
+    value = get_str_env_var(var_name)
+    if value is None:
+        return default_value
+    return value not in {"False", "0", ""}
+
+
+@lru_cache(maxsize=None)
+def get_int_env_var(var_name, default_value=0):
+    value = get_str_env_var(var_name)
+    return int(value) if value and value.isdigit() else default_value
+
+
+def detect_gpu_arch(prefix):
+    """
+    Attempts to detect the machine's GPU architecture.
+
+    Returns:
+        A string representing the GPU architecture (e.g. "70" for compute capability 7.0),
+        or a default value(e.g. "sm_100") if the GPU architecture cannot be determined.
+    """
+    arch = (None, None)
+    try:
+        arch = get_compute_capability_major_minor()
+    except Exception as e:
+        log().info(f"Failed to get CUDA compute capability: {e}")
+
+    if arch == (None, None):
+        # default to sm_100
+        arch = (10, 0)
+
+    major, minor = arch
+    suffix = ""
+    if major >= 9 and minor >= 0:
+        suffix = "a"
+    elif minor != 0:
+        # e.g sm_86, belong with sm_80 family
+        minor = 0
+    return f"sm_{major}{minor}{suffix}"
+
+
+def find_libs_in_ancestors(start, target_libs, lib_folder_guesses):
+    """
+    Search ancestor directories for a candidate library folder containing all required libraries.
+
+    Starting from the given path, this function traverses up through each parent directory.
+    For every ancestor, it checks candidate subdirectories (specified by lib_folder_guesses)
+    for files that match the required library extension (CLIB_EXT). Library file names are
+    canonicalized by removing the "lib" prefix from their stem. If a candidate directory contains
+    all of the required libraries (as specified in target_libs), the function returns a list of
+    absolute paths to these library files.
+
+    Parameters:
+        start (str or Path): The starting directory from which to begin the search.
+        target_libs (iterable of str): A collection of required library names (without the "lib" prefix).
+        lib_folder_guesses (iterable of str): Relative paths from an ancestor directory that may contain the libraries.
+
+    Returns:
+        list[str] or None: A list of resolved paths to the required library files if found; otherwise, None.
+    """
+    # Traverse through all parent directories of the resolved starting path.
+    for ancestor in Path(start).resolve().parents:
+        # Iterate over each candidate relative directory path.
+        for rel_path in lib_folder_guesses:
+            target_dir = ancestor / rel_path
+            # Skip if the candidate directory does not exist.
+            if not target_dir.is_dir():
+                continue
+
+            # Initialize a list to hold the resolved paths of matching library files.
+            libs_cand = []
+            # Create a set of the remaining libraries we need to find.
+            remaining_libs = set(target_libs)
+
+            # Iterate over all items in the candidate directory.
+            for p in target_dir.iterdir():
+                # Consider only files with the expected library extension.
+                if p.suffix == CLIB_EXT:
+                    # Canonicalize the library name by removing the "lib" prefix.
+                    lib_name = p.stem.removeprefix("lib")
+                    # If this library is required, add its resolved path and mark it as found.
+                    if lib_name in remaining_libs:
+                        libs_cand.append(str(p.resolve()))
+                        remaining_libs.remove(lib_name)
+
+            # If all required libraries have been found, return the list of library paths.
+            if len(remaining_libs) == 0:
+                return libs_cand
+
+    # Return None if no candidate directory contains all required libraries.
+    return None
+
+
+def _find_cuda_home():
+    """Find the CUDA installation path using a series of heuristic methods.
+    Methods below are checked in order, and the function returns on first match:
+    1. Checking the environment variables CUDA_HOME and CUDA_PATH.
+    2. Searching for the 'nvcc' compiler in the system PATH and deriving the path of cuda.
+    3. Scanning common installation directories based on the operating system.
+       - On Windows systems (when IS_WINDOWS is True), it searches in:
+             C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*
+       - On Unix-like systems, it searches in:
+             /usr/local/cuda*
+
+    Returns:
+        Optional[str]: The absolute CUDA installation path if found; otherwise, None.
+
+    Note:
+        The variable IS_WINDOWS is defined in the module scope.
+    """
+    # Guess #1
+    cuda_home = get_str_env_var("CUDA_HOME") or get_str_env_var("CUDA_PATH")
+    if cuda_home is None:
+        # Guess #2
+        nvcc_path = shutil.which("nvcc")
+        if nvcc_path is not None:
+            cuda_home = os.path.dirname(os.path.dirname(nvcc_path))
+        else:
+            # Guess #3
+            if IS_WINDOWS:
+                glob_pat = "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*"
+            else:
+                glob_pat = "/usr/local/cuda*"
+            cuda_homes = glob.glob(glob_pat)
+            if len(cuda_homes) == 0:
+                cuda_home = ""
+            else:
+                cuda_home = cuda_homes[0]
+            if not os.path.exists(cuda_home):
+                cuda_home = None
+    return cuda_home
+
+
+def get_cuda_toolkit_path():
+    """
+    Get cuda_toolkit_path. It returns get_str_env_var('CUDA_TOOLKIT_PATH') if
+    set. Otherwise, attempts to discover a valid CUDA toolkit location and
+    return. If not found, return None.
+    """
+    # Check if the environment variable is already set, if so, return it immediately.
+    try:
+        cuda_toolkit_path_existing = get_str_env_var("CUDA_TOOLKIT_PATH")
+        if cuda_toolkit_path_existing:
+            return cuda_toolkit_path_existing
+
+        found_cuda_home = _find_cuda_home()
+        if found_cuda_home:
+            return found_cuda_home
+    except Exception as e:
+        log().info("default_env: exception on get_cuda_toolkit_path", e)
+    return None
+
+
+def get_prefix_dsl_libs(prefix: str):
+    """
+    Returns get_str_env_var('{prefix}_LIBS') if set.
+    Otherwise, attempts to discover libs based on heuristics and return
+    If not found, return None.
+    """
+    # Check if the environment variable is already set, if so, return it immediately.
+    try:
+        prefix_libs_existing = get_str_env_var(f"{prefix}_LIBS")
+        if prefix_libs_existing:
+            return prefix_libs_existing
+
+        def get_libs_cand(start):
+            target_libs = {
+                "mlir_c_runner_utils",
+                "mlir_runner_utils",
+                "mlir_cuda_runtime",
+            }
+            lib_folder_guesses = [
+                "lib",
+            ]
+
+            libs_cand = find_libs_in_ancestors(start, target_libs, lib_folder_guesses)
+            if libs_cand:
+                dsl_libs = ":".join(libs_cand)
+                return dsl_libs
+
+            return None
+
+        # find from install folder
+        dsl_libs = get_libs_cand(__file__)
+
+        if not dsl_libs:
+            # try to find from build folder structure
+            dsl_libs = get_libs_cand(Path(__file__).parent.parent.resolve())
+
+        return dsl_libs
+
+    except Exception as e:
+        log().info(f"default_env: exception on get_prefix_dsl_libs", e)
+    return None
+
+
+class EnvironmentVarManager:
+    """Manages environment variables for configuration options.
+
+    Printing options:
+    - [DSL_NAME]_LOG_TO_CONSOLE: Print logging to stderr (default: False)
+    - [DSL_NAME]_PRINT_AFTER_PREPROCESSOR: Print after preprocess (default: False)
+    - [DSL_NAME]_PRINT_IR: Print generated IR (default: False)
+    - [DSL_NAME]_FILTER_STACKTRACE: Filter internal stacktrace (default: True)
+    File options:
+    - [DSL_NAME]_KEEP_IR: Save generated IR in a file (default: False)
+    - [DSL_NAME]_LOG_TO_FILE: Store all logging into a file, excluding COMPILE_LOGS (default: False)
+    Other options:
+    - [DSL_NAME]_LOG_LEVEL: Logging level to set, for LOG_TO_CONSOLE or LOG_TO_FILE (default: 1).
+    - [DSL_NAME]_DRYRUN: Generates IR only (default: False)
+    - [DSL_NAME]_ARCH: GPU architecture (default: "sm_100")
+    - [DSL_NAME]_WARNINGS_AS_ERRORS: Enable warnings as error (default: False)
+    - [DSL_NAME]_WARNINGS_IGNORE: Ignore warnings (default: False)
+    - [DSL_NAME]_JIT_TIME_PROFILING: Whether or not to profile the IR generation/compilation/execution time (default: False)
+    - [DSL_NAME]_DISABLE_FILE_CACHING: Disable file caching (default: False)
+    - [DSL_NAME]_FILE_CACHING_CAPACITY: Limits the number of the cache save/load files (default: 1000)
+    - [DSL_NAME]_LIBS: Path to dependent shared libraries (default: None)
+    - [DSL_NAME]_NO_SOURCE_LOCATION: Generate source location (default: False)
+    """
+
+    def __init__(self, prefix="DSL"):
+        self.prefix = prefix  # change if needed
+
+        # Printing options
+        self.log_to_console = get_bool_env_var(f"{prefix}_LOG_TO_CONSOLE", False)
+        self.print_after_preprocessor = get_bool_env_var(
+            f"{prefix}_PRINT_AFTER_PREPROCESSOR", False
+        )
+        self.printIR = get_bool_env_var(f"{prefix}_PRINT_IR", False)
+        self.filterStacktrace = get_bool_env_var(f"{prefix}_FILTER_STACKTRACE", True)
+        # File options
+        self.keepIR = get_bool_env_var(f"{prefix}_KEEP_IR", False)
+        self.log_to_file = get_bool_env_var(f"{prefix}_LOG_TO_FILE", False)
+        # Other options
+        self.log_level = get_int_env_var(f"{prefix}_LOG_LEVEL", 1)
+        self.dryrun = get_bool_env_var(f"{prefix}_DRYRUN", False)
+        self.arch = get_str_env_var(f"{prefix}_ARCH", detect_gpu_arch(prefix))
+        self.warnings_as_errors = get_bool_env_var(
+            f"{prefix}_WARNINGS_AS_ERRORS", False
+        )
+        self.warnings_ignore = get_bool_env_var(f"{prefix}_WARNINGS_IGNORE", False)
+        self.jitTimeProfiling = get_bool_env_var(f"{prefix}_JIT_TIME_PROFILING", False)
+        self.disable_file_caching = get_bool_env_var(
+            f"{prefix}_DISABLE_FILE_CACHING", False
+        )
+        self.file_caching_capacity = get_int_env_var(
+            f"{prefix}_FILE_CACHING_CAPACITY", 1000
+        )
+        self.generate_source_location = not get_bool_env_var(
+            f"{prefix}_NO_SOURCE_LOCATION", False
+        )
+        # set cuda
+        self.cuda_toolkit = get_cuda_toolkit_path()
+
+        # set mlir shared libraries
+        self.shared_libs = get_prefix_dsl_libs(prefix)
diff --git a/python/CuTeDSL/base_dsl/jit_executor.py b/python/CuTeDSL/base_dsl/jit_executor.py
new file mode 100644
index 00000000..2c997be3
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/jit_executor.py
@@ -0,0 +1,301 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides jit executor related classes
+"""
+import io
+import inspect
+import ctypes
+import numpy as np
+from typing import get_origin
+
+# Local modules imports
+from .utils.timer import timer
+from .utils.logger import log
+from .common import DSLRuntimeError
+from .runtime import cuda as cuda_helpers
+from .runtime.jit_arg_adapters import JitArgAdapterRegistry, is_arg_spec_constexpr
+from .typing import get_c_pointers
+from . import typing as t
+
+# MLIR modules imports
+from .._mlir import ir
+
+
+class CudaSingleModule:
+    def __init__(self, cuda_module, kernel_ptr):
+        self.cuda_module = cuda_module
+        self.kernel_ptr = kernel_ptr
+
+
+class CudaModules:
+    def __init__(self, modules, args):
+        # list of CudaSingleModule
+        self.modules = modules
+        # extra kernel ptr arguments for launch
+        self.args = args
+
+
+class JitExecutor:
+    def __init__(
+        self,
+        dsl,
+        engine,
+        capi_func,
+        ir_module,
+        args_spec,
+        function_name,
+        cuda_modules: CudaModules = None,
+        jit_time_profiling=False,
+    ):
+        self.dsl = dsl
+        self.engine = engine
+        self.capi_func = capi_func
+        self.ir_module = ir_module
+        self.args_spec = args_spec
+        self.function_name = function_name
+        if args_spec is not None:
+            self.args_spec = self.filter_runtime_arg_spec(args_spec)
+        # cuda kernels
+        self.cuda_modules = cuda_modules
+        self.jit_time_profiling = jit_time_profiling
+
+    def filter_runtime_arg_spec(self, arg_spec: inspect.FullArgSpec):
+        runtime_args = []
+        runtime_annotations = {}
+        runtime_defaults = []
+
+        # Calculate the offset where defaults start in the original args
+        if arg_spec.defaults:
+            defaults_start_idx = len(arg_spec.args) - len(arg_spec.defaults)
+        else:
+            defaults_start_idx = len(arg_spec.args)
+
+        # Filter arguments and maintain their properties
+        for i, arg_name in enumerate(arg_spec.args):
+            arg_type = arg_spec.annotations.get(arg_name, None)
+
+            # Skip compile-time arguments
+            if is_arg_spec_constexpr(arg_type, arg_name, i, self.function_name):
+                continue
+
+            # Keep runtime arguments
+            runtime_args.append(arg_name)
+            if arg_name in arg_spec.annotations:
+                runtime_annotations[arg_name] = arg_type
+
+            # Keep corresponding default if it exists
+            if i >= defaults_start_idx:
+                default_idx = i - defaults_start_idx
+                runtime_defaults.append(arg_spec.defaults[default_idx])
+
+        # Filter kwonlyargs and their defaults
+        runtime_kwonlyargs = []
+        runtime_kwonlydefaults = {}
+
+        if arg_spec.kwonlyargs:
+            for kwarg in arg_spec.kwonlyargs:
+                arg_type = arg_spec.annotations.get(kwarg, None)
+
+                # Apply same filtering logic
+                if is_arg_spec_constexpr(arg_type, kwarg, i, self.function_name):
+                    continue
+
+                runtime_kwonlyargs.append(kwarg)
+                if kwarg in arg_spec.annotations:
+                    runtime_annotations[kwarg] = arg_type
+                if arg_spec.kwonlydefaults and kwarg in arg_spec.kwonlydefaults:
+                    runtime_kwonlydefaults[kwarg] = arg_spec.kwonlydefaults[kwarg]
+
+        # Convert runtime_defaults to tuple if not empty (as expected by FullArgSpec)
+        runtime_defaults = tuple(runtime_defaults) if runtime_defaults else None
+
+        return inspect.FullArgSpec(
+            args=runtime_args,
+            varargs=arg_spec.varargs,  # Keep original varargs
+            varkw=arg_spec.varkw,  # Keep original varkw
+            defaults=runtime_defaults,
+            kwonlyargs=runtime_kwonlyargs,
+            kwonlydefaults=runtime_kwonlydefaults if runtime_kwonlydefaults else None,
+            annotations=runtime_annotations,
+        )
+
+    def __del__(self):
+        if self.cuda_modules:
+            cuda_modules = [module.cuda_module for module in self.cuda_modules.modules]
+            for module in set(cuda_modules):
+                cuda_helpers.unload_cubin_module(module)
+
+    def generate_execution_args(self, args, kwargs, args_spec: inspect.FullArgSpec):
+        """
+        This function is the prune version of `generate_mlir_function_types` which only generates execution args
+        to get rid of mlir context.
+        """
+
+        # args/kwargs must match arg_specs
+        # No canonicalization of args/kwargs to avoid extra latency
+        if len(args) != len(args_spec.args) or len(kwargs) != len(args_spec.kwonlyargs):
+            raise DSLRuntimeError(
+                "input args/kwargs length does not match runtime function signature!",
+                context={
+                    "input args length": len(args),
+                    "input kwargs length": len(kwargs),
+                    "function signature args length": len(args_spec.args),
+                    "function signature kwonlyargs length": len(args_spec.kwonlyargs),
+                },
+            )
+
+        exe_args = []
+        input_args = [*args, *kwargs.values()]
+        input_arg_names = [*args_spec.args, *args_spec.kwonlyargs]
+        for i, arg in enumerate(input_args):
+            arg_type = args_spec.annotations.get(input_arg_names[i], None)
+
+            # Implicit cast to NumericMeta
+            if isinstance(arg_type, t.NumericMeta):
+                arg = t.cast(arg, arg_type)
+
+            # If not any known type, try registered adapter to do the conversion
+            adapter = JitArgAdapterRegistry.get_registered_adapter(type(arg))
+            adapted_arg = adapter(arg) if adapter else arg
+            exe_args.extend(get_c_pointers(adapted_arg))
+
+        return exe_args
+
+    def __call__(self, *args, **kwargs):
+        exe_args = self.generate_execution_args(args, kwargs, self.args_spec)
+
+        self.run_compiled_program(exe_args)
+
+    # Assume each execution args has type `c_void_p` to reduce the overhead of `ctypes.cast`.
+    def get_invoke_packed_args(self, exe_args):
+        if self.cuda_modules:
+            exe_args += self.cuda_modules.args
+        packed_args = (ctypes.c_void_p * len(exe_args))()
+        for argNum in range(len(exe_args)):
+            packed_args[argNum] = exe_args[argNum]
+        return packed_args
+
+    def run_compiled_program(self, exe_args):
+        if self.jit_time_profiling:
+            profiler = timer(enable=True)
+            try:
+                packed_args = profiler(self.get_invoke_packed_args)(exe_args)
+                profiler(self.capi_func)(packed_args)
+            except Exception as e:
+                raise DSLRuntimeError(f"💥💥💥 Runtime Crash 💥💥💥", cause=e)
+        else:
+            try:
+                packed_args = self.get_invoke_packed_args(exe_args)
+                self.capi_func(packed_args)
+            except Exception as e:
+                raise DSLRuntimeError(f"💥💥💥 Runtime Crash 💥💥💥", cause=e)
+
+    def update_jit_cuda_modules(self, kernel_symbols):
+        # preload cuda module from compiled cubin in ir and store to jit_executor.kernels.
+        if len(kernel_symbols) > 0:
+            extra_args = []
+            module = self.ir_module
+            cuda_kernel_cache = dict()
+            cuda_driver_version = cuda_helpers.get_driver_version()
+            for sym in kernel_symbols:
+                if sym not in cuda_kernel_cache:
+                    log().debug(f"Loading CUDA module for symbol: {sym}")
+
+                    # load cuda module/get function pointer from module and cache
+                    def walk_callback(sym, func_sym, cubin_data):
+                        cubin_module = cuda_helpers.load_cubin_module_data(cubin_data)
+                        kernel_ptr = cuda_helpers.get_kernel_function(
+                            cubin_module, func_sym
+                        )
+                        # Enable non-portable cluster size for CUDA version 11.8 or higher.
+                        if cuda_driver_version >= 11080:
+                            cuda_helpers.set_kernel_attribute(
+                                kernel_ptr,
+                                cuda_helpers.cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED,
+                                1,
+                            )
+                        cuda_kernel_cache[sym] = CudaSingleModule(
+                            cubin_module, kernel_ptr
+                        )
+
+                    self.walk_module_and_get_cubin_data(module, sym, walk_callback)
+                else:
+                    log().debug(f"Symbol {sym} already in cache")
+                # check if kernel is empty.
+                if sym in cuda_kernel_cache:
+                    extra_args.append(
+                        ctypes.c_void_p(cuda_kernel_cache[sym].kernel_ptr.getPtr())
+                    )
+            # store to the jit result if jit result is cached.
+            self.cuda_modules = CudaModules(cuda_kernel_cache.values(), extra_args)
+
+        return self
+
+    def _get_escaped_cubin_bytes(self, cubin_data):
+        """This function escapes cubin data from mlir raw bytecode to executable binary bytes"""
+
+        def ishex(inp):
+            return (
+                inp in range(0x30, 0x3A)
+                or inp in range(0x61, 0x67)
+                or inp in range(0x41, 0x47)
+            )
+
+        converted = bytearray()
+        idx = 0
+        while idx < len(cubin_data):
+            # escape the original bytes
+            if cubin_data[idx] == 0x5C:
+                # if data of idx is b'\\'
+                if ishex(cubin_data[idx + 1]) and ishex(cubin_data[idx + 2]):
+                    converted += bytearray.fromhex(
+                        cubin_data[idx + 1 : idx + 3].decode()
+                    )
+                    idx += 3
+                elif cubin_data[idx + 1] == 0x5C:
+                    converted.append(cubin_data[idx])
+                    idx += 2
+            else:
+                # no escape, directly write
+                converted.append(cubin_data[idx])
+                idx += 1
+        return bytes(converted)
+
+    def walk_module_and_get_cubin_data(self, module, sym, callback):
+        """This function is used to walk gpu binary op, extract the cubin inside, and process cubin data with callback."""
+
+        def walk_gpu_binary_op(op):
+            if op.name != "gpu.binary":
+                return ir.WalkResult.ADVANCE
+            s = io.BytesIO()
+            op.write_bytecode(s)
+            cubin_data = s.getvalue()
+            if sym.encode() not in cubin_data:
+                return ir.WalkResult.ADVANCE
+
+            if (
+                "kernels" != op.opview.sym_name.value
+                and sym != op.opview.sym_name.value
+            ):
+                return ir.WalkResult.ADVANCE
+            # function symbol of kernel(gpu.launch_func) is equal to sym name in mlir
+            func_sym = sym
+            if sym == op.opview.sym_name.value and not sym.endswith("_kernel"):
+                func_sym = sym.rsplit("_", 1)[0]
+
+            cubin_data = cubin_data.split(b'bin = "')[1].split(b'">')[0]
+            cubin_data = self._get_escaped_cubin_bytes(cubin_data)
+            callback(sym, func_sym, cubin_data)
+            return ir.WalkResult.ADVANCE
+
+        module.operation.walk(walk_gpu_binary_op)
diff --git a/python/CuTeDSL/base_dsl/runtime/__init__.py b/python/CuTeDSL/base_dsl/runtime/__init__.py
new file mode 100644
index 00000000..6f8e2feb
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/runtime/__init__.py
@@ -0,0 +1,29 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides a runtime utility functions that are needed for
+the DSL.
+"""
+
+from . import device_tensor
+from . import dlpack_types
+from . import cuda
+from . import tensor_descriptor
+from . import jit_arg_adapters
+
+__all__ = [
+    "device_tensor",
+    "dlpack_types",
+    "cuda",
+    "tensor_descriptor",
+    "jit_arg_adapters",
+]
diff --git a/python/CuTeDSL/base_dsl/runtime/cuda.py b/python/CuTeDSL/base_dsl/runtime/cuda.py
new file mode 100644
index 00000000..c4f88b58
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/runtime/cuda.py
@@ -0,0 +1,470 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides CUDA Python helper functions
+"""
+
+
+from functools import lru_cache
+from dataclasses import dataclass
+from typing import List, Optional
+import numpy as np
+import os
+import ctypes
+
+import cuda.bindings.driver as cuda
+import cuda.bindings.nvrtc as nvrtc
+
+# MLIR imports
+from ..._mlir import ir
+from ..._mlir.dialects import gpu
+
+# Local module imports
+from ..utils.logger import log as _log
+from ..common import *
+from .jit_arg_adapters import JitArgAdapterRegistry
+
+
+# =============================================================================
+# Utils
+# =============================================================================
+
+
+def _cudaGetErrorEnum(error):
+    if isinstance(error, cuda.CUresult):
+        err, name = cuda.cuGetErrorName(error)
+        return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>"
+    elif isinstance(error, nvrtc.nvrtcResult):
+        return nvrtc.nvrtcGetErrorString(error)[1]
+    else:
+        raise DSLRuntimeError("Unknown error type: {}".format(error))
+
+
+def _get_gpu_arch_info(major, minor):
+    """Get GPU architecture information and compatibility details."""
+    gpu_arch_map = {
+        (7, 0): ("Volta", "sm_70", ["sm_70"]),  # V100
+        (7, 5): ("Turing", "sm_75", ["sm_75"]),  # RTX 20 Series, Quadro RTX
+        (8, 0): ("Ampere", "sm_80", ["sm_80"]),  # A100
+        (8, 6): ("Ampere", "sm_86", ["sm_86", "sm_80"]),  # RTX 30 Series
+        (8, 9): ("Ada", "sm_89", ["sm_89", "sm_86"]),  # RTX 40 Series
+        (8, 7): ("Ampere", "sm_87", ["sm_87", "sm_86", "sm_80"]),  # A10, A40
+        (9, 0): ("Hopper", "sm_90a", ["sm_90a"]),  # H100
+        (10, 0): ("Blackwell", "sm_100a", ["sm_100a"]),  # B200
+    }
+    return gpu_arch_map.get(
+        (major, minor), ("Unknown", f"sm_{major}{minor}", [f"sm_{major}{minor}"])
+    )
+
+
+def get_compute_capability_major_minor(device_id: int = 0):
+    """
+    Returns the compute capability of the CUDA device as a tuple of (major, minor).
+    For example: (8, 0) for Ampere, (9, 0) for Hopper, (10, 0) for Blackwell.
+    Returns None on failure.
+    """
+    try:
+        checkCudaErrors(cuda.cuInit(0))
+        device = checkCudaErrors(cuda.cuDeviceGet(device_id))
+        major = checkCudaErrors(
+            cuda.cuDeviceGetAttribute(
+                cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                device,
+            )
+        )
+        minor = checkCudaErrors(
+            cuda.cuDeviceGetAttribute(
+                cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                device,
+            )
+        )
+        return major, minor
+    except RuntimeError as e:
+        _log().info(f"Failed to get CUDA compute capability: {e}")
+        return None, None
+
+
+@dataclass
+class DeviceInfo:
+    """Data class to store CUDA device information."""
+
+    device_count: int = 0
+    current_device: int = 0
+    device_name: Optional[str] = None
+    major_version: Optional[int] = None
+    minor_version: Optional[int] = None
+    arch_name: Optional[str] = None
+    sm_arch: Optional[str] = None
+    compatible_archs: Optional[List[str]] = None
+    memory_gb: Optional[float] = None
+    target_arch: Optional[str] = None
+    error_message: Optional[str] = None
+    initialization_failed: bool = False
+
+    def pretty_str(self) -> str:
+        """
+        Convert DeviceInfo to a formatted string for display.
+        """
+        info = ""
+
+        if self.initialization_failed:
+            return f"{Colors.BOLD}- CUDA initialization failed{Colors.RESET}"
+
+        if self.error_message:
+            return f"{Colors.BOLD}- Failed to get GPU info: {self.error_message}{Colors.RESET}"
+
+        if self.device_count > 0:
+            info += f"{Colors.BOLD}- CUDA devices available: {self.device_count} (current: {self.current_device})\n"
+
+            if self.major_version is not None and self.minor_version is not None:
+                info += f"- Architecture: {Colors.BLUE}{self.arch_name}{Colors.RESET} ({Colors.GREEN}{self.sm_arch}{Colors.RESET})\n"
+                info += f"- Compatible SM archs: {Colors.GREEN}{', '.join(self.compatible_archs or [])}{Colors.RESET}\n"
+
+                if self.memory_gb is not None:
+                    info += f"- Total Memory: {Colors.BLUE}{self.memory_gb:.2f} GB{Colors.RESET}\n"
+
+            else:
+                info += f"- Compute capability: unknown\n"
+                info += f"- SM arch: unknown{Colors.RESET}\n"
+        else:
+            info += f"- No devices available\n"
+
+        return info
+
+
+def get_device_info() -> DeviceInfo:
+    """
+    Get detailed information about CUDA devices.
+    Returns a DeviceInfo dataclass with device information.
+    """
+    device_info = DeviceInfo()
+
+    # Initialize CUDA if not already initialized
+    try:
+        result = cuda.cuInit(0)
+        if result[0].value:  # Check for error
+            device_info.initialization_failed = True
+            return device_info
+    except:
+        pass
+
+    try:
+        # Get device count
+        result = cuda.cuDeviceGetCount()
+        device_info.device_count = result[1] if result[0].value == 0 else 0
+
+        if device_info.device_count > 0:
+            # Get current device
+            try:
+                result = cuda.cuCtxGetDevice()
+                if result[0].value == 0:
+                    device_info.current_device = result[1]
+            except:
+                pass
+
+            # Get device name
+            try:
+                name_result = cuda.cuDeviceGetName(100, device_info.current_device)
+                if name_result[0].value == 0:
+                    device_info.device_name = name_result[1]
+            except:
+                pass
+
+            # Get compute capability and architecture info
+            try:
+                major, minor = get_compute_capability_major_minor(
+                    device_info.current_device
+                )
+
+                # Check if we successfully got the compute capability
+                if major is not None and minor is not None:
+                    device_info.major_version = major
+                    device_info.minor_version = minor
+
+                    arch_name, sm_arch, compatible_archs = _get_gpu_arch_info(
+                        device_info.major_version, device_info.minor_version
+                    )
+
+                    device_info.arch_name = arch_name
+                    device_info.sm_arch = sm_arch
+                    device_info.compatible_archs = compatible_archs
+
+                    # Get memory info
+                    try:
+                        total_mem = cuda.cuDeviceGetAttribute(
+                            cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_TOTAL_MEMORY,
+                            device_info.current_device,
+                        )
+                        if total_mem[0].value == 0:
+                            device_info.memory_gb = total_mem[1] / (
+                                1024 * 1024 * 1024
+                            )  # Convert to GB
+                    except:
+                        pass
+
+            except Exception as e:
+                pass  # Compute capability info will remain None
+
+    except Exception as e:
+        device_info.error_message = str(e)
+
+    return device_info
+
+
+def checkCudaErrors(result):
+    """Check CUDA errors and provide detailed error messages."""
+    if result[0].value:
+        error_code = result[0].value
+        error_name = _cudaGetErrorEnum(result[0])
+
+        raise DSLCudaRuntimeError(error_code, error_name)
+
+    if len(result) == 1:
+        return None
+    elif len(result) == 2:
+        return result[1]
+    else:
+        return result[1:]
+
+
+# =============================================================================
+# Driver Helpers
+# =============================================================================
+
+
+@lru_cache(maxsize=1)
+def initialize_cuda_context(device_id: int = 0, flags: int = 0):
+    """
+    Initializes the CUDA context for a specified device.
+    """
+    # Initialize CUDA Driver API
+    _log().info(f"cuInit {flags}")
+    checkCudaErrors(cuda.cuInit(flags))
+    # Retrieve handle for device
+    _log().info(f"cuDeviceGet {device_id}")
+    cuDevice = checkCudaErrors(cuda.cuDeviceGet(device_id))
+    _log().info(f"{cuDevice} <-- cuDeviceGet")
+    # Create context
+    _log().info(f"cuCtxCreate {0} {cuDevice}")
+    context = checkCudaErrors(cuda.cuCtxCreate(0, cuDevice))
+    _log().info(f"{context} <-- cuCtxCreate")
+
+    return context
+
+
+def load_cubin_module(cubin_file):
+    """
+    Loads a CUBIN file and returns the module.
+    """
+    # Load CUBIN file as binary data
+    _log().info(f"read cubin {cubin_file}")
+    with open(cubin_file, "rb") as f:
+        cubin_data = f.read()
+    # Load module data
+    _log().info(f"cuModuleLoadData {np.char.array(cubin_data).ctypes.data}")
+    module = checkCudaErrors(
+        cuda.cuModuleLoadData(np.char.array(cubin_data).ctypes.data)
+    )
+    return module
+
+
+def unload_cubin_module(module):
+    """
+    Unloads a CUBIN module.
+    """
+    _log().info(f"cuModuleUnload {module}")
+    checkCudaErrors(cuda.cuModuleUnload(module))
+
+
+def load_cubin_module_data(cubin_data):
+    """
+    Loads a CUBIN from data and returns the module.
+    """
+    # Load module data
+    _log().info(f"cuModuleLoadData {np.char.array(cubin_data).ctypes.data}")
+    module = checkCudaErrors(
+        cuda.cuModuleLoadData(np.char.array(cubin_data).ctypes.data)
+    )
+    return module
+
+
+def get_kernel_function(module, kernel_name):
+    """
+    Retrieves the kernel function from the module.
+    """
+    _log().info(f"cuModuleGetFunction {module} {kernel_name}")
+    kernel = checkCudaErrors(
+        cuda.cuModuleGetFunction(module, bytes(kernel_name, "utf-8"))
+    )
+    _log().info(f"{kernel} <-- cuModuleGetFunction")
+    return kernel
+
+
+def launch_kernel(kernel, grid_dims, block_dims, stream, smem_size=0, kernel_args=None):
+    """
+    Launches the CUDA kernel.
+    """
+    _log().info(
+        f"cuLaunchKernel {kernel} grid={grid_dims} blocks={block_dims} smem_size={smem_size} stream={stream} {kernel_args}"
+    )
+    checkCudaErrors(
+        cuda.cuLaunchKernel(
+            kernel,
+            grid_dims[0],
+            grid_dims[1],
+            grid_dims[2],
+            block_dims[0],
+            block_dims[1],
+            block_dims[2],
+            smem_size,  # Shared memory size
+            stream,
+            kernel_args,
+            0,  # Extra parameters
+        )
+    )
+
+
+def stream_sync(stream):
+    """
+    Synchronizes the CUDA stream.
+    """
+    _log().info(f"cuStreamSynchronize {stream}")
+    checkCudaErrors(cuda.cuStreamSynchronize(stream))
+
+
+def stream_create(id=0):
+    """
+    Creates the CUDA stream.
+    """
+    _log().info(f"cuStreamCreate {id}")
+    stream = checkCudaErrors(cuda.cuStreamCreate(id))
+    _log().info(f"{stream} <-- cuStreamCreate")
+    return stream
+
+
+def stream_destroy(stream):
+    """
+    Destroys the CUDA stream.
+    """
+    _log().info(f"cuStreamDestroy {stream}")
+    checkCudaErrors(cuda.cuStreamDestroy(stream))
+
+
+def context_destroy(context):
+    """
+    Destroys the CUDA context.
+    """
+    _log().info(f"cuCtxDestroy {context}")
+    checkCudaErrors(cuda.cuCtxDestroy(context))
+
+
+def allocate(size_in_bytes: int, stream=None):
+    """
+    Allocate device memory based on numpy host array size.
+    """
+    _log().info("Allocate size_in_bytes=[%s] stream=[%s]", size_in_bytes, stream)
+    if stream is None:
+        device_memory = checkCudaErrors(cuda.cuMemAlloc(size_in_bytes))
+    else:
+        device_memory = checkCudaErrors(cuda.cuMemAllocAsync(size_in_bytes, stream))
+    _log().info("Allocated [%s]", device_memory)
+    return device_memory
+
+
+def deallocate(device_pointer, stream=None):
+    """
+    Deallocate the specified device memory pointer.
+    """
+    _log().info(
+        "Deallocate device_pointer=[%s] stream=[%s]", hex(int(device_pointer)), stream
+    )
+    if stream is None:
+        checkCudaErrors(cuda.cuMemFree(device_pointer))
+    else:
+        checkCudaErrors(cuda.cuMemFreeAsync(device_pointer, stream))
+
+
+def memcpy_h2d(host_pointer, device_pointer, size_in_bytes, stream=None):
+    """
+    Copy data from host to device memory.
+    """
+    _log().info(
+        "Copy host-to-device host_pointer[%s] device_ptr=[%s] size_in_bytes=[%s] stream=[%s]",
+        hex(host_pointer),
+        hex(int(device_pointer)),
+        size_in_bytes,
+        stream,
+    )
+    if stream is None:
+        checkCudaErrors(cuda.cuMemcpyHtoD(device_pointer, host_pointer, size_in_bytes))
+    else:
+        checkCudaErrors(
+            cuda.cuMemcpyHtoDAsync(device_pointer, host_pointer, size_in_bytes, stream)
+        )
+
+
+def memcpy_d2h(host_pointer, device_pointer, size_in_bytes, stream=None):
+    """
+    Copy data from device to host memory.
+    """
+    _log().info(
+        "Copy device-host-to device_pointer=[%s] host_pointer[%s]  size_in_bytes=[%s] stream=[%s]",
+        hex(int(device_pointer)),
+        hex(host_pointer),
+        size_in_bytes,
+        stream,
+    )
+    if stream is None:
+        checkCudaErrors(cuda.cuMemcpyDtoH(host_pointer, device_pointer, size_in_bytes))
+    else:
+        checkCudaErrors(
+            cuda.cuMemcpyDtoHAsync(host_pointer, device_pointer, size_in_bytes, stream)
+        )
+
+
+def default_stream():
+    return cuda.CUstream(0)
+
+
+def get_driver_version():
+    """
+    Returns the CUDA driver version.
+    """
+    return checkCudaErrors(cuda.cuDriverGetVersion())
+
+
+def set_kernel_attribute(kernel, attribute, value):
+    """
+    Sets a CUDA kernel attribute.
+    """
+    return checkCudaErrors(cuda.cuFuncSetAttribute(kernel, attribute, value))
+
+
+@JitArgAdapterRegistry.register_jit_arg_adapter(cuda.CUstream)
+class StreamAdapter:
+    """
+    Convert a CUDA stream to a stream representation for JIT arg generation.
+    """
+
+    def __init__(self, arg):
+        self._arg = arg
+        self._c_pointer = ctypes.cast(self._arg.getPtr(), ctypes.c_void_p)
+
+    def __new_from_mlir_values__(self, values):
+        assert len(values) == 1
+        return values[0]
+
+    def __c_pointers__(self):
+        return [self._c_pointer]
+
+    def __get_mlir_types__(self):
+        return [gpu.AsyncTokenType.get()]
diff --git a/python/CuTeDSL/base_dsl/runtime/device_tensor.py b/python/CuTeDSL/base_dsl/runtime/device_tensor.py
new file mode 100644
index 00000000..5addb275
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/runtime/device_tensor.py
@@ -0,0 +1,121 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import copy
+
+from . import cuda as cuda_helpers
+from .tensor_descriptor import *
+from ..common import *
+
+
+def allocate(tensor: TensorDescriptor, stream=None):
+    """
+    Allocates GPU memory
+    """
+    if tensor._check_is_managed_by_framework():
+        raise DSLRuntimeError(
+            "GPU tensors are managed by the framework and cannot be modified."
+        )
+    if not tensor.device_pointer is None:
+        raise DSLRuntimeError("Tensor is already allocated on the device.")
+
+    tensor.device_pointer = cuda_helpers.allocate(tensor.size_in_bytes, stream)
+
+    log().info("Allocate done tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer)
+
+
+def deallocate(tensor: TensorDescriptor, stream=None):
+    """
+    Deallocates GPU memory
+    """
+    if tensor._check_is_managed_by_framework():
+        raise DSLRuntimeError(
+            "GPU tensors are managed by the framework and cannot be modified."
+        )
+    if tensor.device_pointer is None:
+        raise DSLRuntimeError("Tensor is not allocated on the device.")
+
+    log().info(
+        "Deallocating done tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer
+    )
+
+    cuda_helpers.deallocate(tensor.device_pointer, stream)
+    tensor.device_pointer = None
+
+
+def copy_to_gpu(tensor: TensorDescriptor, do_allocate=True, stream=None):
+    """
+    Copies data from host memory to the GPU memory.
+    If do_allocate is True, it first calls allocate
+    """
+    log().info("copyin tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer)
+    if do_allocate:
+        allocate(tensor, stream)
+    cuda_helpers.memcpy_h2d(
+        tensor.data_ptr, tensor.device_pointer, tensor.size_in_bytes, stream
+    )
+    log().info("copyin done tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer)
+    return tensor
+
+
+def copy_from_gpu(tensor: TensorDescriptor, do_deallocate=True, stream=None):
+    """
+    Copies data from GPU memory back to the host.
+    If do_deallocate is True, it calls deallocate
+    """
+    log().info("copyout tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer)
+    if tensor._check_is_managed_by_framework():
+        raise DSLRuntimeError(
+            "GPU tensors are managed by the framework and cannot be modified."
+        )
+    if tensor.device_pointer is None:
+        raise DSLRuntimeError("Tensor is not allocated on the device.")
+
+    cuda_helpers.memcpy_d2h(
+        tensor.data_ptr, tensor.device_pointer, tensor.size_in_bytes, stream
+    )
+    if do_deallocate:
+        deallocate(tensor, stream)
+    log().info("copyout done tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer)
+
+
+def to_gpu(tensor, stream=None) -> TensorDescriptor:
+    """
+    Copies the tensor to the GPU memory from Host memory
+    """
+    if isinstance(tensor, TensorDescriptor):
+        new_tensor = copy.copy(tensor)
+        copy_to_gpu(new_tensor, stream=stream)
+        return new_tensor
+
+    if TensorDescriptor.can_transformed_to_dlpack(tensor):
+        new_tensor = TensorDescriptor(tensor)
+        copy_to_gpu(new_tensor, stream=stream)
+        return new_tensor
+
+    raise DSLRuntimeError("Unsupported type")
+
+
+def from_gpu(tensor, stream=None) -> TensorDescriptor:
+    """
+    Copies the tensor to the GPU memory from Host memory
+    """
+    if isinstance(tensor, TensorDescriptor):
+        new_tensor = copy.copy(tensor)
+        copy_from_gpu(new_tensor, stream=stream)
+        return new_tensor
+
+    if TensorDescriptor.can_transformed_to_dlpack(tensor):
+        new_tensor = TensorDescriptor(tensor)
+        copy_from_gpu(new_tensor, stream=stream)
+        return new_tensor
+
+    raise DSLRuntimeError("Unsupported type")
diff --git a/python/CuTeDSL/base_dsl/runtime/dlpack_types.py b/python/CuTeDSL/base_dsl/runtime/dlpack_types.py
new file mode 100644
index 00000000..168c2a99
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/runtime/dlpack_types.py
@@ -0,0 +1,76 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides helper structs for dlpack.
+DLPack is an open standard for in-memory tensor structures, enabling
+seamless sharing of tensors across different frameworks.
+Learn more at: https://github.com/dmlc/dlpack
+"""
+
+import ctypes
+import enum
+
+
+class DLDeviceType(enum.IntEnum):
+    """Enums for device types based on the DLPack specification."""
+
+    kDLCPU = 1
+    kDLGPU = 2
+    kDLCPUPinned = 3
+
+
+class DLDataTypeCode:
+    """Enums for data type codes based on the DLPack specification.
+
+    see https://github.com/dmlc/dlpack/blob/main/include/dlpack/dlpack.h
+    """
+
+    kDLInt = 0
+    kDLUInt = 1
+    kDLFloat = 2
+    kDLOpaqueHandle = 3
+    kDLBfloat = 4
+    kDLComplex = 5
+    kDLBool = 6
+
+
+class DLDevice(ctypes.Structure):
+    """Structure representing the device information in DLPack."""
+
+    _fields_ = [
+        ("device_type", ctypes.c_int),  # kDLCPU, kDLGPU, etc.
+        ("device_id", ctypes.c_int),  # Device ID (e.g., GPU ID)
+    ]
+
+
+class DLDataType(ctypes.Structure):
+    """Structure representing the data type in DLPack."""
+
+    _fields_ = [
+        ("code", ctypes.c_uint8),  # Data type code (e.g., kDLFloat)
+        ("bits", ctypes.c_uint8),  # Number of bits per value
+        ("lanes", ctypes.c_uint16),  # Number of lanes
+    ]
+
+
+class DLTensor(ctypes.Structure):
+    """Structure representing the DLTensor in DLPack."""
+
+    _fields_ = [
+        ("data", ctypes.c_void_p),  # Pointer to tensor data
+        ("device", DLDevice),  # Device info
+        ("ndim", ctypes.c_int),  # Number of dimensions
+        ("dtype", DLDataType),  # Data type
+        ("shape", ctypes.POINTER(ctypes.c_int64)),  # Shape of tensor
+        ("strides", ctypes.POINTER(ctypes.c_int64)),  # Strides of tensor
+        ("byte_offset", ctypes.c_uint64),  # Byte offset to tensor data
+    ]
diff --git a/python/CuTeDSL/base_dsl/runtime/jit_arg_adapters.py b/python/CuTeDSL/base_dsl/runtime/jit_arg_adapters.py
new file mode 100644
index 00000000..eb998d16
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/runtime/jit_arg_adapters.py
@@ -0,0 +1,188 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides runtime utilities for JIT argument conversion in DSL.
+"""
+
+from functools import wraps
+from typing import get_origin
+
+# Local modules imports
+from ..common import DSLRuntimeError
+from ..typing import (
+    Constexpr,
+    Int32,
+    Float32,
+    Boolean,
+)
+
+
+def is_arg_spec_constexpr(arg_spec, arg_name, arg_index, owning_func):
+    """
+    Check if the argument spec is a constexpr.
+    """
+
+    def _is_reserved_python_func_arg(arg_index, arg_name, func):
+        """
+        Check if the argument is a reserved python function argument.
+        """
+
+        if arg_index != 0:
+            return False
+
+        if arg_name == "self":
+            return True
+
+        is_classmethod = isinstance(func, classmethod) or (
+            hasattr(func, "__func__") and isinstance(func.__func__, classmethod)
+        )
+        return arg_name == "cls" and is_classmethod
+
+    return (
+        _is_reserved_python_func_arg(arg_index, arg_name, owning_func)
+        or (isinstance(arg_spec, type) and issubclass(arg_spec, Constexpr))
+        or (get_origin(arg_spec) is Constexpr)
+    )
+
+
+def is_argument_constexpr(arg, arg_spec, arg_name, arg_index, owning_func):
+    """
+    Check if the argument is a constexpr.
+    """
+
+    def _is_type_argument(arg, arg_annotation):
+        """
+        Check if the argument is a type argument like Type[X]
+        """
+
+        return isinstance(arg, type) and (
+            arg_annotation is None or get_origin(arg_annotation) is type
+        )
+
+    return (
+        is_arg_spec_constexpr(arg_spec, arg_name, arg_index, owning_func)
+        or _is_type_argument(arg, arg_spec)
+        or arg is None
+    )
+
+
+class JitArgAdapterRegistry:
+    """
+    A registry to keep track of the JIT argument adapters.
+
+    An adapter is a callable that converts a Python type to a type with following protocols supported:
+    - JitArgument
+    - DynamicExpression
+    The converted type can then be further processed by DSL to generate arguments for JIT functions.
+    """
+
+    # A dictionary with key=type and value=callable
+    jit_arg_adapter_registry = {}
+
+    @classmethod
+    def register_jit_arg_adapter(cls, *dargs, **dkwargs):
+        """
+        Register a JIT argument adapter callable
+
+        This can be used as a decorator on any callable like:
+
+        @register_jit_arg_adapter(my_py_type)
+        def my_adapter_for_my_py_type(arg):
+            ...
+
+        @register_jit_arg_adapter(my_py_type)
+        class MyAdapterForMyPythonType:
+            ...
+
+        The adapters are registered per type. If a type is already registerd, an error will be raised.
+        """
+
+        def decorator(*dargs, **dkwargs):
+            darg_python_ty = dargs[0]
+
+            @wraps(darg_python_ty)
+            def wrapper(*args, **kwargs):
+                if len(args) != 1 or not callable(args[0]):
+                    raise DSLRuntimeError(
+                        "a callable must be provided for registering JIT argument adapter"
+                    )
+                adapter = args[0]
+
+                if darg_python_ty in cls.jit_arg_adapter_registry:
+                    raise DSLRuntimeError(
+                        f"JIT argument adapter for {darg_python_ty} is already registered!",
+                        context={
+                            "Registered adapter": cls.jit_arg_adapter_registry[
+                                darg_python_ty
+                            ],
+                            "Adapter to be registered": adapter,
+                        },
+                    )
+                cls.jit_arg_adapter_registry[darg_python_ty] = adapter
+                return adapter
+
+            return wrapper
+
+        if len(dargs) > 0:
+            return decorator(*dargs, **dkwargs)
+        else:
+            raise DSLRuntimeError(
+                "a Python type must be provided for registering JIT argument adapter"
+            )
+
+    @classmethod
+    def get_registered_adapter(cls, ty):
+        """
+        Get the registered JIT argument adapter for the given type.
+        """
+        return cls.jit_arg_adapter_registry.get(ty, None)
+
+
+# =============================================================================
+# JIT Argument Adapters
+# =============================================================================
+
+
+@JitArgAdapterRegistry.register_jit_arg_adapter(int)
+@JitArgAdapterRegistry.register_jit_arg_adapter(float)
+@JitArgAdapterRegistry.register_jit_arg_adapter(bool)
+def _convert_python_scalar(arg):
+    """
+    Convert a Python scalar to a DSL type.
+    """
+    conversion_map = {
+        int: Int32,
+        float: Float32,
+        bool: Boolean,
+    }
+    return conversion_map.get(type(arg))(arg)
+
+
+@JitArgAdapterRegistry.register_jit_arg_adapter(tuple)
+@JitArgAdapterRegistry.register_jit_arg_adapter(list)
+def _convert_python_sequence(arg):
+    """
+    Go through each element in the sequence and convert it to a type that can be
+    further processed by DSL to generate the corresponding JIT argument(s).
+    """
+    adapted_arg = []
+    for elem in arg:
+        adapter = JitArgAdapterRegistry.get_registered_adapter(type(elem))
+        if adapter is not None:
+            converted_elem = adapter(elem)
+            adapted_arg.append(converted_elem)
+        else:
+            # If no registered adapter is found, just return the original element
+            adapted_arg.append(elem)
+
+    assert len(adapted_arg) == len(arg)
+    return type(arg)(adapted_arg)
diff --git a/python/CuTeDSL/base_dsl/runtime/tensor_descriptor.py b/python/CuTeDSL/base_dsl/runtime/tensor_descriptor.py
new file mode 100644
index 00000000..b09d2fcb
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/runtime/tensor_descriptor.py
@@ -0,0 +1,201 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+# Helpers
+import itertools, operator
+import ctypes
+from . import dlpack_types as _dpack
+from .dlpack_runtime import (
+    dlpack_to_tensor_desc,
+    get_tensor_desc_data_ptr,
+    get_tensor_desc_is_in_device,
+    get_tensor_desc_element_type,
+    get_tensor_desc_shape,
+    get_tensor_desc_stride,
+    get_tensor_desc_element_size_in_bytes,
+    get_tensor_desc_ndim,
+    get_tensor_desc_dtype_code,
+    get_tensor_desc_dtype_bits,
+    get_tensor_desc_device_type,
+    get_tensor_desc_device_id,
+)
+
+from ..utils.logger import log
+from ..common import *
+from ..typing import (
+    Boolean,
+    Float8E5M2,
+    Int64,
+    Int32,
+    Int16,
+    Int8,
+    Uint64,
+    Uint32,
+    Uint16,
+    Uint8,
+    Float64,
+    Float32,
+    Float16,
+    BFloat16,
+)
+
+
+class TensorDescriptor:
+    def __init__(self, tensor):
+        """Initialize with a tensor that supports the DLPack protocol.
+
+        Args:
+            tensor: Any tensor object that implements __dlpack__ and __dlpack_device__
+        """
+
+        self.tensor = tensor
+        self._capsule = dlpack_to_tensor_desc(tensor)
+
+        self.data_ptr = get_tensor_desc_data_ptr(self._capsule)
+        self.device_type = get_tensor_desc_device_type(self._capsule)
+        self.device_type = _dpack.DLDeviceType(self.device_type)
+
+        if self.device_type == _dpack.DLDeviceType.kDLGPU:
+            self.device_pointer = self.data_ptr
+        elif self.device_type == _dpack.DLDeviceType.kDLCPU:
+            self.device_pointer = None
+        else:
+            raise DSLRuntimeError(
+                f"DLPack device type is not supported {self.dl_tensor.device.device_type}"
+            )
+
+        log().info("TensorDescriptor is created = [%s]", self)
+
+    @staticmethod
+    def can_transformed_to_dlpack(dl_tensor):
+        if not hasattr(dl_tensor, "__dlpack__") or not hasattr(
+            dl_tensor, "__dlpack_device__"
+        ):
+            return False
+        return True
+
+    @property
+    def is_in_device(self):
+        """Check if the tensor is stored on a device."""
+        return not self.device_pointer is None
+
+    @property
+    def device_id(self):
+        """Return device id where tensor resides."""
+        if self.is_in_device:
+            return get_tensor_desc_device_id(self._capsule)
+        return -1
+
+    @property
+    def element_type(self):
+        """Return the corresponding Python type based on DLPack dtype metadata."""
+        str_element_type = get_tensor_desc_element_type(self._capsule)
+        dtype_map = {
+            # bool is 8bit from numpy and torch
+            "Bool": Boolean,
+            "Int64": Int64,
+            "Int32": Int32,
+            "Int16": Int16,
+            "Int8": Int8,
+            "UInt64": Uint64,
+            "UInt32": Uint32,
+            "UInt16": Uint16,
+            "UInt8": Uint8,
+            "Float64": Float64,
+            "Float32": Float32,
+            "Float16": Float16,
+            "BFloat16": BFloat16,
+            "Float8E5M2": Float8E5M2,
+        }
+
+        if str_element_type not in dtype_map:
+            raise KeyError(
+                f"Unsupported element type in dlpack: '{str_element_type}'. Supported types are: {list(dtype_map.keys())}"
+            )
+
+        return dtype_map[str_element_type]
+
+    @property
+    def shape(self):
+        """Return the shape of the tensor."""
+        return get_tensor_desc_shape(self._capsule)
+
+    @property
+    def rank(self):
+        """Return the rank of the tensor."""
+        return get_tensor_desc_ndim(self._capsule)
+
+    @property
+    def strides(self):
+        """Return the rank of the tensor."""
+        return get_tensor_desc_stride(self._capsule)
+
+    @property
+    def element_size_in_bytes(self):
+        """Calculate the element size in bytes of the DLPack tensor."""
+        return get_tensor_desc_element_size_in_bytes(self._capsule)
+
+    @property
+    def size_in_bytes(self):
+        """Calculate the total size in bytes of the DLPack tensor."""
+        # Calculate the number of elements using the shape
+        ndim = get_tensor_desc_ndim(self._capsule)
+        shape = get_tensor_desc_shape(self._capsule)
+        num_elements = 1
+        for i in range(ndim):
+            num_elements *= shape[i]
+
+        # Total bytes
+        total_bytes = self.element_size_in_bytes * num_elements
+        return total_bytes
+
+    def __str__(self):
+        """Return a compact string representation of the device_tensor with a tensor prefix."""
+        # Extract shape
+        shape = "x".join(map(str, self.shape))
+
+        # Extract dtype
+        dtype_code = get_tensor_desc_dtype_code(self._capsule)
+        dtype_bits = get_tensor_desc_dtype_bits(self._capsule)
+        dtype = (
+            f"i{dtype_bits}"
+            if dtype_code == _dpack.DLDataTypeCode.kDLInt
+            else f"f{dtype_bits}"
+        )
+
+        # Extract device
+        device_type = "cpu" if not self.is_in_device else "gpu"
+
+        return f"tensor<{shape}x{dtype}>_{device_type}"
+
+    def _check_is_managed_by_framework(self):
+        """
+        Ensure the tensor is not managed by the framework (e.g., GPU tensor).
+        Raises an exception if the tensor is framework-managed.
+        """
+        return self.device_type == _dpack.DLDeviceType.kDLGPU
+
+
+def from_tensor(tensor) -> TensorDescriptor:
+    """Create a TensorDescriptor from a tensor object."""
+    return TensorDescriptor(tensor)
+
+
+def to_tensor(tensor_descriptor: TensorDescriptor):
+    """Return tensor object from tensor descriptor."""
+    return tensor_descriptor.tensor
+
+
+def is_tensor_descriptor(maybe_tensor_descriptor) -> bool:
+    """Check if the object is a TensorDescriptor."""
+    return isinstance(
+        maybe_tensor_descriptor, TensorDescriptor
+    ) or TensorDescriptor.can_transformed_to_dlpack(maybe_tensor_descriptor)
diff --git a/python/CuTeDSL/base_dsl/typing.py b/python/CuTeDSL/base_dsl/typing.py
new file mode 100644
index 00000000..7fc2b4d7
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/typing.py
@@ -0,0 +1,1897 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import ctypes
+import numpy as np
+import operator
+from typing_extensions import deprecated
+from functools import reduce
+from typing import (
+    Generic,
+    Protocol,
+    Union,
+    Any,
+    List,
+    Type,
+    TypeVar,
+    overload,
+    runtime_checkable,
+    get_origin,
+)
+from types import FunctionType
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+
+from .common import *
+from .ast_helpers import const_expr
+from ._mlir_helpers import arith as arith_helper, lru_cache_ir
+from ._mlir_helpers.arith import ArithValue
+
+from .._mlir import ir
+from .._mlir.extras import types as T
+from .._mlir.dialects import arith, math
+
+# =============================================================================
+# Dynamic Expression Protocol
+# =============================================================================
+
+
+@runtime_checkable
+class DynamicExpression(Protocol):
+    """
+    This is a protocol class that provides a common interface
+    to generate user-defined dynamic expressions.
+
+    The DSL checks this protocol to determine if a class is a dynamic expression (SSA value) or not.
+    """
+
+    def __extract_mlir_values__(self):
+        """
+        Generate a dynamic expression for the current object.
+
+        :return: List of MLIR values
+        :rtype: List[ir.Value]
+        """
+        raise NotImplementedError
+
+    def __new_from_mlir_values__(self, values):
+        """
+        Create a new object from MLIR values.
+
+        :param values: List of MLIR values
+        :type values: List[ir.Value]
+        :return: A new instance of the class that implements this protocol
+        :rtype: Any
+        """
+        raise NotImplementedError
+
+
+@runtime_checkable
+class JitArgument(Protocol):
+    """
+    This is a protocol class that provides a common interface
+    for JIT function arguments generation for Python to call JIT functions.
+
+    The DSL checks this protocol to determine if a class is capable of providing information
+    needed for generating JIT function arguments.
+
+    See breakdowns below for JitArgument protocol based JIT function calls.
+
+    .. code-block:: python
+
+        @jit
+        def foo(x: CustomData):
+            return x.int_value + 1
+
+        # Emit: `%c0 = arith.constant(1, i32)`
+        c1 = const(1, Int32)
+        # `c1` tracks `%c0` defined outside of function body of `foo`
+        # `%c0` can't be used directly in function body of `foo`
+        x = CustomData(c1, ...)
+
+    When called like ``y = foo(x)``, the following steps occur:
+
+    1. JIT compiler generates MLIR function definition using ``__get_mlir_types__``:
+
+    .. code-block:: mlir
+
+        func @foo(%arg0: i32, ...) -> i32 {
+            ...
+        }
+
+    2. Function is traced in Python, wrapping MLIR values with ``__new_from_mlir_values__``:
+
+    .. code-block:: python
+
+        # Implementation of IR tracing
+        new_x = CustomData(ir.Value(%arg0), ...)
+        y = foo(new_x)
+        # `x.int_value` is %arg0 rather than `c1` defined outside
+
+    3. For Python runtime execution, JIT engine invokes compiled function using ``__c_pointers__``:
+
+    .. code-block:: python
+
+        jit_engine.invoke(foo, concat([x.__c_pointers__(), ...]))
+    """
+
+    def __c_pointers__(self):
+        """
+        Generate a list of ctypes pointers for the current object.
+
+        :return: List of ctypes pointers
+        :rtype: List[ctypes.c_void_p]
+        """
+        raise NotImplementedError
+
+    def __get_mlir_types__(self):
+        """
+        Generate a list of MLIR types for the current object.
+
+        :return: List of MLIR types
+        :rtype: List[ir.Type]
+        """
+        raise NotImplementedError
+
+    def __new_from_mlir_values__(self, values):
+        """
+        Create a new object from MLIR values.
+
+        :param values: List of MLIR values
+        :type values: List[ir.Value]
+        :return: A new object that represents the given MLIR values
+        :rtype: Any
+        """
+        raise NotImplementedError
+
+
+def get_c_pointers(obj):
+    """
+    Given the `obj`, recursively go through it to extract all contained C pointers
+    """
+    if hasattr(obj, "__c_pointers__"):
+        return obj.__c_pointers__()
+    elif isinstance(obj, (tuple, list)):
+        return sum((get_c_pointers(x) for x in obj), [])
+    elif isinstance(obj, set):
+        raise DSLRuntimeError(
+            "Sets are not supported in get_c_pointers to ensure order preservation",
+            context="The DSL attempted to generate JIT function argument(s) for an argument of type set but failed.",
+            suggestion="Consider using a list or tuple instead",
+        )
+    return []
+
+
+def get_mlir_types(obj):
+    """
+    Given the `obj`, recursively go through it to extract all contained MLIR types
+    """
+    if hasattr(obj, "__get_mlir_types__"):
+        return obj.__get_mlir_types__()
+    elif hasattr(obj, "__extract_mlir_values__"):
+        return [v.type for v in obj.__extract_mlir_values__()]
+    elif isinstance(obj, ir.Value):
+        return [obj.type]
+    elif isinstance(obj, (tuple, list)):
+        return sum((get_mlir_types(x) for x in obj), [])
+    elif isinstance(obj, set):
+        raise DSLRuntimeError(
+            "Sets are not supported in get_mlir_types to ensure order preservation",
+            context="The DSL attempted to generate JIT function argument(s) for an argument of type set but failed.",
+            suggestion="Consider using a list or tuple instead",
+        )
+    return []
+
+
+class DslType(type):
+    """Metaclass for all DSL types in the system.
+
+    This metaclass provides type system infrastructure for DSL types, handling MLIR
+    type mappings and NumPy type conversions.
+
+    All data types in DSL must provide the following methods:
+
+    :param mlir_type: Corresponding MLIR type for this DSL type
+    :type mlir_type: Any, optional
+    :param is_abstract: Whether this type is abstract, defaults to False
+    :type is_abstract: bool, optional
+
+    **Required Methods**
+
+    * ``__str__`` (classmethod): Return string representation of the type
+    * ``__c_pointers__`` (optional): Return list of ctypes pointers of data used to invoke JIT function
+    * ``__get_mlir_types__``: Return list of MLIR types of the MLIR values contained in the instance
+    * ``__extract_mlir_values__``: Return list of MLIR values contained in the instance
+    * ``__new_from_mlir_values__``: Return a new instance from list of MLIR values
+
+    **Attributes**
+
+    :ivar _ir: MLIR provider
+    :vartype _ir: Any
+    :ivar _T: MLIR Type system provider
+    :vartype _T: Any
+
+    **Properties**
+
+    :property mlir_type: Returns the corresponding MLIR type for this DSL type
+    :type mlir_type: Any
+
+    **Examples**
+
+    Define a custom data type:
+
+    .. code-block:: python
+
+        class CustomData(metaclass=DslType, ...):
+            def __init__(self, int_value, ...):
+                self.int_value = int_value
+                ...
+
+            def __str__(cls):
+                return "CustomData[int, ...]"
+
+            def __c_pointers__(self):
+                return [ctypes.pointer(ctypes.c_int32(self.int_value)), ...]
+
+            def __get_mlir_types__(self):
+                return [_T.i32(), ...]
+
+            def __extract_mlir_values__(self):
+                return [self.int_value, ...]
+
+            def __new_from_mlir_values__(self, values):
+                return CustomData(values[0], ...)
+
+    For JIT function calls, MLIR values are extracted with ``__extract_mlir_values__``:
+
+    .. code-block:: python
+
+        @jit
+        def caller():
+            x = CustomData(1, ...)
+            return foo(x)
+
+    .. code-block:: mlir
+
+        func @caller() -> i32 {
+            %0 = func.call @foo(%arg0, ...) : (i32, ...) -> i32
+            return %0 : i32
+        }
+    """
+
+    _is_abstract: bool
+
+    def __new__(cls, name, bases, attrs, is_abstract=False, **kwargs):
+        new_cls = super().__new__(cls, name, bases, attrs)
+
+        new_cls._is_abstract = is_abstract
+
+        return new_cls
+
+    @property
+    def is_abstract(cls):
+        return cls._is_abstract
+
+
+class NumericMeta(DslType):
+    """Metaclass for numeric types providing width and numpy dtype information.
+
+    :param width: Bit width of the numeric type, defaults to 8
+    :type width: int
+    :param np_dtype: Corresponding NumPy dtype
+    :type np_dtype: numpy.dtype, optional
+    :param mlir_type: Corresponding MLIR type
+    :type mlir_type: Any, optional
+    :param is_abstract: Whether the type is abstract, defaults to False
+    :type is_abstract: bool, optional
+
+    :ivar width: Bit width of the numeric type
+    :type width: int
+    :ivar _np_dtype: Corresponding NumPy dtype
+    :type _np_dtype: Union[numpy.dtype, None]
+
+    :property numpy_dtype: Returns the corresponding NumPy dtype
+    :rtype numpy_dtype: numpy.dtype
+    """
+
+    width: int
+
+    # Placeholder type
+    _mlir_type = Any
+    _np_dtype: Union[np.dtype, None]
+
+    def __new__(
+        cls,
+        name,
+        bases,
+        attrs,
+        width=8,
+        np_dtype=None,
+        mlir_type=None,
+        is_abstract=False,
+        **kwargs,
+    ):
+        def _extract_mlir_values(self):
+            return [self.ir_value()]
+
+        def _new_from_mlir_values(self, values: list) -> "Numeric":
+            res_ty = type(self)
+            return res_ty(values[0])
+
+        new_attrs = {
+            "__extract_mlir_values__": _extract_mlir_values,
+            "__new_from_mlir_values__": _new_from_mlir_values,
+        }
+        new_cls = super().__new__(
+            cls,
+            name,
+            bases,
+            new_attrs | attrs,
+            is_abstract=is_abstract,
+            **kwargs,
+        )
+
+        if mlir_type is not None:
+            new_cls._mlir_type = staticmethod(mlir_type)
+
+        new_cls.width = width
+        new_cls._np_dtype = np_dtype
+        return new_cls
+
+    @property
+    def numpy_dtype(cls):
+        return cls._np_dtype
+
+    @property
+    def is_integer(cls) -> bool: ...
+
+    @property
+    def is_float(cls) -> bool: ...
+
+    def is_same_kind(cls, other: Type) -> bool:
+        return cls.is_integer == other.is_integer or cls.is_float == other.is_float
+
+    @staticmethod
+    def from_python(value: Any) -> Type["Numeric"]:
+        """
+        Deduce the DSL type from a Python value.
+        """
+        if isinstance(value, int):
+            return Int32
+        elif isinstance(value, float):
+            return Float32
+        elif isinstance(value, bool):
+            return Boolean
+        raise DSLRuntimeError(
+            f"Could not deduce Type[Numeric] from python value: {value} :{type(value)}"
+        )
+
+    @property
+    def mlir_type(cls):
+        return cls._mlir_type()  # type: ignore
+
+
+Value = TypeVar("Value")
+
+
+def cast(obj: Union[bool, int, float, Value], type_: Type["Numeric"]) -> "Numeric":
+    """Cast an object to the specified numeric type.
+
+    :param obj: Object to be cast
+    :type obj: Union[bool, int, float, Value]
+    :param type_: Target numeric type
+    :type type_: Type[Numeric]
+    :raises TypeError: If casting to an abstract type or unsupported type conversion
+    :return: Object cast to the target numeric type
+    :rtype: Numeric
+
+    Example::
+        >>> x = cast(5, Int32)  # Cast integer to Int32
+        >>> y = cast(3.14, Float32)  # Cast float to Float32
+    """
+    if type_.is_abstract:
+        if not isinstance(obj, type_):
+            raise TypeError(
+                f"can't cast {obj} to {type_}. Pass in concrete type instead, "
+                "e.g. Int32, Float32, etc."
+            )
+        # If target_type is abstract, and value is instance of target_type,
+        # then we can return value as is
+    else:
+        # Implicit cast based on using annotation type
+        obj = type_(obj)
+    return obj
+
+
+# Option 1: use ir.Value as base
+# class IntegerMeta(DslType, type(ir.Value)):
+class IntegerMeta(NumericMeta):
+    """Metaclass for integer types providing signedness information.
+
+    :param width: Bit width of the integer type, defaults to 32
+    :type width: int
+    :param signed: Whether the integer type is signed, defaults to True
+    :type signed: bool
+    :param mlir_type: Corresponding MLIR type, defaults to None
+    :type mlir_type: Any, optional
+
+    :ivar signed: Whether the integer type is signed
+    :vartype signed: bool
+    :ivar arith: Arithmetic operations interface
+    :vartype arith: Any
+    """
+
+    signed: bool
+
+    def __new__(
+        cls,
+        name,
+        bases,
+        attrs,
+        width=32,
+        signed=True,
+        mlir_type=None,
+        is_abstract=False,
+    ):
+        if width == 1:
+            np_dtype = np.bool_
+        elif width == 128:
+            np_dtype = None
+        elif signed:
+            np_dtype = getattr(np, f"int{width}")
+        else:
+            np_dtype = getattr(np, f"uint{width}")
+
+        def _c_pointers(self):
+            if width == 1:
+                c_value = ctypes.c_bool(self.value)
+            elif signed:
+                c_value = getattr(ctypes, f"c_int{width}")(self.value)
+            else:
+                c_value = getattr(ctypes, f"c_uint{width}")(self.value)
+
+            return [ctypes.cast(ctypes.pointer(c_value), ctypes.c_void_p)]
+
+        new_attrs = {
+            "__c_pointers__": _c_pointers,
+        }
+        new_cls = super().__new__(
+            cls, name, bases, attrs | new_attrs, width, np_dtype, mlir_type, is_abstract
+        )
+        new_cls.signed = signed
+        return new_cls
+
+    def __str__(cls):
+        return f"{cls.__name__}"
+
+    @property
+    def is_integer(cls) -> bool:
+        return True
+
+    @property
+    def is_float(cls) -> bool:
+        return False
+
+    @property
+    def zero(cls) -> int:
+        return 0
+
+    @property
+    def min(cls) -> int:
+        if cls.signed:
+            return -(2 ** (cls.width - 1))
+        else:
+            return 0
+
+    @property
+    def max(cls) -> int:
+        if cls.signed:
+            return 2 ** (cls.width - 1) - 1
+        else:
+            return 2**cls.width - 1
+
+    def recast_width(cls, width):
+        return eval(f"Int{width}")
+
+
+class FloatMeta(NumericMeta):
+    """Metaclass for floating-point types.
+
+    This metaclass provides type system infrastructure for floating-point types in the DSL,
+    handling MLIR type mappings and NumPy type conversions.
+
+    :param width: Bit width of the float type, defaults to 32
+    :type width: int
+    :param mlir_type: Corresponding MLIR type, defaults to None
+    :type mlir_type: Any, optional
+    :param is_abstract: Whether this is an abstract base class, defaults to False
+    :type is_abstract: bool, optional
+
+    :ivar _arith: Arithmetic operations interface
+    :vartype _arith: Any
+    """
+
+    _exponent_width: int
+    _mantissa_width: int
+
+    def __new__(cls, name, bases, attrs, width=32, mlir_type=None, is_abstract=False):
+        np_dtype = getattr(np, name.lower(), None)
+        new_cls = super().__new__(
+            cls, name, bases, attrs, width, np_dtype, mlir_type, is_abstract
+        )
+        # Extract exponent and mantissa bits from class name if it follows Float<E><M> pattern
+        # For example: Float8E4M3 -> exponent_width=4, mantissa_width=3
+        import re
+
+        if not is_abstract:
+            match = re.match(r"Float(\d+)E(\d+)M(\d+)(?:.*)", name)
+            if match:
+                exp_bits = int(match.group(2))
+                mant_bits = int(match.group(3))
+
+                # Store extracted values as class attributes
+                new_cls._exponent_width = exp_bits
+                new_cls._mantissa_width = mant_bits
+        # Don't have 1-to-1 mapping of narrow precision types like bfloat16, tfloat32, etc.
+        return new_cls
+
+    def __str__(cls):
+        return f"{cls.__name__}"
+
+    @property
+    def is_integer(cls) -> bool:
+        return False
+
+    @property
+    def is_float(cls) -> bool:
+        return True
+
+    @property
+    def zero(cls) -> float:
+        return 0.0
+
+    @property
+    def inf(cls) -> float:
+        return float("inf")
+
+    @property
+    def nan(cls) -> float:
+        return float("nan")
+
+    @property
+    def exponent_width(cls) -> int:
+        return cls._exponent_width
+
+    @property
+    def mantissa_width(cls) -> int:
+        return cls._mantissa_width
+
+    def recast_width(cls, width):
+        return eval(f"Float{width}")
+
+
+def _arith_signless_to_int(a, target_type):
+    # is_signed: sign of result type
+    if target_type.width > a.type.width:
+        # arith dialect consider `1` in `i1` as `-1`, treat it as unsigned for DSL
+        if target_type.signed and a.type.width > 1:
+            return arith.extsi(target_type.mlir_type, a)
+        else:
+            return arith.extui(target_type.mlir_type, a)
+    elif target_type.width < a.type.width:
+        return arith.trunci(target_type.mlir_type, a)
+    else:
+        return a
+
+
+def _binary_op_type_promote(a, b, promote_bool: bool = False):
+    """Promote two numeric operands following type promotion rules.
+
+    :param a: First numeric operand
+    :type a: Numeric
+    :param b: Second numeric operand
+    :type b: Numeric
+    :param promote_bool: Whether to promote boolean types to Int32 for arithmetic operations, defaults to False
+    :type promote_bool: bool, optional
+    :raises ValueError: If implicit float promotion is not supported between the given types
+    :return: Tuple containing promoted operands and their resulting type
+    :rtype: tuple[Numeric, Numeric, Type[Numeric]]
+
+    Type promotion rules:
+    1. If operands are same type and not bools needing promotion:
+       - No promotion needed, return original types
+    2. If either operand is float:
+       a. If one is float and one is int:
+          - Convert int to the float type
+       b. If both are float:
+          - Promote to higher precision float if width >= 16
+          - For same width, promote to more general type (Float32 over TFloat32)
+          - Otherwise raise ValueError for unsupported promotion
+    3. Otherwise, both operands are integers. Integer promotion rules:
+       a. If promote_bool is True and either operand is bool:
+          - Promote bool to Int32 for arithmetic operations
+
+    Exceptions for numpy dtype casting:
+    - array(dtype=np.bool_) + array(dtype=np.bool_) -> array(dtype=np.bool_)
+
+    What is not supported:
+    - promotion with narrow precision float types which requires explicit cast by user
+    """
+    a_type = a.dtype
+    b_type = b.dtype
+
+    # Early return for same types (except when they're bools that need promotion)
+    if a_type == b_type and not (promote_bool and a_type.width == 1):
+        return a, b, a_type
+
+    # Handle floating point promotions
+    if a_type.is_float or b_type.is_float:
+        # Get highest precision float type based on bitwidth
+        a_width = getattr(a_type, "width", 0)
+        b_width = getattr(b_type, "width", 0)
+
+        # If one type is integer, convert it to the float type
+        if a_type.is_float and not b_type.is_float:
+            b_type = a_type.recast_width(max(a_width, b_width))
+        elif b_type.is_float and not a_type.is_float:
+            a_type = b_type.recast_width(max(a_width, b_width))
+
+        # Both are float types - handle precision promotion
+        if a_width > b_width and a_width >= 16:
+            res_type = a_type
+        elif b_width > a_width and b_width >= 16:
+            res_type = b_type
+        elif a_width == b_width:
+            # Same bitwidth - handle special cases like TFloat32 -> Float32 and BFloat16 -> Float16
+            if a_type is Float64 or b_type is Float64:
+                res_type = Float64
+            elif a_type is Float32 or b_type is Float32:
+                res_type = Float32
+            elif a_type is Float16 or b_type is Float16:
+                res_type = Float16
+            else:
+                raise ValueError(
+                    f"implicit float promotion of {a_type} or {b_type} is not supported, cast explicitly"
+                )
+        else:
+            raise ValueError(
+                f"implicit float promotion of {a_type} or {b_type} is not supported, cast explicitly"
+            )
+
+        # Only convert if type is different
+        new_a = a.to(res_type) if a.dtype != res_type else a
+        new_b = b.to(res_type) if b.dtype != res_type else b
+        return new_a, new_b, res_type
+
+    # Handle bool promotion for arithmetic operations
+    if promote_bool:
+        if a_type is Boolean and b_type is Boolean:
+            # Only promote to Int32 when both are bool
+            a = a.to(Int32)
+            b = b.to(Int32)
+            a_type = b_type = a.dtype
+
+        # If both were bools, they're now same type (Int32)
+        if a_type == b_type:
+            return a, b, a_type
+
+    # Same type, no promotion needed
+    if a_type == b_type:
+        return a, b, a_type
+
+    a_signed = a_type.signed
+    b_signed = b_type.signed
+    a_width = a_type.width
+    b_width = b_type.width
+
+    # Mixed signedness case
+    if a_signed != b_signed:
+        unsigned_type = a_type if not a_signed else b_type
+        signed_type = a_type if a_signed else b_type
+        unsigned_width = a_width if not a_signed else b_width
+
+        if unsigned_width >= signed_type.width:
+            # Promote both to unsigned of larger width
+            res_type = unsigned_type
+        else:
+            # Promote both to signed of larger width
+            res_type = signed_type
+
+        new_a = a.to(res_type) if a.dtype != res_type else a
+        new_b = b.to(res_type) if b.dtype != res_type else b
+        return new_a, new_b, res_type
+
+    # Same signedness, different width - promote to larger width
+    if a_width >= b_width:
+        return a, b.to(a.dtype), a.dtype
+    else:
+        return a.to(b.dtype), b, b.dtype
+
+
+def _binary_op(op, promote_operand=True, promote_bool=False, flip=False):
+    """Wrapper for binary operations on Numeric types.
+
+    This wrapper handles type promotion, operation execution, and result type determination
+    for binary operations between Numeric types.
+
+    :param op: The binary operation to perform (e.g., operator.add, operator.sub)
+    :type op: callable
+    :param emitter: Function that emits the MLIR operation for dynamic values
+    :type emitter: callable
+    :param promote_operand: Whether to promote operands to the same type, defaults to True
+    :type promote_operand: bool, optional
+    :param promote_bool: Whether to promote boolean results to Boolean type, defaults to False
+    :type promote_bool: bool, optional
+    :param flip: Whether to flip the operands when calling the operation, defaults to False
+    :type flip: bool, optional
+
+    :raises TypeError: When an unsupported operation is attempted on specific numeric types
+
+    .. note::
+        Not all operations are supported for all numeric types. In particular:
+
+        - Subtraction is not fully supported for Integer types
+        - Multiplication, floor division, and modulo operations may have limited support
+        - Division (truediv) with integer types is not fully supported and converts to Float32
+    """
+
+    def wrapper(lhs, rhs, *, loc=None, ip=None):
+        orig_lhs_type = type(lhs)
+        orig_rhs_type = type(rhs)
+
+        # When called directly with self and other
+        ty = type(lhs)
+        # Canonicalize to Numeric type for promotion
+        if not isinstance(rhs, Numeric):
+            if not isinstance(rhs, (ArithValue, int, float, bool)):
+                # This allows rhs class to implement __rmul__
+                return NotImplemented
+
+            if isinstance(rhs, ArithValue):
+                if isinstance(rhs.type, ir.VectorType):
+                    return NotImplemented
+
+            rhs = as_numeric(rhs)
+
+        # default result type to left-hand-side
+        res_type = ty
+
+        if promote_operand:
+            lhs, rhs, res_type = _binary_op_type_promote(lhs, rhs, promote_bool)
+        else:
+            rhs = ty(rhs)
+
+        if op in (
+            operator.lt,
+            operator.le,
+            operator.gt,
+            operator.ge,
+            operator.eq,
+            operator.ne,
+        ):
+            res_type = Boolean
+        elif op == operator.truediv and isinstance(lhs, Integer):
+            res_type = Float32
+        elif promote_bool and orig_lhs_type == Boolean and orig_rhs_type == Boolean:
+            res_type = Boolean
+
+        if isinstance(lhs.value, ArithValue) and isinstance(lhs, Integer):
+            lhs_val = lhs.value.with_signedness(lhs.signed)
+        else:
+            lhs_val = lhs.value
+
+        if isinstance(rhs.value, ArithValue) and isinstance(rhs, Integer):
+            rhs_val = rhs.value.with_signedness(rhs.signed)
+        else:
+            rhs_val = rhs.value
+
+        if flip:
+            lhs_val, rhs_val = rhs_val, lhs_val
+
+        # Check if the operation is supported by the operands
+        res_val = op(lhs_val, rhs_val)
+        return res_type(res_val, loc=loc, ip=ip)
+
+    return wrapper
+
+
+class Numeric(metaclass=NumericMeta, is_abstract=True):
+    """Base class for all numeric types in the DSL.
+
+    This class provides the foundation for both Integer and Float types,
+    implementing basic arithmetic operations.
+
+    :param value: The value to store in the numeric type
+    :type value: Union[bool, int, float, Value]
+
+    :ivar value: The stored numeric value
+    :vartype value: Union[bool, int, float, Value]
+    """
+
+    def __init__(self, value: Union[bool, int, float, Value], *, loc=None, ip=None):
+        self.value = value
+
+    def __str__(self) -> str:
+        # Use member's pretty-str method if member object has method.
+        # This can be extended in future to have better support for IDE, jupyter notebook, etc.
+        pretty_str = getattr(self.value, "pretty_str", None)
+        if pretty_str is not None:
+            return pretty_str()
+        else:
+            return "?"
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({repr(self.value)})"
+
+    def __hash__(self):
+        return hash(type(self).__class__) ^ hash(self.value)
+
+    @property
+    def dtype(self) -> Type["Numeric"]:
+        return type(self)
+
+    @overload
+    def to(self, dtype: Type["Numeric"], *, loc=None, ip=None) -> "Numeric": ...
+
+    @overload
+    def to(self, dtype: Type[int], *, loc=None, ip=None) -> int: ...
+
+    @overload
+    def to(self, dtype: Type[float], *, loc=None, ip=None) -> float: ...
+
+    @overload
+    def to(self, dtype: Type[bool], *, loc=None, ip=None) -> bool: ...
+
+    @overload
+    def to(self, dtype: Type[ir.Value], *, loc=None, ip=None) -> ir.Value: ...
+
+    def to(self, dtype: Type, *, loc=None, ip=None):
+        """Convert this numeric value to another numeric type.
+
+        If the target type is the same as the current type, returns self.
+        Otherwise, creates a new instance of the target type with the same value.
+
+        :param dtype: The target numeric type to convert to
+        :type dtype: Union[Type["Numeric"], Type[int], Type[float], Type[bool]]
+        :return: A new instance of the target type, or self if types match
+        :rtype: Numeric
+        :raises TypeError: If trying to convert an MLIR value to a static Python type
+        :raises TypeError: If trying to convert to unsupported float types like Float8E4M3,
+                          Float8E4M3B11FNUZ, Float4E2M1FN, Float6E3M2FN, or Float6E2M3FN
+
+        .. note::
+
+            Unsupported destination float types:
+                - Float8E4M3
+                - Float8E4M3B11FNUZ
+                - Float4E2M1FN
+                - Float6E3M2FN
+                - Float6E2M3FN
+
+        Example::
+
+            .. code-block:: python
+
+                # Convert between DSL numeric types
+                x = Int32(5)
+                y = x.to(Float32)  # Converts to Float32(5.0)
+
+                # Convert to Python primitive types
+                # They are considered as static values at JIT time
+                z = x.to(int)      # Returns Python int 5
+                w = y.to(float)    # Returns Python float 5.0
+
+                # This will raise a ValueError
+                mlir_val = arith.constant(T.i32(), 42)
+                num = Int32(mlir_val)
+                num.to(int)        # ValueError: unable to convert MLIR value to static type: <class 'int'>
+        """
+        if dtype in _unsupported_dst_float_types:
+            raise TypeError(f"Unsupported destination float type: {dtype}")
+
+        if isinstance(dtype, type(self)):
+            return self
+        elif isinstance(dtype, NumericMeta):
+            return dtype(self)
+        elif dtype is ir.Value:
+            if isinstance(self.value, (int, float, bool)):
+                res = arith_helper.const(
+                    self.value, self.dtype.mlir_type, loc=loc, ip=ip
+                )
+            elif isinstance(self.value, ir.Value):
+                res = self.value
+            else:
+                raise ValueError(
+                    f"cannot convert {type(self)} to {dtype}, "
+                    f"self.value is {self.value.type}"
+                )
+
+            if not isinstance(res, ArithValue):
+                raise ValueError(f"Expected ArithValue, got {type(res)} as {res.type}")
+
+            return res.with_signedness(getattr(type(self), "signed", None))
+        elif dtype in (int, float, bool):
+            if isinstance(self.value, ir.Value):
+                raise ValueError(
+                    f"unable to convert {self.value} to static type: {dtype}"
+                )
+            return dtype(self.value)
+        else:
+            raise ValueError(f"unable to convert {type(self)} to {dtype}")
+
+    def ir_value(self, *, loc=None, ip=None) -> ir.Value:
+        return self.to(ir.Value, loc=loc, ip=ip)
+
+    @property
+    def zero(self) -> "Numeric": ...
+
+    def __dsl_not__(self, *, loc=None, ip=None):
+        """DSL implementation of Python's `not` operator.
+
+        Returns True if the value is equal to zero, False otherwise.
+        This matches Python's behavior where any non-zero number is considered True.
+
+        :param loc: The source location information, defaults to None
+        :type loc: Optional[Location]
+        :param ip: The insertion point for the operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+        :return: The result of the logical not operation
+        :rtype: Boolean
+        """
+        ty = type(self)
+        zero_val = arith.constant(ty.mlir_type, ty.zero)
+        return self.__eq__(ty(zero_val), loc=loc, ip=ip)
+
+    def __dsl_and__(self, other, *, loc=None, ip=None):
+        """DSL implementation of Python's `and` operator.
+
+        Returns the second operand if the first is truthy, otherwise returns the first operand.
+        A numeric value is considered truthy if it is non-zero.
+
+        :param other: The right-hand operand
+        :type other: Numeric
+        :param loc: The source location information, defaults to None
+        :type loc: Optional[Location]
+        :param ip: The insertion point for the operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+        :return: The result of the logical and operation
+        :rtype: Boolean
+
+        Example::
+
+            5 and 3 -> 3
+            0 and 3 -> 0
+            3 and 0 and ... -> 0
+        """
+        is_true = self.__dsl_bool__(loc=loc, ip=ip)
+
+        def and_op(lhs, rhs):
+            if isinstance(lhs, (int, float, bool)):
+                if isinstance(rhs, (int, float, bool)):
+                    return lhs and rhs
+                else:
+                    lhs = arith.constant(rhs.type, lhs)
+                    return arith.select(is_true.ir_value(), rhs, lhs, loc=loc, ip=ip)
+            else:
+                if isinstance(rhs, (int, float, bool)):
+                    rhs = arith.constant(lhs.type, rhs)
+                    return arith.select(is_true.ir_value(), rhs, lhs, loc=loc, ip=ip)
+                else:
+                    return arith.select(is_true.ir_value(), rhs, lhs, loc=loc, ip=ip)
+
+        return _binary_op(and_op, promote_bool=True)(self, other, loc=loc, ip=ip)
+
+    def __dsl_or__(self, other, *, loc=None, ip=None):
+        """DSL implementation of Python's `or` operator.
+
+        Returns the first operand if it is truthy, otherwise returns the second operand.
+        A numeric value is considered truthy if it is non-zero.
+
+        :param other: The right-hand operand
+        :type other: Numeric
+        :param loc: The source location information, defaults to None
+        :type loc: Optional[Location]
+        :param ip: The insertion point for the operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+        :return: The result of the logical or operation
+        :rtype: Boolean
+
+        Example::
+
+            5 or 3 -> 5
+            0 or 3 -> 3
+            3 or 0 -> 3
+        """
+        is_true = self.__dsl_bool__(loc=loc, ip=ip)
+
+        def or_op(lhs, rhs):
+            if isinstance(lhs, (int, float, bool)):
+                if isinstance(rhs, (int, float, bool)):
+                    return lhs or rhs
+                else:
+                    lhs = arith.constant(rhs.type, lhs)
+                    return arith.select(is_true.ir_value(), lhs, rhs, loc=loc, ip=ip)
+            else:
+                if isinstance(rhs, (int, float, bool)):
+                    rhs = arith.constant(lhs.type, rhs)
+                    return arith.select(is_true.ir_value(), lhs, rhs, loc=loc, ip=ip)
+                else:
+                    return arith.select(is_true.ir_value(), lhs, rhs, loc=loc, ip=ip)
+
+        return _binary_op(or_op, promote_bool=True)(self, other, loc=loc, ip=ip)
+
+    def __dsl_bool__(self, *, loc=None, ip=None) -> "Boolean":
+        """DSL implementation of Python's __bool__ method.
+
+        Returns a Boolean indicating whether this value is considered truthy.
+        For numeric types, returns True if the value is non-zero.
+
+        :param loc: The source location information, defaults to None
+        :type loc: Optional[Location]
+        :param ip: The insertion point for the operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+        :return: True if this value is truthy (non-zero), False otherwise
+        :rtype: Boolean
+        """
+        zero = type(self).zero
+        return self.__ne__(zero, loc=loc, ip=ip)
+
+    def __bool__(self):
+        if isinstance(self.value, (int, float, bool)):
+            return bool(self.value)
+        else:
+            raise DSLRuntimeError(
+                f"Unable to convert dynamic `{type(self).__name__}` value to bool at compile time.",
+                suggestion=[
+                    "Decorate the parent function with `jit` decorator and with `preprocess` enabled.",
+                    "Ensure not using patterns that DSL does not support.",
+                    "Otherwise, please file a bug report.",
+                ],
+            )
+
+    def __neg__(self, *, loc=None, ip=None):
+        if isinstance(self, (bool, int, float)):
+            return type(self)(-self.value)  # type: ignore
+        else:
+            return type(self)(-self.value, loc=loc, ip=ip)  # type: ignore
+
+    @staticmethod
+    def _from_python_value(value):
+        if isinstance(value, Numeric):
+            return value
+
+        if isinstance(value, bool):
+            res_type = Boolean
+        elif isinstance(value, int):
+            res_type = Int32
+        elif isinstance(value, float):
+            res_type = Float32
+        elif isinstance(value, ArithValue):
+            res_type = Numeric.from_mlir_type(value.type)
+        else:
+            raise ValueError(
+                f"unable to convert {value} in type {type(value)} to Numeric"
+            )
+        return res_type(value)
+
+    def __add__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.add, promote_bool=True)(self, other, loc=loc, ip=ip)
+
+    def __sub__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.sub, promote_bool=True)(self, other, loc=loc, ip=ip)
+
+    def __mul__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.mul, promote_bool=True)(self, other, loc=loc, ip=ip)
+
+    def __floordiv__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.floordiv, promote_bool=True)(
+            self, other, loc=loc, ip=ip
+        )
+
+    def __truediv__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.truediv, promote_bool=True)(
+            self, other, loc=loc, ip=ip
+        )
+
+    def __mod__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.mod, promote_bool=True)(self, other, loc=loc, ip=ip)
+
+    def __radd__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return self.__add__(other, loc=loc, ip=ip)
+
+    def __rsub__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.sub, promote_bool=True, flip=True)(
+            self, other, loc=loc, ip=ip
+        )
+
+    def __rmul__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return self.__mul__(other, loc=loc, ip=ip)
+
+    def __rfloordiv__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.floordiv, promote_bool=True, flip=True)(
+            self, other, loc=loc, ip=ip
+        )
+
+    def __rtruediv__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.truediv, promote_bool=True, flip=True)(
+            self, other, loc=loc, ip=ip
+        )
+
+    def __rmod__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.mod, promote_bool=True, flip=True)(
+            self, other, loc=loc, ip=ip
+        )
+
+    def __eq__(self, other, *, loc=None, ip=None) -> "Boolean":
+        return _binary_op(operator.eq)(self, other, loc=loc, ip=ip)  # type: ignore
+
+    def __ne__(self, other, *, loc=None, ip=None) -> "Boolean":
+        return _binary_op(operator.ne)(self, other, loc=loc, ip=ip)  # type: ignore
+
+    def __lt__(self, other, *, loc=None, ip=None) -> "Boolean":
+        return _binary_op(operator.lt)(self, other, loc=loc, ip=ip)  # type: ignore
+
+    def __le__(self, other, *, loc=None, ip=None) -> "Boolean":
+        return _binary_op(operator.le)(self, other, loc=loc, ip=ip)  # type: ignore
+
+    def __gt__(self, other, *, loc=None, ip=None) -> "Boolean":
+        return _binary_op(operator.gt)(self, other, loc=loc, ip=ip)  # type: ignore
+
+    def __ge__(self, other, *, loc=None, ip=None) -> "Boolean":
+        return _binary_op(operator.ge)(self, other, loc=loc, ip=ip)  # type: ignore
+
+    def __pow__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.pow)(self, other, loc=loc, ip=ip)  # type: ignore
+
+    def __c_pointers__(self):
+        raise ValueError(
+            f"only support built-in types: bool, (u)int{8, 16, 32, 64}, float{32, 64}, but got {type(self)}"
+        )
+
+    def __get_mlir_types__(self):
+        return [type(self).mlir_type]
+
+    @staticmethod
+    def from_mlir_type(mlir_type):
+        type_map = {
+            T.bool(): Boolean,
+            T.f64(): Float64,
+            T.f32(): Float32,
+            T.tf32(): TFloat32,
+            T.f16(): Float16,
+            T.bf16(): BFloat16,
+            T.i(128): Int128,
+            T.i64(): Int64,
+            T.i32(): Int32,
+            T.i16(): Int16,
+            T.i8(): Int8,
+            T.si(128): Int128,
+            T.si64(): Int64,
+            T.si32(): Int32,
+            T.si16(): Int16,
+            T.si8(): Int8,
+            T.ui(128): Uint128,
+            T.ui64(): Uint64,
+            T.ui32(): Uint32,
+            T.ui16(): Uint16,
+            T.ui8(): Uint8,
+            T.f8E5M2(): Float8E5M2,
+            T.f8E4M3(): Float8E4M3,
+            T.f8E4M3FN(): Float8E4M3FN,
+            T.f8E4M3B11FNUZ(): Float8E4M3B11FNUZ,
+            T.f4E2M1FN(): Float4E2M1FN,
+            T.f6E2M3FN(): Float6E2M3FN,
+            T.f6E3M2FN(): Float6E3M2FN,
+            T.f8E8M0FNU(): Float8E8M0FNU,
+        }
+        if mlir_type not in type_map:
+            raise DSLRuntimeError(f"Unsupported DSL type: {mlir_type}")
+        return type_map[mlir_type]
+
+
+def as_numeric(obj: Union[bool, int, float, ir.Value, Numeric]) -> Numeric:
+    """Convert a Python primitive value to a Numeric type.
+
+    :param obj: Python primitive value to convert
+    :type obj: Union[bool, int, float]
+    :return: The converted Numeric object
+    :rtype: Numeric
+
+    Example::
+
+        .. code-block:: python
+
+            x = as_numeric(5)  # Converts to Int32
+            y = as_numeric(3.14)  # Converts to Float32
+            z = as_numeric(True)  # Converts to Boolean
+    """
+    if isinstance(obj, Numeric):
+        return obj
+    return Numeric._from_python_value(obj)
+
+
+class Integer(Numeric, metaclass=IntegerMeta, mlir_type=T.i32, is_abstract=True):
+    """A class representing integer values with specific width and signedness.
+
+    This class provides functionality to create and manipulate integer values with
+    configurable width and signedness. It supports conversion from various input types
+    including Python scalars, MLIR Values, and other numeric types.
+
+    :param x: The input value to convert to this integer type
+    :type x: Union[bool, int, float, ir.Value, Integer, Float]
+
+    :return: A new Integer instance with the converted value
+    :rtype: Integer
+
+    :raises AssertionError: If the type's numpy_dtype is None
+    :raises NotImplementedError: If converting between different Integer types
+    :raises ValueError: If the input type is not supported for conversion
+    :raises OverflowError: If converting float infinity to integer
+
+    Type conversion behavior:
+
+    * Python scalars (bool, int, float):
+        * Converted through numpy dtype casting
+        * NaN and infinity values are rejected
+        * Example: Int8(256) -> -256 (overflow behavior)
+
+    * MLIR Value with IntegerType:
+        * Width differences handled by signless to signed/unsigned conversion
+        * Example: i8 -> i8/ui8 depending on target type
+
+    * MLIR Value with FloatType:
+        * Uses MLIR float-to-int conversion
+        * NaN and infinity values is undefined behavior
+        * Example: f32 -> i32/ui32 depending on target type
+
+    * Integer:
+        * Uses MLIR float-to-int conversion or numpy dtype casting
+        * Example: Int32(Int32(5)) => 5
+
+    * Float:
+        * Uses MLIR float-to-int conversion
+        * Example: Int32(Float(5.7)) -> 5
+
+    Example usage:
+
+    .. code-block:: python
+
+        x = Int32(5)  # From integer
+        y = Int32(True)  # From boolean
+        z = Int32(3.7)  # From float (truncates)
+        w = Int32(x)  # From same Integer type
+        c5 = arith.constant(5, T.i32())
+        a = Int32(c5)  # Treat c5 as int32 bitwise
+    """
+
+    def __init__(self, x, *, loc=None, ip=None):
+        ty = type(self)
+
+        if isinstance(x, (bool, int, float)):
+            # Add check for NaN before numpy conversion
+            if isinstance(x, float):
+                if np.isnan(x):
+                    raise ValueError("Cannot convert float NaN to integer")
+                elif np.isinf(x):
+                    raise OverflowError("Cannot convert float infinity to integer")
+
+            np_dtype = ty.numpy_dtype
+            assert np_dtype is not None, f"expects numpy.dtype, but got {np_dtype}"
+            x_val = int(np.array(x).astype(np_dtype))
+        elif type(x) == ty:
+            x_val = x.value
+        elif isinstance(x, ir.Value):  # type: ignore
+            x_val = x
+            if isinstance(x.type, ir.IntegerType):  # type: ignore
+                if x.type.width != ty.width:
+                    # signless -> (u)int
+                    x_val = _arith_signless_to_int(x, ty)
+            elif isinstance(x.type, ir.FloatType):  # type: ignore
+                # float -> (u)int
+                x_val = arith_helper.fptoi(x, ty.signed, ty.mlir_type, loc=loc, ip=ip)
+        elif isinstance(x, Integer):
+            if isinstance(x.value, ir.Value):
+                x_val = arith_helper.int_to_int(x.ir_value(), ty)
+            else:
+                # For non-MLIR values, use numpy casting
+                src_val = np.array(x.value, dtype=type(x).numpy_dtype)
+                x_val = int(src_val.astype(ty.numpy_dtype))
+        elif isinstance(x, Float):
+            # float -> int is handled by Integer.__init__ recursively
+            Integer.__init__(self, x.value)
+            return
+        else:
+            raise DSLRuntimeError(f"{x} to integer conversion is not supported")
+
+        super().__init__(x_val)
+
+    def __invert__(self, *, loc=None, ip=None):
+        res_type = type(self)
+        # Create a constant of -1 (all bits set to 1) of the same type as value
+        all_ones = arith.constant(res_type.mlir_type, -1)
+        # XOR with -1 gives us bitwise NOT
+        return res_type(arith.xori(self.ir_value(), all_ones, loc=loc, ip=ip))
+
+    def __lshift__(self, other, *, loc=None, ip=None):
+        return _binary_op(operator.lshift)(self, other, loc=loc, ip=ip)
+
+    def __rlshift__(self, other, *, loc=None, ip=None):
+        other_ = as_numeric(other)
+        if not isinstance(other_, Integer):
+            raise ValueError(f"Cannot left shift {other_} with {self}")
+        return other_.__lshift__(self, loc=loc, ip=ip)
+
+    def __rshift__(self, other, *, loc=None, ip=None):
+        return _binary_op(operator.rshift)(self, other, loc=loc, ip=ip)
+
+    def __rrshift__(self, other, *, loc=None, ip=None):
+        other_ = as_numeric(other)
+        if not isinstance(other_, Integer):
+            raise ValueError(f"Cannot right shift {other_} with {self}")
+        return other_.__rshift__(self, loc=loc, ip=ip)
+
+    def __and__(self, other, *, loc=None, ip=None):
+        return _binary_op(operator.and_)(self, other, loc=loc, ip=ip)
+
+    def __rand__(self, other, *, loc=None, ip=None):
+        return self.__and__(other, loc=loc, ip=ip)
+
+    def __or__(self, other, *, loc=None, ip=None):
+        return _binary_op(operator.or_)(self, other, loc=loc, ip=ip)
+
+    def __ror__(self, other, *, loc=None, ip=None):
+        return self.__or__(other, loc=loc, ip=ip)
+
+    def __xor__(self, other, *, loc=None, ip=None):
+        return _binary_op(operator.xor)(self, other, loc=loc, ip=ip)
+
+    def __rxor__(self, other, *, loc=None, ip=None):
+        return self.__xor__(other, loc=loc, ip=ip)
+
+
+class Float(Numeric, metaclass=FloatMeta, mlir_type=T.f32, is_abstract=True):
+    """A class representing floating-point values.
+
+    :param x: The input value to convert to this float type.
+    :type x: Union[bool, int, float, ir.Value, Integer, Float]
+
+    Type conversion behavior:
+
+    1. Python scalars (bool, int, float):
+       - Converted through numpy dtype casting
+       - Example: Float32(1.7) -> 1.7
+
+    2. MLIR Value with FloatType:
+       - If width differs: converts between float types
+       - Example: f16 -> f32
+
+    3. MLIR Value with IntegerType:
+       - Not supported, raises ValueError
+
+    4. Integer:
+       - Converts using MLIR int-to-float operation
+       - Example: Float32(Int32(5)) -> 5.0
+
+    5. Float:
+       - Direct conversion between float types
+       - Example: Float32(Float32(1.5)) -> 1.5
+
+    .. note::
+        The following narrow precision types are only supported in device code:
+
+        8-bit float types:
+            - Float8E5M2
+            - Float8E4M3
+            - Float8E4M3FN
+            - Float8E8M0FNU
+            - Float8E4M3B11FNUZ
+
+        6-bit float types:
+            - Float6E3M2FN
+            - Float6E2M3FN
+
+        4-bit float types:
+            - Float4E2M1FN
+
+        Narrow precision types and special floating-point formats support matrix on device:
+
+    :raises AssertionError: If the type's numpy_dtype is None
+    :raises ValueError: If conversion from the input type is not supported
+    """
+
+    def __init__(self, x, *, loc=None, ip=None):
+        ty = type(self)
+
+        if isinstance(x, (bool, int, float)):  # type: ignore
+            # Why we need to convert x to with numpy?
+            # np_dtype = ty.numpy_dtype
+            # assert np_dtype is not None, f"expects numpy.dtype, but got {np_dtype}"
+            # x = float(np.array(x).astype(np_dtype))
+            super().__init__(float(x))
+        elif isinstance(x, ir.Value):  # type: ignore
+            if isinstance(x.type, ir.IntegerType):  # type: ignore
+                raise DSLRuntimeError("signless to float conversion is not implemented")
+            elif isinstance(x.type, ir.FloatType):  # type: ignore
+                if x.type != ty.mlir_type:
+                    x = arith_helper.cvtf(x, ty.mlir_type, loc=loc, ip=ip)
+            super().__init__(x)
+        elif isinstance(x, Integer):
+            if isinstance(x.value, ir.Value):  # type: ignore
+                x = arith_helper.itofp(
+                    x.value, type(x).signed, ty.mlir_type, loc=loc, ip=ip
+                )
+            else:
+                x = float(x.value)
+            super().__init__(x)
+        elif isinstance(x, Float):
+            Float.__init__(self, x.value)
+        else:
+            raise DSLRuntimeError(f"{x} to Float conversion is not supported")
+
+
+class Boolean(Integer, metaclass=IntegerMeta, width=1, signed=True, mlir_type=T.bool):
+    """Boolean type representation in the DSL.
+
+    This class represents boolean values in the DSL, with a width of 1 bit.
+    It supports conversion from various types to boolean values.
+
+    :param a: Value to convert to Boolean
+    :type a: Union[bool, int, float, "Value", Numeric]
+    :param loc: Source location information, defaults to None
+    :type loc: Optional[Location], optional
+    :param ip: Insertion point for MLIR operations, defaults to None
+    :type ip: Optional[InsertionPoint], optional
+    :raises DSLRuntimeError: If the input value cannot be converted to Boolean
+
+    Conversion rules:
+
+    1. Python bool/int/float:
+       - Converted using Python's bool() function
+       - Example: Boolean(1) -> True, Boolean(0) -> False
+
+    2. Boolean:
+       - Direct value assignment
+       - Example: Boolean(Boolean(True)) -> True
+
+    3. Numeric:
+       - Uses the __dsl_bool__ method of the Numeric type
+
+    4. MLIR Value with IntegerType:
+       - If width is 1: Direct assignment
+       - Otherwise: Compares with 0 using arith.cmpi
+
+    5. MLIR Value with FloatType:
+       - Compares with 0.0 using arith.cmpf
+       - Uses unordered comparison to handle NaN values
+    """
+
+    def __init__(
+        self, a: Union[bool, int, float, ir.Value, Numeric], *, loc=None, ip=None
+    ):
+        value = None
+        if isinstance(a, (bool, int, float)):
+            value = bool(a)
+        elif isinstance(a, Boolean):
+            value = a.value
+        elif isinstance(a, Numeric):
+            value = a.__dsl_bool__(loc=loc, ip=ip)
+        elif isinstance(a, ArithValue):
+            if a.type == T.bool():
+                value = a
+            else:
+                value = a != arith_helper.const(0, a.type)
+
+        if value is None:
+            raise DSLRuntimeError(f"Cannot convert {a} to Boolean")
+        super().__init__(value, loc=loc, ip=ip)
+
+    def __neg__(self, *, loc=None, ip=None):
+        """Negation operator is not supported for boolean type.
+
+        :param loc: Source location information, defaults to None
+        :type loc: Optional[Location], optional
+        :param ip: Insertion point for MLIR operations, defaults to None
+        :type ip: Optional[InsertionPoint], optional
+        :raises TypeError: Always raises this error as negation is not supported
+        """
+        raise TypeError("Negation, the operator `-` is not supported for boolean type")
+
+
+class Int8(Integer, metaclass=IntegerMeta, width=8, signed=True, mlir_type=T.i8): ...
+
+
+class Int16(Integer, metaclass=IntegerMeta, width=16, signed=True, mlir_type=T.i16): ...
+
+
+class Int32(Integer, metaclass=IntegerMeta, width=32, signed=True, mlir_type=T.i32): ...
+
+
+class Int64(Integer, metaclass=IntegerMeta, width=64, signed=True, mlir_type=T.i64): ...
+
+
+class Int128(
+    Integer, metaclass=IntegerMeta, width=128, signed=True, mlir_type=lambda: T.i(128)
+): ...
+
+
+class Uint8(Integer, metaclass=IntegerMeta, width=8, signed=False, mlir_type=T.i8): ...
+
+
+class Uint16(
+    Integer, metaclass=IntegerMeta, width=16, signed=False, mlir_type=T.i16
+): ...
+
+
+class Uint32(
+    Integer, metaclass=IntegerMeta, width=32, signed=False, mlir_type=T.i32
+): ...
+
+
+class Uint64(
+    Integer, metaclass=IntegerMeta, width=64, signed=False, mlir_type=T.i64
+): ...
+
+
+class Uint128(
+    Integer, metaclass=IntegerMeta, width=128, signed=False, mlir_type=lambda: T.i(128)
+): ...
+
+
+class Float64(Float, metaclass=FloatMeta, width=64, mlir_type=T.f64):
+    def __c_pointers__(self):
+        if not isinstance(self.value, float):
+            raise ValueError("only float is supported")
+
+        return [
+            ctypes.cast(ctypes.pointer(ctypes.c_double(self.value)), ctypes.c_void_p)
+        ]
+
+
+class Float32(Float, metaclass=FloatMeta, width=32, mlir_type=T.f32):
+    @staticmethod
+    def _get_c_pointer(value: float):
+        return ctypes.cast(ctypes.pointer(ctypes.c_float(value)), ctypes.c_void_p)
+
+    def __c_pointers__(self):
+        if not isinstance(self.value, float):
+            raise ValueError("only float is supported")
+
+        return [Float32._get_c_pointer(self.value)]
+
+
+class TFloat32(Float, metaclass=FloatMeta, width=32, mlir_type=T.tf32):
+    def __c_pointers__(self):
+        if not isinstance(self.value, float):
+            raise ValueError("only float is supported")
+        return [Float32._get_c_pointer(self.value)]
+
+
+class Float16(Float, metaclass=FloatMeta, width=16, mlir_type=T.f16):
+    @staticmethod
+    def _get_c_pointer(value: float):
+        # Convert float to float16 binary representation
+        # First convert to numpy float16 to handle the conversion
+        f16_val = np.float16(value)
+        # Get the raw bits as a 16-bit integer
+        bits = f16_val.view(np.uint16)
+        # Create a short (16-bit int) with those bits
+        c_val = ctypes.c_short(bits)
+        return ctypes.cast(ctypes.pointer(c_val), ctypes.c_void_p)
+
+    def __c_pointers__(self):
+        if not isinstance(self.value, float):
+            raise ValueError("only float is supported")
+        return [Float16._get_c_pointer(self.value)]
+
+
+class BFloat16(Float, metaclass=FloatMeta, width=16, mlir_type=T.bf16):
+    def __c_pointers__(self):
+        if not isinstance(self.value, float):
+            raise ValueError("only float is supported")
+
+        return Float.__c_pointers__(self)
+
+
+class Float8E5M2(Float, metaclass=FloatMeta, width=8, mlir_type=T.f8E5M2): ...
+
+
+class Float8E4M3FN(Float, metaclass=FloatMeta, width=8, mlir_type=T.f8E4M3FN): ...
+
+
+class Float8E4M3B11FNUZ(
+    Float, metaclass=FloatMeta, width=8, mlir_type=T.f8E4M3B11FNUZ
+): ...
+
+
+
+# Added missing float types
+class Float8E4M3(Float, metaclass=FloatMeta, width=8, mlir_type=T.f8E4M3): ...
+
+
+class Float8E8M0FNU(Float, metaclass=FloatMeta, width=8, mlir_type=T.f8E8M0FNU): ...
+
+
+class Float4E2M1FN(Float, metaclass=FloatMeta, width=4, mlir_type=T.f4E2M1FN): ...
+
+
+class Float6E3M2FN(Float, metaclass=FloatMeta, width=6, mlir_type=T.f6E3M2FN): ...
+
+
+class Float6E2M3FN(Float, metaclass=FloatMeta, width=6, mlir_type=T.f6E2M3FN): ...
+
+
+_unsupported_dst_float_types = [
+    Float8E4M3,
+    Float8E4M3B11FNUZ,
+    Float4E2M1FN,
+    Float6E3M2FN,
+    Float6E2M3FN,
+]
+
+
+ALL_DTYPES = {
+    Int8,
+    Int16,
+    Int32,
+    Int64,
+    Int128,
+    Uint8,
+    Uint16,
+    Uint32,
+    Uint64,
+    Uint128,
+    BFloat16,
+    Float16,
+    Float32,
+    TFloat32,
+    Float64,
+    Float8E5M2,
+    Float8E4M3,
+    Float8E4M3FN,
+    Float8E8M0FNU,
+    Float8E4M3B11FNUZ,
+    Float4E2M1FN,
+    Float6E2M3FN,
+    Float6E3M2FN,
+}
+__STR_TO_DTYPE__ = {dt.__name__: dt for dt in ALL_DTYPES}
+
+
+def dtype(dtype_) -> Type[Numeric]:
+    t = None
+    if const_expr(isinstance(dtype_, str) and dtype_ in __STR_TO_DTYPE__):
+        t = __STR_TO_DTYPE__[dtype_]
+    else:
+        raise TypeError(f"can't interpret {dtype_} as data type")
+
+    return t
+
+
+##############################################################
+# Tensor
+##############################################################
+
+
+class TensorMeta(DslType):
+    _element_type = Any
+    _shape = Any
+
+    """
+    Examples:
+        >>> Tensor[Int32, (3,)]
+        >>> Tensor[Float32, (3, 4)]
+        >>> T = TypeVar("T")
+        >>> Tensor[T, (3, 4, 5)]
+    """
+
+    def __new__(cls, name, bases, attrs, element_type=Any, shape=Any):
+        new_cls = super().__new__(cls, name, bases, attrs)
+        new_cls._element_type = element_type
+        new_cls._shape = shape
+        return new_cls
+
+
+# Generic type
+TY = TypeVar("TY")
+
+
+class Constexpr(Generic[TY]):
+    """Value is passed and computed by python interpreter"""
+
+    pass
+
+
+class align:
+    def __init__(self, value: int):
+        if value <= 0 or (value & (value - 1)) != 0:
+            raise DSLRuntimeError("expects align be power of 2 as positive value")
+        self._value = value
+
+    def __str__(self):
+        return f"align({self._value})"
+
+
+class PointerMeta(DslType):
+    def __new__(cls, name, bases, attrs, value_type=Int32, align_=align(1)):
+        new_cls = super().__new__(
+            cls,
+            name,
+            bases,
+            attrs,
+            mlir_type=lambda: getattr(ir, "UnrankedMemRefType").get(
+                value_type.mlir_type, getattr(ir, "Attribute").parse("0")
+            ),
+        )
+        new_cls._value_type = value_type
+        new_cls._align = align_
+        return new_cls
+
+    def __eq__(cls, other):
+        if not isinstance(other, PointerMeta):
+            return False
+        return (
+            cls._value_type == other._value_type
+            and cls._align._value == other._align._value
+        )  # Compare alignment values
+
+    def __hash__(cls):
+        return hash((cls._value_type, cls._align._value))  # Hash alignment value
+
+    def __getitem__(cls, params) -> Type["Pointer"]:
+        value_type, align_ = params
+
+        if not isinstance(align_, align):
+            raise DSLRuntimeError(f"expects align but got {align_}")
+
+        # Create new class with proper name and parameters
+        new_cls = type(
+            f"Pointer[{value_type.__name__}, {align_}]",
+            (Pointer,),
+            {},
+            value_type=value_type,
+            align_=align_,  # Pass alignment to __new__
+        )
+        return new_cls
+
+    def __str__(cls):
+        return f"ptr<{cls._value_type}, {cls._align}>"
+
+
+class Pointer(metaclass=PointerMeta):
+    """
+    A pointer to a memory location.
+
+    Examples:
+
+        def foo(a : Pointer[Int32, align=8]):
+            ...
+
+    """
+
+    def __init__(self, value):
+        self.value = value
+
+    def __str__(self):
+        return f"{self.value} : {type(self)}"
+
+
+class IRConst(Generic[TY]):
+    """Value is passed as MLIR constant value for (arith.constant)."""
+
+    def __init__(self, ty: TY):
+        self.ty = ty
+
+
+class IRValue(Generic[TY]):
+    """Value is passed as MLIR dynamic value."""
+
+    def __init__(self, ty: TY):
+        self.ty = ty
+
+
+class IRVariadic:
+    """
+    A helper class to pass a variadic number of arguments to a function.
+    """
+
+    def __init__(self, operands):
+        """
+        Create a list of variadic operands. `operands` must be SSA values.
+        """
+        self.operands = operands
+
+    def block_arg_types(self):
+        """
+        Return the list of block args types.
+        """
+        return [operand.type for operand in self.operands]
+
+    def set_func_args(self, block_args):
+        """
+        This function is called after entering a function. `block_args` are the
+        block arguments that correspond to the passed operands. Derived classes
+        may implement this function to provide convenience getters for block
+        arguments.
+        """
+        pass
+
+    def __len__(self):
+        """
+        Return the length of variadic operands.
+        """
+        return len(self.operands)
+
+
+class FuncArgWithAttr(IRValue):
+    """
+    This derived class is specifically for func op arg with attr
+    """
+
+    def __init__(self, ty, attr_name, attr_ty, attr_value=None):
+        super().__init__(ty)
+        assert attr_name is not None and (
+            attr_ty is not None or attr_value is not None
+        ), "Invalid attr_name and/or attr_ty and/or attr_value for FuncArgWithAttr"
+        self.attr_name = attr_name
+        self.attr_ty = attr_ty
+        self.attr_value = attr_value
+
+
+
+def implicitDowncastNumericType(value):
+    if isinstance(value, Numeric):
+        return value.ir_value()
+    return value
+
+
+__all__ = [
+    "DslType",
+    "Numeric",
+    "NumericMeta",
+    "IntegerMeta",
+    "FloatMeta",
+    "Boolean",
+    "Integer",
+    "Int16",
+    "Int32",
+    "Int64",
+    "Int128",
+    "Int8",
+    "Uint8",
+    "Uint16",
+    "Uint32",
+    "Uint64",
+    "Uint128",
+    "Float",
+    "Float16",
+    "BFloat16",
+    "TFloat32",
+    "Float32",
+    "Float64",
+    "Float8E5M2",
+    "Float8E4M3",
+    "Float8E4M3FN",
+    "Float8E4M3B11FNUZ",
+    "Float8E4M3",
+    "Float8E8M0FNU",
+    "Float4E2M1FN",
+    "Float6E2M3FN",
+    "Float6E3M2FN",
+    "as_numeric",
+    "align",
+    "Pointer",
+    "dtype",
+    "Constexpr",
+    "IRConst",
+    "IRValue",
+    "IRVariadic",
+    "implicitDowncastNumericType",
+]
diff --git a/python/CuTeDSL/base_dsl/utils/__init__.py b/python/CuTeDSL/base_dsl/utils/__init__.py
new file mode 100644
index 00000000..c4bfb2b7
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/utils/__init__.py
@@ -0,0 +1,19 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from . import stacktrace
+from . import logger
+from . import timer
+__all__ = [
+    "logger",
+    "timer",
+    "stacktrace",
+]
diff --git a/python/CuTeDSL/base_dsl/utils/logger.py b/python/CuTeDSL/base_dsl/utils/logger.py
new file mode 100644
index 00000000..b239f346
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/utils/logger.py
@@ -0,0 +1,80 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides logging helper functions
+"""
+
+import logging
+
+logger = None
+
+
+def log():
+    return logger
+
+
+def setup_log(
+    name, log_to_console=False, log_to_file=False, log_file_path=None, log_level=1
+):
+    """Set up and configure a logger with console and/or file handlers.
+
+    :param name: Name of the logger to create
+    :type name: str
+    :param log_to_console: Whether to enable logging to console, defaults to False
+    :type log_to_console: bool, optional
+    :param log_to_file: Whether to enable logging to file, defaults to False
+    :type log_to_file: bool, optional
+    :param log_file_path: Path to the log file, required if log_to_file is True
+    :type log_file_path: str, optional
+    :param log_level: Logging level to set, defaults to 1
+    :type log_level: int, optional
+    :raises ValueError: If log_to_file is True but log_file_path is not provided
+    :return: Configured logger instance
+    :rtype: logging.Logger
+    """
+    # Create a custom logger
+    global logger
+    logger = logging.getLogger(name)
+    if log_to_console or log_to_file:
+        logger.setLevel(log_level)
+    else:
+        logger.setLevel(logging.NOTSET)
+
+    # Clear existing handlers to prevent duplicate logs
+    if logger.hasHandlers():
+        logger.handlers.clear()
+
+    # Define formatter
+    formatter = logging.Formatter(
+        f"%(asctime)s - %(name)s - %(levelname)s - [%(funcName)s] - %(message)s"
+    )
+
+    # Add console handler if enabled
+    if log_to_console:
+        console_handler = logging.StreamHandler()
+        console_handler.setLevel(log_level)
+        console_handler.setFormatter(formatter)
+        logger.addHandler(console_handler)
+
+    # Add file handler if enabled
+    if log_to_file:
+        if not log_file_path:
+            raise ValueError("log_file_path must be provided when enable_file is True")
+        file_handler = logging.FileHandler(log_file_path)
+        file_handler.setLevel(log_level)
+        file_handler.setFormatter(formatter)
+        logger.addHandler(file_handler)
+
+    return logger
+
+
+logger = setup_log("generic")
diff --git a/python/CuTeDSL/base_dsl/utils/stacktrace.py b/python/CuTeDSL/base_dsl/utils/stacktrace.py
new file mode 100644
index 00000000..d2091098
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/utils/stacktrace.py
@@ -0,0 +1,165 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+ This module provides stacktrace helper functions
+"""
+
+import os
+import re
+
+
+def walk_to_top_module(start_path):
+    """
+    Walk up from the start_path to find the top-level Python module.
+
+    :param start_path: The path to start from.
+    :return: The path of the top-level module.
+    """
+    current_path = start_path
+
+    while True:
+        # Check if we are at the root directory
+        if os.path.dirname(current_path) == current_path:
+            break
+
+        # Check for __init__.py
+        init_file_path = os.path.join(current_path, "__init__.py")
+        if os.path.isfile(init_file_path):
+            # If __init__.py exists, move up one level
+            current_path = os.path.dirname(current_path)
+        else:
+            # If no __init__.py, we are not in a module; stop
+            break
+
+    # If we reached the root without finding a module, return None
+    if os.path.dirname(current_path) == current_path and not os.path.isfile(
+        os.path.join(current_path, "__init__.py")
+    ):
+        return None
+
+    # Return the path of the top-level module
+    return current_path
+
+
+def _filter_internal_frames(traceback, internal_path):
+    """
+    Filter out stack frames from the traceback that belong to the specified module path.
+
+    This function removes stack frames from the traceback whose file paths start with
+    the given prefix_path, effectively hiding internal implementation details from
+    the error traceback shown to users.
+    """
+    iter_prev = None
+    iter_tb = traceback
+    while iter_tb is not None:
+        if os.path.abspath(iter_tb.tb_frame.f_code.co_filename).startswith(
+            internal_path
+        ):
+            if iter_tb.tb_next:
+                if iter_prev:
+                    iter_prev.tb_next = iter_tb.tb_next
+                else:
+                    traceback = iter_tb.tb_next
+        else:
+            iter_prev = iter_tb
+        iter_tb = iter_tb.tb_next
+    return traceback
+
+
+_generated_function_names = re.compile(
+    r"^(loop_body|while_region|while_before_block|while_after_block|if_region|then_block|else_block|elif_region)_\d+$"
+)
+
+
+def _filter_duplicated_frames(traceback):
+    """
+    Filter out duplicated stack frames from the traceback.
+    The function filters out consecutive frames that are in the same file and have the same line number.
+    In a sequence of consecutive frames, the logic prefers to keep the non-generated frame or the last frame.
+    """
+    iter_prev = None
+    iter_tb = traceback
+    while iter_tb is not None:
+        skip_current = False
+        skip_next = False
+        if iter_tb.tb_next:
+            current_filename = os.path.abspath(iter_tb.tb_frame.f_code.co_filename)
+            next_filename = os.path.abspath(iter_tb.tb_next.tb_frame.f_code.co_filename)
+            # if in the same file, check if the line number is the same
+            if current_filename == next_filename:
+                current_lineno = iter_tb.tb_lineno
+                next_lineno = iter_tb.tb_next.tb_lineno
+                if current_lineno == next_lineno:
+                    # Same file and line number, check name, if current is generated, skip current, otherwise skip next
+                    name = iter_tb.tb_frame.f_code.co_name
+                    is_generated = bool(_generated_function_names.match(name))
+                    if is_generated:
+                        # Skip current
+                        skip_current = True
+                    else:
+                        # Skip next if it's generated, otherwise keep both
+                        next_name = iter_tb.tb_next.tb_frame.f_code.co_name
+                        skip_next = bool(_generated_function_names.match(next_name))
+        if skip_current:
+            if iter_prev:
+                iter_prev.tb_next = iter_tb.tb_next
+            else:
+                traceback = iter_tb.tb_next
+        elif skip_next:
+            # if next is last frame, don't skip
+            if iter_tb.tb_next.tb_next:
+                iter_tb.tb_next = iter_tb.tb_next.tb_next
+            iter_prev = iter_tb
+        else:
+            iter_prev = iter_tb
+        iter_tb = iter_tb.tb_next
+
+    return traceback
+
+
+def filter_stackframe(traceback, prefix_path):
+    """
+    Filter out stack frames from the traceback that belong to the specified module path.
+
+    This function removes stack frames from the traceback whose file paths start with
+    the given prefix_path, effectively hiding internal implementation details from
+    the error traceback shown to users.
+
+    :param traceback: The traceback object to filter.
+    :param prefix_path: The path prefix to filter out from the traceback.
+    :return: The filtered traceback with internal frames removed.
+    """
+    # Step 1: filter internal frames
+    traceback = _filter_internal_frames(traceback, prefix_path)
+
+    # Step 2: consolidate duplicated frames
+    return _filter_duplicated_frames(traceback)
+
+
+def filter_exception(value, module_dir):
+    """
+    Filter out internal implementation details from exception traceback.
+
+    This function recursively processes an exception and its cause chain,
+    removing stack frames that belong to the specified module directory.
+    This helps to present cleaner error messages to users by hiding
+    implementation details.
+
+    :param value: The exception object to filter.
+    :param module_dir: The module directory path to filter out from tracebacks.
+    :return: The filtered exception with internal frames removed.
+    """
+    if hasattr(value, "__cause__") and value.__cause__:
+        filter_exception(value.__cause__, module_dir)
+
+    if hasattr(value, "__traceback__"):
+        filter_stackframe(value.__traceback__, module_dir)
diff --git a/python/CuTeDSL/base_dsl/utils/timer.py b/python/CuTeDSL/base_dsl/utils/timer.py
new file mode 100644
index 00000000..f41d3f74
--- /dev/null
+++ b/python/CuTeDSL/base_dsl/utils/timer.py
@@ -0,0 +1,56 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides a timing helper functions
+"""
+from functools import wraps
+
+from .logger import log
+
+
+# TODO: revisit this part when mlir timing manager is ready for pybind.
+def timer(*dargs, **kwargs):
+    enable = kwargs.get("enable", True)
+
+    def decorator(func):
+        @wraps(func)
+        def func_wrapper(*args, **kwargs):
+            if not enable:
+                return func(*args, **kwargs)
+            from time import time
+
+            start = time()
+            result = func(*args, **kwargs)
+            end = time()
+
+            # Convert time from seconds to us
+            spend_us = (end - start) * 1e6
+
+            # Determine the function type and format the log message
+            if hasattr(func, "__name__"):
+                func_name = func.__name__
+                log_message = f"[JIT-TIMER] Function: {func_name} | Execution Time: {spend_us:.2f} µs"
+            elif "CFunctionType" in str(type(func)):
+                log_message = f"[JIT-TIMER] C API Function: {str(func)} | Execution Time: {spend_us:.2f} µs"
+            else:
+                log_message = f"[JIT-TIMER] Anonymous Function | Execution Time: {spend_us:.2f} µs"
+
+            log().info(log_message)
+
+            return result
+
+        return func_wrapper
+
+    if len(dargs) == 1 and callable(dargs[0]):
+        return decorator(dargs[0])
+    else:
+        return decorator
diff --git a/python/CuTeDSL/cutlass/__init__.py b/python/CuTeDSL/cutlass/__init__.py
new file mode 100644
index 00000000..d0e7c93b
--- /dev/null
+++ b/python/CuTeDSL/cutlass/__init__.py
@@ -0,0 +1,57 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from .cutlass_dsl import (
+    Constexpr,
+    as_numeric,
+    min,
+    max,
+    and_,
+    or_,
+    all_,
+    any_,
+    not_,
+    all_,
+    any_,
+    select_,
+    # Control-flow without AST pre-processor
+    if_generate,
+    for_generate,
+    LoopUnroll,
+    while_generate,
+    yield_out,
+    # Control-flow with AST pre-processor
+    range_constexpr,
+    range_dynamic,
+    const_expr,
+    dynamic_expr,
+    # Data types
+    dtype,  # Provides conversions to types inheriting from NumericType
+    DSLRuntimeError,
+    JitArgAdapterRegistry,
+    # Construction utilities for user-defined classes
+    extract_mlir_values,
+    new_from_mlir_values,
+)
+
+from .cute.typing import *
+
+# Utilities not belonging to CuTe
+from . import utils as utils
+
+# Used as internal symbol
+from . import cutlass_dsl as _dsl
+
+# Aliases
+LaunchConfig = _dsl.BaseDSL.LaunchConfig
+register_jit_arg_adapter = _dsl.JitArgAdapterRegistry.register_jit_arg_adapter
+gpu = _dsl.cutlass_gpu
+cuda = _dsl.cuda_helpers
diff --git a/python/CuTeDSL/cutlass/cute/__init__.py b/python/CuTeDSL/cutlass/cute/__init__.py
new file mode 100644
index 00000000..11496402
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/__init__.py
@@ -0,0 +1,310 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+# Use the auto-generated enum AddressSpace
+from cutlass._mlir.dialects.cute import AddressSpace
+
+# Explicitly import types that might be directly used by other modules.
+# This is a fix for using Sphinx to generate documentation
+# Because Sphinx processes each module in isolation, it won't be able to rely
+# on re-exported symbols via wildcard imports (from .typing import *) in the
+# same way that Python does at runtime.
+from .typing import (
+    Shape,
+    Stride,
+    IntTuple,
+    Coord,
+    Tile,
+    XTuple,
+    Tiler,
+    Layout,
+    Pointer,
+    Tensor,
+)
+
+# Import everything else
+from .typing import *
+
+from .core import (
+    assume,
+    is_integer,
+    is_int_tuple,
+    is_static,
+    size,
+    has_underscore,
+    slice_,
+    make_ptr,
+    make_layout,
+    recast_layout,
+    make_fragment_like,
+    depth,
+    rank,
+    flatten_to_tuple,
+    flatten,
+    unflatten,
+    product,
+    product_like,
+    shape,
+    size_in_bytes,
+    make_identity_layout,
+    make_ordered_layout,
+    make_composed_layout,
+    make_layout_tv,
+    make_swizzle,
+    recast_ptr,
+    make_tensor,
+    make_identity_tensor,
+    make_fragment,
+    recast_tensor,
+    get,
+    select,
+    front,
+    is_major,
+    find,
+    coalesce,
+    group_modes,
+    cosize,
+    dice,
+    product_each,
+    prepend,
+    append,
+    prepend_ones,
+    append_ones,
+    ceil_div,
+    slice_and_offset,
+    crd2idx,
+    domain_offset,
+    elem_less,
+    transform_leaf,
+    filter_zeros,
+    filter,
+    tile_to_shape,
+    shape_div,
+    composition,
+    complement,
+    right_inverse,
+    left_inverse,
+    max_common_layout,
+    max_common_vector,
+    logical_product,
+    zipped_product,
+    tiled_product,
+    flat_product,
+    raked_product,
+    blocked_product,
+    flat_divide,
+    logical_divide,
+    zipped_divide,
+    tiled_divide,
+    local_partition,
+    local_tile,
+    printf,
+    print_tensor,
+    # tiled mma/tiled copy
+    make_mma_atom,
+    make_tiled_mma,
+    make_copy_atom,
+    make_tiled_copy_tv,
+    make_tiled_copy,
+    make_tiled_copy_S,
+    make_tiled_copy_D,
+    make_tiled_copy_C_atom,
+    basic_copy,
+    basic_copy_if,
+    autovec_copy,
+    copy,
+    gemm,
+    # Wrapper classes
+    ComposedLayout,
+    Swizzle,
+    E,
+    Atom,
+    MmaAtom,
+    CopyAtom,
+    TiledCopy,
+    TiledMma,
+    TensorSSA,
+    ReductionOp,
+    full,
+    full_like,
+    empty_like,
+    ones_like,
+    zeros_like,
+    where,
+    any_,
+    all_,
+    # User defined struct
+    struct,
+    pretty_str,
+    make_layout_image_mask,
+    repeat_like,
+    round_up,
+    is_congruent,
+    is_weakly_congruent,
+    ScaledBasis,
+    get_divisibility,
+    Ratio,
+)
+
+from . import arch
+from . import nvgpu
+from . import testing
+from . import runtime
+
+# Export all math ops without "math."
+from .math import *
+
+# Used as internal symbol
+from .. import cutlass_dsl as _dsl
+
+# Aliases
+jit = _dsl.CuTeDSL.jit
+kernel = _dsl.CuTeDSL.kernel
+register_jit_arg_adapter = _dsl.JitArgAdapterRegistry.register_jit_arg_adapter
+compile = _dsl.compile
+
+# Explicitly export all symbols for documentation generation
+__all__ = [
+    # Core types
+    "AddressSpace",
+    "Tensor",
+    "Layout",
+    "ComposedLayout",
+    "Swizzle",
+    "E",
+    "Atom",
+    "MmaAtom",
+    "CopyAtom",
+    "TiledCopy",
+    "TiledMma",
+    "TensorSSA",
+    # Basic utility functions
+    "assume",
+    "is_integer",
+    "is_int_tuple",
+    "is_static",
+    "size",
+    "has_underscore",
+    "slice_",
+    "depth",
+    "rank",
+    "shape",
+    "printf",
+    "print_tensor",
+    "pretty_str",
+    # Layout functions
+    "make_layout",
+    "recast_layout",
+    "make_identity_layout",
+    "make_ordered_layout",
+    "make_composed_layout",
+    "make_layout_tv",
+    "make_layout_image_mask",
+    # Tensor functions
+    "make_ptr",
+    "make_tensor",
+    "make_identity_tensor",
+    "make_fragment",
+    "make_fragment_like",
+    "recast_ptr",
+    "recast_tensor",
+    # Tensor manipulation
+    "get",
+    "select",
+    "front",
+    "is_major",
+    "find",
+    "coalesce",
+    "group_modes",
+    "cosize",
+    "size_in_bytes",
+    # Tuple operations
+    "flatten_to_tuple",
+    "flatten",
+    "product",
+    "product_like",
+    "product_each",
+    "prepend",
+    "append",
+    "prepend_ones",
+    "append_ones",
+    # Math operations
+    "ceil_div",
+    "round_up",
+    # Layout operations
+    "slice_and_offset",
+    "crd2idx",
+    "domain_offset",
+    "elem_less",
+    "filter_zeros",
+    "filter",
+    "tile_to_shape",
+    "shape_div",
+    "dice",
+    # Layout algebra
+    "composition",
+    "complement",
+    "right_inverse",
+    "left_inverse",
+    "max_common_layout",
+    "max_common_vector",
+    "is_congruent",
+    "is_weakly_congruent",
+    # Product operations
+    "logical_product",
+    "zipped_product",
+    "tiled_product",
+    "flat_product",
+    "raked_product",
+    "blocked_product",
+    # Division operations
+    "flat_divide",
+    "logical_divide",
+    "zipped_divide",
+    "tiled_divide",
+    "local_partition",
+    "local_tile",
+    # MMA and Copy operations
+    "make_mma_atom",
+    "make_tiled_mma",
+    "make_copy_atom",
+    "make_tiled_copy_tv",
+    "make_tiled_copy",
+    "make_tiled_copy_C_atom",
+    "basic_copy",
+    "basic_copy_if",
+    "autovec_copy",
+    "copy",
+    "gemm",
+    # Tensor creation
+    "full",
+    "full_like",
+    "empty_like",
+    "ones_like",
+    "zeros_like",
+    "where",
+    "any_",
+    "all_",
+    "repeat_like",
+    "ScaledBasis",
+    # User defined struct
+    "struct",
+    # Modules
+    "arch",
+    "nvgpu",
+    "testing",
+    "runtime",
+    # Decorators and code generation
+    "jit",
+    "kernel",
+    "register_jit_arg_adapter",
+    "compile",
+]
diff --git a/python/CuTeDSL/cutlass/cute/arch/__init__.py b/python/CuTeDSL/cutlass/cute/arch/__init__.py
new file mode 100644
index 00000000..5114b97f
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/arch/__init__.py
@@ -0,0 +1,98 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from .elect import *
+from .mbar import *
+from .nvvm_wrappers import *
+from .smem import *
+from .tmem import *
+
+# __all__ is required here for documentation generation
+__all__ = [
+    #
+    # elect.py
+    #
+    "make_warp_uniform",
+    "elect_one",
+    #
+    # mbar.py
+    #
+    "mbarrier_init_arrive_cnt",
+    "mbarrier_init_fence",
+    "mbarrier_init_tx_bytes",
+    "mbarrier_wait",
+    "mbarrier_try_wait",
+    "conditional_mbarrier_try_wait",
+    "mbarrier_arrive",
+    #
+    # nvvm_wrappers.py
+    #
+    "lane_idx",
+    "warp_idx",
+    "thread_idx",
+    "block_dim",
+    "block_idx",
+    "grid_dim",
+    "cluster_idx",
+    "cluster_dim",
+    "block_in_cluster_idx",
+    "block_in_cluster_dim",
+    "block_idx_in_cluster",
+    "shuffle_sync",
+    "shuffle_sync_up",
+    "shuffle_sync_down",
+    "shuffle_sync_bfly",
+    "barrier",
+    "sync_threads",
+    "sync_warp",
+    "fence_acq_rel_cta",
+    "fence_acq_rel_cluster",
+    "fence_acq_rel_gpu",
+    "fence_acq_rel_sys",
+    "cp_async_commit_group",
+    "cp_async_wait_group",
+    "cp_async_bulk_commit_group",
+    "cp_async_bulk_wait_group",
+    "cluster_wait",
+    "cluster_arrive",
+    "cluster_arrive_relaxed",
+    "fence_proxy",
+    "vote_ballot_sync",
+    "popc",
+    "fence_view_async_tmem_load",
+    "fence_view_async_tmem_store",
+    "warpgroup_reg_alloc",
+    "warpgroup_reg_dealloc",
+    "fma_packed_f32x2",
+    "mul_packed_f32x2",
+    "add_packed_f32x2",
+    "fmax",
+    "rcp_approx",
+    "exp2",
+    # Constants
+    "WARP_SIZE",
+    # Forward from auto-generated nvvm python
+    "ProxyKind",
+    "SharedSpace",
+    "RoundingModeKind",
+    #
+    # smem.py
+    #
+    "alloc_smem",
+    "get_dyn_smem",
+    #
+    # tmem.py
+    #
+    "retrieve_tmem_ptr",
+    "alloc_tmem",
+    "relinquish_tmem_alloc_permit",
+    "dealloc_tmem",
+]
diff --git a/python/CuTeDSL/cutlass/cute/arch/elect.py b/python/CuTeDSL/cutlass/cute/arch/elect.py
new file mode 100644
index 00000000..fce82b13
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/arch/elect.py
@@ -0,0 +1,75 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from cutlass.cutlass_dsl import CuTeDSL, T, dsl_user_op
+
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir.dialects import nvvm, scf
+from cutlass._mlir import ir
+
+from ..typing import Int, Int32
+from ...impl_utils import check_value_in
+
+
+@dsl_user_op
+def make_warp_uniform(value: Int, *, loc=None, ip=None) -> Int32:
+    """
+    Creates a warp-uniform value from the given integer input.
+
+    :param value: The integer to make warp uniform.
+    :type value:  Int
+    :return:      The warp-uniform value equal to the input.
+    :rtype:       Int32
+    """
+    return Int32(
+        _cute_nvgpu_ir.arch_make_warp_uniform(
+            Int32(value).ir_value(loc=loc, ip=ip), loc=loc, ip=ip
+        )
+    )
+
+
+class IfOpRegion:
+    """
+    A context manager for if Op.
+    Automatically inserts `scf.yield([])` when exiting the context.
+    """
+
+    def __init__(self, block, *, loc=None, ip=None):
+        self.block = block
+        self.insert_point = ir.InsertionPoint(self.block)
+        self.loc = loc
+        self.ip = ip
+
+    def __enter__(self):
+        self.insert_point.__enter__()
+        return self.block.arguments
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        scf.yield_([], loc=self.loc, ip=self.ip)
+        self.insert_point.__exit__(exc_type, exc_value, traceback)
+
+
+@dsl_user_op
+def elect_one(*, loc=None, ip=None) -> IfOpRegion:
+    """
+    Elects one thread within a warp.
+
+    .. code-block:: python
+
+        with elect_one():
+            # Only one thread in the warp executes the code in this context
+            pass
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(arch, ["sm_90", "sm_90a", "sm_100a"], "arch")
+    is_thread_leader = nvvm.elect_sync(T.bool())
+    if_op = scf.IfOp(is_thread_leader, loc=loc, ip=ip)
+    return IfOpRegion(if_op.then_block, loc=loc, ip=ip)
diff --git a/python/CuTeDSL/cutlass/cute/arch/mbar.py b/python/CuTeDSL/cutlass/cute/arch/mbar.py
new file mode 100644
index 00000000..b4dc3725
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/arch/mbar.py
@@ -0,0 +1,208 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from cutlass.cutlass_dsl import CuTeDSL, T, if_generate, dsl_user_op
+
+from cutlass._mlir.dialects import nvvm
+from cutlass._mlir import ir
+
+from ..typing import Pointer, Int, Boolean, Int32
+from ...impl_utils import check_value_in
+
+
+####################################################################################################
+#
+# Mbarrier management utilities
+#
+####################################################################################################
+
+
+@dsl_user_op
+def mbarrier_init_arrive_cnt(mbar_ptr: Pointer, cnt: Int, *, loc=None, ip=None) -> None:
+    """
+    Initializes a mbarrier with the specified thread arrival count.
+
+    :param mbar_ptr: A pointer to the mbarrier in SMEM
+    :type mbar_ptr:  Pointer
+    :param cnt:      The arrival count of the mbarrier
+    :type cnt:       Int
+    """
+    nvvm.mbarrier_init_shared(
+        mbar_ptr.llvm_ptr, Int32(cnt).ir_value(loc=loc, ip=ip), loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def mbarrier_init_fence(*, loc=None, ip=None) -> None:
+    """
+    A fence operation that applies to the mbarrier initializations.
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(arch, ["sm_90", "sm_90a", "sm_100a"], "arch")
+    nvvm.fence_mbarrier_init(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def mbarrier_init_tx_bytes(
+    mbar_ptr: Pointer, bytes: Int, peer_cta_rank_in_cluster=None, *, loc=None, ip=None
+) -> None:
+    """
+    Initializes a mbarrier with the specified number of transaction bytes.
+
+    :param mbar_ptr:                 A pointer to the mbarrier in SMEM
+    :type mbar_ptr:                  Pointer
+    :param bytes:                    The number of transaction bytes
+    :type bytes:                     Int
+    :param peer_cta_rank_in_cluster: An optional CTA rank in cluster. If provided, the pointer to
+                                     the mbarrier is converted to a remote address in the peer CTA's
+                                     SMEM.
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(arch, ["sm_90", "sm_90a", "sm_100a"], "arch")
+
+    mbar_llvm_ptr = mbar_ptr.llvm_ptr
+    if peer_cta_rank_in_cluster is not None:
+        mbar_llvm_ptr = nvvm.mapa_shared_cluster(
+            mbar_llvm_ptr.type,
+            mbar_llvm_ptr,
+            Int32(peer_cta_rank_in_cluster).ir_value(loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+        space = nvvm.MBarrierSpaceKind.CLUSTER
+    else:
+        space = nvvm.MBarrierSpaceKind.CTA
+
+    nvvm.mbarrier_txn(
+        mbar_llvm_ptr,
+        Int32(bytes).ir_value(loc=loc, ip=ip),
+        kind=nvvm.MBarrierTxnKind.ARRIVE_EXPECT_TX,
+        space=space,
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def mbarrier_wait(mbar_ptr: Pointer, phase: Int, *, loc=None, ip=None) -> None:
+    """
+    Waits on a mbarrier with a specified phase.
+
+    :param mbar_ptr: A pointer to the mbarrier in SMEM
+    :type mbar_ptr:  Pointer
+    :param phase:    The phase to wait for (either 0 or 1)
+    :type phase:     Int
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(arch, ["sm_90", "sm_90a", "sm_100a"], "arch")
+
+    timeout_ns = 10000000
+    # This NVVM Op is a spin-loop wrapping the mbarrier.try_wait.parity.shared.b64 PTX
+    # The timeout in ns only applies to the latter and this call is truly blocking
+    nvvm.mbarrier_try_wait_parity_shared(
+        mbar_ptr.llvm_ptr,
+        Int32(phase).ir_value(loc=loc, ip=ip),
+        Int32(timeout_ns).ir_value(loc=loc, ip=ip),
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def mbarrier_try_wait(mbar_ptr: Pointer, phase: Int, *, loc=None, ip=None) -> Boolean:
+    """
+    Attempts to wait on a mbarrier with a specified phase in a non-blocking fashion.
+
+    :param mbar_ptr: A pointer to the mbarrier in SMEM
+    :type mbar_ptr:  Pointer
+    :param phase:    The phase to wait for (either 0 or 1)
+    :type phase:     Int
+    :return:         A boolean value indicating whether the wait operation was successful
+    :rtype:          Boolean
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(arch, ["sm_90", "sm_90a", "sm_100a"], "arch")
+
+    return Boolean(
+        nvvm.mbarrier_wait_parity(
+            T.bool(),
+            mbar_ptr.llvm_ptr,
+            Int32(phase).ir_value(loc=loc, ip=ip),
+            nvvm.MBarrierWaitKind.TRY,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def conditional_mbarrier_try_wait(
+    cond, mbar_ptr: Pointer, phase: Int, *, loc=None, ip=None
+) -> Boolean:
+    """
+    Conditionally attempts to wait on a mbarrier with a specified phase in a non-blocking fashion.
+
+    :param cond:     A boolean predicate
+    :param mbar_ptr: A pointer to the mbarrier in SMEM
+    :type mbar_ptr:  Pointer
+    :param phase:    The phase to wait for (either 0 or 1)
+    :type phase:     Int
+    :return:         A boolean value indicating whether the wait operation was successful
+    :rtype:          Boolean
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(arch, ["sm_90", "sm_90a", "sm_100a"], "arch")
+    return if_generate(
+        cond,
+        lambda: mbarrier_try_wait(mbar_ptr, phase, loc=loc, ip=ip),
+        lambda: Boolean(True).ir_value(loc=loc, ip=ip),
+        None,
+        [Boolean],
+    )
+
+
+@dsl_user_op
+def mbarrier_arrive(
+    mbar_ptr: Pointer, peer_cta_rank_in_cluster: Int = None, *, loc=None, ip=None
+) -> None:
+    """
+    Arrives on an mbarrier.
+
+    :param mbar_ptr:                 A pointer to the mbarrier in SMEM
+    :type mbar_ptr:                  Pointer
+    :param peer_cta_rank_in_cluster: An optional CTA rank in cluster. If provided, the pointer to
+                                     the mbarrier is converted to a remote address in the peer CTA's
+                                     SMEM.
+    """
+    mbar_llvm_ptr = mbar_ptr.llvm_ptr
+    if peer_cta_rank_in_cluster is not None:
+        arch = CuTeDSL._get_dsl().envar.arch
+        check_value_in(arch, ["sm_90", "sm_90a", "sm_100a"], "arch")
+
+        mbar_llvm_ptr = nvvm.mapa_shared_cluster(
+            mbar_llvm_ptr.type,
+            mbar_llvm_ptr,
+            Int32(peer_cta_rank_in_cluster).ir_value(loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+        space = nvvm.MBarrierSpaceKind.CLUSTER
+    else:
+        space = nvvm.MBarrierSpaceKind.CTA
+
+    nvvm.mbarrier_txn(
+        mbar_llvm_ptr,
+        Int32(1).ir_value(loc=loc, ip=ip),
+        kind=nvvm.MBarrierTxnKind.ARRIVE,
+        space=space,
+        loc=loc,
+        ip=ip,
+    )
diff --git a/python/CuTeDSL/cutlass/cute/arch/nvvm_wrappers.py b/python/CuTeDSL/cutlass/cute/arch/nvvm_wrappers.py
new file mode 100644
index 00000000..03d83c26
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/arch/nvvm_wrappers.py
@@ -0,0 +1,547 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from functools import partial
+from typing import Optional, Tuple, Union, Callable
+
+from cutlass.cutlass_dsl import T, dsl_user_op
+
+from cutlass._mlir import ir
+from cutlass._mlir.dialects import llvm, nvvm, vector
+
+# Forward nvvm enums
+from cutlass._mlir.dialects.nvvm import (
+    ProxyKind,
+    SharedSpace,
+    Tcgen05WaitKind,
+    SetMaxRegisterAction,
+    RoundingModeKind,
+)
+
+from ..typing import Int, Boolean, Int32, Float32, Numeric, as_numeric
+
+WARP_SIZE = 32
+FULL_MASK = 0xFFFFFFFF
+
+
+@dsl_user_op
+def lane_idx(*, loc=None, ip=None) -> Int32:
+    """
+    Returns the lane index of the current thread within the warp.
+    """
+    return Int32(nvvm.read_ptx_sreg_laneid(T.i32(), loc=loc, ip=ip))
+
+
+@dsl_user_op
+def warp_idx(*, loc=None, ip=None) -> Int32:
+    """
+    Returns the warp index within a CTA.
+    """
+    warp_size = 32
+    tid_x = Int32(nvvm.read_ptx_sreg_tid_x(T.i32(), loc=loc, ip=ip))
+    tid_y = Int32(nvvm.read_ptx_sreg_tid_y(T.i32(), loc=loc, ip=ip))
+    tid_z = Int32(nvvm.read_ptx_sreg_tid_z(T.i32(), loc=loc, ip=ip))
+    ntid_x = Int32(nvvm.read_ptx_sreg_ntid_x(T.i32(), loc=loc, ip=ip))
+    ntid_y = Int32(nvvm.read_ptx_sreg_ntid_y(T.i32(), loc=loc, ip=ip))
+    tid = tid_x + tid_y * ntid_x + tid_z * ntid_x * ntid_y
+    return tid // warp_size
+
+
+@dsl_user_op
+def thread_idx(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the thread index within a CTA.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_tid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_tid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_tid_z(T.i32(), loc=loc, ip=ip)),
+    )
+
+
+@dsl_user_op
+def block_dim(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the number of threads in each dimension of the CTA.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_ntid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_ntid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_ntid_z(T.i32(), loc=loc, ip=ip)),
+    )
+
+
+@dsl_user_op
+def block_idx(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the CTA identifier within a grid.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_ctaid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_ctaid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_ctaid_z(T.i32(), loc=loc, ip=ip)),
+    )
+
+
+@dsl_user_op
+def grid_dim(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the number of CTAs in each dimension of the grid.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_nctaid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_nctaid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_nctaid_z(T.i32(), loc=loc, ip=ip)),
+    )
+
+
+@dsl_user_op
+def cluster_idx(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the cluster identifier within a grid.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_clusterid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_clusterid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_clusterid_z(T.i32(), loc=loc, ip=ip)),
+    )
+
+
+@dsl_user_op
+def cluster_dim(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the number of clusters in each dimension of the grid.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_nclusterid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_nclusterid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_nclusterid_z(T.i32(), loc=loc, ip=ip)),
+    )
+
+
+@dsl_user_op
+def block_in_cluster_idx(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the CTA index within a cluster across all dimensions.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_cluster_ctaid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_cluster_ctaid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_cluster_ctaid_z(T.i32(), loc=loc, ip=ip)),
+    )
+
+
+@dsl_user_op
+def block_in_cluster_dim(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the dimensions of the cluster.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_cluster_nctaid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_cluster_nctaid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_cluster_nctaid_z(T.i32(), loc=loc, ip=ip)),
+    )
+
+
+@dsl_user_op
+def block_idx_in_cluster(*, loc=None, ip=None) -> Int32:
+    """
+    Returns the linearized identifier of the CTA within the cluster.
+    """
+    return Int32(nvvm.read_ptx_sreg_cluster_ctarank(T.i32(), loc=loc, ip=ip))
+
+
+@dsl_user_op
+def shuffle_sync_op(
+    value: Numeric,
+    offset: Int,
+    mask: Int = FULL_MASK,
+    mask_and_clamp: Int = WARP_SIZE - 1,
+    kind: nvvm.ShflKind = nvvm.ShflKind.idx,
+    *,
+    loc=None,
+    ip=None,
+) -> Numeric:
+    """
+    Shuffles a value within the threads of a warp.
+
+    :param value:          The value to shuffle
+    :type value:           Numeric
+    :param mask:           A mask describing the threads participating in this operation
+    :type mask:            Int
+    :param offset:         A source lane or a source lane offset depending on kind
+    :type offset:          Int
+    :param mask_and_clamp: An integer containing two packed values specifying a mask for logically
+                           splitting warps into sub-segments and an upper bound for clamping the
+                           source lane index.
+    :type mask_and_clamp:  Int
+    :param kind:           The kind of shuffle, can be idx, up, down, or bfly
+    :type kind:            ShflKind
+    :return:               The shuffled value
+    :rtype:                Numeric
+    """
+    if not isinstance(value, Numeric):
+        value = as_numeric(value)
+    return type(value)(
+        nvvm.shfl_sync(
+            type(value).mlir_type,
+            Int32(mask).ir_value(loc=loc, ip=ip),
+            value.ir_value(loc=loc, ip=ip),
+            Int32(offset).ir_value(loc=loc, ip=ip),
+            Int32(mask_and_clamp).ir_value(loc=loc, ip=ip),
+            kind,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+shuffle_sync = partial(shuffle_sync_op, kind=nvvm.ShflKind.idx)
+shuffle_sync_up = partial(shuffle_sync_op, kind=nvvm.ShflKind.up)
+shuffle_sync_down = partial(shuffle_sync_op, kind=nvvm.ShflKind.down)
+shuffle_sync_bfly = partial(shuffle_sync_op, kind=nvvm.ShflKind.bfly)
+
+
+@dsl_user_op
+def barrier(*, barrier_id=None, number_of_threads=None, loc=None, ip=None) -> None:
+    """
+    Creates a barrier, optionally named.
+    """
+    if barrier_id is not None:
+        barrier_id = Int32(barrier_id).ir_value(loc=loc, ip=ip)
+
+    if number_of_threads is not None:
+        number_of_threads = Int32(number_of_threads).ir_value(loc=loc, ip=ip)
+
+    nvvm.barrier(
+        barrier_id=barrier_id, number_of_threads=number_of_threads, loc=loc, ip=ip
+    )
+
+@dsl_user_op
+def sync_threads(*, loc=None, ip=None) -> None:
+    """
+    Synchronizes all threads within a CTA.
+    """
+    nvvm.barrier(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def sync_warp(mask: Int = FULL_MASK, *, loc=None, ip=None) -> None:
+    """
+    Performs a warp-wide sync with an optional mask.
+    """
+    nvvm.bar_warp_sync(Int32(mask).ir_value(loc=loc, ip=ip), loc=loc, ip=ip)
+
+
+@dsl_user_op
+def fence_acq_rel_cta(*, loc=None, ip=None) -> None:
+    """
+    Fence operation with acquire-release semantics.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`__.
+    """
+    nvvm.fence_acq_rel_cta(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def fence_acq_rel_cluster(*, loc=None, ip=None) -> None:
+    """
+    Fence operation with acquire-release semantics.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`__.
+    """
+    nvvm.fence_acq_rel_cluster(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def fence_acq_rel_gpu(*, loc=None, ip=None) -> None:
+    """
+    Fence operation with acquire-release semantics.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`__.
+    """
+    nvvm.fence_acq_rel_gpu(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def fence_acq_rel_sys(*, loc=None, ip=None) -> None:
+    """
+    Fence operation with acquire-release semantics.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`__.
+    """
+    nvvm.fence_acq_rel_sys(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def cp_async_commit_group(*, loc=None, ip=None) -> None:
+    """
+    Commits all prior initiated but uncommitted cp.async instructions.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-commit-group>`__.
+    """
+    nvvm.cp_async_commit_group(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def cp_async_wait_group(n, *, loc=None, ip=None) -> None:
+    """
+    Waits till only a specified numbers of cp.async groups are pending.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all>`__.
+    """
+    nvvm.cp_async_wait_group(n, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def cp_async_bulk_commit_group(*, loc=None, ip=None) -> None:
+    """
+    Commits all prior initiated but uncommitted cp.async.bulk instructions.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-commit-group>`__.
+    """
+    nvvm.cp_async_bulk_commit_group(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def cp_async_bulk_wait_group(group, *, read=None, loc=None, ip=None) -> None:
+    """
+    Waits till only a specified numbers of cp.async.bulk groups are pending.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-wait-group>`__.
+    """
+    nvvm.cp_async_bulk_wait_group(group, read=read, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def cluster_wait(*, loc=None, ip=None) -> None:
+    """
+    A cluster-wide wait operation.
+    """
+    nvvm.cluster_wait(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def cluster_arrive(*, aligned=None, loc=None, ip=None) -> None:
+    """
+    A cluster-wide arrive operation.
+    """
+    nvvm.cluster_arrive(aligned=aligned, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def cluster_arrive_relaxed(*, aligned=None, loc=None, ip=None) -> None:
+    """
+    A cluster-wide arrive operation with relaxed semantics.
+    """
+    nvvm.cluster_arrive_relaxed(aligned=aligned, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def fence_proxy(
+    kind: ProxyKind,
+    *,
+    space: Optional[SharedSpace] = None,
+    use_intrinsic=None,
+    loc=None,
+    ip=None,
+) -> None:
+    nvvm.fence_proxy(
+        kind=kind, space=space, use_intrinsic=use_intrinsic, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def vote_ballot_sync(
+    pred: Boolean, mask: Int = FULL_MASK, *, loc=None, ip=None
+) -> Int32:
+    """
+    Performs a ballot operation across the warp.
+    """
+    return Int32(
+        nvvm.vote_ballot_sync(
+            T.i32(),
+            Int32(mask).ir_value(loc=loc, ip=ip),
+            Boolean(pred).ir_value(loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def popc(value: Numeric, *, loc=None, ip=None) -> Numeric:
+    """
+    Performs a population count operation.
+    """
+    if not isinstance(value, Numeric):
+        value = as_numeric(value)
+    return type(value)(llvm.intr_ctpop(value.ir_value(), loc=loc, ip=ip))
+
+
+@dsl_user_op
+def fence_view_async_tmem_op(
+    kind: Tcgen05WaitKind,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    Perform a fence operation on the async TMEM load or store.
+
+    .. note::
+        This function is only available on sm_100a and above.
+        The fence is required to synchronize the TMEM load/store
+        and let the pipeline release or commit the buffer.
+
+        Take a mma2acc pipeline as an example of LOAD fence, the ACC tensor is from TMEM.
+        ```
+        # Start to copy ACC from TMEM to register
+        cute.copy(tmem_load, tACC, rACC)
+        fence_view_async_tmem_load()
+        # After fence, we can ensure the TMEM buffer is consumed totally.
+        # Release the buffer to let the MMA know it can overwrite the buffer.
+        mma2accum_pipeline.consumer_release(curr_consumer_state)
+        ```
+        Take a TS GEMM kernel as an example of STORE fence, the A tensor is from TMEM.
+        ```
+        # Start to copy A from register to TMEM
+        cute.copy(tmem_store, rA, tA)
+        fence_view_async_tmem_store()
+        # After fence, we can ensure the TMEM buffer is ready.
+        # Commit the buffer to let the MMA know it can start to load A.
+        tmem_mma_pipeline.producer_commit(curr_producer_state)
+        ```
+
+
+    :param kind: The kind of fence operation to perform including LOAD and STORE.
+    :type kind: Tcgen05WaitKind
+    """
+    nvvm.tcgen05_wait(kind, loc=loc, ip=ip)
+
+
+fence_view_async_tmem_load = partial(
+    fence_view_async_tmem_op, kind=Tcgen05WaitKind.LOAD
+)
+fence_view_async_tmem_store = partial(
+    fence_view_async_tmem_op, kind=Tcgen05WaitKind.STORE
+)
+
+
+@dsl_user_op
+def warpgroup_reg_realloc_op(
+    reg_count: int,
+    kind: SetMaxRegisterAction,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    nvvm.setmaxregister(reg_count, kind, loc=loc, ip=ip)
+
+
+warpgroup_reg_alloc = partial(
+    warpgroup_reg_realloc_op, kind=SetMaxRegisterAction.increase
+)
+warpgroup_reg_dealloc = partial(
+    warpgroup_reg_realloc_op, kind=SetMaxRegisterAction.decrease
+)
+
+
+@dsl_user_op
+def calc_packed_f32x2_op(
+    src_a: Tuple[Float32, Float32],
+    src_b: Tuple[Float32, Float32],
+    src_c: Tuple[Float32, Float32] | None,
+    calc_func: Callable,
+    *,
+    rnd=RoundingModeKind.RZ,
+    ftz=True,
+    loc=None,
+    ip=None,
+) -> Tuple[Float32, Float32]:
+    vec_type = ir.VectorType.get([2], Float32.mlir_type, loc=loc)
+    vec_src_a = vector.from_elements(
+        vec_type, tuple(as_numeric(a).ir_value() for a in src_a), loc=loc, ip=ip
+    )
+    vec_src_b = vector.from_elements(
+        vec_type, tuple(as_numeric(b).ir_value() for b in src_b), loc=loc, ip=ip
+    )
+    if src_c is not None:
+        vec_src_c = vector.from_elements(
+            vec_type, tuple(as_numeric(c).ir_value() for c in src_c), loc=loc, ip=ip
+        )
+        vec_res = calc_func(
+            vec_type, vec_src_a, vec_src_b, vec_src_c, rnd=rnd, ftz=ftz, loc=loc, ip=ip
+        )
+    else:
+        vec_res = calc_func(
+            vec_type, vec_src_a, vec_src_b, rnd=rnd, ftz=ftz, loc=loc, ip=ip
+        )
+
+    res0 = Float32(
+        vector.extract(
+            vec_res, dynamic_position=[], static_position=[0], loc=loc, ip=ip
+        )
+    )
+    res1 = Float32(
+        vector.extract(
+            vec_res, dynamic_position=[], static_position=[1], loc=loc, ip=ip
+        )
+    )
+    return res0, res1
+
+
+fma_packed_f32x2 = partial(calc_packed_f32x2_op, calc_func=nvvm.fma_packed_f32x2)
+mul_packed_f32x2 = partial(
+    calc_packed_f32x2_op, src_c=None, calc_func=nvvm.mul_packed_f32x2
+)
+add_packed_f32x2 = partial(
+    calc_packed_f32x2_op, src_c=None, calc_func=nvvm.add_packed_f32x2
+)
+
+
+@dsl_user_op
+def fmax(
+    a: Union[float, Float32], b: Union[float, Float32], *, loc=None, ip=None
+) -> Float32:
+    return Float32(
+        nvvm.fmax(
+            T.f32(),
+            Float32(a).ir_value(loc=loc, ip=ip),
+            Float32(b).ir_value(loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def rcp_approx(a: Union[float, Float32], *, loc=None, ip=None):
+    return Float32(
+        nvvm.rcp_approx_ftz_f(
+            T.f32(), Float32(a).ir_value(loc=loc, ip=ip), loc=loc, ip=ip
+        )
+    )
+
+
+@dsl_user_op
+def exp2(a: Union[float, Float32], *, loc=None, ip=None) -> Float32:
+    return Float32(
+        llvm.inline_asm(
+            T.f32(),
+            [Float32(a).ir_value(loc=loc, ip=ip)],
+            "ex2.approx.ftz.f32 $0, $1;",
+            "=f,f",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    )
diff --git a/python/CuTeDSL/cutlass/cute/arch/smem.py b/python/CuTeDSL/cutlass/cute/arch/smem.py
new file mode 100644
index 00000000..4e5dee7b
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/arch/smem.py
@@ -0,0 +1,96 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from typing import Optional, Type
+
+from cutlass.cutlass_dsl import T, dsl_user_op
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir import ir
+
+from ..typing import Pointer, Numeric, NumericMeta
+
+
+@dsl_user_op
+def alloc_smem(
+    element_type: Type[Numeric],
+    size_in_elems: int,
+    alignment: Optional[int] = None,
+    *,
+    loc=None,
+    ip=None,
+) -> Pointer:
+    """
+    Statically allocates SMEM.
+
+    :param element_type:  The pointee type of the pointer.
+    :type element_type:   Type[Numeric]
+    :param size_in_elems: The size of the allocation in terms of number of elements of the
+                          pointee type
+    :type size_in_elems:  int
+    :param alignment:     An optional pointer alignment for the allocation
+    :type alignment:      int
+    :return:              A pointer to the start of the allocation
+    :rtype:               Pointer
+    """
+    if not isinstance(element_type, NumericMeta):
+        raise TypeError(
+            f"element_type must be a type of Numeric, but got {element_type}"
+        )
+
+    if alignment is None:
+        # Default alignment based on the element type's width
+        alignment = element_type.width // 8
+    ptr_ty = _cute_ir.PtrType.get(
+        element_type.mlir_type, _cute_ir.AddressSpace.smem, alignment
+    )
+    return _cute_nvgpu_ir.arch_alloc_smem(
+        ptr=ptr_ty,
+        input=ir.IntegerAttr.get(T.i32(), size_in_elems),
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def get_dyn_smem(
+    element_type: Type[Numeric],
+    alignment: Optional[int] = None,
+    *,
+    loc=None,
+    ip=None,
+) -> Pointer:
+    """
+    Retrieves a pointer to a dynamic SMEM allocation.
+
+    :param element_type:  The pointee type of the pointer.
+    :type element_type:   Type[Numeric]
+    :param alignment:     An optional pointer alignment, the result pointer is offset appropriately
+    :type alignment:      int
+    :return:              A pointer to the start of the dynamic SMEM allocation with a correct
+                          alignement
+    :rtype:               Pointer
+    """
+    if not isinstance(element_type, NumericMeta):
+        raise TypeError(
+            f"element_type must be a type of Numeric, but got {element_type}"
+        )
+
+    if alignment is None:
+        # Default alignment based on the element type's width
+        alignment = element_type.width // 8
+    ptr_ty = _cute_ir.PtrType.get(
+        element_type.mlir_type,
+        _cute_ir.AddressSpace.smem,
+        alignment,
+    )
+    return _cute_nvgpu_ir.arch_get_dyn_smem(ptr=ptr_ty, loc=loc, ip=ip)
diff --git a/python/CuTeDSL/cutlass/cute/arch/tmem.py b/python/CuTeDSL/cutlass/cute/arch/tmem.py
new file mode 100644
index 00000000..302616d2
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/arch/tmem.py
@@ -0,0 +1,142 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from typing import Type
+
+from cutlass.cutlass_dsl import dsl_user_op
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+
+from ..typing import Pointer, Int, Int32, Numeric, NumericMeta
+
+
+SM100_TMEM_CAPACITY_COLUMNS = 512
+SM100_TMEM_MIN_ALLOC_COLUMNS = 32
+
+
+@dsl_user_op
+def retrieve_tmem_ptr(
+    element_type: Type[Numeric],
+    alignment: int,
+    ptr_to_buffer_holding_addr: Pointer,
+    *,
+    loc=None,
+    ip=None,
+) -> Pointer:
+    """
+    Retrieves a pointer to TMEM with the provided element type and alignment.
+
+    :param element_type:               The pointee type of the pointer.
+    :type element_type:                Type[Numeric]
+    :param alignment:                  The alignment of the result pointer
+    :type alignment:                   int
+    :param ptr_to_buffer_holding_addr: A pointer to a SMEM buffer holding the TMEM address of the
+                                       start of the allocation allocation
+    :type ptr_to_buffer_holding_addr:  Pointer
+    :return:                           A pointer to TMEM
+    :rtype:                            Pointer
+    """
+    if not isinstance(element_type, NumericMeta):
+        raise TypeError(
+            f"element_type must be a type of Numeric, but got {element_type}"
+        )
+
+    res_ty = _cute_ir.PtrType.get(
+        element_type.mlir_type, _cute_ir.AddressSpace.tmem, alignment
+    )
+    return _cute_nvgpu_ir.arch_sm100_retrieve_tmem_ptr(
+        res_ty, ptr_to_buffer_holding_addr.value, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def alloc_tmem(
+    num_columns: Int,
+    smem_ptr_to_write_address: Pointer,
+    is_two_cta=None,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    Allocates TMEM.
+
+    :param num_columns: The number of TMEM columns to allocate
+    :type num_columns:  Int
+    :param smem_ptr_to_write_address: A pointer to a SMEM buffer where the TMEM address is written
+                                      to
+    :type smem_ptr_to_write_address:  Pointer
+    :param is_two_cta:                Optional boolean parameter for 2-CTA MMAs
+    """
+    if isinstance(num_columns, int):
+        if (
+            num_columns < SM100_TMEM_MIN_ALLOC_COLUMNS
+            or num_columns > SM100_TMEM_CAPACITY_COLUMNS
+            or not (num_columns & (num_columns - 1) == 0)
+        ):
+            raise ValueError(
+                f"num_columns must be between 32 and 512, and must be pow of 2, but got {num_columns}"
+            )
+    _cute_nvgpu_ir.arch_sm100_alloc_tmem(
+        Int32(num_columns).ir_value(loc=loc, ip=ip),
+        smem_ptr_to_write_address.value,
+        is_two_cta=is_two_cta,
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def relinquish_tmem_alloc_permit(is_two_cta=None, *, loc=None, ip=None) -> None:
+    """
+    Relinquishes the right to allocate TMEM so that other CTAs potentially in a different grid can
+    allocate.
+    """
+    _cute_nvgpu_ir.arch_sm100_relinquish_tmem_alloc_permit(
+        is_two_cta=is_two_cta, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def dealloc_tmem(
+    tmem_ptr: Pointer,
+    num_columns: Int,
+    is_two_cta=None,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    Deallocates TMEM using the provided pointer and number of columns.
+
+    :param tmem_ptr:    A pointer to the TMEM allocation to de-allocate
+    :type tmem_ptr:     Pointer
+    :param num_columns: The number of columns in the TMEM allocation
+    :type num_columns:  Int
+    :param is_two_cta:  Optional boolean parameter for 2-CTA MMAs
+    """
+    if isinstance(num_columns, int):
+        if (
+            num_columns < SM100_TMEM_MIN_ALLOC_COLUMNS
+            or num_columns > SM100_TMEM_CAPACITY_COLUMNS
+            or not (num_columns & (num_columns - 1) == 0)
+        ):
+            raise ValueError(
+                f"num_columns must be between 32 and 512, and must be pow of 2, but got {num_columns}"
+            )
+    _cute_nvgpu_ir.arch_sm100_dealloc_tmem(
+        tmem_ptr.value,
+        Int32(num_columns).ir_value(loc=loc, ip=ip),
+        is_two_cta=is_two_cta,
+        loc=loc,
+        ip=ip,
+    )
diff --git a/python/CuTeDSL/cutlass/cute/core.py b/python/CuTeDSL/cutlass/cute/core.py
new file mode 100644
index 00000000..6af262cd
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/core.py
@@ -0,0 +1,6417 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import copy as py_copy
+from dataclasses import dataclass
+import math
+import operator
+from abc import ABC, abstractmethod
+from functools import lru_cache, partial, reduce
+from inspect import isclass
+from itertools import chain
+from typing import Iterable, overload, List, Tuple, Union, Type, Any, Dict, Optional
+from enum import Enum, auto
+
+from cutlass.cutlass_dsl import (
+    const,
+    T,
+    lru_cache_ir,
+    is_dynamic_expression,
+    for_generate,
+    yield_out,
+    if_generate,
+    extract_mlir_values,
+    new_from_mlir_values,
+    _binary_op_type_promote,
+    not_,
+    cutlass_arith,
+    dsl_user_op,
+)
+
+from cutlass._mlir import ir
+from cutlass._mlir.dialects import cute as _cute_ir
+from cutlass._mlir.dialects.cute import (
+    ScaledBasis as _ScaledBasis,
+    Ratio as _Ratio,
+)
+
+from cutlass._mlir.dialects import cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir.dialects import llvm, builtin, vector, arith
+
+from .typing import (
+    Numeric,
+    Integer,
+    NumericMeta,
+    Boolean,
+    Int32,
+    Int8,
+    Int16,
+    Int32,
+    Int64,
+    Float32,
+    TFloat32,
+    Int,
+    IntTuple,
+    Shape,
+    Stride,
+    Coord,
+    Layout,
+    Tile,
+    Tiler,
+    XTuple,
+    Tensor,
+    Pointer,
+    AddressSpace,
+    as_numeric,
+)
+
+
+####################################################################################################
+#
+# Internal IntTuple helpers
+#
+####################################################################################################
+
+
+def _get_typed_value(x):
+    if isinstance(x, Integer):
+        return (
+            x.value.get_typed_value() if isinstance(x.value, IntValue) else x.ir_value()
+        )
+    else:
+        return x
+
+
+def _pack_x(x, packer, op, *, loc=None, ip=None) -> ir.Value:
+    x = transform_leaf(_get_typed_value, x)
+    res_ty, dyn_elems = packer(x)
+    # <"0"> is deduced from type inference which should be removed for make_... operations
+    dyn_elems = [t for t in dyn_elems if not is_static(t)]
+    return op(res_ty, dyn_elems, loc=loc, ip=ip).result
+
+
+def _pack_shape(shape: Shape, *, loc=None, ip=None) -> ir.Value:
+    return _pack_x(shape, _cute_ir.pack_shape, _cute_ir.MakeShapeOp, loc=loc, ip=ip)
+
+
+def _pack_stride(stride: Stride, *, loc=None, ip=None) -> ir.Value:
+    # Convert basis elements to the base class before _pack_x
+    stride = transform_leaf(
+        lambda x: x.to(_cute_ir.ScaledBasis) if isinstance(x, ScaledBasis) else x,
+        stride,
+    )
+    return _pack_x(stride, _cute_ir.pack_stride, _cute_ir.MakeStrideOp, loc=loc, ip=ip)
+
+
+def _pack_coord(coord: Coord, *, loc=None, ip=None) -> ir.Value:
+    return _pack_x(coord, _cute_ir.pack_coord, _cute_ir.MakeCoordOp, loc=loc, ip=ip)
+
+
+def _pack_int_tuple(int_tuple: IntTuple, *, loc=None, ip=None) -> ir.Value:
+    return _pack_x(
+        int_tuple, _cute_ir.pack_int_tuple, _cute_ir.MakeIntTupleOp, loc=loc, ip=ip
+    )
+
+
+def _pack_tile(tile: Tile, *, loc=None, ip=None) -> ir.Value:
+    def expand_leaves(tile) -> list:
+        leaves = []
+        for e in tile:
+            if isinstance(e, _Layout):
+                leaves.extend(list(flatten_to_tuple(e.shape)))
+                leaves.extend(list(flatten_to_tuple(e.stride)))
+            else:
+                leaves.append(e)
+        return leaves
+
+    layout_leaves = flatten_to_tuple(tile)
+    dyn_elems = expand_leaves(layout_leaves)
+    dyn_elems = [
+        _get_typed_value(x) for x in dyn_elems if isinstance(x, (Integer, ir.Value))
+    ]
+
+    res_ty = _cute_ir.pack_tile(tile)
+    return _cute_ir.make_tile(res_ty, dyn_elems, loc=loc, ip=ip)
+
+
+def _unpack_x_tuple(t: Union[ir.Type, ir.Value], *, loc=None, ip=None) -> XTuple:
+    # If t is an MLIR type, make sure it's static and make a Value
+    if isinstance(t, ir.Type):
+        if not _cute_ir.is_static(t):
+            raise ValueError()
+        t = _cute_ir.static(t)
+
+    if isinstance(t, ir.Value):
+        input_ty = t.type
+        if t.type.rank == 0:
+            # Handle this case separately, _cute_ir.get_leaves will return an Op in this case
+            vals = []
+        else:
+            vals = _cute_ir.get_leaves(t, loc=loc, ip=ip)
+            if not isinstance(vals, list):
+                vals = [vals]
+    else:
+        raise TypeError(f"expects static type or value, but got {t}")
+
+    # CuTe IR only supports Int32 for now. Need to support detection of other types
+    res = _cute_ir.unpack_x_tuple(input_ty, vals)
+
+    def post_process(x):
+        if isinstance(x, _cute_ir.ScaledBasis):
+            return ScaledBasis(post_process(x.get_value()), x.get_mode())
+        elif isinstance(x, _cute_ir.Ratio):
+            return Ratio(x.numerator, x.denominator)
+        else:
+            return x
+
+    return transform_leaf(post_process, res)
+
+
+####################################################################################################
+#
+# Core types
+#
+####################################################################################################
+
+
+class IntValue(cutlass_arith.ArithValue):
+    """Internal representation of constrained integer types with divisibility information.
+
+    IntValue serves as a proxy for constrained integer types in the CuTe IR. Rather than
+    directly storing values of IntTupleType with depth=0, it stores the result of the
+    `cute.get_scalars` operation applied to such values.
+
+    This class represents the following sequence of operations in the IR:
+      %0 = ... : (...) -> !cute.int_tuple<"?">
+      %1 = cute.get_scalars(%0) : (!cute.int_tuple<"?">) -> i32
+
+    where the first operation produces a `cute.int_tuple<"?">` with depth=0 and rank=1. It
+    automatically emit `cute.get_scalars` and track it.
+
+    IntValue inherits behavior from ArithValue with the following extensions:
+      * Overloaded operations that accept IntTupleType values to propagate divisibility information
+      * Support for CuTe operations that utilize divisibility constraints
+
+    API for interacting with IntValue:
+      * get_typed_value() - Returns the value as an IntTupleType
+      * get_divisibility() - Returns the divisibility constraint of the value
+    """
+
+    def __init__(self, v, signed=True):
+        # Cute Constrained Int Type is always signed
+        if isinstance(v, int):
+            v = _pack_int_tuple(v)
+
+        if isinstance(v.type, _cute_ir.IntTupleType):
+            scalar_val = _cute_ir.get_scalars(v)
+            super().__init__(scalar_val, True)
+        else:
+            super().__init__(v, True)
+
+    def get_typed_value(self):
+        if isinstance(self.type, ir.IntegerType):
+            def_op = self.owner.operation
+            if def_op.name == "cute.get_scalars":
+                return def_op.operands[0]
+
+        assert not isinstance(self.type, _cute_ir.IntTupleType)
+
+        return _pack_int_tuple(self)
+
+    @property
+    def divisibility(self):
+        if isinstance(self.get_typed_value().type, _cute_ir.IntTupleType):
+            return self.get_typed_value().type.get_divisibility([0])
+        else:
+            return 1
+
+    def __str__(self):
+        if self.divisibility == 1:
+            return f"?"
+        else:
+            return f"?{{div={self.divisibility}}}"
+
+    def __repr__(self):
+        parent_name = cutlass_arith.ArithValue.__name__
+        return super().__str__().replace(parent_name, IntValue.__name__)
+
+    def pretty_str(self):
+        return self.__str__()
+
+    @staticmethod
+    def _binary_op(op):
+        def wrapper(self, other, **kwargs):
+            if isinstance(other, IntValue):
+                other_val = other.get_typed_value()
+            elif isinstance(other, ir.Value) and isinstance(
+                other.type, _cute_ir.IntTupleType
+            ):
+                other_val = other
+            elif isinstance(other, ir.Value) and isinstance(other.type, ir.IntegerType):
+                other = cutlass_arith.int_to_int(other, Int32, **kwargs)
+                other_val = _pack_int_tuple(other)
+            elif isinstance(other, (int, bool)):
+                other_val = _pack_int_tuple(int(other))
+            else:
+                # Dispatch to `__rmul__` of `other`
+                return NotImplemented
+
+            return IntValue(op(self, other_val, **kwargs))
+
+        return wrapper
+
+    @dsl_user_op
+    @_binary_op
+    def __add__(self, other, *, loc=None, ip=None):
+        return _cute_ir.add_offset(self.get_typed_value(), other, loc=loc, ip=ip)
+
+    @dsl_user_op
+    @_binary_op
+    def __sub__(self, other, *, loc=None, ip=None):
+        return _cute_ir.tuple_sub(self.get_typed_value(), other, loc=loc, ip=ip)
+
+    @dsl_user_op
+    @_binary_op
+    def __mul__(self, other, *, loc=None, ip=None):
+        return _cute_ir.tuple_mul(self.get_typed_value(), other, loc=loc, ip=ip)
+
+    @dsl_user_op
+    @_binary_op
+    def __floordiv__(self, other, *, loc=None, ip=None) -> "IntValue":
+        return _cute_ir.tuple_div(self.get_typed_value(), other, loc=loc, ip=ip)
+
+    @dsl_user_op
+    @_binary_op
+    def __mod__(self, other, *, loc=None, ip=None) -> cutlass_arith.ArithValue:
+        return _cute_ir.tuple_mod(self.get_typed_value(), other, loc=loc, ip=ip)
+
+    @dsl_user_op
+    @_binary_op
+    def __radd__(self, other, *, loc=None, ip=None) -> "IntValue":
+        return _cute_ir.add_offset(other, self.get_typed_value(), loc=loc, ip=ip)
+
+    @dsl_user_op
+    @_binary_op
+    def __rsub__(self, other, *, loc=None, ip=None) -> "IntValue":
+        return _cute_ir.tuple_sub(other, self.get_typed_value(), loc=loc, ip=ip)
+
+    @dsl_user_op
+    @_binary_op
+    def __rmul__(self, other, *, loc=None, ip=None):
+        return _cute_ir.tuple_mul(other, self.get_typed_value(), loc=loc, ip=ip)
+
+    @dsl_user_op
+    @_binary_op
+    def __rfloordiv__(self, other, *, loc=None, ip=None) -> "IntValue":
+        return _cute_ir.tuple_div(other, self.get_typed_value(), loc=loc, ip=ip)
+
+    @dsl_user_op
+    @_binary_op
+    def __rmod__(self, other, *, loc=None, ip=None) -> "IntValue":
+        return _cute_ir.tuple_mod(other, self.get_typed_value(), loc=loc, ip=ip)
+
+
+class Ratio(_Ratio):
+    """A class representing a rational number as a ratio of two integers.
+
+    Ratio is used in CuTe to represent exact fractional values that arise in
+    tensor layout operations, particularly in composition operations where
+    divisibility conditions may not be satisfied.
+
+    :param numerator: The numerator of the ratio
+    :type numerator: int
+    :param denominator: The denominator of the ratio
+    :type denominator: int
+    :raises TypeError: If numerator or denominator are not integers
+    """
+
+    def __init__(self, numerator: int, denominator: int):
+        if not isinstance(numerator, int) or not isinstance(denominator, int):
+            raise TypeError(
+                f"numerator and denominator must be integers, but got {numerator} and {denominator}"
+            )
+        super().__init__(numerator, denominator)
+
+    def is_integral(self) -> bool:
+        """Check if the ratio represents an integer value.
+
+        :return: True if the numerator is divisible by the denominator
+        :rtype: bool
+        """
+        return super().is_integral()
+
+    def reduced(self) -> "Ratio":
+        """Return a new Ratio with the numerator and denominator reduced to lowest terms.
+
+        :return: A new Ratio in reduced form
+        :rtype: Ratio
+        """
+        res = super().reduced()
+        return Ratio(res.numerator, res.denominator)
+
+    def __mul__(self, other):
+        """Multiply this ratio by another ratio or an integer.
+
+        :param other: The value to multiply by
+        :type other: Union[Ratio, int]
+        :return: A new ratio representing the product
+        :rtype: Ratio
+        :raises TypeError: If other is not a Ratio or int
+        """
+        if isinstance(other, Ratio):
+            return Ratio(
+                self.numerator * other.numerator,
+                self.denominator * other.denominator,
+            )
+        elif isinstance(other, int):
+            return Ratio(self.numerator * other, self.denominator)
+        else:
+            raise TypeError(f"Cannot multiply Ratio with {type(other)}")
+
+    def __rmul__(self, other):
+        """Right multiplication operation.
+
+        :param other: The value to multiply by
+        :type other: Union[Ratio, int]
+        :return: A new ratio representing the product
+        :rtype: Ratio
+        """
+        return self.__mul__(other)
+
+    def __str__(self):
+        """String representation of the ratio.
+
+        :return: String in the format "numerator/denominator"
+        :rtype: str
+        """
+        return super().__str__()
+
+    def to(self, dtype):
+        """Convert the ratio to another type.
+
+        :param dtype: The target type for conversion
+        :type dtype: type
+        :return: The ratio converted to the specified type
+        :raises TypeError: If conversion to the specified type is not supported
+        """
+        if dtype is Ratio:
+            return self
+        elif dtype is float:
+            return self.numerator / self.denominator
+        elif dtype is int:
+            return self.numerator // self.denominator
+        elif issubclass(dtype, _Ratio):
+            return self
+        else:
+            raise TypeError(f"Cannot convert Ratio to {dtype}")
+
+
+class ScaledBasis:
+    """A class representing a scaled basis element in CuTe's layout algebra.
+
+    ScaledBasis is used to represent elements in the layout algebra, particularly
+    in the context of composition operations. It consists of a value (scale) and
+    a mode that identifies mode of the basis element.
+
+    :param value: The scale value
+    :type value: Union[int, Integer, Ratio, ir.Value]
+    :param mode: The mode identifying the basis element
+    :type mode: Union[int, List[int]]
+    :raises TypeError: If mode is not an integer or list of integers
+
+    **Examples**::
+
+        # Create a scaled basis with integer scale and mode
+        sb1 = ScaledBasis(2, 0)  # 2 * E(0)
+
+        # Create a scaled basis with a Ratio scale
+        sb2 = ScaledBasis(Ratio(1, 2), 1)  # (1/2) * E(1)
+
+        # Create a scaled basis with a list of modes
+        sb3 = ScaledBasis(4, [0, 1])  # 4 * E([0, 1])
+
+        # Scaled basis elements are commonly used in layout strides
+        layout = make_layout((4, 8), stride=(ScaledBasis(1, 0), ScaledBasis(1, 1)))
+
+        # This creates a layout with strides (1@0, 1@1) representing
+        # a coordinate system where each dimension has its own basis
+    """
+
+    def __init__(self, value, mode) -> None:
+        if isinstance(mode, int):
+            self._mode = [mode]
+        else:
+            if any(not isinstance(x, int) for x in mode):
+                raise TypeError("Mode must be a list of integers")
+            self._mode = mode
+
+        self._value = value
+
+    def is_static(self) -> bool:
+        """Check if the value is statically known.
+
+        :return: True if the value is not a dynamic expression
+        :rtype: bool
+        """
+        return not is_dynamic_expression(self._value)
+
+    def to(self, dtype):
+        """Convert to another type.
+
+        :param dtype: The target type for conversion
+        :type dtype: type
+        :return: The ScaledBasis converted to the specified type
+        :raises TypeError: If conversion to the specified type is not supported
+        """
+        if dtype is ScaledBasis:
+            return self
+        elif dtype is _ScaledBasis:
+            if isinstance(self._value, Ratio):
+                scale = self._value
+            elif isinstance(self._value, Integer):
+                scale = self._value.ir_value()
+            else:
+                scale = self._value
+
+            if isinstance(scale, IntValue):
+                return _ScaledBasis(scale.get_typed_value(), self._mode)
+            else:
+                return _ScaledBasis(scale, self._mode)
+        else:
+            raise TypeError(f"Cannot convert ScaledBasis to {dtype}")
+
+    def __str__(self):
+        return f"{self.to(_ScaledBasis).__str__()}"
+
+    def __hash__(self):
+        if isinstance(self.mode, list):
+            return hash((self.value, tuple(self.mode)))
+        else:
+            return hash((self.value, self.mode))
+
+    @property
+    def value(self):
+        """Get the scale value.
+
+        :return: The scale value
+        """
+        return self._value
+
+    @property
+    def mode(self) -> List[int]:
+        """Get the mode identifying the basis element.
+
+        :return: The mode as a list of integers
+        :rtype: List[int]
+        """
+        return self._mode
+
+    def __eq__(self, other):
+        if isinstance(other, ScaledBasis):
+            return self.value == other.value and self.mode == other.mode
+        else:
+            return False
+
+    def __rmul__(self, scale: Union[Int, ir.Value, Ratio]) -> "ScaledBasis":
+        """Right multiplication by a scale factor.
+
+        This operation is used in layout algebra to scale basis elements,
+        which is essential for operations like composition and partitioning.
+
+        :param scale: The scale factor
+        :type scale: Union[Int, ir.Value, Ratio]
+        :return: A new scaled basis element
+        :rtype: ScaledBasis
+        :raises TypeError: If scale is not of a supported type
+        :raises NotImplementedError: If scaling a basis element with a ratio value
+        """
+        if not isinstance(scale, (int, Integer, Ratio, ir.Value)):
+            raise TypeError(
+                f"scale must be an integer or a ratio, but got {type(scale)}"
+            )
+        if isinstance(self.value, Ratio):
+            raise NotImplementedError(
+                "scaling a basis element having a ratio is not supported"
+            )
+
+        value = self.value
+
+        if not isinstance(value, (Integer, Ratio, int, cutlass_arith.ArithValue)):
+            raise TypeError(f"Don't support {type(value)} for ScaledBasis")
+
+        # Lift to IntValue type to preserve type info as much as possible
+        if isinstance(scale, cutlass_arith.ArithValue):
+            scale = IntValue(_pack_int_tuple(cutlass_arith.int_to_int(scale, Int32)))
+
+        if isinstance(value, cutlass_arith.ArithValue):
+            value = IntValue(_pack_int_tuple(cutlass_arith.int_to_int(value, Int32)))
+        elif isinstance(value, Integer):
+            value = value.ir_value()
+
+        return ScaledBasis(scale * value, self.mode)  # type: ignore
+
+
+def E(mode: Union[int, List[int]]) -> ScaledBasis:
+    """Create a unit ScaledBasis element with the specified mode.
+
+    This function creates a ScaledBasis with value 1 and the given mode.
+    The mode represents the coordinate axis or dimension in the layout.
+
+    :param mode: The mode (dimension) for the basis element, either a single integer or a list of integers
+    :type mode: Union[int, List[int]]
+    :return: A ScaledBasis with value 1 and the specified mode
+    :rtype: ScaledBasis
+    :raises TypeError: If mode is not an integer or a list
+
+    **Examples**::
+
+        # Create a basis element for the first dimension (mode 0)
+        e0 = E(0)
+
+        # Create a basis element for the second dimension (mode 1)
+        e1 = E(1)
+
+        # Create a basis element for a hierarchical dimension
+        e_hier = E([0, 1])
+    """
+    if isinstance(mode, int):
+        mode = [mode]
+
+    if not isinstance(mode, list):
+        raise TypeError(f"expects a list, got {type(mode)}")
+
+    if not mode:
+        return 1
+
+    return ScaledBasis(1, mode)
+
+
+def get_divisibility(x: Union[int, Integer]) -> int:
+    if isinstance(x, int):
+        return x
+
+    if isinstance(x, Integer):
+        x = x.value
+
+    if isinstance(x, IntValue):
+        return x.divisibility
+    else:
+        return 1
+
+
+@ir.register_value_caster(_cute_ir.SwizzleType.get_static_typeid(), replace=True)
+class Swizzle(ir.Value):
+    """
+    Swizzle is a transformation that permutes the elements of a layout.
+
+    Swizzles are used to rearrange data elements to improve memory access patterns
+    and computational efficiency.
+
+    Swizzle is defined by three parameters:
+    - MBase: The number of least-significant bits to keep constant
+    - BBits: The number of bits in the mask
+    - SShift: The distance to shift the mask
+
+    The mask is applied to the least-significant bits of the layout.
+
+    .. code-block::
+
+        0bxxxxxxxxxxxxxxxYYYxxxxxxxZZZxxxx
+                                      ^--^ MBase is the number of least-sig bits to keep constant
+                         ^-^       ^-^     BBits is the number of bits in the mask
+                           ^---------^     SShift is the distance to shift the YYY mask
+                                              (pos shifts YYY to the right, neg shifts YYY to the left)
+
+        e.g. Given
+        0bxxxxxxxxxxxxxxxxYYxxxxxxxxxZZxxx
+        the result is
+        0bxxxxxxxxxxxxxxxxYYxxxxxxxxxAAxxx where AA = ZZ xor YY
+
+    """
+
+    def __str__(self):
+        # Cut off the MLIR type's string for making pretty_str more concise
+        return self.type.__str__()[15 : 15 + 8]
+
+
+@ir.register_value_caster(_cute_ir.LayoutType.get_static_typeid(), replace=True)
+class _Layout(Layout):
+    """Layout is CuTe's core abstraction for representing tensor layouts.
+
+    A Layout maps from a logical coordinate space to an index space, defined by a
+    pair of (Shape, Stride). The Shape defines the abstract dimensions of the Layout,
+    while the Stride defines how coordinates within the Shape map to linear indices.
+
+    Layouts present a common interface to multidimensional array access that abstracts
+    away the details of how array elements are organized in memory. This allows algorithms
+    to be written generically, so that layouts can change without requiring code changes.
+
+    CuTe layouts are inherently hierarchical, constructed from smaller, nested layouts
+    that can represent complex mappings required by GPU tensor instructions. They support
+    a rich algebra of operations including concatenation, coalescence, composition,
+    complement, and inversion.
+
+    :ivar shape: An IntTuple representing the dimensions of the layout.
+    :ivar stride: An IntTuple representing the strides of the layout.
+    :ivar max_alignment: The maximum alignment of the layout.
+
+    **Examples**::
+
+    .. code-block:: python
+
+        # Creating a layout with shape (4,8) and default stride (layout left / "column major")
+        layout = cute.make_layout((4, 8))
+
+        # Creating a layout with explicit shape and stride
+        layout = cute.make_layout((4, 8), stride=(8, 1))
+
+        # Accessing a specific coordinate: (2, 3) -> 2 * 8 + 3 * 1 = 19
+        idx = cute.crd2idx((2, 3), layout)
+    """
+
+    def __init__(self, op_result) -> None:
+        """Initialize a Layout object.
+
+        :param op_result: The operation result value to wrap.
+        """
+        super().__init__(op_result)
+
+    def __str__(self) -> str:
+        """Return a string representation of the layout.
+
+        :return: A string in the format "shape:stride".
+        """
+        return f"{pretty_str(self.shape)}:{pretty_str(self.stride)}"
+
+    @property
+    def shape(self, *, loc=None, ip=None) -> Shape:
+        """Get the shape of the layout.
+
+        The shape defines the dimensions and structure of the layout's
+        coordinate space.
+
+        :param loc: Optional location information for debugging.
+        :param ip: Optional insertion point for IR generation.
+        :return: The hierarchical shape of the layout.
+        """
+        return _unpack_x_tuple(_cute_ir.get_shape(self, loc=loc, ip=ip), loc=loc, ip=ip)
+
+    @property
+    def stride(self, *, loc=None, ip=None) -> Stride:
+        """Get the stride of the layout.
+
+        The stride defines how coordinates map to linear indices in memory.
+
+        :param loc: Optional location information for debugging.
+        :param ip: Optional insertion point for IR generation.
+        :return: The hierarchical stride of the layout.
+        """
+        return _unpack_x_tuple(
+            _cute_ir.get_stride(self, loc=loc, ip=ip), loc=loc, ip=ip
+        )
+
+    @property
+    def max_alignment(self) -> int:
+        """Get the maximum alignment of the layout.
+
+        :return: The maximum alignment in bytes.
+        """
+        return self.type.max_alignment
+
+    def __eq__(self, other) -> Union[bool, Boolean]:
+        """Check if this layout is equal to another layout.
+
+        Two layouts are equal if they have the same shape and stride.
+
+        :param other: The layout to compare with.
+        :return: True if layouts are equal, False otherwise.
+            May return an IR value for dynamic layouts.
+        """
+        if isinstance(other, Layout):
+            if is_static(self.type) and is_static(other.type):
+                return self.type == other.type
+            return Boolean(_cute_ir.equal(self, other))
+        else:
+            return False
+
+    def __req__(self, other) -> Union[bool, Boolean]:
+        """Reflected equality check.
+
+        :param other: The layout to compare with.
+        :return: Result of other.__eq__(self).
+        """
+        if isinstance(other, Layout):
+            return other.__eq__(self)
+        return False
+
+    def __ne__(self, other) -> Union[bool, Boolean]:
+        """Check if this layout is not equal to another layout.
+
+        :param other: The layout to compare with.
+        :return: True if layouts are not equal, False otherwise.
+        """
+        if isinstance(other, Layout):
+            if is_static(self.type) and is_static(other.type):
+                return self.type != other.type
+            return Boolean(not_(_cute_ir.equal(self, other)))
+        else:
+            return True
+
+    def __rne__(self, other) -> Union[bool, Boolean]:
+        """Reflected inequality check.
+
+        :param other: The layout to compare with.
+        :return: Result of other.__ne__(self).
+        """
+        if isinstance(other, Layout):
+            return other.__ne__(self)
+        return False
+
+    def __getitem__(self, idx: int) -> Layout:
+        """
+        Top-level `get` to provide a syntax similar to `tuple`.
+        """
+        return get(self, mode=[idx])
+
+    @dsl_user_op
+    def __call__(self, coord: Coord, loc=None, ip=None) -> IntTuple:
+        return crd2idx(coord, self, loc=loc, ip=ip)
+
+    @dsl_user_op
+    def get_hier_coord(self, idx, *, loc=None, ip=None) -> Coord:
+        """Get the hierarchical coordinate corresponding to a linear index.
+
+        This method maps from a linear index back to the logical coordinate
+        in the layout's coordinate space.
+
+        :param idx: The linear index to convert.
+        :return: The hierarchical coordinate corresponding to the index.
+
+        **Examples**::
+
+            layout = make_layout((4, 8), stride=(8, 1))
+
+            # map linear index back to coordinate: 5 -> (1, 1)
+            coord = get_hier_coord(5, layout)
+        """
+        idx_val = Int32(idx).ir_value()
+        crd = _cute_ir.get_hier_coord(idx_val, self, loc=loc, ip=ip)
+        return _unpack_x_tuple(crd)
+
+    @dsl_user_op
+    def get_flat_coord(self, idx, *, loc=None, ip=None) -> Coord:
+        idx_val = Int32(idx).ir_value()
+        res = _cute_ir.get_flat_coord(idx_val, self, loc=loc, ip=ip)
+        return _unpack_x_tuple(res, loc=loc, ip=ip)
+
+
+@ir.register_value_caster(_cute_ir.ComposedLayoutType.get_static_typeid(), replace=True)
+class ComposedLayout(ir.Value):
+    """ComposedLayout represents the functional composition of layouts in CuTe.
+
+    A ComposedLayout is formed by the composition of three components:
+    inner o offset o outer, where:
+
+    - inner: The inner layout or swizzle that is applied last
+    - offset: An integer tuple representing a coordinate offset
+    - outer: The outer layout that is applied first
+
+    ComposedLayout implements the functional composition operation where:
+    R(c) := (inner o offset o outer)(c) := inner(offset + outer(c))
+
+    This composition allows for complex transformations of coordinates and indices,
+    enabling operations like tiling, partitioning, and reshaping of data.
+
+    :ivar inner: The inner layout or swizzle component
+    :ivar offset: The coordinate offset applied between inner and outer layouts
+    :ivar outer: The outer layout component
+    :ivar max_alignment: The maximum alignment of the composed layout
+
+    **Examples**::
+
+        # Create a composed layout with inner layout, offset, and outer layout
+
+        # inner layout: (4, 8):(1, 4)
+        inner_layout = make_layout((4, 8))
+
+        offset = (0, 0)
+
+        # outer layout: (2, 2):(1@0, 1@1)
+        outer_layout = make_layout((2, 2), stride=(1 * E(0), 1 * E(1)))
+
+        # composed layout: (inner o offset o outer)
+        composed = make_composed_layout(inner_layout, offset, outer_layout)
+
+        # Accessing components of the composed layout
+        inner = composed.inner
+        offset = composed.offset
+        outer = composed.outer
+
+        # map coordinate (1, 2) to linear index
+        #  - outer(1, 2) = (1, 2)
+        #  - offset + outer(1, 2) = (1, 2)
+        #  - inner(1, 2) = 1 * 1 + 2 * 4 = 9
+        idx = crd2idx((1, 2), composed)
+
+        # Composition is used in many tiling operations
+        # For example, in logical_product, raked_product, and blocked_product
+    """
+
+    def __init__(self, value) -> None:
+        """Initialize a ComposedLayout object.
+
+        :param value: The operation result value to wrap.
+        """
+        super().__init__(value)
+
+    def __str__(self) -> str:
+        return f"{pretty_str(self.inner)} o {pretty_str(self.offset)} o {pretty_str(self.outer)}"
+
+    @property
+    def inner(self, *, loc=None, ip=None) -> Union[Swizzle, Layout]:
+        return _cute_ir.composed_get_inner(self, loc=loc, ip=ip)
+
+    @property
+    def offset(self, *, loc=None, ip=None) -> IntTuple:
+        return _unpack_x_tuple(_cute_ir.composed_get_offset(self, loc=loc, ip=ip))
+
+    @property
+    def outer(self, *, loc=None, ip=None) -> Layout:
+        return _cute_ir.composed_get_outer(self, loc=loc, ip=ip)
+
+    @property
+    def shape(self, *, loc=None, ip=None) -> Shape:
+        return _unpack_x_tuple(_cute_ir.get_shape(self, loc=loc, ip=ip), loc=loc, ip=ip)
+
+    @property
+    def max_alignment(self) -> int:
+        return self.type.max_alignment
+
+    def __eq__(self, other) -> Union[bool, Boolean]:
+        if isinstance(other, ComposedLayout):
+            if is_static(self.type) and is_static(other.type):
+                return self.type == other.type
+            else:
+                raise NotImplementedError(
+                    f"runtime comparison of composed layouts is not supported, got `{self}` and `{other}`"
+                )
+        else:
+            return False
+
+    def __req__(self, other) -> Union[bool, Boolean]:
+        if isinstance(other, ComposedLayout):
+            return Boolean(other.__eq__(self))
+        return False
+
+    def __ne__(self, other) -> Union[bool, Boolean]:
+        return not self.__eq__(other)
+
+    def __rne__(self, other) -> Union[bool, Boolean]:
+        if isinstance(other, ComposedLayout):
+            return other.__ne__(self)
+        return False
+
+    def __getitem__(self, idx: int) -> "ComposedLayout":
+        """
+        Top-level `get` to provide a syntax similar to `tuple`.
+        """
+        return get(self, mode=[idx])
+
+    @dsl_user_op
+    def __call__(self, coord: Coord, loc=None, ip=None) -> IntTuple:
+        return crd2idx(coord, self, loc=loc, ip=ip)
+
+
+@ir.register_value_caster(_cute_ir.PtrType.get_static_typeid(), replace=True)
+class _Pointer(Pointer):
+    """
+    A pointer class representing a memory address with specific properties.
+
+    Pointers are a fundamental type of iterator/engine that support random-access operations.
+    They can be offset by elements of a layout's codomain and dereferenced to produce values.
+
+    :param value: The MLIR operation result value to initialize the pointer with
+    :type value: ir.Value
+
+    :ivar type: The MLIR type of the pointer
+    :vartype type: Type
+    :ivar value_type: The type of value this pointer points to
+    :vartype value_type: Type
+    :ivar memspace: The memory space where the pointer data resides (e.g., gmem, smem, rmem)
+    :vartype memspace: AddressSpace
+
+    :note: When composed with a layout, a pointer forms a tensor: T = E ∘ L, where E is the pointer
+           and L is the layout. The tensor evaluates the layout by mapping a coordinate c to the
+           codomain, offsets the pointer accordingly, and dereferences the result:
+           T(c) = (E ∘ L)(c) = *(E + L(c))
+    """
+
+    def __init__(self, value) -> None:
+        assert isinstance(value, ir.Value)
+        self.value = ir.Value(value)
+
+    def __str__(self) -> str:
+        # Cut off the MLIR type's string for making pretty_str more concise
+        return self.type.__str__()[6:]
+
+    def __extract_mlir_values__(self):
+        return [self.value]
+
+    def __new_from_mlir_values__(self, values):
+        # Only expecting single value of _Pointer instance or ir.Value
+        # In this context, a _Pointer instance is an encapsulated ir.Value which is automatically created
+        # by value caster for cute.ptr typed values
+        assert len(values) == 1, f"Expected 1 value, but got {len(values)}"
+        assert isinstance(
+            values[0], (_Pointer, ir.Value)
+        ), f"Expected _Pointer or ir.Value, but got {type(values[0])}"
+        return _Pointer(
+            values[0] if isinstance(values[0], ir.Value) else values[0].value
+        )
+
+    @property
+    @lru_cache_ir()
+    def value_type(self) -> Type[Numeric]:
+        return Numeric.from_mlir_type(self.value.type.value_type)
+
+    @property
+    def alignment(self) -> int:
+        return self.type.alignment
+
+    @property
+    def max_alignment(self) -> int:
+        return self.type.max_alignment
+
+    @property
+    @lru_cache_ir()
+    def memspace(self) -> AddressSpace:
+        return self.type.address_space
+
+    # Make it behave as if it inherited from ir.Value
+    @property
+    @lru_cache_ir()
+    def type(self) -> ir.Type:
+        return self.value.type
+
+    # Only use if you absolutely need to get the LLVM pointer Value
+    @property
+    @lru_cache_ir()
+    def llvm_ptr(self, *, loc=None, ip=None) -> ir.Value:
+        """
+        Get the LLVM pointer representation of this pointer.
+
+        :param loc: The source location for the operation, defaults to None
+        :type loc: Location, optional
+        :param ip: The insertion point for the operation, defaults to None
+        :type ip: InsertionPoint, optional
+        :return: The LLVM pointer representation
+        :rtype: ir.Value
+        """
+        llvm_ptr_ty = llvm.PointerType.get(self.type.address_space)
+        return builtin.unrealized_conversion_cast(
+            [llvm_ptr_ty], [self.value], loc=loc, ip=ip
+        )
+
+    def __add__(self, offset: IntTuple) -> Pointer:
+        """
+        Offset the pointer by elements of a layout's codomain.
+
+        :param offset: The offset to add to the pointer
+        :type offset: IntTuple
+        :return: A new pointer offset by the specified amount
+        :rtype: ir.Value
+        """
+        offset = _pack_int_tuple(offset)
+        return _cute_ir.add_offset(self.value, offset=offset)
+
+    @dsl_user_op
+    def toint(self, *, loc=None, ip=None):
+        if self.type.address_space in (
+            _cute_ir.AddressSpace.gmem,
+            _cute_ir.AddressSpace.generic,
+        ):
+            res_type = Int64
+        else:
+            res_type = Int32
+
+        return res_type(
+            _cute_ir.ptrtoint(res_type.mlir_type, self.value, loc=loc, ip=ip)
+        )
+
+    @dsl_user_op
+    def align(self, min_align: int, *, loc=None, ip=None) -> Pointer:
+        """
+        Align a pointer to a specified byte alignment.
+
+        :param min_align: The minimum byte alignment requirement. Must be a power of 2.
+        :type min_align: int
+        :param loc: The source location for the operation, defaults to None
+        :type loc: Location, optional
+        :param ip: The insertion point for the operation, defaults to None
+        :type ip: InsertionPoint, optional
+        :return: The aligned new pointer that satisfies alignment request.
+        :rtype: Pointer
+        :raises ValueError: If the alignment is not a power of 2.
+        :raises TypeError: If pointer is in tmem address space.
+        """
+
+        if (min_align & (min_align - 1)) != 0:
+            raise ValueError("Alignment must be a power of 2")
+
+        assert isinstance(self.type, _cute_ir.PtrType)
+        if self.type.address_space is AddressSpace.tmem:
+            raise ValueError("aligning a TMEM pointer is not supported")
+
+        if min_align <= self.alignment:
+            return self
+        else:
+            # Convert pointer to integer
+            address_int = self.toint(loc=loc, ip=ip)
+            # Align the address
+            aligned_address = (address_int + min_align - 1) & ~(min_align - 1)
+            # Create and return the aligned pointer
+            return make_ptr(
+                Numeric.from_mlir_type(self.type.value_type),
+                aligned_address,
+                self.type.address_space,
+                assumed_align=min_align,
+                loc=loc,
+                ip=ip,
+            )
+
+
+@ir.register_value_caster(_cute_ir.MemRefType.get_static_typeid(), replace=True)
+@ir.register_value_caster(_cute_ir.CountingTensorType.get_static_typeid(), replace=True)
+@ir.register_value_caster(
+    _cute_nvgpu_ir.SmemDescViewType.get_static_typeid(), replace=True
+)
+class _Tensor(Tensor):
+    """A tensor class representing the composition of an iterator (engine) with a layout.
+
+    A tensor evaluates the layout by mapping a coordinate to the codomain, offsets the
+    iterator accordingly, and dereferences the result to obtain the tensor's value.
+    Formally: T(c) = (E ∘ L)(c) = *(E + L(c)), where E is the iterator/engine and L is the layout.
+
+    :param value: The MLIR operation result value to initialize the tensor with
+    :type value: ir.Value
+    :param dtype: The user specified data type of the tensor elements. It could be \
+        different from the underlying dtype in the iterator. The default is None.
+    :type dtype: Type[Numeric], optional
+
+    Attributes:
+        iterator: The pointer or iterator (engine) component of the tensor
+        layout: The layout component defining the mapping from coordinates to offsets
+        shape: The shape of the tensor, inherited from the layout
+        stride: The stride of the tensor, inherited from the layout
+        element_type: The data type of the tensor elements
+        memspace: The memory space where the tensor data resides
+
+    Notes:
+        - The tensor supports both direct element access via coordinates and slicing operations
+        - Load/store operations are only supported for specific memory spaces (rmem, smem, gmem, generic)
+        - For composed layouts, stride information is not directly accessible
+        - Dynamic layouts do not support vector load/store operations
+
+    Examples:
+
+    .. code-block:: python
+
+        # Create a tensor with shape (4,8) in row-major layout
+        tensor = make_tensor(ptr, make_layout(shape=(4,8), stride=(8,1)))
+
+        # Access individual element
+        val = tensor[0, 0]    # or val = tensor[(0, 0)]
+
+        # Slice operation - get first column
+        subtensor = tensor[None, 0]  # or subtensor = tensor[(None, 0)]
+    """
+
+    def __init__(self, value, dtype: Optional[Type[Numeric]] = None):
+        self._dtype = dtype
+        if isinstance(value, ir.Value):
+            self.value = value
+        else:
+            raise TypeError(f"Expected ir.Value, got {type(value)}")
+
+    def __str__(self):
+        return f"tensor<{pretty_str(self.iterator)} o {pretty_str(self.layout)}>"
+
+    def __extract_mlir_values__(self):
+        return [self.value]
+
+    def __new_from_mlir_values__(self, values):
+        # Only expecting single value of _Tensor or ir.Value
+        # In this context, a _Tensor instance is an encapsulated ir.Value which is automatically created
+        # by value caster for MemRef/CountingTensor/SmemDescView typed values
+        assert len(values) == 1, f"Expected 1 value, but got {len(values)}"
+        assert isinstance(
+            values[0], (_Tensor, ir.Value)
+        ), f"Expected _Tensor or ir.Value, but got {type(values[0])}"
+        return _Tensor(
+            values[0] if isinstance(values[0], ir.Value) else values[0].value,
+            self._dtype,
+        )
+
+    # Cheat to let `Type(_Tensor())` to return cute.Tensor
+    @property
+    def __class__(self) -> Type[Tensor]:
+        return Tensor
+
+    # Make it behave as if it inherited from ir.Value
+    @property
+    @lru_cache_ir()
+    def type(self) -> ir.Type:
+        return self.value.type
+
+    @dsl_user_op
+    def __getitem__(
+        self, crd: Coord, *, loc=None, ip=None
+    ) -> Union[Tensor, Numeric, IntTuple]:
+        """Access or slice tensor elements using coordinates.
+
+        This method implements
+        * tensor evaluation T(c) = *(E + L(c)) when `c` is a coordinate without slicing, or
+        * tensor slicing operations T(c) = make_tensor(E + L(c), slice(L, c))
+        where E is the iterator/engine and L is the layout
+
+        :param crd: Coordinate or slice specification for accessing tensor elements
+        :type crd: Coord
+        :param loc: Source location for MLIR operation tracking, defaults to None
+        :type loc: Optional[Location]
+        :param ip: Insertion point for MLIR operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+        :return: Tensor element value or sliced subtensor
+        :rtype: Union[Tensor, ir.Value, IntTuple]
+
+        :raises ValueError: If coordinate access is invalid for the tensor layout
+
+        Examples:
+
+        .. code-block:: python
+
+            # Create a tensor with pointer iterator
+            ptr = make_ptr(cutlass.Float32, 0, cutlass.AddressSpace.gmem)
+            layout = make_layout((64, 128))  # leftmost mode is major
+            tensor = make_tensor(ptr, layout)  # Tensor using pointer iterator
+
+            # Direct element access loads from memory
+            val = tensor[0]  # Loads element at offset 0
+            val = tensor[1]  # Loads element at offset 4 (4bytes per Float32)
+            val = tensor[(0, 1)]  # Loads element at offset 64
+
+            # Create a counting tensor
+            layout = make_layout((64, 128), stride=(1 * E(0), 1 * E(1)))
+            tensor = make_tensor((128, 128), layout)
+
+            # Direct element access
+            val = tensor[0]  # Returns (128, 128)
+            val = tensor[(0, 1)]  # Returns (128, 129)
+
+            # Slice access
+            sliced = view[(3, None)]  # Returns tensor slice
+
+        .. note::
+            Sub-byte types like Float4E2M1FN and Float6E3M2FN are not supported for scalar
+            dereference operations. Attempting to set individual elements of tensors with
+            these element types will result in errors.
+
+        Examples:
+
+        .. code-block:: python
+
+            # Unsupported operations with sub-byte types:
+            ptr = make_ptr(cutlass.Float4E2M1FN, 0, cutlass.AddressSpace.gmem)
+            tensor = make_tensor(ptr, layout)
+            # The following will raise an error:
+            val = tensor[0]  # Error: sub-byte scalar dereference not supported
+
+            # Similarly for other sub-byte types:
+            ptr = make_ptr(cutlass.Float6E3M2FN, 0, cutlass.AddressSpace.gmem)
+            tensor = make_tensor(ptr, layout)
+            val = tensor[0]  # Error: sub-byte scalar dereference not supported
+        """
+        if has_underscore(crd):
+            return slice_(self.value, crd)
+        elif isinstance(self.type, _cute_ir.CountingTensorType):
+            res = _cute_ir.get_iter(slice_(self, crd).value, loc=loc, ip=ip)
+            return _unpack_x_tuple(res)
+        else:
+            self._check_can_load_store()
+            self._check_can_dereference()
+
+            crd_val = _pack_coord(crd, loc=loc, ip=ip)
+            data_val = _cute_ir.memref_load(self.value, crd_val, loc=loc, ip=ip)
+            return self.element_type(data_val)
+
+    def _cvt_to_dest(self, data: Union["TensorSSA", Numeric], *, loc=None, ip=None):
+        if data.dtype is self.element_type:
+            return data.ir_value(loc=loc, ip=ip)
+
+        orig_dtype = data.dtype
+        # Implicit upcast to wider type
+        if (
+            data.dtype.is_same_kind(self.element_type)
+            and self.element_type.width >= data.dtype.width
+        ):
+            data = data.to(self.element_type, loc=loc, ip=ip)  # type: ignore
+
+        if data.dtype.width != self.element_type.width:
+            raise ValueError(
+                f"Type mismatch, store {orig_dtype} (-> {data.dtype}) "
+                f"to Tensor with element type {self.element_type}"
+            )
+
+        val = data.ir_value(loc=loc, ip=ip)
+        if isinstance(data.dtype, (Int8, Boolean)) and (self.element_type is Boolean):
+            zero = Int8(0).ir_value(loc=loc, ip=ip)
+            val = arith.cmpi(arith.CmpIPredicate.ne, val, zero, loc=loc, ip=ip)
+
+        return val
+
+    @dsl_user_op
+    def __setitem__(
+        self,
+        crd: Coord,
+        data: Union[int, float, ir.Value, Numeric, "TensorSSA"],
+        *,
+        loc=None,
+        ip=None,
+    ) -> None:
+        """Set tensor elements at specified coordinates.
+
+        Assigns values to tensor elements through direct coordinate access or slice assignment.
+        For slice assignment, the value must be a TensorSSA with matching shape.
+
+        :param crd: Coordinate or slice specification for tensor element assignment
+        :type crd: Coord
+        :param value: Value to assign - can be scalar or TensorSSA for slice assignment
+        :type value: Union[int, float, ir.Value, TensorSSA]
+        :param loc: Source location for MLIR operation tracking, defaults to None
+        :type loc: Optional[Location]
+        :param ip: Insertion point for MLIR operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+
+        :raises ValueError: If tensor type doesn't support load/store operations
+        :raises ValueError: If slice assignment value is not a TensorSSA
+        :raises ValueError: If value type doesn't match tensor element type
+        :raises NotImplementedError: If value type is not supported
+
+        .. note::
+            Sub-byte types like Float4E2M1FN and Float6E3M2FN are not supported for scalar
+            dereference operations. Attempting to set individual elements of tensors with
+            these element types will result in errors.
+
+        Examples:
+
+        .. code-block:: python
+
+            # Unsupported operations with sub-byte types:
+            ptr = make_ptr(cutlass.Float4E2M1FN, 0, cutlass.AddressSpace.gmem)
+            tensor = make_tensor(ptr, layout)
+            # The following will raise an error:
+            tensor[0] = 1.0  # Error: sub-byte scalar dereference not supported
+
+            # Similarly for other sub-byte types:
+            ptr = make_ptr(cutlass.Float6E3M2FN, 0, cutlass.AddressSpace.gmem)
+            tensor = make_tensor(ptr, layout)
+            tensor[0] = 0.5  # Error: sub-byte scalar dereference not supported
+        """
+        self._check_can_load_store()
+
+        # convert scalar type
+        if not has_underscore(crd):
+            self._check_can_dereference()
+            # First, convert ir.Value to Numeric
+            if isinstance(data, ir.Value):
+                data = as_numeric(data)
+            elif isinstance(data, (int, float, bool)):
+                data = as_numeric(data)
+
+            if not isinstance(data, Numeric):
+                raise ValueError(f"unsupported data type: {type(data)}")
+
+            # Implicit upcast to wider type
+            val = self._cvt_to_dest(data, loc=loc, ip=ip)
+            if val.type != self.element_type.mlir_type:
+                raise ValueError(
+                    f"type mismatch, store {val.type} to {self.element_type}"
+                )
+
+            crd_val = _pack_coord(crd, loc=loc, ip=ip)
+            _cute_ir.memref_store(self.value, crd_val, val, loc=loc, ip=ip)
+        else:
+            if not isinstance(data, TensorSSA):
+                raise ValueError(f"expects TensorSSA, but got {data}")
+
+            self.__getitem__(crd).store(data, loc=loc, ip=ip)  # type: ignore
+
+    @property
+    def __class__(self) -> Type[Tensor]:
+        return Tensor
+
+    # Make it behave as if it inherited from ir.Value
+    @property
+    @lru_cache_ir()
+    def type(self) -> ir.Type:
+        return self.value.type
+
+    @property
+    def iterator(self) -> Union[Pointer, IntTuple]:
+        res = _cute_ir.get_iter(self.value)
+        if isinstance(res, Pointer):
+            return res
+        elif isinstance(res.type, _cute_ir.IntTupleType):
+            return _unpack_x_tuple(res)
+        elif isinstance(res, ir.Value):
+            # Example: SMEM descriptor iterator, not well supported today
+            return res
+        else:
+            raise TypeError(f"unsupported iterator type, got {type(res)}")
+
+    @property
+    def layout(self) -> Layout:
+        return _cute_ir.get_layout(self.value)
+
+    @property
+    def shape(self) -> Shape:
+        return self.layout.shape
+
+    @property
+    def stride(self) -> Stride:
+        if isinstance(self.type, _cute_ir.ComposedLayoutType):
+            raise ValueError(f"can't get stride from composed layout")
+        return self.layout.stride
+
+    @property
+    def leading_dim(self) -> Union[int, Tuple[int], None]:
+        """
+        Get the leading dimension of this Tensor.
+
+        Returns:
+            int: Single leading dimension index if found
+            Tuple[int, ...]: Tuple of indices for nested leading dimensions
+            None: If no leading dimension is found
+        """
+        return find(1, self.stride, exclude_when=(1, self.shape))
+
+    @property
+    @lru_cache_ir()
+    def element_type(self) -> Union[Type[Numeric], Type[IntTuple]]:
+        if is_integer(self.iterator) or isinstance(self.iterator, tuple):
+            return IntTuple
+        elif isinstance(self.iterator, Pointer):
+            return self.iterator.value_type
+        else:
+            raise TypeError(f"unsupported iterator type, got {type(self.iterator)}")
+
+    @property
+    @lru_cache_ir()
+    def memspace(self) -> AddressSpace:
+        if isinstance(self.iterator, Pointer):
+            return self.iterator.memspace
+
+        raise ValueError(f"{self} doesn't have memspace")
+
+    @dsl_user_op
+    def load(self, *, loc=None, ip=None) -> "TensorSSA":
+        """Load tensor elements as a vector.
+
+        Loads all elements of the tensor into a vector representation, assuming the tensor
+        has a static shape and is in a memory space that supports load operations.
+
+        :param loc: Source location for MLIR operation tracking, defaults to None
+        :type loc: Optional[Location]
+        :param ip: Insertion point for MLIR operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+        :return: Vector representation of tensor elements
+        :rtype: TensorSSA
+
+        :raises ValueError: If tensor has dynamic layout
+        :raises ValueError: If tensor memory space doesn't support load operations
+        """
+        if not is_static(self.shape):
+            raise ValueError("dynamic layout doesn't support load")
+
+        self._check_can_load_store()
+
+        res_vect = _cute_ir.memref_load_vec(self.value, row_major=True, loc=loc, ip=ip)
+
+        return TensorSSA(res_vect, self.shape, self.element_type)
+
+    @dsl_user_op
+    def store(self, data: "TensorSSA", *, loc=None, ip=None):
+        """Store vector data into tensor.
+
+        Stores vector data into the tensor, assuming matching shapes and a memory space
+        that supports store operations.
+
+        :param data: Vector data to store into tensor
+        :type data: TensorSSA
+        :param loc: Source location for MLIR operation tracking, defaults to None
+        :type loc: Optional[Location]
+        :param ip: Insertion point for MLIR operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+
+        :raises ValueError: If tensor has dynamic layout
+        :raises ValueError: If tensor memory space doesn't support store operations
+        :raises ValueError: If data shape doesn't match tensor shape
+        """
+        if not isinstance(data, TensorSSA):
+            raise ValueError(f"Expects TensorSSA, but got {type(data)}")
+
+        if not is_static(self.shape):
+            raise ValueError("Dynamic layout doesn't support vectorized store")
+
+        self._check_can_load_store()
+
+        n_elems = size(self.shape, loc=loc, ip=ip)
+        if n_elems != size(data.shape, loc=loc, ip=ip):
+            raise ValueError(
+                f"lhs and rhs must have the same shape, but got {self.shape} and {data.shape}"
+            )
+
+        elem_mlir_type = cutlass_arith.element_type(data.dtype.mlir_type)
+        if cutlass_arith.is_narrow_precision(elem_mlir_type):
+            if elem_mlir_type.width * n_elems % 32 != 0:
+                raise ValueError(
+                    f"narrow precision type must be 32-bit aligned vector, but got {elem_mlir_type} with {n_elems} elements"
+                )
+
+        # Implicit upcast to wider type
+        new_data = self._cvt_to_dest(data, loc=loc, ip=ip)
+
+        return _cute_ir.memref_store_vec(
+            new_data, self.value, row_major=True, loc=loc, ip=ip
+        )
+
+    @dsl_user_op
+    def fill(self, value: Numeric, *, loc=None, ip=None) -> None:
+        """Fill tensor with a constant value.
+
+        Fills all elements of the tensor with the specified value, assuming static size
+        and supported memory space.
+
+        :param value: Value to fill tensor with
+        :type value: Union[int, float]
+        :param loc: Source location for MLIR operation tracking, defaults to None
+        :type loc: Optional[Location]
+        :param ip: Insertion point for MLIR operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+
+        :raises NotImplementedError: If tensor has dynamic size
+
+        Examples:
+
+        .. code-block:: python
+
+            # Create tensor from numpy array
+            b = np.random.randn(4, 8).astype(np.float32)
+            tensor = from_dlpack(b)
+
+            # Fill tensor with constant value
+            tensor.fill(0.5)  # All elements become 0.5
+        """
+        self._check_can_load_store()
+
+        sz = size(self, loc=loc, ip=ip)
+        if type(sz) is not int:
+            raise NotImplementedError(f"dynamic size is not supported: {self.type}")
+
+        # Should we cast to destination type even with narrow cast?
+        dst_type = self.element_type
+        value = dst_type(value)
+
+        self[None] = full(self.shape, fill_value=value, dtype=dst_type, loc=loc, ip=ip)
+
+    def _check_can_load_store(self):
+        if not isinstance(
+            self.type, _cute_ir.MemRefType
+        ) or not self.type.address_space in (
+            AddressSpace.rmem,
+            AddressSpace.smem,
+            AddressSpace.gmem,
+            AddressSpace.generic,
+        ):
+            raise ValueError(f"{self} doesn't support load and store")
+
+    def _check_can_dereference(self):
+        # Check for sub-byte types and raise error if needed
+        if self.element_type.width % 8 != 0 and self.element_type is not Boolean:
+            raise ValueError(
+                f"Sub-byte scalar dereference not supported for type {self.element_type}"
+            )
+
+
+@dsl_user_op
+def print_tensor(tensor: Tensor, *, verbose: bool = False, loc=None, ip=None):
+    """Print content of the tensor in human readable format.
+
+    tensor(raw_ptr<@..., Float32, generic, align(4)> o (8,5):(5,1), data=
+           [[-0.4326, -0.5434,  0.1238,  0.7132,  0.8042],
+            [-0.8462,  0.9871,  0.4389,  0.7298,  0.6948],
+            [ 0.3426,  0.5856,  0.1541,  0.2923,  0.6976],
+            [-0.1649,  0.8811,  0.1788,  0.1404,  0.2568],
+            [-0.2944,  0.8593,  0.4171,  0.8998,  0.1766],
+            [ 0.8814,  0.7919,  0.7390,  0.4566,  0.1576],
+            [ 0.9159,  0.7577,  0.6918,  0.0754,  0.0591],
+            [ 0.6551,  0.1626,  0.1189,  0.0292,  0.8655]])
+    """
+    if not isinstance(tensor.type, _cute_ir.MemRefType):
+        raise NotImplementedError(
+            f"printing {tensor} is not supported because it doesn't support trivial dereferencing. "
+            f"Coordinate Tensor will be supported in the future."
+        )
+
+    tensor._check_can_load_store()  # type: ignore
+
+    if tensor.element_type.is_integer:
+        signed = tensor.element_type.signed
+    else:
+        signed = False
+
+    _cute_ir.print_view(tensor.value, verbose=verbose, is_signed=signed, loc=loc, ip=ip)
+
+
+####################################################################################################
+#
+# Core API
+#
+####################################################################################################
+
+
+#
+# Utilties
+#
+
+
+@lru_cache_ir()
+def is_integer(a) -> bool:
+    """Check if an object is static integer or dynamic integer"""
+    return (
+        isinstance(a, int)
+        or isinstance(a, Integer)
+        or (isinstance(a, ir.Value) and isinstance(a.type, ir.IntegerType))
+    )
+
+
+def is_valid_leaf(a) -> bool:
+    """
+    Returns whether `a` has a type that is valid for a CuTe tuple's leaf.
+    """
+    return (
+        is_integer(a)
+        or (a is None)
+        or isinstance(a, (ScaledBasis, Layout, ComposedLayout))
+    )
+
+
+def is_int_tuple(a) -> bool:
+    if isinstance(a, tuple):
+        return all([is_int_tuple(x) for x in a])
+    else:
+        return is_integer(a)
+
+
+def is_static(x: Union[ir.Type, ir.Value, XTuple]) -> bool:
+    """Check if a value is statically known at compile time.
+
+    In CuTe, static values are those whose values are known at compile time,
+    as opposed to dynamic values which are only known at runtime.
+
+    :param x: The value to check
+    :type x: Union[ir.Type, ir.Value, XTuple]
+    :return: True if the value is static, False otherwise
+    :rtype: bool
+    :raises TypeError: If an unsupported type is provided
+    """
+    if isinstance(x, ir.Type):
+        return _cute_ir.is_static(x)
+    elif isinstance(x, tuple):
+        return all(is_static(a) for a in x)
+    # Can it be a static int?
+    elif isinstance(x, Numeric):
+        return False
+    elif is_dynamic_expression(x):
+        return _cute_ir.is_static(x.type)
+    elif isinstance(x, int) or x is None:
+        return True
+    elif isinstance(x, ScaledBasis):
+        return x.is_static()
+    else:
+        raise TypeError(f"unsupported type {x}")
+
+
+def has_underscore(a: XTuple) -> bool:
+    if type(a) is tuple:
+        return any([has_underscore(x) for x in a])
+    else:
+        return a is None
+
+
+def has_scaled_basis(a: XTuple) -> bool:
+    """Check if a tuple or its nested elements contain ScaledBasis objects.
+
+    ScaledBasis objects are fundamental components in CuTe layouts,
+    representing the basis vectors of coordinate systems.
+
+    :param a: The tuple to check
+    :type a: XTuple
+    :return: True if the tuple contains ScaledBasis objects, False otherwise
+    :rtype: bool
+    """
+    if type(a) is tuple:
+        return any([has_scaled_basis(x) for x in a])
+    else:
+        return isinstance(a, ScaledBasis)
+
+
+def _tuple_str(t: tuple) -> str:
+    """
+    Constructs a string representation of a python tuple without calling __repr__ on its elements.
+    """
+
+    def construct_inner_str(t) -> str:
+        if not isinstance(t, tuple):
+            return pretty_str(t)
+        res = ""
+        l = len(t)
+        for i in range(l):
+            res += pretty_str(t[i])
+            if i < l - 1:
+                res += ","
+        return res
+
+    res = "(" + construct_inner_str(t) + ")"
+    return res
+
+
+def pretty_str(arg) -> str:
+    """
+    Constructs a concise readable pretty string.
+    """
+    if isinstance(arg, tuple):
+        # _tuple_str for tuples
+        return _tuple_str(arg)
+    elif arg is None:
+        # We interpret None as underscores for slicers
+        return "_"
+    else:
+        # Fallback to __str__
+        return arg.__str__()
+
+
+@dsl_user_op
+def printf(*args, loc=None, ip=None) -> None:
+    """Print a value or a list of values.
+
+    :param args: List of values to print
+    :type args: list
+    :param loc: Source location where it's called, defaults to None
+    :type loc: source location, optional
+    :param ip: Insertion pointer, defaults to None
+    :type ip: insertion pointer, optional
+    :raises ValueError: If no arguments are provided or if an unsupported argument type is passed
+    """
+
+    if len(args) == 0:
+        raise ValueError("expects at least one argument to print")
+
+    if isinstance(args[0], str):
+        fmt = args[0] + "\n"
+        args = args[1:]
+    else:
+        fmt = "{}" + ", {}" * (len(args) - 1) + "\n"
+
+    def process_arg(arg):
+        arg0 = arg.value if isinstance(arg, Numeric) else arg
+
+        if isinstance(arg0, ir.Value):
+            if isinstance(arg0.type, ir.FloatType) and (arg0.type != T.f32()):
+                raise TypeError(
+                    f"cute.printf only supports 32-bit floating-point type, but got {arg0.type}"
+                )
+            return arg0
+        elif isinstance(arg0, bool):
+            return const(arg0, Boolean)
+        elif isinstance(arg0, int):
+            return const(arg0, Int32)
+        elif isinstance(arg0, float):
+            return const(arg0, Float32)
+        elif has_underscore(arg0):
+            # Assume it's a coordinate
+            return _pack_coord(arg0)
+        elif has_scaled_basis(arg0):
+            # Assume it's a stride
+            return _pack_stride(arg0)
+        elif isinstance(arg0, tuple):
+            # Assume it's an int_tuple
+            return _pack_int_tuple(arg0)
+        elif isinstance(arg0, (_Tensor, _Pointer)):
+            return arg0.value
+        else:
+            raise TypeError(f"unsupported argument type in printf, got {type(arg)}")
+
+    args = [process_arg(a) for a in args]
+    _cute_ir.print_(args, fmt=fmt, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def front(input, *, loc=None, ip=None):
+    """Recursively get the first element of input.
+
+    This function traverses a hierarchical structure (like a layout or tensor)
+    and returns the first element at the deepest level. It's particularly useful
+    for accessing the first stride value in a layout to determine properties like
+    majorness.
+
+    :param input: The hierarchical structure to traverse
+    :type input: Union[Tensor, Layout, Stride]
+    :param loc: Source location where it's called, defaults to None
+    :type loc: source location, optional
+    :param ip: Insertion pointer for IR generation, defaults to None
+    :type ip: insertion pointer, optional
+    :return: The first element at the deepest level of the input structure
+    :rtype: Union[int, float, bool, ir.Value]
+    """
+    if rank(input) == 1 and depth(input) == 0:
+        return input
+    else:
+        return front(get(input, mode=[0], loc=loc, ip=ip), loc=loc, ip=ip)
+
+
+@dsl_user_op
+def is_major(mode, stride: Stride, *, loc=None, ip=None) -> bool:
+    """
+    Check whether a mode in stride is the major mode.
+    """
+    first_stride = front(get(stride, mode=[mode], loc=loc, ip=ip), loc=loc, ip=ip)
+    if is_dynamic_expression(first_stride):
+        return False
+    return True if first_stride == 1 else False
+
+
+@dsl_user_op
+def find(
+    x: int,
+    t: Union[tuple, ir.Value, int],
+    *,
+    exclude_when: Optional[IntTuple] = None,
+    loc=None,
+    ip=None,
+) -> Union[int, Tuple[int, ...], None]:
+    """Find the first position of a x in t.
+    If exclude_when is provided, the positions where comparison equals comparison_value will be excluded from the search results.
+
+    :param x: The static integer x to search for
+    :type x: int
+    :param t: The search space
+    :type t: Union[tuple, ir.Value, int]
+    :param exclude_when: A tuple of (comparison_value, comparison) - positions where comparison equals comparison_value will be excluded from the search results
+    :type exclude_when: Optional[Tuple[int, Union[tuple, ir.Value, int]]]
+    :return: Index if found at top level, tuple of indices showing nested position, or None if not found
+    :rtype: Union[int, Tuple[int, ...], None]
+    """
+    if not isinstance(x, int):
+        raise TypeError(f"find() requires a static x to search for, but got {x}")
+
+    # Extract comparison value and tuple from exclude_when if provided
+    comparison_value, comparison = None, None
+    if exclude_when is not None:
+        comparison_value, comparison = exclude_when
+
+    # Iterate through t, checking both nested tuples and leaf values
+    for i in range(rank(t)):
+        # Get current elements from t and comparison
+        curr1 = get(t, mode=[i], loc=loc, ip=ip)
+        curr2 = (
+            get(comparison, mode=[i], loc=loc, ip=ip)
+            if comparison is not None
+            else None
+        )
+
+        if isinstance(curr1, tuple):
+            # Recursively search nested tuple
+            sub_pos = find(
+                x,
+                curr1,
+                exclude_when=(
+                    (comparison_value, curr2) if comparison is not None else None
+                ),
+                loc=loc,
+                ip=ip,
+            )
+            if sub_pos is not None:
+                # Combine current index with recursive result
+                if isinstance(sub_pos, int):
+                    return (i, sub_pos)
+                return (i,) + sub_pos
+        else:
+            # For leaf values, check if it matches x
+            # Skip dynamic expressions and Numeric types which can't be compared
+            if not (is_dynamic_expression(curr1) or isinstance(curr1, Numeric)):
+                if curr1 == x:
+                    if (
+                        comparison is None
+                        or is_dynamic_expression(curr2)
+                        or isinstance(curr2, Numeric)
+                        or curr2 != comparison_value
+                    ):
+                        return i
+
+    return None
+
+
+def transform_leaf(f, *args):
+    """
+    Apply a function to the leaf nodes of nested tuple structures.
+
+    This function traverses nested tuple structures in parallel and applies the function f
+    to corresponding leaf nodes. All input tuples must have the same nested structure.
+
+    :param f: Function to apply to leaf nodes
+    :type f: Callable
+    :param args: One or more nested tuple structures with matching profiles
+    :return: A new nested tuple with the same structure as the inputs, but with leaf values transformed by f
+    :raises TypeError: If the input tuples have different nested structures
+
+    Example:
+
+    .. code-block:: python
+
+        >>> transform_leaf(lambda x: x + 1, (1, 2))
+        (2, 3)
+        >>> transform_leaf(lambda x, y: x + y, (1, 2), (3, 4))
+        (4, 6)
+        >>> transform_leaf(lambda x: x * 2, ((1, 2), (3, 4)))
+        ((2, 4), (6, 8))
+    """
+    if all(isinstance(t, tuple) for t in args):
+        return tuple(transform_leaf(f, *_args) for _args in zip(*args))
+    elif all(not isinstance(t, tuple) for t in args):
+        return f(*args)
+    else:
+        raise TypeError(f"profile of input tuples doesn't match: {args}")
+
+
+@dsl_user_op
+def assume(src, divby=None, *, loc=None, ip=None):
+    if divby is None:
+        return src
+
+    if isinstance(src, Integer):
+        width = type(src).width
+        src_val = src.ir_value()
+    else:
+        width = src.type.width
+        src_val = src
+
+    res_ty = _cute_ir.ConstrainedIntType.get(divby, width)
+    assumed_val = _cute_ir.assume(res_ty, src_val, loc=loc, ip=ip)
+    return type(src)(IntValue(_pack_int_tuple(assumed_val, loc=loc, ip=ip)))
+
+
+@dsl_user_op
+def make_swizzle(b, m, s, *, loc=None, ip=None):
+    # canonicalize to <0, 4, 3> for identity swizzle (as compiler assumes <0, 4, 3>)
+    if b == 0:
+        m, s = 4, 3
+    ty = ir.Type.parse(f'!cute.swizzle<"S<{b},{m},{s}>">')
+    return Swizzle(_cute_ir.static(ty, loc=loc, ip=ip))
+
+
+#
+# Tuple API (also used by layouts and tensors)
+#
+
+
+def depth(a: Union[XTuple, Layout, "ComposedLayout"]) -> int:
+    """Returns the depth (nesting level) of a tuple, layout, or tensor.
+
+    The depth of a tuple is the maximum depth of its elements plus 1.
+    For an empty tuple, the depth is 1. For layouts and tensors, the depth
+    is determined by the depth of their shape. For non-tuple values (e.g., integers),
+    the depth is considered 0.
+
+    :param a: The object whose depth is to be determined
+    :type a: Union[XTuple, Layout, ComposedLayout, Tensor, Any]
+    :return: The depth of the input object
+    :rtype: int
+
+    Example:
+
+    .. code-block:: python
+
+        >>> depth(1)
+        0
+        >>> depth((1, 2))
+        1
+        >>> depth(((1, 2), (3, 4)))
+        2
+    """
+    if type(a) is tuple:
+        if not a:
+            return 1
+        return max(depth(x) for x in a) + 1
+    elif isinstance(a, (Layout, ComposedLayout, Tensor)):
+        return depth(a.shape)
+    else:
+        return 0
+
+
+@lru_cache_ir()
+def rank(a: Union[XTuple, Layout, "ComposedLayout"]) -> int:
+    """Returns the rank (dimensionality) of a tuple, layout, or tensor.
+
+    The rank of a tuple is its length. For layouts and tensors, the rank is
+    determined by the rank of their shape. For non-tuple values (e.g., integers),
+    the rank is considered 1 for convenience.
+
+    :param a: The object whose rank is to be determined
+    :type a: Union[XTuple, Layout, ComposedLayout, Tensor, Any]
+    :return: The rank of the input object
+    :rtype: int
+
+    This function is used in layout algebra to determine the dimensionality
+    of tensors and layouts for operations like slicing and evaluation.
+    """
+    if isinstance(a, tuple):
+        return len(a)
+    elif isinstance(a, (Layout, ComposedLayout, Tensor)):
+        return rank(a.shape)
+    elif depth(a) == 0:
+        return 1
+    else:
+        raise TypeError(f"unsupported type in rank, got {type(a)}")
+
+
+def is_congruent(
+    a: Union[XTuple, Layout, ComposedLayout, Tensor],
+    b: Union[XTuple, Layout, ComposedLayout, Tensor],
+) -> bool:
+    """
+    Returns whether a is congruent to b.
+    """
+    if isinstance(a, (Layout, ComposedLayout, Tensor)):
+        a = a.shape
+    if isinstance(b, (Layout, ComposedLayout, Tensor)):
+        b = b.shape
+    if isinstance(a, tuple) and isinstance(b, tuple):
+        return (len(a) == len(b)) and all(is_congruent(x, y) for x, y in zip(a, b))
+    if isinstance(a, tuple) or isinstance(b, tuple):
+        return False
+    return True
+
+
+def is_weakly_congruent(
+    a: Union[XTuple, Layout, ComposedLayout, Tensor],
+    b: Union[XTuple, Layout, ComposedLayout, Tensor],
+) -> bool:
+    """
+    Returns whether a is weakly congruent to b.
+    """
+    if isinstance(a, (Layout, ComposedLayout, Tensor)):
+        a = a.shape
+    if isinstance(b, (Layout, ComposedLayout, Tensor)):
+        b = b.shape
+    if not isinstance(a, tuple):
+        return True
+    if isinstance(a, tuple) and isinstance(b, tuple):
+        return (len(a) == len(b)) and all(
+            is_weakly_congruent(x, y) for x, y in zip(a, b)
+        )
+    if isinstance(a, tuple) or isinstance(b, tuple):
+        return False
+    return True
+
+
+@overload
+def get(input: Shape, mode, *, loc=None, ip=None) -> Shape: ...
+@overload
+def get(input: Stride, mode, *, loc=None, ip=None) -> Stride: ...
+@overload
+def get(input: Coord, mode, *, loc=None, ip=None) -> Coord: ...
+@overload
+def get(input: IntTuple, mode, *, loc=None, ip=None) -> IntTuple: ...
+@overload
+def get(input: Tile, mode, *, loc=None, ip=None) -> Tile: ...
+@overload
+def get(input: Layout, mode, *, loc=None, ip=None) -> Layout: ...
+@overload
+def get(input: ComposedLayout, mode, *, loc=None, ip=None) -> ComposedLayout: ...
+
+
+@dsl_user_op
+def get(input, mode: List[int], *, loc=None, ip=None):
+    """Extract a specific element or sub-layout from a layout or tuple.
+
+    This function recursively traverses the input according to the mode indices,
+    extracting the element at the specified path. For layouts, this operation
+    corresponds to extracting a specific sub-layout.
+
+    :param input: The input layout or tuple to extract from
+    :type input: Layout, ComposedLayout, tuple
+    :param mode: Indices specifying the path to traverse for extraction
+    :type mode: List[int]
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: optional
+    :param ip: Insertion point, defaults to None
+    :type ip: optional
+    :return: The extracted element or sub-layout
+    :rtype: Layout, ComposedLayout, or element type
+    :raises ValueError: If any index in mode is out of range
+    :raises TypeError: If mode contains non-integer elements or if input has unsupported type
+
+    **Examples**:
+
+    For a layout like ((4,8),2):((16,1),8), get with mode=[0,1] would extract
+    the element 8 from the shape component.
+    """
+    # Empty mode returns input and terminates the recursive call
+    if not mode:
+        return input
+
+    if rank(input) <= mode[0]:
+        raise ValueError(
+            f"elements in mode must be less than rank({input}), got {mode}"
+        )
+
+    if depth(input) == 0:
+        return input
+    elif isinstance(input, tuple):
+        if not isinstance(mode[0], int):
+            raise TypeError(
+                f"invalid element in mode, expects int, got {type(mode[0])}"
+            )
+        return get(input[mode[0]], mode=mode[1:])
+    else:
+        if not isinstance(input, (Layout, ComposedLayout)):
+            raise TypeError(f"unsupported type of input, got {type(input)}")
+        return _cute_ir.get(
+            input.type.get_op_res_type(mode=mode), input, mode=mode, loc=loc, ip=ip
+        )
+
+
+@overload
+def select(input: Shape, mode, *, loc=None, ip=None) -> Shape: ...
+@overload
+def select(input: Stride, mode, *, loc=None, ip=None) -> Stride: ...
+@overload
+def select(input: Coord, mode, *, loc=None, ip=None) -> Coord: ...
+@overload
+def select(input: IntTuple, mode, *, loc=None, ip=None) -> IntTuple: ...
+@overload
+def select(input: Tile, mode, *, loc=None, ip=None) -> Tile: ...
+@overload
+def select(input: Layout, mode, *, loc=None, ip=None) -> Layout: ...
+@overload
+def select(input: ComposedLayout, mode, *, loc=None, ip=None) -> ComposedLayout: ...
+
+
+@dsl_user_op
+def select(input, mode: List[int], *, loc=None, ip=None):
+    """Select modes from input.
+
+    :param input: Input to select from
+    :type input: Layout, ComposedLayout, tuple
+    :param mode: Indices specifying which dimensions or elements to select
+    :type mode: List[int]
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: optional
+    :param ip: Insertion point, defaults to None
+    :type ip: optional
+    :return: A new instance with selected dimensions/elements
+    :rtype: Layout, ComposedLayout, tuple
+    :raises ValueError: If any index in mode is out of range
+    :raises TypeError: If the input type is invalid
+    """
+    if any((not isinstance(i, int)) or (i >= rank(input)) for i in mode):
+        raise ValueError(
+            f"invalid mode element for input of rank {rank(input)}, got {mode=}"
+        )
+
+    if isinstance(input, tuple):
+        return tuple(input[i] for i in mode)
+
+    if not isinstance(input, (Layout, ComposedLayout)):
+        raise TypeError(f"unsupported type of input, got {type(input)}")
+
+    return _cute_ir.select(input, mode=mode, loc=loc, ip=ip)
+
+
+@overload
+def group_modes(input: Shape, begin: int, end: int, *, loc=None, ip=None) -> Shape: ...
+@overload
+def group_modes(
+    input: Stride, begin: int, end: int, *, loc=None, ip=None
+) -> Stride: ...
+@overload
+def group_modes(input: Coord, begin: int, end: int, *, loc=None, ip=None) -> Coord: ...
+@overload
+def group_modes(
+    input: IntTuple, begin: int, end: int, *, loc=None, ip=None
+) -> IntTuple: ...
+@overload
+def group_modes(input: Tile, begin: int, end: int, *, loc=None, ip=None) -> Tile: ...
+@overload
+def group_modes(
+    input: Layout, begin: int, end: int, *, loc=None, ip=None
+) -> Layout: ...
+@overload
+def group_modes(
+    input: ComposedLayout, begin: int, end: int, *, loc=None, ip=None
+) -> ComposedLayout: ...
+@overload
+def group_modes(
+    input: Tensor, begin: int, end: int, *, loc=None, ip=None
+) -> Tensor: ...
+
+
+@dsl_user_op
+def group_modes(input, begin: int, end: int = -1, *, loc=None, ip=None):
+    """Group modes of a hierarchical tuple or layout into a single mode.
+
+    This function groups a range of modes from the input object into a single mode,
+    creating a hierarchical structure. For tuples, it creates a nested tuple containing
+    the specified range of elements. For layouts and other CuTe objects, it creates
+    a hierarchical representation where the specified modes are grouped together.
+
+    :param input: Input object to group modes from (layout, tuple, etc.)
+    :type input: Layout, ComposedLayout, tuple, Shape, Stride, etc.
+    :param beg: Beginning index of the range to group (inclusive)
+    :type beg: int
+    :param end: Ending index of the range to group (exclusive)
+    :type end: int
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: optional
+    :param ip: Insertion point, defaults to None
+    :type ip: optional
+    :return: A new object with the specified modes grouped
+    :rtype: Same type as input with modified structure
+
+    Examples:
+
+    .. code-block:: python
+
+        # Group modes in a tuple
+        t = (2, 3, 4, 5)
+        grouped = group_modes(t, 1, 3)  # (2, (3, 4), 5)
+
+        # Group modes in a layout
+        layout = make_layout((2, 3, 4, 5))
+        grouped_layout = group_modes(layout, 1, 3)  # Layout with shape (2, (3, 4), 5)
+
+        # Group modes in a shape
+        shape = make_shape(2, 3, 4, 5)
+        grouped_shape = group_modes(shape, 0, 2)  # Shape ((2, 3), 4, 5)
+    """
+    if depth(input) == 0:
+        return (input,)
+    if isinstance(input, tuple):
+        return (*input[:begin], (input[begin:end]), *input[end:])
+    return _cute_ir.group_modes(input.value, begin, end, loc=loc, ip=ip)
+
+
+@overload
+def slice_(src: Shape, coord: Coord, *, loc=None, ip=None) -> Shape: ...
+@overload
+def slice_(src: Stride, coord: Coord, *, loc=None, ip=None) -> Stride: ...
+@overload
+def slice_(src: Coord, coord: Coord, *, loc=None, ip=None) -> Coord: ...
+@overload
+def slice_(src: IntTuple, coord: Coord, *, loc=None, ip=None) -> IntTuple: ...
+@overload
+def slice_(src: Tile, coord: Coord, *, loc=None, ip=None) -> Tile: ...
+@overload
+def slice_(src: Layout, coord: Coord, *, loc=None, ip=None) -> Layout: ...
+@overload
+def slice_(
+    src: ComposedLayout, coord: Coord, *, loc=None, ip=None
+) -> ComposedLayout: ...
+@overload
+def slice_(src: Tensor, coord: Coord, *, loc=None, ip=None) -> Tensor: ...
+
+
+@dsl_user_op
+def slice_(src, coord: Coord, *, loc=None, ip=None):
+    """Perform a slice operation on a source object using the given coordinate.
+
+    This function implements CuTe's slicing operation which extracts a subset of elements
+    from a source object (tensor, layout, etc.) based on a coordinate pattern. The slice
+    operation preserves the structure of the source while selecting specific elements.
+
+    :param src: Source object to be sliced (tensor, layout, tuple, etc.)
+    :type src: Union[Tensor, Layout, IntTuple, Value]
+    :param coord: Coordinate pattern specifying which elements to select
+    :type coord: Coord
+    :param loc: Source location information, defaults to None
+    :type loc: Optional[Location]
+    :param ip: Insertion point for IR generation, defaults to None
+    :type ip: Optional[InsertionPoint]
+    :return: A new object containing the sliced elements
+    :rtype: Union[Tensor, Layout, IntTuple, tuple]
+    :raises ValueError: If the coordinate pattern is incompatible with source
+
+    Examples:
+
+    .. code-block:: python
+
+        # Layout slicing
+        layout = make_layout((4,4))
+
+        # Select 1st index of first mode and keep all elements in second mode
+        sub_layout = slice_(layout, (1, None))
+
+    .. code-block:: python
+
+        # Basic tensor slicing
+        tensor = make_tensor(...)           # Create a 2D tensor
+
+        # Select 1st index of first mode and keep all elements in second mode
+        sliced = slice_(tensor, (1, None))
+
+    .. code-block:: python
+
+        # Select 2nd index of second mode and keep all elements in first mode
+        sliced = slice_(tensor, (None, 2))
+
+    Note:
+        - `None` represents keeping all elements in that mode
+        - Slicing preserves the layout/structure of the original object
+        - Can be used for:
+          * Extracting sub-tensors/sub-layouts
+          * Creating views into data
+          * Selecting specific patterns of elements
+    """
+
+    def lift_slice(a, b):
+        if isinstance(a, tuple):
+            if (not isinstance(b, tuple)) or (len(a) != len(b)):
+                raise ValueError("coord must be weakly congruent to src in slice_")
+            return reduce(
+                lambda p, q: p + q, (lift_slice(x, y) for x, y in zip(a, b)), ()
+            )
+        elif a is None:
+            return (b,)
+        else:
+            return ()
+
+    if is_integer(src) or isinstance(src, tuple):
+        if isinstance(coord, tuple):
+            if (not isinstance(src, tuple)) or (len(coord) != len(src)):
+                raise ValueError("coord must be weakly congruent to src in slice_")
+            return reduce(
+                lambda p, q: p + q, (lift_slice(x, y) for x, y in zip(coord, src)), ()
+            )
+        elif coord is None:
+            return src
+        else:
+            return ()
+
+    if isinstance(src, Tensor):
+        src = src.value
+    coord_val = _pack_coord(coord, loc=loc, ip=ip)
+    return _cute_ir.slice(input=src, coord=coord_val, loc=loc, ip=ip)
+
+
+@overload
+def dice(src: Shape, coord: Coord, *, loc=None, ip=None) -> Shape: ...
+@overload
+def dice(src: Stride, coord: Coord, *, loc=None, ip=None) -> Stride: ...
+@overload
+def dice(src: Coord, coord: Coord, *, loc=None, ip=None) -> Coord: ...
+@overload
+def dice(src: IntTuple, coord: Coord, *, loc=None, ip=None) -> IntTuple: ...
+@overload
+def dice(src: Tile, coord: Coord, *, loc=None, ip=None) -> Tile: ...
+@overload
+def dice(src: Layout, coord: Coord, *, loc=None, ip=None) -> Layout: ...
+@overload
+def dice(src: ComposedLayout, coord: Coord, *, loc=None, ip=None) -> ComposedLayout: ...
+
+
+@dsl_user_op
+@lru_cache_ir()
+def dice(src, dicer, *, loc=None, ip=None):
+    """Keep modes in input when it is paired with an integer in dicer.
+
+    This function performs dicing operation on the input based on the dicer coordinate.
+    Dicing is a fundamental operation in CuTe that allows selecting specific modes from
+    a tensor or layout based on a coordinate pattern.
+
+    :param dicer: A static coordinate indicating how to dice the input
+    :type dicer: Coord
+    :param input: The operand to be diced on
+    :type input: Union[IntTuple, Shape, Stride, Coord, Layout, ComposedLayout]
+    :param loc: Source location information, defaults to None
+    :type loc: Optional[Location]
+    :param ip: Insertion point for IR generation, defaults to None
+    :type ip: Optional[InsertionPoint]
+    :return: The diced result with selected modes from the input
+    :rtype: Union[IntTuple, Shape, Stride, Coord, Layout, ComposedLayout]
+    :raises TypeError: If dicer has an unsupported type
+    :raises ValueError: If input is not provided
+
+    Examples:
+
+    .. code-block:: python
+
+        # Basic dicing of a layout
+        layout = make_layout((32,16,8))
+
+        # Keep only first and last modes
+        diced = dice((1,None,1), layout)
+
+    Note:
+        - The dicer coordinate must be static
+        - Use underscore (_) to remove a mode
+    """
+    if not is_static(dicer):
+        raise ValueError(f"expects dicer to be static, but got {dicer}")
+
+    def lift_dice(a, b):
+        if isinstance(a, tuple):
+            if (not isinstance(b, tuple)) or (len(a) != len(b)):
+                raise ValueError("dicer must be weakly congruent to input in dice")
+            return reduce(
+                lambda p, q: p + q, (lift_dice(x, y) for x, y in zip(a, b)), ()
+            )
+        elif a is None:
+            return ()
+        else:
+            return (b,)
+
+    if is_integer(src) or isinstance(src, tuple):
+        if isinstance(dicer, tuple):
+            if (not isinstance(src, tuple)) or (len(dicer) != len(src)):
+                raise ValueError("dicer must be weakly congruent to src in dice")
+            return reduce(
+                lambda p, q: p + q, (lift_dice(x, y) for x, y in zip(dicer, src)), ()
+            )
+        elif dicer is None:
+            return ()
+        else:
+            return src
+
+    dicer_val = _pack_coord(dicer, loc=loc, ip=ip)
+    return _cute_ir.dice(src, dicer_val.type.attribute, loc=loc, ip=ip)
+
+
+def wrap(x) -> tuple:
+    """
+    Wraps the input into a tuple if not a tuple.
+    """
+    if isinstance(x, tuple):
+        return x
+    return (x,)
+
+
+def _extend(func, input, elem, up_to_rank, loc, ip):
+    if input is None:
+        raise ValueError(f"No input provided for input")
+
+    if isinstance(input, (Layout, ComposedLayout)):
+        if elem is None:
+            elem = make_layout(1)
+        elif not isinstance(elem, Layout):
+            raise TypeError(f"Input type of elem ({type(elem)}) is not accepted!")
+        N = rank(input) + 1 if up_to_rank is None else up_to_rank
+        return func(N, input, elem, loc=loc, ip=ip)
+
+    if is_valid_leaf(input) or isinstance(input, tuple):
+        if elem is None:
+            elem = 1
+        if (not isinstance(elem, tuple)) and (not is_valid_leaf(elem)):
+            raise TypeError(f"Input type of elem ({type(elem)}) is not accepted!")
+
+        input = wrap(input)
+        repeat_cnt = 1 if up_to_rank is None else up_to_rank - rank(input)
+        if repeat_cnt == 0:
+            return input
+        elif repeat_cnt < 0:
+            raise ValueError(f"up_to_rank must be >= rank(input)")
+        else:
+            if func is _cute_ir.prepend_to_rank:
+                return (elem,) * repeat_cnt + input
+            else:
+                return input + (elem,) * repeat_cnt
+
+    raise TypeError(f"invalid type for input, got {type(input)}")
+
+
+@overload
+def prepend(
+    input: Shape, elem: Shape, up_to_rank=None, *, loc=None, ip=None
+) -> Shape: ...
+@overload
+def prepend(
+    input: Stride, elem: Stride, up_to_rank=None, *, loc=None, ip=None
+) -> Stride: ...
+@overload
+def prepend(
+    input: Coord, elem: Coord, up_to_rank=None, *, loc=None, ip=None
+) -> Coord: ...
+@overload
+def prepend(
+    input: IntTuple, elem: IntTuple, up_to_rank=None, *, loc=None, ip=None
+) -> IntTuple: ...
+@overload
+def prepend(input: Tile, elem: Tile, up_to_rank=None, *, loc=None, ip=None) -> Tile: ...
+@overload
+def prepend(
+    input: Layout, elem: Layout, up_to_rank=None, *, loc=None, ip=None
+) -> Layout: ...
+@overload
+def prepend(
+    input: ComposedLayout, elem: Layout, up_to_rank=None, *, loc=None, ip=None
+) -> ComposedLayout: ...
+
+
+@dsl_user_op
+def prepend(input, elem, up_to_rank: Union[None, int] = None, *, loc=None, ip=None):
+    """Extend input to rank up_to_rank by prepending elem in front of input.
+
+    This function extends the input object by prepending elements to reach a desired rank.
+    It supports various CuTe types including shapes, layouts, tensors etc.
+
+    :param input: Source to be prepended to
+    :type input: Union[Shape, Stride, Coord, IntTuple, Tile, Layout, ComposedLayout, Tensor]
+    :param elem: Element to prepend to input
+    :type elem: Union[Shape, Stride, Coord, IntTuple, Tile, Layout]
+    :param up_to_rank: The target rank after extension, defaults to None
+    :type up_to_rank: Union[None, int], optional
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: Optional[Location]
+    :param ip: Insertion point, defaults to None
+    :type ip: Optional[InsertionPoint]
+    :return: The extended result with prepended elements
+    :rtype: Union[Shape, Stride, Coord, IntTuple, Tile, Layout, ComposedLayout, Tensor]
+    :raises ValueError: If up_to_rank is less than input's current rank
+    :raises TypeError: If input or elem has unsupported type
+
+    Examples:
+
+    .. code-block:: python
+
+        # Prepend to a Shape
+        shape = (4,4)
+        prepend(shape, 2)                   # Returns (2,4,4)
+
+        # Prepend to a Layout
+        layout = make_layout((8,8))
+        prepend(layout, make_layout((2,)))  # Returns (2,8,8):(1,1,8)
+
+        # Prepend with target rank
+        coord = (1,1)
+        prepend(coord, 0, up_to_rank=4)     # Returns (0,0,1,1)
+    """
+    return _extend(_cute_ir.prepend_to_rank, input, elem, up_to_rank, loc=loc, ip=ip)
+
+
+@overload
+def append(
+    input: Shape, elem: Shape, up_to_rank=None, *, loc=None, ip=None
+) -> Shape: ...
+@overload
+def append(
+    input: Stride, elem: Stride, up_to_rank=None, *, loc=None, ip=None
+) -> Stride: ...
+@overload
+def append(
+    input: Coord, elem: Coord, up_to_rank=None, *, loc=None, ip=None
+) -> Coord: ...
+@overload
+def append(
+    input: IntTuple, elem: IntTuple, up_to_rank=None, *, loc=None, ip=None
+) -> IntTuple: ...
+@overload
+def append(input: Tile, elem: Tile, up_to_rank=None, *, loc=None, ip=None) -> Tile: ...
+@overload
+def append(
+    input: Layout, elem: Layout, up_to_rank=None, *, loc=None, ip=None
+) -> Layout: ...
+@overload
+def append(
+    input: ComposedLayout, elem: Layout, up_to_rank=None, *, loc=None, ip=None
+) -> ComposedLayout: ...
+
+
+@dsl_user_op
+def append(input, elem, up_to_rank: Union[None, int] = None, *, loc=None, ip=None):
+    """Extend input to rank up_to_rank by appending elem to the end of input.
+
+    This function extends the input object by appending elements to reach a desired rank.
+    It supports various CuTe types including shapes, layouts, tensors etc.
+
+    :param input: Source to be appended to
+    :type input: Union[Shape, Stride, Coord, IntTuple, Tile, Layout, ComposedLayout, Tensor]
+    :param elem: Element to append to input
+    :type elem: Union[Shape, Stride, Coord, IntTuple, Tile, Layout]
+    :param up_to_rank: The target rank after extension, defaults to None
+    :type up_to_rank: Union[None, int], optional
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: Optional[Location]
+    :param ip: Insertion point, defaults to None
+    :type ip: Optional[InsertionPoint]
+    :return: The extended result with appended elements
+    :rtype: Union[Shape, Stride, Coord, IntTuple, Tile, Layout, ComposedLayout, Tensor]
+    :raises ValueError: If up_to_rank is less than input's current rank
+    :raises TypeError: If input or elem has unsupported type
+
+    Examples:
+
+    .. code-block:: python
+
+        # Append to a Shape
+        shape = (4,4)
+        append(shape, 2)                   # Returns (4,4,2)
+
+        # Append to a Layout
+        layout = make_layout((8,8))
+        append(layout, make_layout((2,)))  # Returns (8,8,2):(1,8,1)
+
+        # Append with target rank
+        coord = (1,1)
+        append(coord, 0, up_to_rank=4)     # Returns (1,1,0,0)
+
+    Note:
+        - The function preserves the structure of the input while extending it
+        - Can be used to extend tensors, layouts, shapes and other CuTe types
+        - When up_to_rank is specified, fills remaining positions with elem
+        - Useful for tensor reshaping and layout transformations
+    """
+    return _extend(_cute_ir.append_to_rank, input, elem, up_to_rank, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def prepend_ones(
+    t: Tensor, up_to_rank: Union[None, int] = None, *, loc=None, ip=None
+) -> Tensor:
+    return make_tensor(
+        t.iterator, prepend(t.layout, make_layout(1), up_to_rank), loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def append_ones(
+    t: Tensor, up_to_rank: Union[None, int] = None, *, loc=None, ip=None
+) -> Tensor:
+    return make_tensor(
+        t.iterator, append(t.layout, make_layout(1), up_to_rank), loc=loc, ip=ip
+    )
+
+
+def repeat_like(x, target):
+    """Creates an object congruent to target and filled with x.
+
+    This function recursively creates a nested tuple structure that matches the structure
+    of the target, with each leaf node filled with the value x.
+
+    :param x: The value to fill the resulting structure with
+    :type x: Any
+    :param target: The structure to mimic
+    :type target: Union[tuple, Any]
+    :return: A structure matching target but filled with x
+    :rtype: Union[tuple, Any]
+
+    Examples:
+
+    .. code-block:: python
+
+        repeat_like(0, (1, 2, 3))      # Returns (0, 0, 0)
+        repeat_like(1, ((1, 2), 3))    # Returns ((1, 1), 1)
+        repeat_like(2, 5)              # Returns 2
+    """
+    if not isinstance(target, tuple):
+        return x
+    if not target:
+        return ()
+    if len(target) == 1:
+        return (repeat_like(x, target[0]),)
+    return tuple(repeat_like(x, t) for t in target)
+
+
+def flatten_to_tuple(a: Union[IntTuple, Coord, Shape, Stride]) -> tuple:
+    """Flattens a potentially nested tuple structure into a flat tuple.
+
+    This function recursively traverses the input structure and flattens it into
+    a single-level tuple, preserving the order of elements.
+
+    :param a: The structure to flatten
+    :type a: Union[IntTuple, Coord, Shape, Stride]
+    :return: A flattened tuple containing all elements from the input
+    :rtype: tuple
+
+    Examples:
+
+    .. code-block:: python
+
+        flatten_to_tuple((1, 2, 3))       # Returns (1, 2, 3)
+        flatten_to_tuple(((1, 2), 3))     # Returns (1, 2, 3)
+        flatten_to_tuple((1, (2, (3,))))  # Returns (1, 2, 3)
+    """
+    if not isinstance(a, tuple):
+        return wrap(a)
+    else:
+        return tuple(chain.from_iterable(tuple(flatten_to_tuple(x) for x in a)))
+
+
+def flatten(a: Union[IntTuple, Coord, Shape, Stride, Layout, Tensor]) -> tuple:
+    """Flattens a CuTe data structure into a simpler form.
+
+    For tuples, this function flattens the structure into a single-level tuple.
+    For non-tuple types, it returns the input unchanged.
+
+    :param a: The structure to flatten
+    :type a: Union[IntTuple, Coord, Shape, Stride, Layout, Tensor]
+    :return: The flattened structure
+    :rtype: Union[tuple, Any]
+    :raises NotImplementedError: If input is a Layout or Tensor
+
+    Examples:
+
+    .. code-block:: python
+
+        flatten((1, 2, 3))           # Returns (1, 2, 3)
+        flatten(((1, 2), (3, 4)))    # Returns (1, 2, 3, 4)
+        flatten(5)                   # Returns 5
+    """
+    if isinstance(a, (Layout, Tensor)):
+        raise NotImplementedError("flatten layout and tensor is not supported")
+
+    if not isinstance(a, tuple):
+        return a
+    else:
+        return flatten_to_tuple(a)
+
+
+def unflatten(
+    sequence: Union[Tuple[Any, ...], List[Any], Iterable[Any]], profile: XTuple
+) -> XTuple:
+    """Unflatten a flat tuple into a nested tuple structure according to a profile.
+
+    This function transforms a flat sequence of elements into a nested tuple structure
+    that matches the structure defined by the profile parameter. It traverses the profile
+    structure and populates it with elements from the sequence.
+
+    sequence must be long enough to fill the profile. Raises RuntimeError if it is not.
+
+    :param sequence: A flat sequence of elements to be restructured
+    :type sequence: Union[Tuple[Any, ...], List[Any], Iterable[Any]]
+    :param profile: A nested tuple structure that defines the shape of the output
+    :type profile: XTuple
+    :return: A nested tuple with the same structure as profile but containing elements from sequence
+    :rtype: XTuple
+
+    Example:
+        >>> unflatten([1, 2, 3, 4], ((0, 0), (0, 0)))
+        ((1, 2), (3, 4))
+    """
+
+    def _make_generator():
+        for element in sequence:
+            yield element
+
+    xs = _make_generator()
+    return transform_leaf(lambda _: next(xs), profile)
+
+
+@dsl_user_op
+def elem_less(
+    lhs: Union[Shape, IntTuple, Coord],
+    rhs: Union[Shape, IntTuple, Coord],
+    *,
+    loc=None,
+    ip=None,
+):
+    lhs_val = _pack_coord(lhs, loc=loc, ip=ip)
+    rhs_val = _pack_coord(rhs, loc=loc, ip=ip)
+    return Boolean(_cute_ir.elem_less(lhs_val, rhs_val, loc=loc, ip=ip))
+
+
+@overload
+def filter_zeros(
+    input: Layout, *, target_profile=None, loc=None, ip=None
+) -> Layout: ...
+@overload
+def filter_zeros(
+    input: Tensor, *, target_profile=None, loc=None, ip=None
+) -> Tensor: ...
+
+
+@dsl_user_op
+def filter_zeros(input, *, target_profile=None, loc=None, ip=None):
+    """Filter out zeros from a layout or tensor.
+
+    This function removes zero-stride dimensions from a layout or tensor.
+    See Section 3.3 in the CuTe Whitepaper for more details on layout operations.
+
+    :param input: The input layout or tensor to filter
+    :type input: Layout or Tensor
+    :param target_profile: Target profile for the filtered result, defaults to None
+    :type target_profile: optional
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: optional
+    :param ip: Insertion point, defaults to None
+    :type ip: optional
+    :return: The filtered layout or tensor with zeros removed
+    :rtype: Layout or Tensor
+    :raises TypeError: If input is not a Layout or Tensor
+    """
+    if not isinstance(input, (Layout, Tensor)):
+        raise TypeError(f"Expect layout or tensor as input but got {type(input)=}")
+    if isinstance(input, Tensor):
+        input = input.value
+    return _cute_ir.filter_zeros(input, target_profile=target_profile, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def filter(input: Union[Layout, Tensor], *, loc=None, ip=None):
+    """Filter a layout or tensor.
+
+    This function filters a layout or tensor according to CuTe's filtering rules.
+
+    :param input: The input layout or tensor to filter
+    :type input: Layout or Tensor
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: optional
+    :param ip: Insertion point, defaults to None
+    :type ip: optional
+    :return: The filtered layout or tensor
+    :rtype: Layout or Tensor
+    :raises TypeError: If input is not a Layout or Tensor
+    """
+    if not isinstance(input, (Layout, Tensor)):
+        raise TypeError(f"Expect layout or tensor as input but got {type(input)=}")
+    if isinstance(input, _Tensor):
+        input = input.value
+    return _cute_ir.filter(input, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def product(a: Union[IntTuple, Shape], *, loc=None, ip=None):
+    """Return product of the given IntTuple or Shape.
+
+    Computes the product of all elements in the input tuple or shape.
+    Returns static value if type is static.
+
+    :param a: The input tuple or shape
+    :type a: IntTuple or Shape
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: optional
+    :param ip: Insertion point, defaults to None
+    :type ip: optional
+    :return: Static product of IntTuple or Shape if static, otherwise a Value
+    :rtype: int or Value
+    :raises TypeError: If input is not an IntTuple or Shape
+    """
+    if is_integer(a):
+        return a
+    if isinstance(a, tuple):
+        a_val = _pack_int_tuple(a, loc=loc, ip=ip)
+        res = _cute_ir.tuple_product(a_val, loc=loc, ip=ip)
+        return _unpack_x_tuple(res, loc=loc, ip=ip)
+    else:
+        raise TypeError(f"expects IntTuple or Shape, but got {type(a)}")
+
+
+@overload
+def product_like(
+    a: IntTuple, target_profile: XTuple, *, loc=None, ip=None
+) -> IntTuple: ...
+@overload
+def product_like(a: Shape, target_profile: XTuple, *, loc=None, ip=None) -> Shape: ...
+
+
+@dsl_user_op
+def product_like(
+    a: Union[IntTuple, Shape], target_profile: XTuple, *, loc=None, ip=None
+):
+    """Return product of the given IntTuple or Shape at leaves of `target_profile`.
+
+    This function computes products according to the structure defined by target_profile.
+
+    :param a: The input tuple or shape
+    :type a: IntTuple or Shape
+    :param target_profile: The profile that guides how products are computed
+    :type target_profile: XTuple
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: optional
+    :param ip: Insertion point, defaults to None
+    :type ip: optional
+    :return: The resulting tuple with products computed according to target_profile
+    :rtype: IntTuple or Shape
+    :raises TypeError: If inputs have incompatible types
+    :raises ValueError: If inputs have incompatible shapes
+    """
+    # Perform product at leaf of `target_profile`
+    if not isinstance(target_profile, tuple):
+        return product(a, loc=loc, ip=ip)
+    else:
+        if not isinstance(a, tuple):
+            raise TypeError(f"expects `a` tuple but got {a}")
+
+        if len(a) != len(target_profile):
+            raise ValueError(f"expects `a` and `guide` have the same rank")
+
+        return tuple(
+            product_like(x, g, loc=loc, ip=ip) for x, g in zip(a, target_profile)
+        )
+
+
+@overload
+def product_each(a: IntTuple, *, loc=None, ip=None) -> IntTuple: ...
+@overload
+def product_each(a: Shape, *, loc=None, ip=None) -> Shape: ...
+
+
+@dsl_user_op
+def product_each(a, *, loc=None, ip=None):
+    """Compute products for each component of the input.
+
+    Returns a rank(a) tuple `result` such that get(result, mode=[i]) == product(get(a, mode=[i]))
+
+    :param a: The input tuple or shape
+    :type a: IntTuple or Shape
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: optional
+    :param ip: Insertion point, defaults to None
+    :type ip: optional
+    :return: A tuple containing products for each component
+    :rtype: tuple
+    :raises TypeError: If input is not an IntTuple or Shape
+    """
+    if is_integer(a):
+        return a
+    if isinstance(a, tuple):
+        if not a:
+            return 1
+        else:
+            a_val = _pack_int_tuple(a, loc=loc, ip=ip)
+            res = _cute_ir.tuple_product_each(a_val, loc=loc, ip=ip)
+            return _unpack_x_tuple(res, loc=loc, ip=ip)
+    else:
+        raise TypeError(f"expects IntTuple or Shape, but got {type(a)}")
+
+
+@dsl_user_op
+def size(
+    a: Union[IntTuple, Shape, Layout, ComposedLayout, Tensor],
+    mode: List[int] = [],
+    *,
+    loc=None,
+    ip=None,
+) -> Int:
+    """Return size of domain of layout or tensor.
+
+    Computes the size (number of elements) in the domain of a layout or tensor.
+    For layouts, this corresponds to the shape of the coordinate space.
+    See Section 3.2 in the CuTe Whitepaper for more details on layout domains.
+
+    :param a: The input object whose size to compute
+    :type a: IntTuple, Shape, Layout, ComposedLayout or Tensor
+    :param mode: List of mode(s) for size calculation. If empty, computes total size, defaults to []
+    :type mode: list of int, optional
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: optional
+    :param ip: Insertion point, defaults to None
+    :type ip: optional
+    :return: Static size of layout or tensor if static, otherwise a Value
+    :rtype: int or Value
+    :raises ValueError: If mode contains non-integer elements
+    """
+    if any(not isinstance(m, int) for m in mode):
+        raise ValueError(f"expects integer elements in mode, but got {mode}")
+
+    if isinstance(a, (TiledMma, TiledCopy)):
+        return a.size
+    a_val = None
+    if not isinstance(a, (Layout, ComposedLayout, Tensor)):
+        a_val = _pack_int_tuple(a, loc=loc, ip=ip)
+    elif isinstance(a, Tensor):
+        a_val = a.value
+    else:
+        a_val = a
+
+    res = _cute_ir.size(a_val, mode=mode, loc=loc, ip=ip)
+    return _unpack_x_tuple(res, loc=loc, ip=ip)  # type: ignore
+
+
+@dsl_user_op
+def shape_div(lhs: Shape, rhs: Shape, *, loc=None, ip=None) -> Shape:
+    """Perform element-wise division of shapes.
+
+    This function performs element-wise division between two shapes.
+
+    :param lhs: Left-hand side shape
+    :type lhs: Shape
+    :param rhs: Right-hand side shape
+    :type rhs: Shape
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: optional
+    :param ip: Insertion point, defaults to None
+    :type ip: optional
+    :return: The result of element-wise division
+    :rtype: Shape
+    """
+    lhs = _pack_shape(lhs, loc=loc, ip=ip)
+    rhs = _pack_shape(rhs, loc=loc, ip=ip)
+    res = _cute_ir.shape_div(lhs, rhs, loc=loc, ip=ip)
+    return _unpack_x_tuple(res, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def ceil_div(input: Shape, tiler: Tiler, *, loc=None, ip=None) -> Shape:
+    """
+    Compute the ceiling division of a target shape by a tiling specification.
+
+    This function computes the number of tiles required to cover the target domain.
+    It is equivalent to the second mode of `zipped_divide(input, tiler)`.
+
+    :param input: A tuple of integers representing the dimensions of the target domain.
+    :type input: Shape
+    :param tiler: The tiling specification.
+    :type tiler: Union[Layout, Shape, Tile]
+    :param loc: Optional location information for IR diagnostics.
+    :type loc: optional
+    :param ip: Optional instruction pointer or context for underlying IR functions.
+    :type ip: optional
+    :return: A tuple of integers representing the number of tiles required along each dimension,
+             i.e. the result of the ceiling division of the input dimensions by the tiler dimensions.
+    :rtype: Shape
+
+    Example:
+
+    .. code-block:: python
+
+        import cutlass.cute as cute
+        @cute.jit
+        def foo():
+            input = (10, 6)
+            tiler = (3, 4)
+            result = cute.ceil_div(input, tiler)
+            print(result)  # Outputs: (4, 2)
+    """
+    input_val = _pack_shape(input, loc=loc, ip=ip)
+    tiler_val = _pack_tile(tiler, loc=loc, ip=ip)
+    res = _cute_ir.ceil_div(input=input_val, tiler=tiler_val, loc=loc, ip=ip)
+    return _unpack_x_tuple(res, loc=loc, ip=ip)
+
+
+def round_up(a: IntTuple, b: IntTuple) -> IntTuple:
+    """
+    Rounds up elements of a using elements of b.
+    """
+    if isinstance(a, tuple):
+        if not a:
+            raise ValueError(f"inputs cannot be empty")
+        if not isinstance(b, tuple):
+            raise TypeError(
+                f"expects both inputs to be tuple, but got {type(a)} and {type(b)}"
+            )
+        if rank(a) < rank(b):
+            raise ValueError(
+                f"expects rank(a) to be greater or equal than rank(b), but got {a}, {b}"
+            )
+        b = append(b, 1, rank(a))
+        return tuple(round_up(x, y) for x, y in zip(a, b))
+    return ((a + b - 1) // b) * b
+
+
+#
+# Layout API (also used by tensors)
+#
+
+
+@dsl_user_op
+def make_layout(
+    shape: Shape, *, stride: Union[Stride, None] = None, loc=None, ip=None
+) -> Layout:
+    """Create a CuTe Layout object from shape and optional stride information.
+
+    A Layout in CuTe represents the mapping between logical and physical coordinates of a tensor.
+    This function creates a Layout object that defines how tensor elements are arranged in memory.
+
+    :param shape: Shape of the layout defining the size of each mode
+    :type shape: Shape
+    :param stride: Optional stride values for each mode, defaults to None
+    :type stride: Union[Stride, None]
+    :param loc: Source location information, defaults to None
+    :type loc: Optional[Location]
+    :param ip: Insertion point for IR generation, defaults to None
+    :type ip: Optional[InsertionPoint]
+    :return: A new Layout object with the specified shape and stride
+    :rtype: Layout
+
+    Examples:
+
+    .. code-block:: python
+
+        # Create a 2D compact left-most layout with shape (4,4)
+        layout = make_layout((4,4))                     # compact left-most layout
+
+        # Create a left-most layout with custom strides
+        layout = make_layout((4,4), stride=(1,4))       # left-most layout with strides (1,4)
+
+        # Create a layout for a 3D tensor
+        layout = make_layout((32,16,8))                 # left-most layout
+
+        # Create a layout with custom strides
+        layout = make_layout((2,2,2), stride=(4,1,2))   # layout with strides (4,1,2)
+
+    Note:
+        - If stride is not provided, a default compact left-most stride is computed based on the shape
+        - The resulting layout maps logical coordinates to physical memory locations
+        - The layout object can be used for tensor creation and memory access patterns
+        - Strides can be used to implement:
+          * Row-major vs column-major layouts
+          * Padding and alignment
+          * Blocked/tiled memory arrangements
+          * Interleaved data formats
+        - Stride is keyword only argument to improve readability, e.g.
+          * make_layout((3,4), (1,4)) can be confusing with make_layout(((3,4), (1,4)))
+          * make_layout((3,4), stride=(1,4)) is more readable
+    """
+    shape_val = _pack_shape(shape, loc=loc, ip=ip)
+    if stride is not None:
+        stride_val = _pack_stride(stride, loc=loc, ip=ip)
+        layout_ty = _cute_ir.LayoutType.get(shape_val, stride_val)
+    else:
+        stride_val = None
+        layout_ty = _cute_ir.LayoutType.get(shape_val)
+
+    return _cute_ir.make_layout(
+        layout_ty, shape=shape_val, stride=stride_val, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def make_identity_layout(shape: Shape, *, loc=None, ip=None) -> Layout:
+    """Create an identity layout with the given shape.
+
+    An identity layout maps logical coordinates directly to themselves without any transformation.
+    This is equivalent to a layout with stride (1@0,1@1,...,1@(N-1)).
+
+    :param shape: The shape of the layout
+    :type shape: Shape
+    :param loc: Source location information, defaults to None
+    :type loc: Optional[Location]
+    :param ip: Insertion point for IR generation, defaults to None
+    :type ip: Optional[InsertionPoint]
+    :return: A new identity Layout object with the specified shape
+    :rtype: Layout
+
+    Examples:
+
+    .. code-block:: python
+
+        # Create a 2D identity layout with shape (4,4)
+        layout = make_identity_layout((4,4))     # stride=(1@0,1@1)
+
+        # Create a 3D identity layout
+        layout = make_identity_layout((32,16,8)) # stride=(1@0,1@1,1@2)
+
+    Note:
+        - An identity layout is a special case where each coordinate maps to itself
+        - Useful for direct coordinate mapping without any transformation
+    """
+    if not is_int_tuple(shape):
+        raise TypeError(f"expects a shape input, got {type(shape)}")
+    shape_val = _pack_shape(shape, loc=loc, ip=ip)
+    return _cute_ir.make_identity_layout(shape_val, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def make_ordered_layout(shape: Shape, order: Shape, *, loc=None, ip=None) -> Layout:
+    """Create a layout with a specific ordering of dimensions.
+
+    This function creates a layout where the dimensions are ordered according to the
+    specified order parameter, allowing for custom dimension ordering in the layout.
+
+    :param shape: The shape of the layout
+    :type shape: Shape
+    :param order: The ordering of dimensions
+    :type order: Shape
+    :param loc: Source location information, defaults to None
+    :type loc: Optional[Location]
+    :param ip: Insertion point for IR generation, defaults to None
+    :type ip: Optional[InsertionPoint]
+    :return: A new Layout object with the specified shape and dimension ordering
+    :rtype: Layout
+
+    Examples:
+
+    .. code-block:: python
+
+        # Create a row-major layout
+        layout = make_ordered_layout((4,4), order=(1,0))
+
+        # Create a column-major layout
+        layout = make_ordered_layout((4,4), order=(0,1))         # stride=(1,4)
+
+        # Create a layout with custom dimension ordering for a 3D tensor
+        layout = make_ordered_layout((32,16,8), order=(2,0,1))   # stride=(128,1,16)
+
+    Note:
+        - The order parameter specifies the ordering of dimensions from fastest-varying to slowest-varying
+        - For a 2D tensor, (0,1) creates a column-major layout, while (1,0) creates a row-major layout
+        - The length of order must match the rank of the shape
+    """
+    shape_val = _pack_shape(shape, loc=loc, ip=ip)
+    order_val = _pack_shape(order, loc=loc, ip=ip)
+    return _cute_ir.make_ordered_layout(
+        shape=shape_val, order=order_val, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def make_composed_layout(
+    inner, offset: IntTuple, outer: Layout, *, loc=None, ip=None
+) -> ComposedLayout:
+    """Create a composed layout by composing an inner transformation with an outer layout.
+
+    As described in the CuTe whitepaper, a composed layout applies a sequence of transformations
+    to coordinates. The composition is defined as (inner ∘ offset ∘ outer), where the operations
+    are applied from right to left.
+
+    :param inner: The inner transformation (can be a Layout or Swizzle)
+    :type inner: Union[Layout, Swizzle]
+    :param offset: An integral offset applied between transformations
+    :type offset: IntTuple
+    :param outer: The outer (right-most) layout that is applied first
+    :type outer: Layout
+    :param loc: Source location information, defaults to None
+    :type loc: Optional[Location]
+    :param ip: Insertion point for IR generation, defaults to None
+    :type ip: Optional[InsertionPoint]
+    :return: A new ComposedLayout representing the composition
+    :rtype: ComposedLayout
+
+    Examples:
+
+    .. code-block:: python
+
+        # Create a basic layout
+        inner = make_layout(...)
+        outer = make_layout((4,4), stride=(E(0), E(1)))
+
+        # Create a composed layout with an offset
+        composed = make_composed_layout(inner, (2,0), outer)
+
+    Note:
+        - The composition applies transformations in the order: outer → offset → inner
+        - The stride divisibility condition must be satisfied for valid composition
+        - Certain compositions (like Swizzle with scaled basis) are invalid and will raise errors
+        - Composed layouts inherit many properties from the outer layout
+    """
+    if not isinstance(outer, Layout):
+        raise TypeError(
+            f"expects the outer (or right-most or effectively visible) layout to be an affine layout, but got {outer}"
+        )
+    if isinstance(inner, Swizzle) and has_scaled_basis(outer.stride):
+        raise TypeError(f"invalid composition {inner} o {offset} o {outer}")
+    offset_val = _pack_int_tuple(offset, loc=loc, ip=ip)
+    return _cute_ir.make_composed_layout(inner, offset_val, outer, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def cosize(
+    a: Union[Layout, ComposedLayout, Tensor], mode: List[int] = [], *, loc=None, ip=None
+):
+    """Return size of codomain of layout or tensor. Return static value if type is static.
+
+    :param a: Layout, ComposedLayout, or Tensor object
+    :type a: Union[Layout, ComposedLayout, Tensor]
+    :param mode: List of mode(s) for cosize calculation
+    :type mode: List[int], optional
+    :param loc: Location information for diagnostics, defaults to None
+    :type loc: optional
+    :param ip: Instruction pointer for diagnostics, defaults to None
+    :type ip: optional
+    :return: Static size of layout or tensor (fast fold) if static, or a dynamic Value
+    :rtype: Union[int, Value]
+    """
+    if any(not is_static(m) for m in mode):
+        raise ValueError(f"expects static mode, but got {mode}")
+
+    if isinstance(a, _Tensor):
+        a = a.value
+    res = _cute_ir.cosize(a, mode=mode, loc=loc, ip=ip)
+    return _unpack_x_tuple(res, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def size_in_bytes(
+    dtype: Type[Numeric], layout: Union[Layout, ComposedLayout], *, loc=None, ip=None
+):
+    """Calculate the size in bytes based on its data type and layout.
+
+    :param dtype: The DSL numeric data type
+    :type dtype: Type[Numeric]
+    :param layout: The layout of the elements. If None, the function returns 0
+    :type layout: Layout, optional
+    :param loc: Location information for diagnostics, defaults to None
+    :type loc: optional
+    :param ip: Instruction pointer for diagnostics, defaults to None
+    :type ip: optional
+    :return: The total size in bytes. Returns 0 if the layout is None
+    :rtype: int
+    """
+    if not isinstance(dtype, NumericMeta):
+        raise TypeError(f"dtype must be a Numeric, but got {dtype}")
+
+    if layout is None:
+        return 0
+    elif isinstance(layout, ComposedLayout):
+        if not isinstance(layout.inner, Swizzle):
+            raise TypeError(
+                f"invalid composed layout {layout}, inner must be a Swizzle"
+            )
+        else:
+            return cosize(layout.outer, loc=loc, ip=ip) * dtype.width // 8
+    else:
+        return cosize(layout, loc=loc, ip=ip) * dtype.width // 8
+
+
+@dsl_user_op
+def coalesce(input, *, target_profile: Coord = None, loc=None, ip=None):
+    if target_profile:
+        profile_val = _pack_coord(target_profile, loc=loc, ip=ip)
+        return _cute_ir.coalesce(input, target_profile=profile_val, loc=loc, ip=ip)
+    else:
+        return _cute_ir.coalesce(input, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def crd2idx(coord: Coord, layout, *, loc=None, ip=None):
+    """
+    Convert a multi-dimensional coordinate into a value using the specified layout.
+
+    This function computes the inner product of the flattened coordinate and stride:
+
+        index = sum(flatten(coord)[i] * flatten(stride)[i] for i in range(len(coord)))
+
+    :param coord: A tuple or list representing the multi-dimensional coordinate
+                  (e.g., (i, j) for a 2D layout).
+    :type coord: Coord
+    :param layout: A layout object that defines the memory storage layout, including shape and stride,
+                   used to compute the inner product.
+    :type layout: Layout or ComposedLayout
+    :param loc: Optional location information for IR diagnostics.
+    :type loc: optional
+    :param ip: Optional instruction pointer or context for underlying IR functions.
+    :type ip: optional
+    :returns: The result of applying the layout transformation to the provided coordinate.
+    :rtype: Any type that the layout maps to
+
+    Example:
+
+    .. code-block:: python
+
+        import cutlass.cute as cute
+        @cute.jit
+        def foo():
+            L = cute.make_layout((5, 4), stride=(4, 1))
+            idx = cute.crd2idx((2, 3), L)
+            # Computed as: 2 * 4 + 3 = 11
+            print(idx)
+        foo()  # Expected output: 11
+    """
+    coord_val = _pack_coord(coord, loc=loc, ip=ip)
+    if isinstance(layout, (tuple, int)):
+        layout = make_layout(layout, loc=loc, ip=ip)
+
+    res = _cute_ir.crd2idx(coord_val, layout, loc=loc, ip=ip)
+    return _unpack_x_tuple(res, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def recast_layout(new_type_bits, old_type_bits, src_layout, *, loc=None, ip=None):
+    return _cute_ir.recast_layout(
+        new_type_bits, old_type_bits, src_layout, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def slice_and_offset(coord, src, *, loc=None, ip=None):
+    layout = slice_(src, coord, loc=loc, ip=ip)
+    offset = crd2idx(coord, src, loc=loc, ip=ip)
+    return layout, offset
+
+
+@dsl_user_op
+@lru_cache_ir()
+def shape(
+    input: Union[Shape, Tensor, Layout, Tile], *, mode=None, loc=None, ip=None
+) -> Shape:
+    """Returns the shape of a tensor, layout or tiler.
+
+    For shapes, this function is identical to get.
+
+    This function extracts the shape information from the input object. For tensors and layouts,
+    it returns their internal shape property. For tilers, it unpacks the shape from the tile
+    representation.
+
+    :param input: The object to extract shape from
+    :type input: Union[Tensor, Layout, Tile]
+    :param mode: Optional mode selector to extract specific dimensions from the shape
+    :type mode: Optional[int]
+    :param loc: Source location for MLIR operation tracking
+    :type loc: Optional[Location]
+    :param ip: Insertion point for MLIR operation
+    :type ip: Optional[InsertionPoint]
+    :return: The shape of the input object, optionally filtered by mode
+    :rtype: Shape
+
+    Example:
+
+    .. code-block:: python
+
+        # Get shape of a layout
+        l0 = cute.make_layout((2, 3, 4))
+        s0 = cute.shape(l0)  # => (2, 3, 4)
+
+        # Get shape of a hierarchical tiler
+        l1 = cute.make_layout(1)
+        s1 = cute.shape((l0, l1))  # => ((2, 3, 4), 1)
+
+        # Get specific mode from a shape
+        s2 = cute.shape(l0, mode=0)  # => 2
+    """
+    if is_int_tuple(input):
+        return get(input, mode=mode)
+
+    if isinstance(input, (Tensor, Layout)):
+        shp = input.shape
+    else:
+        val = _cute_ir.get_shape(_pack_tile(input, loc=loc, ip=ip))
+        shp = _unpack_x_tuple(val, loc=loc, ip=ip)
+    return get(shp, mode=mode)
+
+
+#
+# Pointer API
+#
+
+
+@dsl_user_op
+def recast_ptr(
+    ptr: Pointer,
+    swizzle_=None,
+    dtype: Optional[Type[Numeric]] = None,
+    loc=None,
+    ip=None,
+) -> Pointer:
+    if dtype is not None:
+        if not isclass(dtype) or not issubclass(dtype, Numeric):
+            raise TypeError(f"dtype must be a type of Numeric, but got {dtype}")
+        dtype = dtype.mlir_type
+
+    value_type = ptr.type.value_type if dtype is None else dtype
+    swizzle = swizzle_.type.attribute if swizzle_ is not None else None
+    res_ty = _cute_ir.PtrType.get(
+        value_type,
+        AddressSpace(ptr.type.address_space),
+        ptr.alignment,
+        swizzle,
+    )
+    return _cute_ir.recast_iter(res_ty, ptr.value, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def make_ptr(
+    dtype: Union[Type[Numeric], None],
+    value,
+    mem_space: AddressSpace = AddressSpace.generic,
+    *,
+    assumed_align=None,
+    loc=None,
+    ip=None,
+) -> Pointer:
+    if dtype is None or not isinstance(dtype, NumericMeta):
+        raise TypeError(f"expects dtype to be a type of Numeric, but got {dtype}")
+
+    if not is_integer(value):
+        raise TypeError(f"expects integer value, but got {type(value)}")
+
+    bytes_per_elt = max(1, dtype.width // 8)
+    if assumed_align is None:
+        assumed_align = bytes_per_elt
+
+    if bytes_per_elt % assumed_align != 0 and assumed_align % bytes_per_elt != 0:
+        raise ValueError(
+            f"{bytes_per_elt=} is not a multiple of {assumed_align=} and vice versa."
+        )
+
+    value = Int32(value) if mem_space == AddressSpace.tmem else Int64(value)
+    aligned_ty = _cute_ir.ConstrainedIntType.get(assumed_align, type(value).width)
+    aligned_intptr = _cute_ir.assume(aligned_ty, value.ir_value(), loc=loc, ip=ip)
+
+    ptr_ty = _cute_ir.PtrType.get(
+        T.i8() if dtype is None else dtype.mlir_type, mem_space, assumed_align
+    )
+    return _cute_ir.inttoptr(ptr_ty, aligned_intptr, loc=loc, ip=ip)
+
+
+#
+# Tensor API
+#
+
+
+@dsl_user_op
+def make_tensor(
+    iterator, layout: Union[Shape, Layout, ComposedLayout], *, loc=None, ip=None
+) -> Tensor:
+    """Creates a tensor by composing an engine (iterator/pointer) with a layout.
+
+    A tensor is defined as T = E ∘ L, where E is an engine (array, pointer, or counting iterator)
+    and L is a layout that maps logical coordinates to physical offsets. The tensor
+    evaluates coordinates by applying the layout mapping and dereferencing the engine
+    at the resulting offset.
+
+    :param iterator: Engine component (pointer, iterator, or counting iterator) that provides
+                    data access capabilities
+    :type iterator: Union[Pointer, IntTuple]
+    :param layout: Layout component that defines the mapping from logical coordinates to
+                  physical offsets
+    :type layout: Union[Shape, Layout, ComposedLayout]
+    :param loc: Source location for MLIR operation tracking, defaults to None
+    :type loc: Optional[Location]
+    :param ip: Insertion point for MLIR operation, defaults to None
+    :type ip: Optional[InsertionPoint]
+    :return: A tensor object representing the composition E ∘ L
+    :rtype: Tensor
+
+    :raises ValueError: If iterator type is not supported
+
+    Examples:
+
+    .. code-block:: python
+
+        # Create a tensor with row-major layout
+        layout = make_layout((64, 128), stride=(128, 1))
+        tensor = make_tensor(ptr, layout)
+
+        # Create a tensor with hierarchical layout
+        layout = make_layout(((128, 8), (1, 4, 1)), stride=((32, 1), (0, 8, 4096)))
+        tensor = make_tensor(smem_ptr, layout)
+
+        # Create a counting tensor
+        layout = make_layout(2, stride=16 * E(0))
+        tensor = make_tensor(5, layout)
+
+    Notes:
+        - The engine (iterator) must support random access operations
+        - Common engine types include raw pointers, arrays, and random-access iterators
+        - The layout defines both the shape (logical dimensions) and stride (physical mapping)
+        - Supports both direct coordinate evaluation T(c) and partial evaluation (slicing)
+    """
+    if not isinstance(layout, (Layout, ComposedLayout)):
+        layout = make_layout(layout, loc=loc, ip=ip)
+    elif isinstance(layout, ComposedLayout) and layout.type.is_normal_layout:
+        layout = layout.outer
+
+    ty = None
+    if is_integer(iterator) or isinstance(iterator, tuple):
+        iterator = _pack_int_tuple(iterator, loc=loc, ip=ip)
+        ty = _cute_ir.CountingTensorType.get(iterator.type, layout.type)
+    elif isinstance(iterator, Pointer):
+        iterator = iterator.value
+        ty = _cute_ir.MemRefType.get(iterator.type, layout.type)
+    else:
+        raise TypeError(f"unsupported iterator type, got {type(iterator)}")
+
+    return _cute_ir.make_view(result=ty, iter=iterator, layout=layout, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def make_identity_tensor(shape: Shape, *, loc=None, ip=None) -> Tensor:
+    """Creates an identity tensor with the given shape.
+
+    An identity tensor maps each coordinate to itself, effectively creating a counting
+    sequence within the shape's bounds. This is useful for generating coordinate indices
+    or creating reference tensors for layout transformations.
+
+    :param shape: The shape defining the tensor's dimensions. Can be a simple integer
+                 sequence or a hierarchical structure ((m,n),(p,q))
+    :type shape: Shape
+    :param loc: Source location for MLIR operation tracking, defaults to None
+    :type loc: Optional[Location]
+    :param ip: Insertion point for MLIR operation, defaults to None
+    :type ip: Optional[InsertionPoint]
+    :return: A tensor that maps each coordinate to itself
+    :rtype: Tensor
+
+    Examples:
+
+    .. code-block:: python
+
+        # Create a simple 1D counting tensor
+        tensor = make_identity_tensor(6)  # [0,1,2,3,4,5]
+
+        # Create a 2D counting tensor
+        tensor = make_identity_tensor((3,2))  # [(0,0),(1,0),(2,0),(0,1),(1,1),(2,1)]
+
+        # Create hierarchical counting tensor
+        tensor = make_identity_tensor(((2,1),3))
+        # [((0,0),0),((1,0),0),((0,0),1),((1,0),1),((0,0),2),((1,0),2)]
+
+    Notes:
+        - The shape parameter follows CuTe's IntTuple concept
+        - Coordinates are ordered colexicographically
+        - Useful for generating reference coordinates in layout transformations
+    """
+    shape_val = _pack_shape(shape, loc=loc, ip=ip)
+    return _cute_ir.make_identity_tensor(shape_val, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def make_fragment(
+    layout_or_shape: Union[Layout, Shape],
+    dtype: Type[Numeric],
+    *,
+    loc=None,
+    ip=None,
+) -> Tensor:
+    if not issubclass(dtype, Numeric):
+        raise TypeError(f"value_type must be a type of Numeric, but got {type(dtype)}")
+    elem_ty = dtype.mlir_type
+
+    # Alignment for register memory is useless(?), pick-up large enough number
+    # to allow .128 (> 16B) load store
+    alignment = 32
+    layout = None
+    if not isinstance(layout_or_shape, Layout):
+        layout = make_layout(layout_or_shape, loc=loc, ip=ip)
+    else:
+        layout = layout_or_shape
+
+    ptr_ty = _cute_ir.PtrType.get(elem_ty, AddressSpace.rmem, alignment)
+    res_ty = _cute_ir.MemRefType.get(ptr_ty, layout.type)
+    tensor = _cute_ir.memref_alloca(res_ty, layout=layout, loc=loc, ip=ip)
+    return _Tensor(tensor.value, dtype)
+
+
+@overload
+def make_fragment_like(
+    src: Tensor, dtype: Optional[Type[Numeric]], *, loc=None, ip=None
+) -> Tensor: ...
+
+
+@overload
+def make_fragment_like(src: Layout, *, loc=None, ip=None) -> Layout: ...
+
+
+@overload
+def make_fragment_like(src: ComposedLayout, *, loc=None, ip=None) -> ComposedLayout: ...
+
+
+@dsl_user_op
+def make_fragment_like(src, dtype=None, *, loc=None, ip=None):
+    """Create tensor with a compact layout in the same shape as the source on stack.
+
+    This function either creates a fragment tensor with compact layout in
+    same shape as the source layout or a new layout with the same shape as the source.
+    The strides of the new layout follow the order induced by the source's strides, with a
+    special handling of the 0th mode: it is always stride-1 and generated in column-major order
+    (LayoutLeft).
+
+    :param src: The source layout or tensor whose shape will be matched
+    :type src: Union[Layout, ComposedLayout, Tensor]
+    :param dtype: The element type for the fragment tensor, defaults to None
+    :type dtype: Type[Numeric], optional
+    :param loc: Source location for MLIR operations, defaults to None
+    :type loc: Location, optional
+    :param ip: Insertion point for MLIR operations, defaults to None
+    :type ip: InsertionPoint, optional
+
+    :return: A new layout or fragment tensor with matching shape
+    :rtype: Union[Layout, Tensor]
+
+    **Examples**
+
+    Creating a rmem tensor from a tensor:
+
+    .. code-block:: python
+
+        smem_tensor = cute.make_tensor(smem_ptr, layout)
+        frag_tensor = cute.make_fragment_like(smem_tensor, cutlass.Float32)
+        # frag_tensor will be a register-backed tensor with the same shape
+
+    Creating a fragment with a different element type:
+
+    .. code-block:: python
+
+        tensor = cute.make_tensor(gmem_ptr, layout)
+        bool_frag = cute.make_fragment_like(tensor, cutlass.Boolean)
+        # bool_frag will be a register-backed tensor with Boolean elements
+
+    **Notes**
+
+    - When used with a Tensor, if a type is provided, it will create a new
+      fragment tensor with that element type.
+    - For layouts with ScaledBasis strides, the function creates a fragment
+      from the shape only.
+    - This function is commonly used in GEMM and other tensor operations to
+      create register storage for intermediate results.
+
+    """
+    if isinstance(src, (Layout, ComposedLayout)):
+        new_layout = None
+        # Create base fragment layout
+        if isinstance(src, Layout) and has_scaled_basis(src.stride):
+            # For scaled basis strides, create fragment from shape only
+            new_layout = _cute_ir.make_fragment_like(
+                make_layout(src.shape), loc=loc, ip=ip
+            )
+        else:
+            # Otherwise use full source layout
+            new_layout = _cute_ir.make_fragment_like(src, loc=loc, ip=ip)
+        if dtype is not None:
+            # call make_fragment to convert layout to tensor
+            return make_fragment(new_layout, dtype, loc=loc, ip=ip)
+        else:
+            return new_layout
+    elif isinstance(src, Tensor):
+        if isinstance(src.type, _cute_ir.CountingTensorType):
+            if dtype is None:
+                raise ValueError(
+                    "dtype must be provided when src is a coordinate tensor"
+                )
+
+            new_layout = _cute_ir.make_fragment_like(
+                make_layout(src.shape), loc=loc, ip=ip
+            )
+            return make_fragment(new_layout, dtype, loc=loc, ip=ip)
+        else:
+            if dtype is None:
+                ty = src.element_type.mlir_type
+            else:
+                ty = dtype.mlir_type
+            new_tensor = _cute_ir.make_fragment_like(
+                src.value, elem_type=ty, loc=loc, ip=ip
+            )
+            return _Tensor(
+                new_tensor.value, dtype if dtype is not None else src.element_type
+            )
+    else:
+        raise TypeError(
+            f"src must be a Layout or ComposedLayout or tensor, got {type(src)}"
+        )
+
+
+@dsl_user_op
+def recast_tensor(
+    src: Tensor, dtype: Type[Numeric], swizzle_=None, *, loc=None, ip=None
+):
+    if not isclass(dtype) or not issubclass(dtype, Numeric):
+        raise TypeError(f"dtype must be a type of Numeric, but got {dtype}")
+
+    if dtype is Boolean:
+        dst_width = 8
+    else:
+        dst_width = dtype.width
+
+    if src.element_type is Boolean:
+        src_width = 8
+    else:
+        src_width = src.element_type.width
+
+    src_iter = recast_ptr(src.iterator, dtype=dtype, loc=loc, ip=ip)
+    src_layout = recast_layout(dst_width, src_width, src.layout, loc=loc, ip=ip)
+    return make_tensor(src_iter, src_layout, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def domain_offset(coord: Coord, tensor: Tensor, *, loc=None, ip=None) -> Tensor:
+    offset = crd2idx(coord, tensor.layout, loc=loc, ip=ip)
+    if isinstance(tensor.iterator, Pointer):
+        return make_tensor(tensor.iterator + offset, tensor.layout)
+    elif is_integer(tensor.iterator) or isinstance(tensor.iterator, tuple):
+        new_iter = _cute_ir.add_offset(
+            _pack_int_tuple(tensor.iterator), _pack_int_tuple(offset)
+        )
+        return make_tensor(_unpack_x_tuple(new_iter), tensor.layout)
+    else:
+        raise ValueError(f"unsupported tensor for domain_offset, got {tensor}")
+
+
+#
+# Layout algebra
+#
+
+
+@overload
+def composition(
+    lhs: Layout, rhs: Union[Layout, Shape, Tile], *, loc=None, ip=None
+) -> Layout: ...
+
+
+@overload
+def composition(
+    lhs: Tensor, rhs: Union[Layout, Shape, Tile], *, loc=None, ip=None
+) -> Tensor: ...
+
+
+@dsl_user_op
+def composition(lhs, rhs: Union[Layout, Shape, Tile], *, loc=None, ip=None):
+    """
+    Compose two layout representations using the CuTe layout algebra.
+
+    Compose a left-hand layout (or tensor) with a right-hand operand into a new layout R, such that
+    for every coordinate c in the domain of the right-hand operand, the composed layout satisfies:
+
+        R(c) = A(B(c))
+
+    where A is the left-hand operand provided as ``lhs`` and B is the right-hand operand provided as
+    ``rhs``. In this formulation, B defines the coordinate domain while A applies its transformation to
+    B's output, and the resulting layout R inherits the stride and shape adjustments from A.
+
+    Satisfies:
+        cute.shape(cute.composition(lhs, rhs)) is compatible with cute.shape(rhs)
+
+    :param lhs: The left-hand operand representing the transformation to be applied.
+    :type lhs: Layout or Tensor
+    :param rhs: The right-hand operand defining the coordinate domain. If provided as an int or tuple,
+                it will be converted to a tile layout.
+    :type rhs: Layout, Shape, or Tile, or int or tuple
+    :param loc: Optional location information for IR diagnostics.
+    :type loc: optional
+    :param ip: Optional instruction pointer or context for underlying IR functions.
+    :type ip: optional
+    :returns: A new composed layout R, such that for all coordinates c in the domain of ``rhs``,
+              R(c) = lhs(rhs(c)).
+    :rtype: Layout or Tensor
+
+    Example:
+
+    .. code-block:: python
+
+        import cutlass.cute as cute
+        @cute.jit
+        def foo():
+            # Create a layout that maps (i,j) to i*4 + j
+            L1 = cute.make_layout((2, 3), stride=(4, 1))
+            # Create a layout that maps (i,j) to i*3 + j
+            L2 = cute.make_layout((3, 4), stride=(3, 1))
+            # Compose L1 and L2
+            L3 = cute.composition(L1, L2)
+            # L3 now maps coordinates through L2 then L1
+    """
+    rhs_val = rhs
+    if not isinstance(rhs, Layout) and isinstance(rhs, (int, tuple)):
+        rhs_val = _pack_tile(rhs, loc=loc, ip=ip)
+    if isinstance(lhs, _Tensor):
+        lhs = lhs.value
+    return _cute_ir.composition(lhs, rhs_val, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def complement(
+    input: Layout, cotarget: Union[Layout, Shape], *, loc=None, ip=None
+) -> Layout:
+    """
+    Compute the complement layout of the input layout with respect to the cotarget.
+
+    The complement of a layout A with respect to cotarget n is a layout A* such that
+    for every k in Z_n and c in the domain of A, there exists a unique c* in the domain
+    of A* where k = A(c) + A*(c*).
+
+    This operation is useful for creating layouts that partition a space in complementary ways,
+    such as row and column layouts that together cover a matrix.
+
+    :param input: The layout to compute the complement of
+    :type input: Layout
+    :param cotarget: The target layout or shape that defines the codomain
+    :type cotarget: Union[Layout, Shape]
+    :param loc: Optional location information for IR diagnostics
+    :type loc: optional
+    :param ip: Optional instruction pointer or context for underlying IR functions
+    :type ip: optional
+    :returns: The complement layout
+    :rtype: Layout
+
+    Example:
+
+    .. code-block:: python
+
+        import cutlass.cute as cute
+        @cute.jit
+        def foo():
+            # Create a right-major layout for a 4x4 matrix
+            row_layout = cute.make_layout((4, 4), stride=(4, 1))
+            # Create a left-major layout that complements the row layout
+            col_layout = cute.complement(row_layout, 16)
+            # The two layouts are complementary under 16
+    """
+    if isinstance(cotarget, Layout):
+        return _cute_ir.complement(input, cotarget=cotarget, loc=loc, ip=ip)
+    else:
+        cotarget_val = _pack_shape(cotarget, loc=loc, ip=ip)
+        return _cute_ir.complement(input, cotarget=cotarget_val, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def right_inverse(input: Layout, *, loc=None, ip=None) -> Layout:
+    if not isinstance(input, Layout):
+        raise TypeError(f"expects input of type Layout, but got {type(Layout)}")
+    return _cute_ir.right_inverse(input=input, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def left_inverse(input: Layout, *, loc=None, ip=None) -> Layout:
+    if not isinstance(input, Layout):
+        raise TypeError(f"expects input of type Layout, but got {type(Layout)}")
+    return _cute_ir.left_inverse(input=input, loc=loc, ip=ip)
+
+
+@overload
+def logical_product(block: Layout, tiler: Layout, *, loc=None, ip=None) -> Layout: ...
+@overload
+def logical_product(
+    block: ComposedLayout, tiler: Layout, *, loc=None, ip=None
+) -> ComposedLayout: ...
+
+
+@dsl_user_op
+def logical_product(block, tiler: Layout, *, loc=None, ip=None):
+    return _cute_ir.logical_product(input=block, tiler=tiler, loc=loc, ip=ip)
+
+
+@overload
+def zipped_product(block: Layout, tiler: Layout, *, loc=None, ip=None) -> Layout: ...
+@overload
+def zipped_product(
+    block: ComposedLayout, tiler: Layout, *, loc=None, ip=None
+) -> ComposedLayout: ...
+
+
+@dsl_user_op
+def zipped_product(block, tiler: Layout, *, loc=None, ip=None):
+    return _cute_ir.zipped_product(input=block, tiler=tiler, loc=loc, ip=ip)
+
+
+@overload
+def tiled_product(block: Layout, tiler: Layout, *, loc=None, ip=None) -> Layout: ...
+@overload
+def tiled_product(
+    block: ComposedLayout, tiler: Layout, *, loc=None, ip=None
+) -> ComposedLayout: ...
+
+
+@dsl_user_op
+def tiled_product(block, tiler: Layout, *, loc=None, ip=None):
+    return _cute_ir.tiled_product(input=block, tiler=tiler, loc=loc, ip=ip)
+
+
+@overload
+def flat_product(block: Layout, tiler: Layout, *, loc=None, ip=None) -> Layout: ...
+@overload
+def flat_product(
+    block: ComposedLayout, tiler: Layout, *, loc=None, ip=None
+) -> ComposedLayout: ...
+
+
+@dsl_user_op
+def flat_product(block, tiler: Layout, *, loc=None, ip=None):
+    return _cute_ir.flat_product(input=block, tiler=tiler, loc=loc, ip=ip)
+
+
+@overload
+def raked_product(block: Layout, tiler: Layout, *, loc=None, ip=None) -> Layout: ...
+@overload
+def raked_product(
+    block: ComposedLayout, tiler: Layout, *, loc=None, ip=None
+) -> ComposedLayout: ...
+
+
+@dsl_user_op
+def raked_product(block, tiler: Layout, *, loc=None, ip=None):
+    return _cute_ir.raked_product(input=block, tiler=tiler, loc=loc, ip=ip)
+
+
+@overload
+def blocked_product(block: Layout, tiler: Layout, *, loc=None, ip=None) -> Layout: ...
+@overload
+def blocked_product(
+    block: ComposedLayout, tiler: Layout, *, loc=None, ip=None
+) -> ComposedLayout: ...
+
+
+@dsl_user_op
+def blocked_product(block, tiler: Layout, *, loc=None, ip=None):
+    return _cute_ir.blocked_product(input=block, tiler=tiler, loc=loc, ip=ip)
+
+
+@overload
+def logical_divide(target: Layout, tiler: Tiler, *, loc=None, ip=None) -> Layout: ...
+@overload
+def logical_divide(target: Tensor, tiler: Tiler, *, loc=None, ip=None) -> Tensor: ...
+
+
+@dsl_user_op
+def logical_divide(target, tiler: Tiler, *, loc=None, ip=None):
+    if isinstance(target, _Tensor):
+        target = target.value
+    if isinstance(tiler, tuple):
+        tiler = _pack_tile(tiler, loc=loc, ip=ip)
+    return _cute_ir.logical_divide(input=target, tiler=tiler, loc=loc, ip=ip)
+
+
+@overload
+def zipped_divide(target: Layout, tiler: Tiler, *, loc=None, ip=None) -> Layout: ...
+@overload
+def zipped_divide(target: Tensor, tiler: Tiler, *, loc=None, ip=None) -> Tensor: ...
+
+
+@dsl_user_op
+def zipped_divide(target, tiler: Tiler, *, loc=None, ip=None):
+    if isinstance(target, _Tensor):
+        target = target.value
+    if isinstance(tiler, tuple):
+        tiler = _pack_tile(tiler, loc=loc, ip=ip)
+    return _cute_ir.zipped_divide(input=target, tiler=tiler, loc=loc, ip=ip)
+
+
+@overload
+def tiled_divide(target: Layout, tiler: Tiler, *, loc=None, ip=None) -> Layout: ...
+@overload
+def tiled_divide(target: Tensor, tiler: Tiler, *, loc=None, ip=None) -> Tensor: ...
+
+
+@dsl_user_op
+def tiled_divide(target, tiler: Tiler, *, loc=None, ip=None):
+    if isinstance(target, _Tensor):
+        target = target.value
+    if isinstance(tiler, tuple):
+        tiler = _pack_tile(tiler, loc=loc, ip=ip)
+    return _cute_ir.tiled_divide(input=target, tiler=tiler, loc=loc, ip=ip)
+
+
+@overload
+def flat_divide(target: Layout, tiler: Tiler, *, loc=None, ip=None) -> Layout: ...
+@overload
+def flat_divide(target: Tensor, tiler: Tiler, *, loc=None, ip=None) -> Tensor: ...
+
+
+@dsl_user_op
+def flat_divide(target, tiler: Tiler, *, loc=None, ip=None):
+    if isinstance(target, _Tensor):
+        target = target.value
+    if isinstance(tiler, tuple):
+        tiler = _pack_tile(tiler, loc=loc, ip=ip)
+    return _cute_ir.flat_divide(input=target, tiler=tiler, loc=loc, ip=ip)
+
+
+#
+# Higher-level utilties
+#
+
+
+@dsl_user_op
+def max_common_layout(
+    a: Union[Layout, Tensor], b: Union[Layout, Tensor], *, loc=None, ip=None
+) -> Layout:
+    a_layout = a.layout if isinstance(a, _Tensor) else a
+    b_layout = b.layout if isinstance(b, _Tensor) else b
+
+    inv_b = right_inverse(b_layout, loc=loc, ip=ip)
+    common = coalesce(composition(a_layout, inv_b, loc=loc, ip=ip), loc=loc, ip=ip)
+
+    # some_ir_value == 1 generates a new IR Value which evaluates to True!
+    s = get(common.shape, mode=[0], loc=loc, ip=ip)
+    d = get(common.stride, mode=[0], loc=loc, ip=ip)
+    # Keep only the static identity component of the common layout
+    if isinstance(s, int) and isinstance(d, int) and d == 1:
+        # Truncate to the size of the contiguous vector (static stride-1 mode)
+        return composition(inv_b, get(common, mode=[0], loc=loc, ip=ip), loc=loc, ip=ip)
+    else:
+        return make_layout(1, stride=0, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def max_common_vector(
+    a: Union[Layout, Tensor], b: Union[Layout, Tensor], *, loc=None, ip=None
+) -> int:
+    a_layout = a.layout if isinstance(a, _Tensor) else a
+    b_layout = b.layout if isinstance(b, _Tensor) else b
+
+    inv_b = right_inverse(b_layout, loc=loc, ip=ip)
+    common = coalesce(composition(a_layout, inv_b, loc=loc, ip=ip), loc=loc, ip=ip)
+
+    # Keep only the static identity component of the common layout
+    if (
+        is_static(get(common.shape, mode=[0], loc=loc, ip=ip))
+        and get(common.stride, mode=[0], loc=loc, ip=ip) == 1
+    ):
+        # Truncate to the size of the contiguous vector (static stride-1 mode)
+        return get(common.shape, mode=[0], loc=loc, ip=ip)
+    else:
+        return 1
+
+
+@dsl_user_op
+def tile_to_shape(
+    atom: Union[Layout, ComposedLayout],
+    trg_shape: Shape,
+    order: Shape,
+    *,
+    loc=None,
+    ip=None,
+) -> Union[Layout, ComposedLayout]:
+    trg_shape = _pack_shape(shape(trg_shape), loc=loc, ip=ip)
+    order = _pack_shape(order, loc=loc, ip=ip)
+    return _cute_ir.tile_to_shape(atom, trg_shape, order, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def local_partition(
+    target: Tensor,
+    tiler: Union[Layout, Shape],
+    index,
+    proj: XTuple = 1,
+    *,
+    loc=None,
+    ip=None,
+) -> Tensor:
+    return _cute_ir.local_partition(
+        input=target.value, tiler=dice(tiler, proj), index=index, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def local_tile(
+    input: Tensor,
+    tiler: Union[Layout, Shape],
+    coord: Coord,
+    proj: XTuple = None,
+    *,
+    loc=None,
+    ip=None,
+) -> Tensor:
+    tiler_val = _pack_shape(tiler, loc=loc, ip=ip)
+    coord_val = _pack_coord(coord, loc=loc, ip=ip)
+    if proj is not None:
+        if not isinstance(proj, tuple):
+            raise TypeError(f"Expects tuple for proj, but got {type(proj)}")
+        proj_val = _pack_coord(proj, loc=loc, ip=ip)
+        proj = proj_val.type.attribute
+
+    return _cute_ir.local_tile(
+        input=input.value,
+        tile=tiler_val,
+        static_tile=None,
+        coord=coord_val,
+        static_coord=None,
+        proj=proj,
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def make_layout_image_mask(
+    lay: Layout, coord: Coord, mode: int, *, loc=None, ip=None
+) -> Int16:
+    """
+    Makes a 16-bit integer mask of the image of a layout sliced at a given mode
+    and accounting for the offset given by the input coordinate for the other modes.
+    """
+    if not is_static(lay):
+        raise ValueError(
+            f"make_layout_image_mask requires the layout to be static, but got {pretty_str(lay)}"
+        )
+    r = rank(lay)
+    if rank(coord) != r:
+        raise ValueError(
+            f"the rank of the coordinate must be equal to the one of the layout, but got {pretty_str(coord)}"
+        )
+    if mode > r or mode < 0:
+        raise ValueError(f"expects `mode` to be in [0,rank(lay)), but got {mode}")
+    # Given that we require the layout to be static, we can check that the mask fits in 16 bits
+    # This might be too conservative but safe
+    if cosize(lay) > 16:
+        raise ValueError("the mask may not fit into a 16-bit integer")
+
+    # Replace the mode to keep with _ in the coordinate
+    slicer = tuple(None if idx == mode else x for idx, x in enumerate(coord))
+    # Slice the layout with the slicer above and keep track of the offset
+    sliced_lay, offset = slice_and_offset(slicer, lay, loc=loc, ip=ip)
+    # Given that we replace only one mode with _, the rank of the slice should be 1
+    assert rank(sliced_lay) == 1
+
+    # Create the mask of the image
+    mcast_mask = Int16(0)
+    for i in range(size(sliced_lay)):
+        mcast_mask = mcast_mask | (1 << sliced_lay(i))
+    mcast_mask <<= offset
+    return Int16(mcast_mask)
+
+
+####################################################################################################
+#
+# Atom
+#
+####################################################################################################
+
+
+class Op(ABC):
+    """
+    Operation abstract base class.
+    """
+
+    pass
+
+
+class MmaOp(Op):
+    """
+    MMA Operation abstract base class.
+    """
+
+    @abstractmethod
+    def _make_trait(self, *, loc=None, ip=None, **kwargs):
+        pass
+
+
+class CopyOp(Op):
+    """
+    Copy Operation abstract base class.
+    """
+
+    @abstractmethod
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ):
+        pass
+
+
+class Trait(ABC):
+    """
+    Trait abstract base class.
+
+    Traits are internal-only classes used by Atoms that wrap the underlying IR Value. The Python
+    user should only interact with Ops and Atoms.
+    """
+
+    def __init__(self, value: ir.Value) -> None:
+        self.value = value
+
+    def __extract_mlir_values__(self):
+        return [self.value]
+
+    def __new_from_mlir_values__(self, values):
+        return self.__class__(values[0])
+
+    def set(self, field, value, *, loc=None, ip=None) -> None:
+        raise NotImplementedError(
+            "set not implemented, the requesting Atom has likely no runtime state"
+        )
+
+    def unpack(self, *, loc=None, ip=None, **kwargs) -> ir.Value:
+        return self.value
+
+
+class Atom(ABC):
+    """
+    Atom base class.
+
+    An Atom is the composition of
+
+    - a MMA or Copy Operation;
+    - an internal MMA or Copy Trait.
+
+    An Operation is a pure Python class that is used to model a specific MMA or Copy instruction.
+    The Trait wraps the underlying IR Value and provides access to the metadata of the instruction
+    encoded using CuTe Layouts. When the Trait can be constructed straighforwardly from an
+    Operation, the ``make_mma_atom`` or ``make_copy_atom`` API should be used. There are cases where
+    constructing the metadata is not trivial and requires more information, for example to determine
+    the number of bytes copied per TMA instruction ("the TMA vector length"). In such cases,
+    dedicated helper functions are provided with an appropriate API such that the Atom is
+    constructed internally in an optimal fashion for the user.
+    """
+
+    def __init__(self, op: Op, trait: Trait) -> None:
+        self._op = op
+        self._trait = trait
+
+    def __extract_mlir_values__(self):
+        return extract_mlir_values(self._trait)
+
+    def __new_from_mlir_values__(self, values):
+        return self.__class__(self.op, new_from_mlir_values(self._trait, values))
+
+    @property
+    def op(self) -> Op:
+        return self._op
+
+    @property
+    def type(self):
+        return self._trait.value.type
+
+    @dsl_user_op
+    def set(self, modifier, value, *, loc=None, ip=None) -> None:
+        """
+        Sets runtime fields of the Atom.
+
+        Some Atoms have runtime state, for example a tcgen05 MMA Atom
+
+
+        .. code-block:: python
+
+            tiled_mma = cute.make_tiled_mma(some_tcgen05_mma_op)
+            tiled_mma.set(cute.nvgpu.tcgen05.Field.ACCUMULATE, True)
+
+        The ``set`` method provides a way to the user to modify such runtime state. Modifiable
+        fields are provided by arch-specific enumerations, for example ``tcgen05.Field``. The Atom
+        instance internally validates the field as well as the value provided by the user to set
+        the field to.
+        """
+        self._trait.set(modifier, value, loc=loc, ip=ip)
+
+    def _unpack(self, *, loc=None, ip=None, **kwargs) -> ir.Value:
+        return self._trait.unpack(loc=loc, ip=ip, **kwargs)
+
+
+####################################################################################################
+#
+# MMA Atoms, TiledMma, and ThrMma
+#
+####################################################################################################
+
+
+class MmaAtom(Atom):
+    """
+    The MMA Atom class.
+    """
+
+    def __str__(self) -> str:
+        res = "MMA Atom\n"
+        res += "  ThrID:       " + pretty_str(self.thr_id) + "\n"
+        res += "  Shape MNK:   " + pretty_str(self.shape_mnk) + "\n"
+        res += "  TV Layout A: " + pretty_str(self.tv_layout_A) + "\n"
+        res += "  TV Layout B: " + pretty_str(self.tv_layout_B) + "\n"
+        res += "  TV Layout C: " + pretty_str(self.tv_layout_C)
+        return res
+
+    #
+    # Properties
+    #
+
+    @property
+    def thr_id(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.thr_id)
+
+    @property
+    def shape_mnk(self) -> Shape:
+        return _unpack_x_tuple(self._trait.value.type.shape_mnk)
+
+    @property
+    def tv_layout_A(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.layout_a_tv)
+
+    @property
+    def tv_layout_B(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.layout_b_tv)
+
+    @property
+    def tv_layout_C(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.layout_c_tv)
+
+    #
+    # make_fragment
+    #
+
+    @dsl_user_op
+    def make_fragment_A(self, input, *, loc=None, ip=None):
+        # input could be memref/shape/layout for tmem based fragment
+        if isinstance(input, _Tensor):
+            input = input.value
+        if isinstance(input, tuple):
+            input = _pack_shape(input, loc=loc, ip=ip)
+        return _cute_ir.mma_make_fragment(
+            _cute_ir.MmaOperand.A,
+            self._trait.value,
+            input,
+            loc=loc,
+            ip=ip,
+        )
+
+    @dsl_user_op
+    def make_fragment_B(self, input, *, loc=None, ip=None):
+        if isinstance(input, _Tensor):
+            input = input.value
+        return _cute_ir.mma_make_fragment(
+            _cute_ir.MmaOperand.B,
+            self._trait.value,
+            input,
+            loc=loc,
+            ip=ip,
+        )
+
+    @dsl_user_op
+    def make_fragment_C(self, input, *, loc=None, ip=None):
+        # input could be memref/shape/layout for tmem based fragment
+        if isinstance(input, _Tensor):
+            input = input.value
+        if isinstance(input, tuple):
+            input = _pack_shape(input, loc=loc, ip=ip)
+        return _cute_ir.mma_make_fragment(
+            _cute_ir.MmaOperand.C,
+            self._trait.value,
+            input,
+            loc=loc,
+            ip=ip,
+        )
+
+
+class TiledMma(MmaAtom):
+    """
+    The tiled MMA class.
+    """
+
+    def __str__(self) -> str:
+        res = "Tiled MMA\n"
+        res += "  Thr Layout VMNK: " + pretty_str(self.thr_layout_vmnk) + "\n"
+        res += "  Permutation MNK: " + pretty_str(self.permutation_mnk) + "\n"
+        res += "MMA Atom\n"
+        res += "  ThrID:           " + pretty_str(self.thr_id) + "\n"
+        res += "  Shape MNK:       " + pretty_str(self.shape_mnk) + "\n"
+        res += "  TV Layout A:     " + pretty_str(self.tv_layout_A) + "\n"
+        res += "  TV Layout B:     " + pretty_str(self.tv_layout_B) + "\n"
+        res += "  TV Layout C:     " + pretty_str(self.tv_layout_C)
+        return res
+
+    #
+    # Properties
+    #
+
+    @property
+    def tv_layout_A_tiled(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.layout_a_tv_tiled)
+
+    @property
+    def tv_layout_B_tiled(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.layout_b_tv_tiled)
+
+    @property
+    def tv_layout_C_tiled(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.layout_c_tv_tiled)
+
+    @property
+    def permutation_mnk(self) -> Tile:
+        return _unpack_x_tuple(self._trait.value.type.permutation_mnk)
+
+    @property
+    def thr_layout_vmnk(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.thr_layout_vmnk)
+
+    @property
+    def size(self) -> int:
+        return self._trait.value.type.size
+
+    #
+    # Tiler
+    #
+
+    def get_tile_size(self, mode_idx: int) -> Shape:
+        assert (mode_idx >= 0) and (mode_idx < 3)
+        perm_tile = self.permutation_mnk[mode_idx]
+        if perm_tile is None:
+            thr_layout_vmnk = self.thr_layout_vmnk
+            atom_shape_mnk = self.shape_mnk
+            return size(atom_shape_mnk, mode=[mode_idx]) * size(
+                thr_layout_vmnk, mode=[mode_idx + 1]
+            )
+        else:
+            return size(perm_tile)
+
+    #
+    # get_slice
+    #
+
+    def get_slice(self, thr_idx: Union[int, Int32]) -> "ThrMma":
+        return ThrMma(self.op, self._trait, thr_idx)
+
+    #
+    # partition_shape
+    #
+
+    def _partition_shape(self, operand_id, shape, *, loc=None, ip=None):
+        shape = _pack_shape(shape, loc=loc, ip=ip)
+        return _unpack_x_tuple(
+            _cute_ir.tiled_mma_partition_shape(
+                operand_id, self._trait.value, shape, loc=loc, ip=ip
+            ),
+            loc=loc,
+            ip=ip,
+        )
+
+    @dsl_user_op
+    def partition_shape_A(self, shape_mk, *, loc=None, ip=None):
+        return self._partition_shape(_cute_ir.MmaOperand.A, shape_mk, loc=loc, ip=ip)
+
+    @dsl_user_op
+    def partition_shape_B(self, shape_nk, *, loc=None, ip=None):
+        return self._partition_shape(_cute_ir.MmaOperand.B, shape_nk, loc=loc, ip=ip)
+
+    @dsl_user_op
+    def partition_shape_C(self, shape_mn, *, loc=None, ip=None):
+        return self._partition_shape(_cute_ir.MmaOperand.C, shape_mn, loc=loc, ip=ip)
+
+    #
+    # _thrfrg
+    #
+
+    @overload
+    def _thrfrg(self, operand_id, input: Layout, *, loc=None, ip=None) -> Layout: ...
+
+    @overload
+    def _thrfrg(self, operand_id, input: Tensor, *, loc=None, ip=None) -> Tensor: ...
+
+    def _thrfrg(self, operand_id, input, *, loc=None, ip=None) -> Union[Tensor, Layout]:
+        if isinstance(input, Tensor):
+            return make_tensor(
+                input.iterator,
+                self._thrfrg(operand_id, input.layout, loc=loc, ip=ip),
+            )
+        elif isinstance(input, Layout):
+            if not is_static(input.type):
+                raise ValueError(f"Expects a static layout but got {input.type}")
+            return _cute_ir.static(
+                self._trait.value.type.thrfrg(operand_id, input), loc=loc, ip=ip
+            )
+
+        raise ValueError(
+            f"Expects a layout or a tensor as input but got {type(input)=}"
+        )
+
+    def _thrfrg_A(
+        self, input: Union[Layout, Tensor], *, loc=None, ip=None
+    ) -> Union[Layout, Tensor]:
+        return self._thrfrg(_cute_ir.MmaOperand.A, input, loc=loc, ip=ip)
+
+    def _thrfrg_B(
+        self, input: Union[Layout, Tensor], *, loc=None, ip=None
+    ) -> Union[Layout, Tensor]:
+        return self._thrfrg(_cute_ir.MmaOperand.B, input, loc=loc, ip=ip)
+
+    def _thrfrg_C(
+        self, input: Union[Layout, Tensor], *, loc=None, ip=None
+    ) -> Union[Layout, Tensor]:
+        return self._thrfrg(_cute_ir.MmaOperand.C, input, loc=loc, ip=ip)
+
+
+class ThrMma(TiledMma):
+    """
+    The thread MMA class for modeling a thread-slice of a tiled MMA.
+    """
+
+    def __init__(self, op: Op, trait: Trait, thr_idx: Union[int, Int32]) -> None:
+        super().__init__(op, trait)
+        self._thr_idx = thr_idx
+
+    def __new_from_mlir_values__(self, values):
+        return self.__class__(
+            self.op, new_from_mlir_values(self._trait, values), self.thr_idx
+        )
+
+    @property
+    def thr_idx(self):
+        return self._thr_idx
+
+    @dsl_user_op
+    def partition_A(self, input_mk: Tensor, *, loc=None, ip=None) -> Tensor:
+        thr_idx = _pack_coord(self.thr_idx, loc=loc, ip=ip)
+        return _cute_ir.tiled_mma_partition(
+            _cute_ir.MmaOperand.A,
+            self._trait.value,
+            input_mk.value,
+            thr_idx,
+            loc=loc,
+            ip=ip,
+        )
+
+    @dsl_user_op
+    def partition_B(self, input_nk: Tensor, *, loc=None, ip=None) -> Tensor:
+        thr_idx = _pack_coord(self.thr_idx, loc=loc, ip=ip)
+        return _cute_ir.tiled_mma_partition(
+            _cute_ir.MmaOperand.B,
+            self._trait.value,
+            input_nk.value,
+            thr_idx,
+            loc=loc,
+            ip=ip,
+        )
+
+    @dsl_user_op
+    def partition_C(self, input_mn: Tensor, *, loc=None, ip=None) -> Tensor:
+        thr_idx = _pack_coord(self.thr_idx, loc=loc, ip=ip)
+        return _cute_ir.tiled_mma_partition(
+            _cute_ir.MmaOperand.C,
+            self._trait.value,
+            input_mn.value,
+            thr_idx,
+            loc=loc,
+            ip=ip,
+        )
+
+
+@dsl_user_op
+def make_mma_atom(op: MmaOp, *, loc=None, ip=None, **kwargs) -> MmaAtom:
+    """
+    Makes an MMA Atom from an MMA Operation.
+
+    This function creates an MMA Atom from a given MMA Operation. Arbitrary kw arguments can be
+    provided for Op-specific additional parameters. They are not used as of today.
+
+    :param op: The MMA Operation to construct an Atom for
+    :type op:  MmaOp
+    :return:   The MMA Atom
+    :rtype:    MmaAtom
+    """
+    trait = op._make_trait(loc=loc, ip=ip, **kwargs)
+    return MmaAtom(op, trait)
+
+
+@dsl_user_op
+def make_tiled_mma(
+    op_or_atom: Union[Op, MmaAtom],
+    atom_layout_mnk=(1, 1, 1),
+    permutation_mnk=None,
+    *,
+    loc=None,
+    ip=None,
+    **kwargs,
+) -> TiledMma:
+    """
+    Makes a tiled MMA from an MMA Operation or an MMA Atom.
+
+    :param op_or_atom:      The MMA Operation or Atom
+    :type op_or_atom:       Union[Op, MmaAtom]
+    :param atom_layout_mnk: A Layout describing the tiling of Atom across threads
+    :type atom_layout_mnk:  Layout
+    :param permutation_mnk: A permutation Tiler describing the tiling of Atom across values including any permutation of such tiling
+    :type permutation_mnk:  Tiler
+    :return:                The resulting tiled MMA
+    :rtype:                 TiledMma
+    """
+    if isinstance(op_or_atom, Op):
+        op = op_or_atom
+        atom = make_mma_atom(op_or_atom, loc=loc, ip=ip, **kwargs)
+    elif isinstance(op_or_atom, MmaAtom):
+        op = op_or_atom.op
+        atom = op_or_atom
+    else:
+        raise TypeError(
+            f"expected an MMA Op or Atom, but got an instance of {type(op_or_atom)}"
+        )
+    if isinstance(atom_layout_mnk, tuple):
+        atom_layout_mnk = make_layout(atom_layout_mnk, loc=loc, ip=ip)
+    if rank(atom_layout_mnk) != 3:
+        raise ValueError(f"expects rank-3 MNK atom layout, but got {atom_layout_mnk}")
+    permutation_mnk_ty = None
+    if permutation_mnk is not None:
+        permutation_mnk_ty = _pack_tile(permutation_mnk, loc=loc, ip=ip).type
+    ty = _cute_nvgpu_ir.TiledMmaType.get(
+        atom._trait.value.type,
+        atom_layout_mnk.type,
+        permutation_mnk_ty,
+    )
+    val = _cute_ir.make_tiled_mma(ty, atom._trait.value, loc=loc, ip=ip)
+    # Instead of modifying atom which might have been provided by the user, create a brand new
+    # trait instance and replace the Atom ir.Value with the tiled one
+    trait = new_from_mlir_values(atom._trait, [val])
+    return TiledMma(op, trait)
+
+
+####################################################################################################
+#
+# Copy Atoms, TiledCopy, and ThrCopy
+#
+####################################################################################################
+
+
+class CopyAtom(Atom):
+    """
+    The Copy Atom class.
+    """
+
+    def __str__(self) -> str:
+        res = "Copy Atom\n"
+        res += "  ThrID:         " + str(self.thr_id) + "\n"
+        res += "  TV Layout Src: " + str(self.layout_src_tv) + "\n"
+        res += "  TV Layout Dst: " + str(self.layout_dst_tv) + "\n"
+        res += "  Value type:    " + str(self._trait.value.type.value_type)
+        return res
+
+    #
+    # Properties
+    #
+
+    @property
+    def value_type(self) -> Type[Numeric]:
+        return Numeric.from_mlir_type(self._trait.value.type.value_type)
+
+    @property
+    def thr_id(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.thr_id)
+
+    @property
+    def layout_src_tv(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.layout_src_tv)
+
+    @property
+    def layout_dst_tv(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.layout_dst_tv)
+
+
+class TiledCopy(CopyAtom):
+    """
+    The tiled Copy class.
+    """
+
+    def __str__(self) -> str:
+        res = "Tiled Copy\n"
+        res += "  Tiler MN:        " + pretty_str(self.tiler_mn) + "\n"
+        res += "  TV Layout tiled: " + str(self.layout_tv_tiled) + "\n"
+        res += "Copy Atom\n"
+        res += "  ThrID:           " + str(self.thr_id) + "\n"
+        res += "  TV Layout Src:   " + str(self.layout_src_tv) + "\n"
+        res += "  TV Layout Dst:   " + str(self.layout_dst_tv) + "\n"
+        res += "  Value type:      " + str(self._trait.value.type.value_type)
+        return res
+
+    #
+    # Properties
+    #
+
+    @property
+    def layout_tv_tiled(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.layout_tv_tiled)
+
+    @property
+    def tiler_mn(self) -> Tile:
+        return _unpack_x_tuple(self._trait.value.type.tiler_mn)
+
+    @property
+    def layout_src_tv_tiled(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.layout_src_tv_tiled)
+
+    @property
+    def layout_dst_tv_tiled(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.layout_dst_tv_tiled)
+
+    @property
+    def size(self) -> int:
+        return self._trait.value.type.size
+
+    #
+    # get_slice and retile
+    #
+
+    def get_slice(self, thr_idx: Union[int, Int32]) -> "ThrCopy":
+        return ThrCopy(self.op, self._trait, thr_idx)
+
+    @dsl_user_op
+    def retile(self, src, *, loc=None, ip=None):
+        return _cute_ir.tiled_copy_retile(
+            tiled_copy=self._trait.value, input=src.value, loc=loc, ip=ip
+        )
+
+
+class ThrCopy(TiledCopy):
+    """
+    The thread Copy class for modeling a thread-slice of a tiled Copy.
+    """
+
+    def __init__(self, op: Op, trait: Trait, thr_idx: Union[int, Int32]) -> None:
+        super().__init__(op, trait)
+        self._thr_idx = thr_idx
+
+    def __new_from_mlir_values__(self, values):
+        return self.__class__(
+            self.op, new_from_mlir_values(self._trait, values), self.thr_idx
+        )
+
+    @property
+    def thr_idx(self):
+        return self._thr_idx
+
+    @dsl_user_op
+    def partition_S(self, src: Tensor, *, loc=None, ip=None) -> Tensor:
+        thr_idx = _pack_coord(self.thr_idx, loc=loc, ip=ip)
+        return _cute_ir.tiled_copy_partition_S(
+            self._trait.value, src.value, thr_idx, loc=loc, ip=ip
+        )
+
+    @dsl_user_op
+    def partition_D(self, dst: Tensor, *, loc=None, ip=None) -> Tensor:
+        thr_idx = _pack_coord(self.thr_idx, loc=loc, ip=ip)
+        return _cute_ir.tiled_copy_partition_D(
+            self._trait.value, dst.value, thr_idx, loc=loc, ip=ip
+        )
+
+
+@dsl_user_op
+def make_copy_atom(
+    op: CopyOp, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+) -> CopyAtom:
+    """
+    Makes a Copy Atom from a Copy Operation.
+
+    This function creates a Copy Atom from a given Copy Operation. Arbitrary kw arguments can be
+    provided for Op-specific additional parameters.
+
+    Example:
+
+    .. code-block:: python
+
+        op = cute.nvgpu.CopyUniversalOp()
+        atom = cute.make_copy_atom(op, tensor_dtype, num_bits_per_copy=64)
+
+    :param op:                 The Copy Operation to construct an Atom for
+    :type op:                  CopyOp
+    :param copy_internal_type: An internal data type used to construct the source/destination layouts in unit of tensor elements
+    :type copy_internal_type:  Type[Numeric]
+    :return:                   The Copy Atom
+    :rtype:                    CopyAtom
+    """
+    trait = op._make_trait(copy_internal_type, loc=loc, ip=ip, **kwargs)
+    return CopyAtom(op, trait)
+
+
+@dsl_user_op
+def make_layout_tv(
+    thr_layout: Layout, val_layout: Layout, *, loc=None, ip=None
+) -> Tuple[Shape, Layout]:
+    """
+    Create a tiled copy given separate thr and val layouts. A TV partitioner is inferred based on inputs.
+    Requires input thr layout be compact.
+
+    Parameters
+    ----------
+    atom : copy atom, e.g. smit_copy and simt_async_copy, tma_load, etc.
+    thr_layout : mn -> tid (need to be compact?)
+    val_layout : mn -> vid
+    loc     : source location for mlir (optional)
+    ip      : insertion point (optional)
+
+    Returns
+    -------
+    layout_mn
+            logical tile size
+    layout_tv
+            thread-value layout (tid, vid) -> mn
+    """
+
+    # Take the raked_products to compute the Layout_MN
+    # (M,N) -> (thr_idx, val_idx)
+    layout_mn = raked_product(thr_layout, val_layout, loc=loc, ip=ip)
+    thr_size = size(thr_layout, loc=loc, ip=ip)
+    val_size = size(val_layout, loc=loc, ip=ip)
+    tmp = make_layout((thr_size, val_size), loc=loc, ip=ip)
+    # (thr_idx, val_idx) -> (M,N)
+    layout_tv = composition(
+        right_inverse(layout_mn, loc=loc, ip=ip), tmp, loc=loc, ip=ip
+    )
+
+    tiler_mn = product_each(layout_mn.shape, loc=loc, ip=ip)
+
+    return (tiler_mn, layout_tv)
+
+
+@dsl_user_op
+def make_tiled_copy_tv(atom, thr_layout, val_layout, *, loc=None, ip=None) -> TiledCopy:
+    """
+    Create a tiled copy given separate thr and val layouts. A TV partitioner is inferred based on inputs.
+    Requires input thr layout be compact.
+
+    Parameters
+    ----------
+    atom : copy atom, e.g. smit_copy and simt_async_copy, tma_load, etc.
+    thr_layout : mn -> tid (need to be compact?)
+    val_layout : mn -> vid
+    loc     : source location for mlir (optional)
+    ip      : insertion point (optional)
+
+    Returns
+    -------
+    tiled_copy
+            A tiled copy for partitioner
+    """
+
+    tiler_mn, layout_tv = make_layout_tv(thr_layout, val_layout, loc=loc, ip=ip)
+    tiler_mn = _pack_tile(product_each(tiler_mn, loc=loc, ip=ip), loc=loc, ip=ip)
+    if not is_static(layout_tv.type) or not is_static(tiler_mn.type):
+        raise ValueError(
+            f"expects layout tv and tiler mn, but got {layout_tv.type} and {tiler_mn.type}"
+        )
+    tiled_copy_ty = _cute_nvgpu_ir.TiledCopyType.get(
+        atom.type, layout_tv.type, tiler_mn.type
+    )
+    val = _cute_ir.make_tiled_copy(tiled_copy_ty, atom._trait.value, loc=loc, ip=ip)
+    # Instead of modifying atom which might have been provided by the user, create a brand new
+    # trait instance and replace the Atom ir.Value with the tiled one
+    trait = new_from_mlir_values(atom._trait, [val])
+    return TiledCopy(atom.op, trait)
+
+
+@dsl_user_op
+def make_tiled_copy(atom, layout_tv, tiler_mn, *, loc=None, ip=None):
+    """
+    Create a tiled type given a TV partitioner and tiler
+
+    Parameters
+    ----------
+    atom : copy atom, e.g. smit_copy and simt_async_copy, tma_load, etc.
+    layout_tv : thread-value layout.
+    tiler_mn : tile size (??)
+    loc     : source location for mlir (optional)
+    ip      : insertion point (optional)
+
+    Returns
+    -------
+    tiled_copy
+            A tuple of A tiled copy and atom
+    """
+
+    # tiler_mn = pack_tuple(tiler_mn, make_tile)
+    if type(tiler_mn) is tuple:
+        tiler_mn = _pack_tile(tiler_mn, loc=loc, ip=ip)
+
+    assert is_static(layout_tv.type) and is_static(
+        tiler_mn.type
+    ), "layout tv and tiler mn must be static"
+    tiled_copy_ty = _cute_nvgpu_ir.TiledCopyType.get(
+        atom.type, layout_tv.type, tiler_mn.type
+    )
+    val = _cute_ir.make_tiled_copy(tiled_copy_ty, atom._trait.value, loc=loc, ip=ip)
+    # Instead of modifying atom which might have been provided by the user, create a brand new
+    # trait instance and replace the Atom ir.Value with the tiled one
+    trait = new_from_mlir_values(atom._trait, [val])
+    return TiledCopy(atom.op, trait)
+
+
+@dsl_user_op
+def make_tiled_copy_S(atom, tiled_copy, *, loc=None, ip=None):
+    """
+    Create a tiled type out of the copy_atom that matches the Src-Layout of tiled_copy.
+
+    Parameters
+    ----------
+    atom : copy atom, e.g. smit_copy and simt_async_copy, tma_load, etc.
+    tiled_copy : tiled copy
+    loc     : source location for mlir (optional)
+    ip      : insertion point (optional)
+
+    Returns
+    -------
+    tiled_copy
+            A tuple of A tiled copy and atom
+    """
+
+    return make_tiled_copy(
+        atom, tiled_copy.layout_src_tv_tiled, tiled_copy.tiler_mn, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def make_tiled_copy_D(atom, tiled_copy, *, loc=None, ip=None):
+    """
+    Create a tiled type out of the copy_atom that matches the Dst-Layout of tiled_copy.
+
+    Parameters
+    ----------
+    atom : copy atom, e.g. smit_copy and simt_async_copy, tma_load, etc.
+    tiled_copy : tiled copy
+    loc     : source location for mlir (optional)
+    ip      : insertion point (optional)
+
+    Returns
+    -------
+    tiled_copy
+            A tuple of A tiled copy and atom
+    """
+
+    return make_tiled_copy(
+        atom, tiled_copy.layout_dst_tv_tiled, tiled_copy.tiler_mn, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def make_tiled_copy_C_atom(atom: CopyAtom, mma: TiledMma, *, loc=None, ip=None):
+    """
+    Create the smallest tiled copy that can retile LayoutC_TV
+    for use with pipelined epilogues with subtiled stores
+
+    Parameters
+    ----------
+    atom: CopyAtom
+    mma : TiledMma
+    loc : source location for mlir (optional)
+    ip : insertion point (optional)
+
+    Returns
+    -------
+    tiled_copy
+            A tiled copy for partitioner
+    """
+    # Truncate the V-layout to just the Copy_Atom, keep the V-order
+    layoutC_tv = mma.tv_layout_C_tiled
+    val_layout_src = atom.layout_src_tv
+    num_val_src = size(val_layout_src, mode=[1], loc=loc, ip=ip)
+    num_val_layoutC_tv = size(layoutC_tv, mode=[1], loc=loc, ip=ip)
+    if num_val_src > num_val_layoutC_tv:
+        raise ValueError(
+            f"The number value of CopyAtom's source layout {num_val_src} "
+            f"is greater than the size of TiledMma's LayoutC_TV {num_val_layoutC_tv}"
+        )
+    layout_TV = composition(
+        layoutC_tv,
+        make_layout(
+            (size(layoutC_tv, mode=[0], loc=loc, ip=ip), num_val_src), loc=loc, ip=ip
+        ),
+        loc=loc,
+        ip=ip,
+    )
+
+    # Recompute tiler and restride the TV layout for the new tiler
+
+    # Tiler -- Find the active elements in the MMA tensor and generate a tiler to extract them
+    # Convert to the awkward by-mode tiler to preserve the modes of the tiled MMA
+    mma_tiler = (mma.get_tile_size(0), mma.get_tile_size(1))
+
+    tiler_0 = filter(
+        composition(
+            make_layout(mma_tiler, stride=(1, 0), loc=loc, ip=ip),
+            layout_TV,
+            loc=loc,
+            ip=ip,
+        ),
+        loc=loc,
+        ip=ip,
+    )
+    tiler_1 = filter(
+        composition(
+            make_layout(mma_tiler, stride=(0, 1), loc=loc, ip=ip),
+            layout_TV,
+            loc=loc,
+            ip=ip,
+        ),
+        loc=loc,
+        ip=ip,
+    )
+    tiler = (tiler_0, tiler_1)
+
+    tile2mma = composition(
+        make_layout(mma_tiler, loc=loc, ip=ip), tiler, loc=loc, ip=ip
+    )
+    layout_tv = composition(
+        left_inverse(tile2mma, loc=loc, ip=ip), layout_TV, loc=loc, ip=ip
+    )
+
+    tiler_mn = _pack_tile(tiler, loc=loc, ip=ip)
+
+    return make_tiled_copy(atom, layout_tv, tiler_mn, loc=loc, ip=ip)
+
+
+####################################################################################################
+#
+# cute.gemm and cute.copy
+#
+####################################################################################################
+
+
+@dsl_user_op
+def gemm(
+    atom: MmaAtom,
+    d: Tensor,
+    a: Tensor,
+    b: Tensor,
+    c: Tensor,
+    *,
+    loc=None,
+    ip=None,
+    **kwargs,
+) -> None:
+    """
+    The GEMM algorithm.
+
+    Computes ``D <- AB + C`` where ``C`` and ``D`` can alias. Note that some MMA Atoms (e.g.
+    warpgroup-wide or tcgen05 MMAs) require manually setting an "accumulate" boolean field.
+
+    All tensors must be partitioned according to the provided MMA Atom.
+    """
+    value = atom._unpack(loc=loc, ip=ip, **kwargs)
+    return _cute_ir.gemm(value, d.value, a.value, b.value, c.value, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def basic_copy(src: Tensor, dst: Tensor, *, loc=None, ip=None) -> None:
+    """
+    Performs a basic element-wise copy.
+
+    This functions **assumes** the following pre-conditions:
+    1. `size(src) == size(dst)`
+
+    When the `src` and `dst` shapes are static, the pre-conditions are actually verified and the
+    element-wise loop is fully unrolled.
+    """
+    if is_static(src.shape) and is_static(dst.shape):
+        simt_copy_ty = _cute_nvgpu_ir.CopyAtomSIMTSyncCopyType.get(
+            src.element_type.mlir_type, src.element_type.width
+        )
+        simt_copy = _cute_ir.atom(simt_copy_ty, loc=loc, ip=ip)
+        return _cute_ir.copy(simt_copy, src.value, dst.value, loc=loc, ip=ip)
+
+    s = size(dst, loc=loc, ip=ip)
+    # Always generate an scf.for Op when one of the tensors is dynamic
+    for i in for_generate(0, s):
+        dst[i] = src[i]
+        yield_out()
+
+
+@dsl_user_op
+def basic_copy_if(pred: Tensor, src: Tensor, dst: Tensor, *, loc=None, ip=None) -> None:
+    """
+    Performs a basic predicated element-wise copy.
+
+    This functions **assumes** the following pre-conditions:
+    1. `size(src) == size(dst)`
+    2. `size(src) == size(pred)`
+
+    When all shapes are static, the pre-conditions are actually verified and the element-wise loop
+    is fully unrolled.
+    """
+    if src.element_type.width != dst.element_type.width:
+        raise NotImplementedError(
+            "basic_copy_if currently only supports equal source and destination "
+            "element type bit width"
+        )
+
+    if is_static(src.shape) and is_static(dst.shape) and is_static(pred.shape):
+        return _basic_copy_if_static(pred, src, dst, loc=loc, ip=ip)
+
+    s = size(dst, loc=loc, ip=ip)
+    # Always generate an scf.for Op when one of the tensors is dynamic
+    for i in for_generate(0, s):
+        if_generate(pred[i], lambda: dst.__setitem__(i, src[i]))
+        yield_out()
+
+
+# Version of basic_copy_if when src and dst have static shapes
+# - verify size(src) == size(dst) == size(prd)
+# - fully unroll the loop for now
+def _basic_copy_if_static(
+    pred: Tensor, src: Tensor, dst: Tensor, *, loc=None, ip=None
+) -> None:
+    assert is_static(src.shape) and is_static(dst.shape) and is_static(pred.shape)
+    if size(src, loc=loc, ip=ip) != size(dst, loc=loc, ip=ip):
+        raise ValueError(
+            "basic_copy expects the size of source, destination, and predicate tensors to match"
+        )
+    # Fully unrolled loop in the static case for now
+    for i in range(size(dst, loc=loc, ip=ip)):
+        if_generate(pred[i], lambda: dst.__setitem__(i, src[i]))
+
+
+@dsl_user_op
+def autovec_copy(src: Tensor, dst: Tensor, *, loc=None, ip=None) -> None:
+    """
+    Auto-vectorizing SIMT copy policy.
+
+    Given a source and destination tensors that are statically shaped, this policy figures out the
+    largest safe vector width that the copy instruction can take and performs the copy.
+    """
+    if src.element_type.width != dst.element_type.width:
+        raise NotImplementedError(
+            "autovec_copy currently only supports equal source and destination "
+            "element type bit width"
+        )
+
+    # We are going to dispatch to copy-with-atom which requires shapes to be static
+    if not is_static(src.shape) or not is_static(dst.shape):
+        raise ValueError(
+            "autovec_copy expects source and destination tensors to be statically shaped"
+        )
+
+    vec_layout = max_common_layout(src, dst, loc=loc, ip=ip)
+    num_common_elements = size(vec_layout, loc=loc, ip=ip)
+
+    # Next we construct an upper-bound on the number bits that can be vectorized by considering
+    # - the maximum alignment of the layouts
+    # - the maximum alignment of the pointers
+
+    upper_bound = math.gcd(src.layout.max_alignment, dst.layout.max_alignment)
+    upper_bound = math.gcd(upper_bound, num_common_elements)
+    upper_bound *= src.element_type.width
+
+    # For our instructions, the alignment of the pointer is an upper bound to the vector width
+    # max_alignment, as opposed to alignment, takes into account possible address swizzling
+    upper_bound = math.gcd(upper_bound, src.iterator.max_alignment * 8)
+    upper_bound = math.gcd(upper_bound, dst.iterator.max_alignment * 8)
+
+    # Finally, we put a cap at 128b
+    num_bits_per_copy = math.gcd(upper_bound, 128)
+
+    if (num_common_elements > 1) and (num_bits_per_copy % 8 == 0):
+        num_common_elements = num_bits_per_copy // src.element_type.width
+
+        # 2 step logical divides ensuring that the divides are valid at every step
+        vec_src = logical_divide(src, vec_layout, loc=loc, ip=ip)
+        vec_dst = logical_divide(dst, vec_layout, loc=loc, ip=ip)
+        tiled_src = logical_divide(
+            vec_src, make_layout(num_common_elements, loc=loc, ip=ip), loc=loc, ip=ip
+        )
+        tiled_dst = logical_divide(
+            vec_dst, make_layout(num_common_elements, loc=loc, ip=ip), loc=loc, ip=ip
+        )
+
+        # Dispatch to copy with atom
+        simt_type = _cute_nvgpu_ir.CopyAtomSIMTSyncCopyType.get(
+            src.element_type.mlir_type, num_bits_per_copy
+        )
+        simt_copy = _cute_ir.atom(simt_type, loc=loc, ip=ip)
+        return _cute_ir.copy(
+            simt_copy, tiled_src.value, tiled_dst.value, loc=loc, ip=ip
+        )
+
+    # Failed to vectorize, use a basic copy
+    basic_copy(src, dst, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def copy(
+    atom: CopyAtom,
+    src: Tensor,
+    dst: Tensor,
+    *,
+    pred: Tensor = None,
+    loc=None,
+    ip=None,
+    **kwargs,
+) -> None:
+    """
+    The Copy algorithm.
+
+    The "copy with Atom" expects source and destination tensors to be partitioned according to the
+    provided Copy Atom. Some Atoms require additional Op-specific kw arguments, for example TMA
+    copies:
+
+    .. code-block:: python
+
+        cute.copy(tma_atom, src, dst, tma_bar_ptr=mbar_ptr, mcast_mask=mask)
+
+    An additional predication tensor can be provided. If the partitioned tensors have the following
+    logical profile ``((ATOM_V,ATOM_REST),REST_M,...)``, the predication tensor must have a profile
+    consistent with ``(ATOM_REST,REST_M,...)``.
+    """
+    if isinstance(src.type, _cute_ir.MemRefType) and isinstance(
+        dst.type, _cute_ir.MemRefType
+    ):
+        if src.element_type.width != dst.element_type.width:
+            raise TypeError(
+                "`copy` currently only supports equal source and destination "
+                "element type bit width"
+            )
+
+    value = atom._unpack(loc=loc, ip=ip, **kwargs)
+    if isinstance(pred, Tensor):
+        pred = pred.value
+    return _cute_ir.copy(value, src.value, dst.value, pred=pred, loc=loc, ip=ip)
+
+
+####################################################################################################
+#
+# TensorSSA class (experimental)
+#
+####################################################################################################
+
+
+class ReductionOp(Enum):
+    ADD = auto()
+    MUL = auto()
+    MAX = auto()
+    MIN = auto()
+
+    def __str__(self):
+        return self.name.lower()
+
+
+class TensorSSA(cutlass_arith.ArithValue):
+    """A class representing thread local data from CuTe Tensor in value semantic and immutable.
+
+    :param value: Flatten vector as ir.Value holding logic data of SSA Tensor
+    :type value: ir.Value
+    :param shape: The nested shape in CuTe of the vector
+    :type shape: Shape
+    :param dtype: Data type of the tensor elements
+    :type dtype: Type[Numeric]
+
+    :ivar _shape: The nested shape in CuTe of the vector
+    :ivar _dtype: Data type of the tensor elements
+
+    :raises ValueError: If shape is not static
+    """
+
+    def __init__(self, value, shape: Shape, dtype: Type[Numeric]):
+        """Initialize a new TensorSSA object.
+
+        :param value: Flatten vector as ir.Value holding logic data of SSA Tensor
+        :type value: ir.Value
+        :param shape: The nested shape in CuTe of the vector
+        :type shape: Shape
+        :param dtype: Data type of the tensor elements
+        :type dtype: Type[Numeric]
+        :raises ValueError: If shape is not static
+        """
+        if not is_static(shape):
+            raise ValueError("dynamic shape is not supported")
+
+        signed = dtype.signed if issubclass(dtype, Integer) else False
+        super().__init__(value, signed)
+
+        self._shape = shape
+        self._dtype = dtype
+
+    @property
+    def dtype(self) -> Type[Numeric]:
+        return self._dtype
+
+    @property
+    def element_type(self) -> Type[Numeric]:
+        return self._dtype
+
+    @abstractmethod
+    def __extract_mlir_values__(self):
+        return [self]
+
+    @abstractmethod
+    def __new_from_mlir_values__(self, values):
+        return TensorSSA(values[0], self.shape, self.dtype)
+
+    def __str__(self):
+        return f"tensor_value<{self.type} o {self.shape}>"
+
+    @property
+    def shape(self):
+        return self._shape
+
+    @overload
+    def _apply_op(self, op, other: "TensorSSA", flip, *, loc, ip) -> "TensorSSA": ...
+
+    @overload
+    def _apply_op(
+        self, op, other: cutlass_arith.ArithValue, flip, *, loc, ip
+    ) -> "TensorSSA": ...
+
+    @overload
+    def _apply_op(
+        self, op, other: Union[int, float, bool], flip, *, loc, ip
+    ) -> "TensorSSA": ...
+
+    def _apply_op(self, op, other, flip=False, *, loc=None, ip=None):
+        def get_attr_for_type(ty, value):
+            if isinstance(ty, ir.IntegerType):
+                return ir.IntegerAttr.get(ty, value)
+            elif isinstance(ty, ir.FloatType):
+                return ir.FloatAttr.get(ty, value)
+            else:
+                raise TypeError(f"unsupported type: {ty}")
+
+        # Canonicalize into Numeric
+        if isinstance(other, (int, float, bool)) or (
+            not isinstance(other, TensorSSA)
+            and isinstance(other, cutlass_arith.ArithValue)
+        ):
+            other = as_numeric(other)
+
+        # Promote types
+        lhs, rhs, res_type = _binary_op_type_promote(self, other, True)
+
+        # Promote scalar to vector
+        if not isinstance(rhs, TensorSSA):
+            if isinstance(rhs, Numeric):
+                vect_val = vector.broadcast(lhs.type, rhs.ir_value(loc=loc, ip=ip))
+            else:
+                elem_attr = get_attr_for_type(lhs.type.element_type, rhs)
+                vect_attr = ir.DenseElementsAttr.get_splat(lhs.type, elem_attr)
+                vect_val = arith.constant(lhs.type, vect_attr, loc=loc, ip=ip)
+            rhs = TensorSSA(vect_val, lhs.shape, lhs.dtype)
+
+        if flip:
+            lhs, rhs = rhs, lhs
+
+        if op in (
+            operator.lt,
+            operator.le,
+            operator.gt,
+            operator.ge,
+            operator.eq,
+            operator.ne,
+        ):
+            res_type = Boolean
+
+        if lhs.shape != rhs.shape:
+            raise ValueError(
+                f"lhs and rhs must have the same shape type, but got {lhs.shape} and {rhs.shape}"
+            )
+
+        if not isinstance(rhs, TensorSSA):
+            raise TypeError(f"rhs must be TensorSSA but got {rhs}")
+
+        if (
+            op in (operator.add, operator.sub)
+            and lhs.dtype == Boolean
+            and rhs.dtype == Boolean
+        ):
+            res = op(lhs.to(Int32), rhs.to(Int32))
+            zero = zeros_like(res)
+            res = res.__ne__(zero).to(res_type)
+        else:
+            lhs_val = lhs.maybe_downcast()
+            rhs_val = rhs.maybe_downcast()
+
+            if issubclass(lhs.dtype, Integer):
+                lhs_val = lhs_val.with_signedness(lhs.dtype.signed)
+
+            if issubclass(rhs.dtype, Integer):
+                rhs_val = rhs_val.with_signedness(rhs.dtype.signed)
+
+            res_vect = op(lhs_val, rhs_val)
+            res = TensorSSA(res_vect, lhs._shape, res_type)
+
+        return res
+
+    def __pow__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the results of tensor^other.
+
+        :param other: The other tensor for exponent.
+        :type other: TensorSSA
+        :return: The power of the tensor.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.pow, other, loc=loc, ip=ip)
+
+    def __rpow__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the results of other^tensor.
+
+        :param other: The other tensor to compute power with.
+        :type other: TensorSSA
+        :return: The element-wise power of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.pow, other, flip=True, loc=loc, ip=ip)
+
+    def __add__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the sum of the tensor and another tensor.
+
+        :param other: The other tensor to add.
+        :type other: TensorSSA
+        :return: The sum of the two tensors with the same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.add, other, loc=loc, ip=ip)
+
+    def __radd__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the sum of the tensor and another tensor (reverse add)
+
+        :param other: The other tensor to add.
+        :type other: TensorSSA
+        :return: The sum of the two tensors with the same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.add, other, flip=True, loc=loc, ip=ip)
+
+    def __sub__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the difference of the tensor and another tensor.
+
+        :param other: The other tensor to subtract.
+        :type other: TensorSSA
+        :return: The subtraction of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.sub, other, loc=loc, ip=ip)
+
+    def __rsub__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the difference of the tensor and another tensor (reverse subtract)
+
+        :param other: The other tensor to subtract.
+        :type other: TensorSSA
+        :return: The subtraction of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.sub, other, flip=True, loc=loc, ip=ip)
+
+    def __mul__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the multiplication of the tensor and another tensor.
+
+        :param other: The other tensor to multiply.
+        :type other: TensorSSA
+        :return: The multiplication of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.mul, other, loc=loc, ip=ip)
+
+    def __rmul__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the multiplication of the tensor and another tensor (reverse multiply)
+
+        :param other: The other tensor to multiply.
+        :type other: TensorSSA
+        :return: The multiplication of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.mul, other, flip=True, loc=loc, ip=ip)
+
+    def __mod__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the modulo of the tensor and another tensor.
+
+        :param other: The other tensor to compute modulo with.
+        :type other: TensorSSA
+        :return: The element-wise modulo of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.mod, other, loc=loc, ip=ip)
+
+    def __rmod__(self, other) -> "TensorSSA":
+        """
+        Returns the modulo of the tensor and another tensor (reverse modulo)
+
+        :param other: The other tensor to compute modulo with.
+        :type other: TensorSSA
+        :return: The element-wise modulo of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.mod, other, flip=True)
+
+    def __floordiv__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the floordiv(//) of the tensor and another tensor.
+
+        :param other: The other tensor to compute floordiv with.
+        :type other: TensorSSA
+        :return: The floordiv of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.floordiv, other, loc=loc, ip=ip)
+
+    def __rfloordiv__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the floordiv(//) of the tensor and another tensor (reverse floordiv)
+
+        :param other: The other tensor to compute floordiv with.
+        :type other: TensorSSA
+        :return: The floordiv of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.floordiv, other, flip=True, loc=loc, ip=ip)
+
+    def __truediv__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the truediv(/) of the tensor and another tensor.
+
+        :param other: The other tensor to compute truediv with.
+        :type other: TensorSSA
+        :return: The truediv of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.truediv, other, loc=loc, ip=ip)
+
+    def __rtruediv__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the truediv(/) of the tensor and another tensor (reverse truediv)
+
+        :param other: The other tensor to compute truediv with.
+        :type other: TensorSSA
+        :return: The truediv of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.truediv, other, flip=True, loc=loc, ip=ip)
+
+    def __eq__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the comparison of the tensor and another tensor as mask
+
+        :param other: The other tensor to compare.
+        :type other: TensorSSA
+        :return: The comparison of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.eq, other, loc=loc, ip=ip)
+
+    def __ne__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the element-wise not equal comparison of the tensor and another tensor.
+
+        :param other: The other tensor to compare.
+        :type other: TensorSSA
+        :return: A boolean tensor with same shape as inputs, True where self != other.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.ne, other, loc=loc, ip=ip)
+
+    def __lt__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the element-wise less than comparison of the tensor and another tensor.
+
+        :param other: The other tensor to compare with.
+        :type other: TensorSSA
+        :return: A boolean tensor with same shape as inputs, True where self < other.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.lt, other, loc=loc, ip=ip)
+
+    def __le__(self, other) -> "TensorSSA":
+        """
+        Returns the element-wise less than or equal comparison of the tensor and another tensor.
+
+        :param other: The other tensor to compare with.
+        :type other: TensorSSA
+        :return: A boolean tensor with same shape as inputs, True where self <= other.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.le, other)
+
+    def __gt__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the element-wise greater than comparison of the tensor and another tensor.
+
+        :param other: The other tensor to compare with.
+        :type other: TensorSSA
+        :return: A boolean tensor with same shape as inputs, True where self > other.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.gt, other)
+
+    def __ge__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the element-wise greater than or equal comparison of the tensor and another tensor.
+
+        :param other: The other tensor to compare with.
+        :type other: TensorSSA
+        :return: A boolean tensor with same shape as inputs, True where self >= other.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.ge, other, loc=loc, ip=ip)
+
+    def __xor__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the element-wise XOR of the tensor and another tensor.
+
+        :param other: The other tensor to perform XOR with.
+        :type other: TensorSSA
+        :return: The element-wise XOR of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.xor, other)
+
+    def __rxor__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the bitwise XOR of the tensor and another tensor.
+
+        :param other: The other tensor to compute XOR with.
+        :type other: TensorSSA
+        :return: The element-wise bitwise XOR of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.xor, other, flip=True, loc=loc, ip=ip)
+
+    def __or__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the element-wise OR of the tensor and another tensor.
+
+        :param other: The other tensor to perform OR with.
+        :type other: TensorSSA
+        :return: The element-wise OR of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.or_, other)
+
+    def __ror__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the element-wise OR of the tensor and another tensor.
+
+        :param other: The other tensor to perform OR with.
+        :type other: TensorSSA
+        :return: The element-wise OR of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.or_, other, flip=True)
+
+    def __and__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the element-wise AND of the tensor and another tensor.
+
+        :param other: The other tensor to perform AND with.
+        :type other: TensorSSA
+        :return: The element-wise AND of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.and_, other)
+
+    def __rand__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the element-wise AND of the tensor and another tensor.
+
+        :param other: The other tensor to perform AND with.
+        :type other: TensorSSA
+        :return: The element-wise AND of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.and_, other, flip=True, loc=loc, ip=ip)
+
+    def _flatten_shape_and_coord(self, crd, *, loc=None, ip=None):
+        # Coalesce and flatten source layout at terminal of coordinate
+        # (N_0,(N_1,...), ...) -> (N_0,N_1,N_2,...)
+        crd_shp = product_like(self._shape, target_profile=crd, loc=loc, ip=ip)
+
+        # Flatten coordinate
+        flat_shp = flatten(crd_shp)
+        assert isinstance(flat_shp, tuple) and is_static(flat_shp)
+        # (C_0,(C_1,...), ...) -> (C_0,C_1,C_2,...)
+        flat_crd = flatten(crd)
+
+        assert isinstance(flat_crd, tuple) and is_static(flat_crd)
+        return flat_shp, flat_crd
+
+    def _build_result(self, res_vect, res_shp, *, loc=None, ip=None):
+        if isinstance(res_shp, ir.Value):
+            raise ValueError(
+                f"expects static shape and coordinates, but got {self._shape} and {crd}"
+            )
+
+        # cast back to 1D vector
+        res_1d_ty = ir.VectorType.get([size(res_shp)], self.type.element_type)
+        res_1d_vect = vector.shape_cast(res_1d_ty, res_vect, loc=loc, ip=ip)
+        return TensorSSA(res_1d_vect, res_shp, self.dtype)
+
+    @dsl_user_op
+    def __getitem__(
+        self, crd: Coord, *, loc=None, ip=None
+    ) -> Union["TensorSSA", Numeric]:
+        """Access or slice tensor elements using coordinates.
+
+        This method implements tensor evaluation T(c) = *(E + L(c)) where E is the iterator/engine
+        and L is the layout. It supports both direct element access and slicing operations.
+
+        :param crd: Coordinate or slice specification for accessing tensor elements
+        :type crd: Coord
+        :param loc: Source location for MLIR operation tracking, defaults to None
+        :type loc: Optional[Location]
+        :param ip: Insertion point for MLIR operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+        :return: Tensor element value or sliced subtensor
+        :rtype: Union[TensorSSA, Numeric]
+
+        :raises ValueError: If coordinate access is invalid for the tensor layout
+
+        Examples:
+
+        .. code-block:: python
+
+            # Create a fragment from rmem as shape (8, 4)
+            layout = make_layout((8, 4))
+            tensor = make_fragment(layout, Float32)
+            frg = tensor.load()
+
+            # Direct element access
+            val = frg[0]  # Returns first element of fragment
+            val = frg[(0, 1)]  # Returns element at (0, 1)
+
+            # Slice access
+            sliced = frg[(3, None)]  # Returns fragment slice
+        """
+        # short-cut to no-op
+        if crd is None:
+            return self
+
+        if not has_underscore(crd) or depth(crd) == 0:
+            idx = crd2idx(crd, make_layout(self._shape))
+            if is_static(idx):
+                res = vector.extract(
+                    self, dynamic_position=[], static_position=[idx], loc=loc, ip=ip
+                )
+            else:
+                res = vector.extract(
+                    self, dynamic_position=[crd], static_position=[], loc=loc, ip=ip
+                )
+            return self.dtype(res)
+
+        if not is_static(crd):
+            raise ValueError("dynamic coordinate is not supported")
+
+        flat_shp, flat_crd = self._flatten_shape_and_coord(crd)
+
+        multi_dim_ty = ir.VectorType.get(list(flat_shp), self.type.element_type)
+        # vector<NxTy> -> vector<N_0xN_1x...xTy>
+        tmp_vect = vector.shape_cast(multi_dim_ty, self)
+
+        # Slice and keep dims matching `_` or None
+        res_shp = slice_(self._shape, crd)
+        if isinstance(res_shp, ir.Value):
+            raise TypeError(
+                f"expects static shape and coordinates, but got {self._shape} and {crd}"
+            )
+
+        # Offsets is index of coordinates if NOT `_` otherwise 0
+        offsets = [c if c is not None else 0 for c in flat_crd]
+        # Sizes is size of shapes if `_` otherwise 1
+        sizes = [s if c is None else 1 for s, c in zip(flat_shp, flat_crd)]
+        # Logic stride to index vector. Only support stride-1 by vector
+        strides = [1] * rank(flat_shp)
+
+        # Vector slice on N-D vector
+        res_ty = ir.VectorType.get(list(sizes), self.type.element_type)
+        res_vect = vector.extract_strided_slice(
+            res_ty, tmp_vect, offsets=offsets, sizes=sizes, strides=strides
+        )
+
+        # Slice and keep dims matching `_` or None
+        res_shp = slice_(self._shape, crd)
+        return self._build_result(res_vect, res_shp, loc=loc, ip=ip)
+
+    @dsl_user_op
+    def to(self, dtype: Type[Numeric], *, loc=None, ip=None):
+        """Convert the tensor to a different numeric type.
+
+        :param dtype: The target numeric type to cast to.
+        :type dtype: Type[Numeric]
+        :return: A new tensor with the same shape but with elements cast to the target type.
+        :rtype: TensorSSA
+        :raises TypeError: If dtype is not a subclass of Numeric.
+        :raises NotImplementedError: If dtype is an unsigned integer type.
+        """
+        if dtype is ir.Value:
+            return self
+
+        if not isclass(dtype) or not issubclass(dtype, Numeric):
+            raise TypeError(f"dtype must be a type of Numeric, but got {type(dtype)}")
+
+        src_dtype = self.dtype
+        if src_dtype == dtype:
+            return self
+
+        # maybe downcast can lose signedness
+        src = self.maybe_downcast().with_signedness(self.signed)
+        if src_dtype.is_float and dtype.is_float:
+            res_vect = cutlass_arith.cvtf(src, dtype.mlir_type, loc=loc, ip=ip)
+        elif src_dtype.is_float and issubclass(dtype, Integer):
+            res_vect = cutlass_arith.fptoi(
+                src, dtype.signed, dtype.mlir_type, loc=loc, ip=ip
+            )
+        elif issubclass(src_dtype, Integer) and dtype.is_float:
+            res_vect = cutlass_arith.itofp(
+                src, src_dtype.signed, dtype.mlir_type, loc=loc, ip=ip
+            )
+        else:
+            res_vect = cutlass_arith.int_to_int(src, dtype, loc=loc, ip=ip)
+
+        return TensorSSA(res_vect, self._shape, dtype)
+
+    def ir_value(self, *, loc=None, ip=None):
+        return self
+
+    def reduce(self, op, init_val, reduction_profile: Coord, *, loc=None, ip=None):
+        """
+        Perform reduce on selected modes with given predefined reduction op.
+
+        :param op: The reduction operator to use (operator.add or operator.mul)
+        :type op: operator
+        :param init_val: The initial value for the reduction
+        :type init_val: numeric
+        :param reduction_profile: Specifies which dimensions to reduce. Dimensions marked with '_' are kept.
+        :type reduction_profile: Coord
+
+        :return: The reduced tensor
+        :rtype: TensorSSA
+
+        Examples:
+            reduce(f32 o (4,))
+              => f32
+            reduce(f32 o (4, 5))
+              => f32
+            reduce(f32 o (4, (5, 4)), reduction_profile=(_, 1))
+              => f32 o (4,)
+            reduce(f32 o (4, (5, 4)), reduction_profile=(_, (_, 1)))
+              => f32 o (4, (5,))
+        """
+        # short-cut to no-op
+        if reduction_profile is None:
+            return self
+
+        if op is ReductionOp.ADD:
+            red_kind = vector.CombiningKind.ADD
+        elif op is ReductionOp.MUL:
+            red_kind = vector.CombiningKind.MUL
+        elif op is ReductionOp.MAX:
+            red_kind = vector.CombiningKind.MAXIMUMF
+        elif op is ReductionOp.MIN:
+            red_kind = vector.CombiningKind.MINIMUMF
+        else:
+            raise NotImplementedError(
+                f"{op} is not supported, expects one of "
+                f"{ReductionOp.ADD, ReductionOp.MUL, ReductionOp.MAX, ReductionOp.MIN}"
+            )
+
+        elem_ty = self.element_type
+        # Canonicalize to `Numeric` and convert into MLIR value
+        init_val = as_numeric(init_val).ir_value(loc=loc, ip=ip)
+
+        if depth(reduction_profile) == 0:
+            return vector.reduction(
+                elem_ty.mlir_type, red_kind, self, acc=init_val, loc=loc, ip=ip
+            )
+
+        flat_shp, flat_prof = self._flatten_shape_and_coord(
+            reduction_profile, loc=loc, ip=ip
+        )
+        assert depth(flat_shp) == 1 and depth(flat_prof) == 1
+        assert rank(flat_shp) == rank(flat_prof)
+
+        temp_ty = ir.VectorType.get(list(flat_shp), elem_ty.mlir_type)
+        temp_vect = vector.shape_cast(temp_ty, self, loc=loc, ip=ip)
+
+        if isinstance(flat_prof, tuple):
+            red_dims = [i for i, x in enumerate(flat_prof) if x is not None]
+        else:
+            red_dims = [0]
+
+        temp_acc_shp = slice_(flat_shp, flat_prof, loc=loc, ip=ip)
+        temp_acc_ty = ir.VectorType.get(list(temp_acc_shp), elem_ty.mlir_type)
+
+        init_val = vector.broadcast(temp_acc_ty, init_val, loc=loc, ip=ip)
+        res_vect = vector.multi_reduction(
+            red_kind, temp_vect, acc=init_val, reduction_dims=red_dims, loc=loc, ip=ip
+        )
+
+        # Slice and keep dims matching `_` or None
+        res_shp = slice_(self.shape, reduction_profile, loc=loc, ip=ip)
+        return self._build_result(res_vect, res_shp, loc=loc, ip=ip)
+
+
+def _get_attr_for_type(ty, value):
+    if isinstance(ty, ir.IntegerType):
+        return ir.IntegerAttr.get(ty, value.to(int))
+    elif isinstance(ty, ir.FloatType):
+        return ir.FloatAttr.get(ty, value.to(float))
+    else:
+        raise TypeError(f"unsupported type: {ty}")
+
+
+def _splat(res_ty, fill_value):
+    elem_attr = _get_attr_for_type(res_ty.element_type, fill_value)
+    vect_attr = ir.DenseElementsAttr.get_splat(res_ty, elem_attr)
+    return arith.constant(res_ty, vect_attr)
+
+
+@dsl_user_op
+def full(shape, fill_value, dtype: Type[Numeric], *, loc=None, ip=None) -> TensorSSA:
+    """
+    Return a new TensorSSA of given shape and type, filled with fill_value.
+
+    :param shape: Shape of the new tensor.
+    :type shape: tuple
+    :param fill_value: Value to fill the tensor with.
+    :type fill_value: scalar
+    :param dtype: Data type of the tensor.
+    :type dtype: Type[Numeric]
+    :return: Tensor of fill_value with the specified shape and dtype.
+    :rtype: TensorSSA
+    """
+    size = product(shape, loc=loc, ip=ip)
+    if not is_static(size):
+        raise ValueError("shape must be static")
+
+    if isinstance(fill_value, (ir.Value, int, float, bool)):
+        fill_value = dtype(fill_value)
+
+    res_mlir_type = T.vector(size, dtype.mlir_type)
+    return TensorSSA(_splat(res_mlir_type, fill_value), shape, dtype)
+
+
+def full_like(
+    a: TensorSSA,
+    fill_value,
+    dtype: Union[None, Type[Numeric]] = None,
+    *,
+    loc=None,
+    ip=None,
+) -> TensorSSA:
+    """
+    Return a full TensorSSA with the same shape and type as a given array.
+
+    :param a: The shape and data-type of `a` define these same attributes of the returned array.
+    :type a: array_like
+    :param fill_value: Fill value.
+    :type fill_value: array_like
+    :param dtype: Overrides the data type of the result, defaults to None
+    :type dtype: Union[None, Type[Numeric]], optional
+    :return: Tensor of `fill_value` with the same shape and type as `a`.
+    :rtype: TensorSSA
+
+    .. seealso::
+       :func:`empty_like`: Return an empty array with shape and type of input.
+       :func:`ones_like`: Return an array of ones with shape and type of input.
+       :func:`zeros_like`: Return an array of zeros with shape and type of input.
+       :func:`full`: Return a new array of given shape filled with value.
+
+    Examples
+    --------
+    .. code-block:: python
+
+        frg = cute.make_fragment(Float32, (2, 3))
+        a = frg.load()
+        b = cute.full_like(a, 1.0)
+    """
+    return full(
+        a.shape, fill_value, dtype if dtype is not None else a.dtype, loc=loc, ip=ip
+    )
+
+
+def empty_like(a, dtype=None):
+    """
+    Return a new TensorSSA with the same shape and type as a given array, without initializing entries.
+
+    :param a: The shape and data-type of `a` define these same attributes of the returned array.
+    :type a: TensorSSA
+    :param dtype: Overrides the data type of the result, defaults to None
+    :type dtype: Type[Numeric], optional
+    :return: Uninitialized tensor with the same shape and type (unless overridden) as `a`.
+    :rtype: TensorSSA
+    """
+    return full_like(a, 0, dtype)
+
+
+def ones_like(a, dtype=None):
+    """
+    Return a TensorSSA of ones with the same shape and type as a given array.
+
+    :param a: The shape and data-type of `a` define these same attributes of the returned array.
+    :type a: TensorSSA
+    :param dtype: Overrides the data type of the result, defaults to None
+    :type dtype: Type[Numeric], optional
+    :return: Tensor of ones with the same shape and type (unless overridden) as `a`.
+    :rtype: TensorSSA
+    """
+    return full_like(a, 1, dtype)
+
+
+def zeros_like(a, dtype=None, *, loc=None, ip=None):
+    """
+    Return a TensorSSA of zeros with the same shape and type as a given array.
+
+    :param a: The shape and data-type of `a` define these same attributes of the returned array.
+    :type a: TensorSSA
+    :param dtype: Overrides the data type of the result, defaults to None
+    :type dtype: Type[Numeric], optional
+    :return: Tensor of zeros with the same shape and type (unless overridden) as `a`.
+    :rtype: TensorSSA
+    """
+    return full_like(a, 0, dtype, loc=loc, ip=ip)
+
+
+def where(
+    cond: TensorSSA, x: TensorSSA, y: TensorSSA, *, loc=None, ip=None
+) -> TensorSSA:
+    """
+    Return elements chosen from x or y depending on condition.
+
+    :param cond: Where True, yield x, where False, yield y.
+    :type cond: TensorSSA
+    :param x: Values from which to choose when condition is True.
+    :type x: TensorSSA
+    :param y: Values from which to choose when condition is False.
+    :type y: TensorSSA
+    :return: A tensor with elements from x where condition is True, and elements from y where condition is False.
+    :rtype: TensorSSA
+    """
+    if x.dtype != y.dtype:
+        raise ValueError(
+            f"x and y must have the same dtype, but got {x.dtype} and {y.dtype}"
+        )
+
+    if cond.dtype != Boolean:
+        raise ValueError(f"cond must be Boolean type, but got {cond.dtype}")
+
+    return TensorSSA(
+        arith.select(cond.ir_value(), x, y, loc=loc, ip=ip), x.shape, x.dtype
+    )
+
+
+def any_(x: TensorSSA, *, loc=None, ip=None) -> Boolean:
+    """
+    Test whether any tensor element evaluates to True.
+
+    :param x: Input tensor.
+    :type x: TensorSSA
+    :return: Returns a TensorSSA scalar containing True if any element of x is True, False otherwise.
+    :rtype: TensorSSA
+    """
+    is_true = x != full_like(x, 0, x.dtype, loc=loc, ip=ip)
+    return Boolean(
+        vector.reduction(T.bool(), vector.CombiningKind.OR, is_true, loc=loc, ip=ip)
+    )
+
+
+def all_(x: TensorSSA, *, loc=None, ip=None) -> Boolean:
+    """
+    Test whether all tensor elements evaluate to True.
+
+    :param x: Input tensor.
+    :type x: TensorSSA
+    :return: Returns a TensorSSA scalar containing True if all elements of x are True, False otherwise.
+    :rtype: TensorSSA
+    """
+    is_true = x != full_like(x, 0, x.dtype, loc=loc, ip=ip)
+    return Boolean(
+        vector.reduction(T.bool(), vector.CombiningKind.AND, is_true, loc=loc, ip=ip)
+    )
+
+
+##############################################################################
+# User defined struct
+##############################################################################
+
+
+class struct:
+    """
+    Decorator to abstract C structure in Python DSL.
+
+    Usage:
+    .. code-block::
+
+        # Supports base_dsl scalar int/float elements, array and nested struct:
+        @cute.struct
+        class complex:
+            real : cutlass.Float32
+            imag : cutlass.Float32
+        @cute.struct
+        class StorageA:
+            mbarA : cute.struct.MemRange[cutlass.Int64, stage]
+            compA : complex
+            intA : cutlass.Int16
+
+        # Supports aligment for its elements:
+        @cute.struct
+        class StorageB:
+            a: cute.struct.Align[
+                cute.struct.MemRange[cutlass.Float32, size_a], 1024
+            ]
+            b: cute.struct.Align[
+                cute.struct.MemRange[cutlass.Float32, size_b], 1024
+            ]
+            x: cute.struct.Align[cutlass.Int32, 16]
+            compA: cute.struct.Align[complex, 16]
+
+        # Statically get size and alignment:
+        size = StorageB.__sizeof__()
+        align = StorageB.__alignof__()
+
+        # Allocate and referencing elements:
+        storage = allocator.allocate(StorageB)
+
+        storage.a[0] ...
+        storage.x ...
+        storage.compA.real ...
+
+    :param cls: The struct class with annotations.
+    :return: The decorated struct class.
+    """
+
+    # inner class for defining a continuous memory region
+    class _MemRangeMeta(type):
+        """
+        A metaclass for creating MemRange classes.
+
+        This metaclass is used to dynamically create MemRange classes with specific
+        data types and sizes.
+
+        :ivar _dtype: The data type of the MemRange.
+        :ivar _size: The size of the MemRange.
+        """
+
+        _dtype = None
+        _size = None
+
+        def __new__(cls, name, bases, dct):
+            new_cls = super().__new__(cls, name, bases, dct)
+            return new_cls
+
+        def __getitem__(cls, params) -> Type["struct.MemRange"]:
+            # get params from syntax: struct.MemRange[dtype, size]
+            if len(params) == 2:
+                dtype, size = params
+            else:
+                raise TypeError("Invalid struct.MemRange Arguments")
+
+            if not struct._is_scalar_type(dtype):
+                raise TypeError("MemRange only support dsl scalar type!")
+
+            # Create new class with proper name and parameters
+            new_cls = type(
+                f"struct.MemRange[{dtype.__name__}, {size}]",
+                (struct.MemRange,),
+                {"_dtype": dtype, "_size": size},
+            )
+            return new_cls
+
+        @property
+        def size(cls):
+            return cls._size
+
+        @property
+        def elem_width(cls):
+            return cls._dtype.width
+
+        @property
+        def size_in_bytes(cls):
+            return cls.size * cls.elem_width // 8
+
+    class MemRange(metaclass=_MemRangeMeta):
+        """
+        Defines a range of memory by `MemRange[T, size]`.
+        """
+
+        pass
+
+    class _MemRangeData:
+        """
+        Represents a range of memory.
+
+        :param dtype: The data type.
+        :param size: The size of the memory range in bytes.
+        :param base: The base address of the memory range.
+        """
+
+        def __init__(self, dtype, size, base):
+            """
+            Initializes a new memory range.
+
+            :param dtype: The data type.
+            :param size: The size of the memory range in bytes.
+            :param base: The base address of the memory range.
+            """
+            self._dtype = dtype
+            self._size = size
+            self._base = base
+
+        def data_ptr(self):
+            """
+            Returns start pointer to the data in this memory range.
+
+            :return: A pointer to the start of the memory range.
+            :raises AssertionError: If the size of the memory range is not greater than zero.
+            """
+            assert self._size > 0
+            return recast_ptr(self._base, dtype=self._dtype)
+
+        def get_tensor(self, layout, swizzle=None, dtype=None):
+            """
+            Creates a tensor from the memory range.
+
+            :param layout: The layout of the tensor.
+            :param swizzle: Optional swizzle pattern.
+            :param dtype: Optional data type; defaults to the memory range's data type if not specified.
+            :return: A tensor representing the memory range.
+            :raises TypeError: If the layout is incompatible with the swizzle.
+            :raises AssertionError: If the size of the memory range is not greater than zero.
+            """
+            assert self._size > 0
+            # make tensor
+            if isinstance(layout, ComposedLayout) and (swizzle is not None):
+                raise TypeError(f"incompatible layout with swizzle")
+            elem_type = self._dtype if dtype is None else dtype
+            ptr = recast_ptr(self._base, swizzle, dtype=elem_type)
+            res = make_tensor(ptr, layout)
+            return res
+
+        def __getitem__(self, index: int) -> Any:
+            """
+            Returns the element at the specified index in the memory range.
+
+            :param index: The index of the element to retrieve.
+            :return: The element at the specified index.
+            :raises AssertionError: If the index is out of range.
+            """
+            assert (index >= 0) and (index < self._size)
+            return self.data_ptr() + index
+
+    # inner class for aligning a member type
+    class _AlignMeta(type):
+        """
+        Aligns the given object by setting its alignment attribute.
+
+        :param v: The object to align. Must be a struct, MemRange, or a scalar type.
+        :param align: The alignment value to set.
+        :return: A copy of the object with the specified alignment.
+        :raises TypeError: If the object is not a struct, MemRange, or a scalar type.
+        """
+
+        def __new__(cls, name, bases, dct):
+            return super().__new__(cls, name, bases, dct)
+
+        def __getitem__(cls, params) -> Any:
+            if len(params) == 2:
+                obj, align = params
+            else:
+                raise TypeError("Invalid struct.Align Arguments")
+
+            # make a copy of type and mark alignment
+            if struct._is_scalar_type(obj) or isinstance(
+                obj, (struct, struct._MemRangeMeta)
+            ):
+                new_obj = py_copy.copy(obj)
+                setattr(new_obj, "_struct_alignment_", align)
+                return new_obj
+            else:
+                raise TypeError(
+                    "align only can be applied to sturct/MemRange/base_dsl scalar"
+                )
+
+    class Align(metaclass=_AlignMeta):
+        """
+        Aligns the given type by `Align[T, alignment]`.
+        """
+
+        pass
+
+    # util func for base dsl scalar types
+    @staticmethod
+    def _is_scalar_type(dtype):
+        """
+        Checks if the given type is a scalar numeric type.
+
+        :param dtype: The type to check.
+        :return: True if the type is a subclass of Numeric, False otherwise.
+        """
+        return isinstance(dtype, type) and issubclass(dtype, Numeric)
+
+    # calculate size and alignment
+    def __init__(self, cls):
+        """
+        Initializes a new struct decorator instance.
+
+        :param cls: The class representing the structured data type.
+        :raises TypeError: If the struct is empty.
+        """
+        self._cls = cls
+        # Get the class annotations
+        self._annotations = cls.__annotations__
+        # Create a dictionary to store the offsets
+        self._offsets: Dict[str, int] = {}
+
+        # Calculate the offsets and alignment
+        offset = 0
+        alignment = 1
+        if len(self._annotations) == 0:
+            raise TypeError("Empty struct is not supported!")
+        for name, object in self._annotations.items():
+            # get alignment of object
+            def alignof(object, default: int = 1):
+                return getattr(object, "_struct_alignment_", default)
+
+            # alignment for the next offset
+            def align_offset(offset, align):
+                return (offset + (align - 1)) & ~(align - 1)
+
+            # switch addition order to support dynamic size
+            def add_offset(val):
+                return val + offset if isinstance(val, ir.Value) else offset + val
+
+            # size of scalar
+            if struct._is_scalar_type(object):
+                dtype_size = object.width // 8
+                sub_align = alignof(object, dtype_size)
+                offset = align_offset(offset, sub_align)
+                self._offsets[name] = offset
+                offset = add_offset(dtype_size)
+            # size of array is size_in_bytes, alignment is elem_size
+            elif isinstance(object, struct._MemRangeMeta):
+                if object.size == 0:
+                    continue  # skip empty array
+                sub_align = alignof(object, max(1, object.elem_width // 8))
+                offset = align_offset(offset, sub_align)
+                self._offsets[name] = offset
+                offset = add_offset(object.size_in_bytes)
+            # size of struct
+            elif isinstance(object, struct):
+                sub_align = max(object.__alignof__(), alignof(object))
+                offset = align_offset(offset, sub_align)
+                self._offsets[name] = offset
+                offset = add_offset(object.__sizeof__())
+            else:
+                raise TypeError(
+                    f"Struct element only support sturct/array/base_dsl scalar, "
+                    f"but got {object}"
+                )
+            # Total aligment determined by the strictest requirement
+            alignment = max(alignment, sub_align)
+        # Total size determined by alignment
+        self._align_of = alignment
+        self._size_of = align_offset(offset, alignment)
+
+    # create the __init__ method for decorated struct
+    def __call__(self, base: Any) -> None:
+        """
+        Creates a new instance of the decorated struct.
+
+        :param base: The base address of the struct.
+        :return: An instance of the decorated struct.
+        :raises TypeError: If the base pointer is not byte-sized.
+        """
+        if base.type.value_type.width != 8:
+            raise TypeError("struct base ptr value type must be byte sized.")
+        # make an new object of user-defined decorated struct
+        # otherwise it will override same self._cls when new instance created
+        cls = self._cls()
+        setattr(cls, "_base", base)
+        for name, off in self._offsets.items():
+            obj = self._annotations[name]
+            if struct._is_scalar_type(obj):
+                new_obj = recast_ptr(base + off, dtype=obj)
+                setattr(cls, name, new_obj)
+            elif isinstance(obj, struct._MemRangeMeta):
+                new_obj = struct._MemRangeData(obj._dtype, obj._size, base + off)
+                setattr(cls, name, new_obj)
+            elif isinstance(obj, struct):
+                new_obj = obj(base + off)
+                setattr(cls, name, new_obj)
+            else:
+                raise TypeError(
+                    f"Struct element only support sturct/array/base_dsl scalar, "
+                    f"but got {obj}"
+                )
+        return cls
+
+    # get size
+    def size_in_bytes(self) -> int:
+        """
+        Returns the size of the struct in bytes.
+
+        :return: The size of the struct.
+        """
+        return self._size_of
+
+    # get size
+    def __sizeof__(self) -> int:
+        return self._size_of
+
+    # get alignment
+    def __alignof__(self) -> int:
+        return self._align_of
diff --git a/python/CuTeDSL/cutlass/cute/math.py b/python/CuTeDSL/cutlass/cute/math.py
new file mode 100644
index 00000000..3dda89c2
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/math.py
@@ -0,0 +1,354 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from .core import TensorSSA
+from cutlass._mlir.dialects import math, arith
+
+
+def acos(a: TensorSSA) -> TensorSSA:
+    """Compute element-wise arc cosine of the input tensor.
+
+    :param a: Input tensor
+    :type a: TensorSSA
+    :return: Tensor containing the arc cosine of each element in input tensor
+    :rtype: TensorSSA
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = acos(y)  # Compute arc cosine
+    """
+    return TensorSSA(math.acos(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype)
+
+
+def asin(a: TensorSSA) -> TensorSSA:
+    """Compute element-wise arc sine of the input tensor.
+
+    :param a: Input tensor
+    :type a: TensorSSA
+    :return: Tensor containing the arc sine of each element in input tensor
+    :rtype: TensorSSA
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = asin(y)  # Compute arc sine
+    """
+    return TensorSSA(math.asin(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype)
+
+
+def atan(a: TensorSSA, fastmath: bool = False) -> TensorSSA:
+    """Compute element-wise arc tangent of the input tensor.
+
+    :param a: Input tensor
+    :type a: TensorSSA
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the arc tangent of each element in input tensor
+    :rtype: TensorSSA
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = atan(y)  # Compute arc tangent
+    """
+    raise NotImplementedError("atan is not implemented")
+    return TensorSSA(math.atan(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype)
+
+
+def atan2(a: TensorSSA, b: TensorSSA, fastmath: bool = False) -> TensorSSA:
+    """Compute element-wise arc tangent of two tensors.
+
+    Computes atan2(a, b) element-wise. The function atan2(a, b) is the angle in radians
+    between the positive x-axis and the point given by the coordinates (b, a).
+
+    :param a: First input tensor (y-coordinates)
+    :type a: TensorSSA
+    :param b: Second input tensor (x-coordinates)
+    :type b: TensorSSA
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the arc tangent of a/b element-wise
+    :rtype: TensorSSA
+
+    Example:
+
+    .. code-block::
+
+        y = cute.make_fragment(ptr1, layout).load()  # y coordinates
+        x = cute.make_fragment(ptr2, layout).load()  # x coordinates
+        theta = atan2(y, x)  # Compute angles
+    """
+    return TensorSSA(
+        math.atan2(a, b, fastmath=arith.FastMathFlags.none), a.shape, a.dtype
+    )
+
+
+def cos(a: TensorSSA, fastmath: bool = False) -> TensorSSA:
+    """Compute element-wise cosine of the input tensor.
+
+    :param a: Input tensor (in radians)
+    :type a: TensorSSA
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the cosine of each element
+    :rtype: TensorSSA
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = cos(y)  # Compute cosine
+    """
+    return TensorSSA(math.cos(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype)
+
+
+def erf(a: TensorSSA, fastmath: bool = False) -> TensorSSA:
+    """Compute element-wise error function of the input tensor.
+
+    The error function is defined as:
+    erf(x) = 2/√π ∫[0 to x] exp(-t²) dt
+
+    :param a: Input tensor
+    :type a: TensorSSA
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the error function value for each element
+    :rtype: TensorSSA
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = erf(y)  # Compute error function
+    """
+    return TensorSSA(math.erf(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype)
+
+
+def exp2(a: TensorSSA, fastmath: bool = False) -> TensorSSA:
+    """Compute element-wise base-2 exponential of the input tensor.
+
+    :param a: Input tensor
+    :type a: TensorSSA
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing 2 raised to the power of each element
+    :rtype: TensorSSA
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = exp2(y)  # Compute 2^x
+    """
+    return TensorSSA(math.exp2(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype)
+
+
+def log(a: TensorSSA, fastmath: bool = False) -> TensorSSA:
+    """Compute element-wise natural logarithm of the input tensor.
+
+    :param a: Input tensor
+    :type a: TensorSSA
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the natural logarithm of each element
+    :rtype: TensorSSA
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = log(y)  # Compute natural logarithm
+    """
+    return TensorSSA(math.log(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype)
+
+
+def log2(a: TensorSSA, fastmath: bool = False) -> TensorSSA:
+    """Compute element-wise base-2 logarithm of the input tensor.
+
+    :param a: Input tensor
+    :type a: TensorSSA
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the base-2 logarithm of each element
+    :rtype: TensorSSA
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = log2(y)  # Compute log base 2
+    """
+    return TensorSSA(math.log2(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype)
+
+
+def log10(a: TensorSSA, fastmath: bool = False) -> TensorSSA:
+    """Compute element-wise base-10 logarithm of the input tensor.
+
+    :param a: Input tensor
+    :type a: TensorSSA
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the base-10 logarithm of each element
+    :rtype: TensorSSA
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = log10(y)  # Compute log base 10
+    """
+    return TensorSSA(math.log10(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype)
+
+
+def rsqrt(a: TensorSSA, fastmath: bool = False) -> TensorSSA:
+    """Compute element-wise reciprocal square root of the input tensor.
+
+    Computes 1/√x element-wise.
+
+    :param a: Input tensor
+    :type a: TensorSSA
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the reciprocal square root of each element
+    :rtype: TensorSSA
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = rsqrt(y)  # Compute 1/√x
+    """
+    return TensorSSA(math.rsqrt(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype)
+
+
+def sin(a: TensorSSA, fastmath: bool = False) -> TensorSSA:
+    """Compute element-wise sine of the input tensor.
+
+    :param a: Input tensor (in radians)
+    :type a: TensorSSA
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the sine of each element
+    :rtype: TensorSSA
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = sin(y)  # Compute sine
+    """
+    return TensorSSA(math.sin(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype)
+
+
+def sqrt(a: TensorSSA, fastmath: bool = False) -> TensorSSA:
+    """Compute element-wise square root of the input tensor.
+
+    :param a: Input tensor
+    :type a: TensorSSA
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the square root of each element
+    :rtype: TensorSSA
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = sqrt(y)  # Compute square root
+    """
+    return TensorSSA(math.sqrt(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype)
+
+
+def tan(a: TensorSSA) -> TensorSSA:
+    """Compute element-wise tangent of the input tensor.
+
+    :param a: Input tensor (in radians)
+    :type a: TensorSSA
+    :return: Tensor containing the tangent of each element
+    :rtype: TensorSSA
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = tan(y)  # Compute tangent
+    """
+    return TensorSSA(math.tan(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype)
+
+
+def tanh(a: TensorSSA, fastmath: bool = False) -> TensorSSA:
+    """Compute element-wise hyperbolic tangent of the input tensor.
+
+    :param a: Input tensor
+    :type a: TensorSSA
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the hyperbolic tangent of each element
+    :rtype: TensorSSA
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = tanh(y)  # Compute hyperbolic tangent
+    """
+    return TensorSSA(math.tanh(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype)
+
+
+__all__ = [
+    "acos",
+    "asin",
+    "atan",
+    "atan2",
+    "cos",
+    "erf",
+    "exp2",
+    "log",
+    "log10",
+    "log2",
+    "rsqrt",
+    "sin",
+    "sqrt",
+    "tan",
+    "tanh",
+]
diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/__init__.py b/python/CuTeDSL/cutlass/cute/nvgpu/__init__.py
new file mode 100644
index 00000000..0655bb09
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/nvgpu/__init__.py
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from . import warp
+from . import cpasync
+from . import warpgroup
+from . import tcgen05
+
+from .common import *
+from .helpers import *
+
+
+# __all__ is required here for documentation generation
+__all__ = [
+    "OpError",
+    "MmaUniversalOp",
+    "CopyUniversalOp",
+]
diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/common.py b/python/CuTeDSL/cutlass/cute/nvgpu/common.py
new file mode 100644
index 00000000..c93becad
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/nvgpu/common.py
@@ -0,0 +1,143 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from dataclasses import dataclass
+from typing import Type, Optional
+
+from cutlass.cutlass_dsl import DSLBaseError
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir import ir
+
+from .. import core
+from ..typing import Float16, Float32, Float64, Numeric
+
+
+class OpError(DSLBaseError):
+    """
+    An exception class for Op construction errors.
+    """
+
+    def __init__(
+        self, op: core.Op, message: str, suggestion: Optional[str] = None
+    ) -> None:
+        if suggestion is None:
+            # Default suggestion
+            suggestion = "Check your Op construction code"
+        super().__init__(
+            message,
+            error_code=f"{op.__class__.__name__} error",
+            suggestion=suggestion,
+        )
+
+
+####################################################################################################
+#
+# MMA Ops and Traits
+#
+####################################################################################################
+
+
+@dataclass(frozen=True)
+class MmaUniversalOp(core.MmaOp):
+    """
+    The universal MMA Operation.
+
+    This Operation currently expects the A/B operands as well as the accumulator to share the same
+    data types.
+
+    :param abacc_dtype: The data type for the A/B operands and the accumulator
+    :type abacc_dtype:  Type[Numeric]
+    """
+
+    abacc_dtype: Type[Numeric]
+
+    def __post_init__(self) -> None:
+        if self.abacc_dtype not in [Float16, Float32, Float64]:
+            raise OpError(
+                self,
+                f"expects the 'abacc_dtype' Op parameter to be one of Float16, Float32, or Float64",
+            )
+
+    def __str__(self) -> str:
+        return (
+            "universal MMA Operation using FMA"
+            f"\n  A/B/Accumulator data type = {self.abacc_dtype}"
+        )
+
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaUniversalTrait":
+        shape_mnk_attr = ir.Attribute.parse(f'#cute.shape<"(1,1,1)">')
+        atom_ty = _cute_nvgpu_ir.UniversalFmaAtomType.get(
+            shape_mnk_attr,
+            self.abacc_dtype.mlir_type,
+            self.abacc_dtype.mlir_type,
+            self.abacc_dtype.mlir_type,
+        )
+        return MmaUniversalTrait(_cute_ir.atom(atom_ty, loc=loc, ip=ip))
+
+
+class MmaUniversalTrait(core.Trait):
+    pass
+
+
+####################################################################################################
+#
+# Copy Ops and Traits
+#
+####################################################################################################
+
+
+@dataclass(frozen=True)
+class CopyUniversalOp(core.CopyOp):
+    """
+    The universal Copy Operation.
+
+    When creating a Copy Atom out of this operation, the expected usage pattern is
+
+    .. code-block:: python
+
+        op = cute.nvgpu.CopyUniversalOp()
+        atom = cute.make_copy_atom(op, tensor_dtype, num_bits_per_copy=64)
+
+    - ``tensor_dtype`` is the data type used to build the reference TV Layout (either the source \
+        or the destination TV Layout) in unit of tensor elements and is used for partitioning by \
+        ``TiledCopy`` for example
+    - ``num_bits_per_copy`` is a kw argument specifying the number of bits to copy per Atom \
+        execution. This can be larger than the width of the above data type. When not provided, \
+        the compiler will do a best effort at auto-vectorizing.
+    """
+
+    def __str__(self) -> str:
+        return "universal Copy Operation"
+
+    def _make_trait(
+        self,
+        copy_internal_type: Type[Numeric],
+        *,
+        loc=None,
+        ip=None,
+        **kwargs,
+    ) -> "CopyUniversalTrait":
+        num_bits_per_copy = kwargs.get("num_bits_per_copy", 0)
+        if not isinstance(num_bits_per_copy, int) or (num_bits_per_copy < 0):
+            raise ValueError(
+                "expects a 'num_bits_per_copy' kw argument of type int that is non-negative "
+                f"when creating a copy Atom for {self.__class__.__name__}"
+            )
+        ty = _cute_nvgpu_ir.CopyAtomSIMTSyncCopyType.get(
+            copy_internal_type.mlir_type, num_bits_per_copy
+        )
+        return CopyUniversalTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class CopyUniversalTrait(core.Trait):
+    pass
diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/__init__.py b/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/__init__.py
new file mode 100644
index 00000000..322e8bf0
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/__init__.py
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from .copy import *
+from .helpers import *
+
+
+# __all__ is required here for documentation generation
+__all__ = [
+    #
+    # copy.py
+    #
+    "LoadCacheMode",
+    "CopyG2SOp",
+    "CopyBulkTensorTileG2SOp",
+    "CopyBulkTensorTileG2SMulticastOp",
+    "CopyBulkTensorTileS2GOp",
+    #
+    # helpers.py
+    #
+    "make_tma_tile_atom",
+    "tma_partition",
+    "create_tma_multicast_mask",
+    "prefetch_descriptor",
+    "copy_tensormap",
+    "update_tma_descriptor",
+    "fence_tma_desc_acquire",
+    "cp_fence_tma_desc_release",
+    "fence_tma_desc_release",
+]
diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py b/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py
new file mode 100644
index 00000000..8de65a72
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py
@@ -0,0 +1,366 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import enum
+from dataclasses import dataclass
+from typing import Optional, Type
+
+from cutlass.cutlass_dsl import CuTeDSL, t
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir import ir
+
+from ...core import CopyOp, Trait
+from ...typing import Int16, Pointer, Integer, Numeric
+from ..common import OpError
+from ..tcgen05.mma import CtaGroup
+
+
+####################################################################################################
+#
+# Aynchronous copies
+#
+####################################################################################################
+
+
+class LoadCacheMode(enum.Enum):
+    """
+    An enumeration for the possible cache modes of a non-bulk ``cp.async`` instruction.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#cache-operators>`__.
+    """
+
+    ALWAYS = _cute_nvgpu_ir.LoadCacheMode.always
+    GLOBAL = _cute_nvgpu_ir.LoadCacheMode.global_
+    STREAMING = _cute_nvgpu_ir.LoadCacheMode.streaming
+    LAST_USE = _cute_nvgpu_ir.LoadCacheMode.last_use
+    NONE = _cute_nvgpu_ir.LoadCacheMode.none
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+    def _to_ir(self) -> _cute_nvgpu_ir.LoadCacheMode:
+        return self.value
+
+
+@dataclass(frozen=True)
+class CopyG2SOp(CopyOp):
+    """
+    Non-bulk asynchronous GMEM to SMEM Copy Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-non-bulk-copy>`__.
+    """
+
+    cache_mode: LoadCacheMode = LoadCacheMode.ALWAYS
+
+    def __str__(self) -> str:
+        res = "cp.async GMEM -> SMEM copy Operation"
+        if self.cache_mode != LoadCacheMode.ALWAYS:
+            res += f"\n  with cache mode = {self.cache_mode}"
+        return res
+
+    def _make_trait(
+        self,
+        copy_internal_type: Type[t.Numeric],
+        *,
+        loc=None,
+        ip=None,
+        **kwargs,
+    ) -> "CopyG2STrait":
+        num_bits_per_copy = kwargs.get("num_bits_per_copy", None)
+        if not isinstance(num_bits_per_copy, int) or (num_bits_per_copy <= 0):
+            raise ValueError(
+                "expects a 'num_bits_per_copy' kw argument of type int that is positive "
+                f"when creating a copy Atom for {self.__class__.__name__}"
+            )
+        # Verify that the user provided enum values
+        if not isinstance(self.cache_mode, LoadCacheMode):
+            raise OpError(
+                self,
+                "expects the 'cache_mode' Op parameter to be a LoadCacheMode instance",
+            )
+        ty = _cute_nvgpu_ir.CopyAtomSIMTAsyncCopyType.get(
+            copy_internal_type.mlir_type, self.cache_mode._to_ir(), num_bits_per_copy
+        )
+        return CopyG2STrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class CopyG2STrait(Trait):
+    pass
+
+
+####################################################################################################
+#
+# Bulk tensor copies a.k.a TMA copies
+#
+####################################################################################################
+
+TMA_MBAR_PTR_FIELD_NAME = "tma_bar"
+TMA_MASK_FIELD_NAME = "mcast_mask"
+TMA_DESC_PTR_FIELD_NAME = "tma_descriptor_ptr"
+
+#
+# TMA GMEM -> SMEM copies
+#
+
+
+@dataclass(frozen=True)
+class CopyBulkTensorTileG2SOp(CopyOp):
+    """
+    Bulk tensor asynchrnous GMEM to SMEM Copy Operation using the TMA unit.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`__.
+    This Operation uses TMA in the ``.tile`` mode.
+    """
+
+    cta_group: CtaGroup = CtaGroup.ONE
+
+    admissible_archs = ["sm_90", "sm_90a", "sm_100a"]
+
+    def __post_init__(self) -> None:
+        if not isinstance(self.cta_group, CtaGroup):
+            raise OpError(
+                self, "expects the 'cta_group' parameter to be a CtaGroup instance"
+            )
+        # Arch verification
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+        if (self.cta_group == CtaGroup.TWO) and arch[:5] == "sm_90":
+            raise OpError(
+                self,
+                f"CTA group of 2 is tcgen05-specific and is not and is not compatible with {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+
+    def __str__(self) -> str:
+        res = "cp.async GMEM -> SMEM bulk tensor copy Operation"
+        if self.cta_group == CtaGroup.TWO:
+            res += f"\n  CTA group = 2"
+        return res
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "CopyBulkTensorTileG2SNonExecTrait":
+        raise NotImplementedError(
+            "Use cpasync.make_tma_tile_atom to obtain a copy Atom for TMA"
+        )
+
+    def _to_ir(self) -> _cute_nvgpu_ir.TiledTmaLoadEnum:
+        if self.cta_group == CtaGroup.ONE:
+            return _cute_nvgpu_ir.TiledTmaLoadEnum.sm_90
+        elif self.cta_group == CtaGroup.TWO:
+            return _cute_nvgpu_ir.TiledTmaLoadEnum.sm_100_2sm
+        else:
+            assert False, "unrecognized self.cta_group"
+
+
+class CopyBulkTensorTileG2SNonExecTrait(Trait):
+    # We allow kw args to be dropped so that the user can write common code for non-multicast
+    # and multicast loads.
+    def unpack(
+        self,
+        *,
+        loc=None,
+        ip=None,
+        tma_bar_ptr: Optional[Pointer] = None,
+        tma_desc_ptr: Optional[Pointer] = None,
+        **kwargs,
+    ):
+        """
+        Custom implementation of unpack for non-executable TMAs.
+
+        The non-multicast TMA load requires a `tma_bar_ptr` keyword argument to be provided when
+        using `cute.copy`. Any other kw arguments will be ignored instead of triggering an error.
+        """
+        if not isinstance(tma_bar_ptr, Pointer):
+            raise ValueError(
+                "expects a pointer to an mbarrier to be provided via the tma_bar_ptr kw argument"
+            )
+        exec_value = _cute_nvgpu_ir.atom_make_exec_tma(self.value, loc=loc, ip=ip)
+        attr_str = f"#cute_nvgpu.atom_copy_field_tmaload<{TMA_MBAR_PTR_FIELD_NAME}>"
+        attr = ir.Attribute.parse(attr_str)
+        exec_value = _cute_nvgpu_ir.atom_set_value(
+            exec_value, attr, tma_bar_ptr.value, loc=loc, ip=ip
+        )
+        if isinstance(tma_desc_ptr, Pointer):
+            attr_str = f"#cute_nvgpu.atom_copy_field_tmaload<{TMA_DESC_PTR_FIELD_NAME}>"
+            attr = ir.Attribute.parse(attr_str)
+            exec_value = _cute_nvgpu_ir.atom_set_value(
+                exec_value, attr, tma_desc_ptr.value, loc=loc, ip=ip
+            )
+        return exec_value
+
+
+#
+# TMA GMEM -> SMEM multicast copies
+#
+
+
+@dataclass(frozen=True)
+class CopyBulkTensorTileG2SMulticastOp(CopyOp):
+    """
+    Bulk tensor asynchrnous multicast GMEM to SMEM Copy Operation using the TMA unit.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`__.
+    This Operation uses TMA in the ``.tile`` mode.
+    """
+
+    cta_group: CtaGroup = CtaGroup.ONE
+
+    admissible_archs = ["sm_90", "sm_90a", "sm_100a"]
+
+    def __post_init__(self):
+        if not isinstance(self.cta_group, CtaGroup):
+            raise OpError(
+                self, "expects the 'cta_group' parameter to be a CtaGroup instance"
+            )
+        # Arch verification
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+        if (self.cta_group == CtaGroup.TWO) and arch[:5] == "sm_90":
+            raise OpError(
+                self,
+                f"CTA group of 2 is tcgen05-specific and is not and is not compatible with {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+
+    def __str__(self) -> str:
+        res = "cp.async GMEM -> SMEM bulk tensor multicast copy Operation"
+        if self.cta_group == CtaGroup.TWO:
+            res += f"\n  CTA group = 2"
+        return res
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "CopyBulkTensorTileG2SMulticastNonExecTrait":
+        raise NotImplementedError(
+            "Use cpasync.make_tma_tile_atom to obtain a copy Atom for TMA"
+        )
+
+    def _to_ir(self) -> _cute_nvgpu_ir.TiledTmaLoadEnum:
+        if self.cta_group == CtaGroup.ONE:
+            return _cute_nvgpu_ir.TiledTmaLoadEnum.sm_90_multicast
+        elif self.cta_group == CtaGroup.TWO:
+            return _cute_nvgpu_ir.TiledTmaLoadEnum.sm_100_2sm_multicast
+        else:
+            assert False, "unrecognized self.cta_group"
+
+
+class CopyBulkTensorTileG2SMulticastNonExecTrait(Trait):
+    def unpack(
+        self,
+        *,
+        loc=None,
+        ip=None,
+        tma_bar_ptr: Optional[Pointer] = None,
+        mcast_mask=None,
+        tma_desc_ptr=None,
+    ):
+        """
+        Custom implementation of unpack for non-executable TMAs.
+
+        The multicast TMA load requires a `tma_bar_ptr`  and a `mcast_mask` keyword arguments to be
+        provided when using `cute.copy`.
+        """
+        if not isinstance(tma_bar_ptr, Pointer):
+            raise ValueError(
+                "expects a pointer to an mbarrier to be provided via the tma_bar_ptr kw argument"
+            )
+        if not isinstance(mcast_mask, Integer):
+            raise ValueError(
+                "expects a multicast mask to be provided via the mcast_mask kw argument"
+            )
+        exec_value = _cute_nvgpu_ir.atom_make_exec_tma(self.value, loc=loc, ip=ip)
+        attr_str = f"#cute_nvgpu.atom_copy_field_tmaload<tma_bar>"
+        attr = ir.Attribute.parse(attr_str)
+        exec_value = _cute_nvgpu_ir.atom_set_value(
+            exec_value, attr, tma_bar_ptr.value, loc=loc, ip=ip
+        )
+        attr_str = f"#cute_nvgpu.atom_copy_field_tmaload<mcast_mask>"
+        attr = ir.Attribute.parse(attr_str)
+        exec_value = _cute_nvgpu_ir.atom_set_value(
+            exec_value, attr, Int16(mcast_mask).ir_value(loc=loc, ip=ip), loc=loc, ip=ip
+        )
+        if isinstance(tma_desc_ptr, Pointer):
+            attr_str = f"#cute_nvgpu.atom_copy_field_tmaload<{TMA_DESC_PTR_FIELD_NAME}>"
+            attr = ir.Attribute.parse(attr_str)
+            exec_value = _cute_nvgpu_ir.atom_set_value(
+                exec_value, attr, tma_desc_ptr.value, loc=loc, ip=ip
+            )
+        return exec_value
+
+
+#
+# TMA SMEM -> GMEM copies
+#
+
+
+@dataclass(frozen=True)
+class CopyBulkTensorTileS2GOp(CopyOp):
+    """
+    Bulk tensor asynchrnous SMEM to GMEM Copy Operation using the TMA unit.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`__.
+    This Operation uses TMA in the ``.tile`` mode.
+    """
+
+    admissible_archs = ["sm_90", "sm_90a", "sm_100a"]
+
+    def __post_init__(self):
+        # Arch verification
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+
+    def __str__(self) -> str:
+        return "cp.async SMEM -> GMEM bulk tensor copy Operation"
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "CopyBulkTensorTileS2GTrait":
+        raise NotImplementedError(
+            "Use cpasync.make_tma_tile_atom to obtain a copy Atom for TMA"
+        )
+
+
+class CopyBulkTensorTileS2GTrait(Trait):
+    def unpack(self, *, loc=None, ip=None, tma_desc_ptr: Optional[Pointer] = None):
+        """
+        Custom implementation of unpack for non-executable TMAs.
+        """
+        exec_value = _cute_nvgpu_ir.atom_make_exec_tma(self.value, loc=loc, ip=ip)
+        if isinstance(tma_desc_ptr, Pointer):
+            attr_str = (
+                f"#cute_nvgpu.atom_copy_field_tmastore<{TMA_DESC_PTR_FIELD_NAME}>"
+            )
+            attr = ir.Attribute.parse(attr_str)
+            exec_value = _cute_nvgpu_ir.atom_set_value(
+                exec_value, attr, tma_desc_ptr.value, loc=loc, ip=ip
+            )
+        return exec_value
diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py b/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py
new file mode 100644
index 00000000..92f028a2
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py
@@ -0,0 +1,327 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from typing import Optional, Tuple, Type, Union
+
+from cutlass.cutlass_dsl import dsl_user_op
+
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir.dialects import llvm
+
+from ...typing import Coord, Layout, Tensor, Tiler, Pointer, Int16, Numeric, NumericMeta
+from ... import core
+from .copy import (
+    CopyBulkTensorTileG2SOp,
+    CopyBulkTensorTileG2SMulticastOp,
+    CopyBulkTensorTileS2GOp,
+    CopyBulkTensorTileG2SNonExecTrait,
+    CopyBulkTensorTileG2SMulticastNonExecTrait,
+    CopyBulkTensorTileS2GTrait,
+)
+
+
+@dsl_user_op
+def make_tma_tile_atom(
+    op: Union[
+        CopyBulkTensorTileG2SOp,
+        CopyBulkTensorTileG2SMulticastOp,
+        CopyBulkTensorTileS2GOp,
+    ],
+    gmem_tensor: Tensor,
+    smem_layout: Layout,
+    cta_tiler: Tiler,
+    num_multicast: int = 1,
+    *,
+    internal_type: Optional[Type[Numeric]] = None,
+    loc=None,
+    ip=None,
+) -> Tuple[core.CopyAtom, Tensor]:
+    """
+    Makes a TMA Copy Atom in the ``.tile`` mode to copy tiles of a GMEM tensor to/from and SMEM
+    buffer with the given Layout.
+
+    Given
+
+    - a GMEM tensor
+    - a SMEM layout
+    - a CTA-level Tiler
+
+    this function figures out the bulk tensor asynchronous copy instruction to use with the maximum
+    "TMA vector length" to copy tiles of the GMEM tensor to/from an SMEM buffer with the provided
+    layout and consistent with the provided Tiler.
+
+    This function returns two results:
+
+    1. the Copy Atom
+    2. the so-called TMA tensor used to map logical coordinates of the GMEM tensor to coordinates \
+       that the TMA unit can consume. TMA tensors have so-called basis stride elements so that the \
+       associated layout can output coordinates. Otherwise, TMA tensors can be partitioned \
+       similarly to any other CuTe tensors using the algebra.
+
+    :param op:            The Copy Operation to construct an Atom for
+    :type op:             Union[CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp, CopyBulkTensorTileS2GOp]
+    :param gmem_tensor:   The GMEM tensor involved in the Copy
+    :type gmem_tensor:    Tensor
+    :param smem_layout:   The SMEM layout to construct the Copy Atom for
+    :type smem_layout:    Layout
+    :param cta_tiler:     The CTA Tiler to use
+    :type cta_tiler:      Tiler
+    :param num_multicast: The multicast factor
+    :type num_multicast:  int
+    :param internal_type: An optional parameter for the internal data type to use when the actual data type is not supported by the TMA unit
+    :type internal_type:  Type[Numeric]
+    :return:              A Copy Atom for this Operation and the associated TMA tensor
+    :rtype:               Tuple[core.CopyAtom, Tensor]
+    """
+
+    if internal_type is not None:
+        if not isinstance(internal_type, NumericMeta):
+            raise TypeError(f"internal_type must be a Numeric, but got {internal_type}")
+        internal_type = internal_type.mlir_type
+
+    cta_v_map = core.composition(
+        core.make_identity_layout(gmem_tensor.shape, loc=loc, ip=ip),
+        cta_tiler,
+        loc=loc,
+        ip=ip,
+    )
+
+    if isinstance(op, CopyBulkTensorTileG2SOp):
+        if num_multicast != 1:
+            raise ValueError(
+                f"expects num_multicast to be 1 for non multicast G2S copies, "
+                f"but got {num_multicast}"
+            )
+        res = _cute_nvgpu_ir.atom_make_non_exec_tiled_tma_load(
+            gmem_tensor.value,
+            smem_layout,
+            cta_v_map,
+            op._to_ir(),
+            num_multicast=num_multicast,
+            internal_type=internal_type,
+            loc=loc,
+            ip=ip,
+        )
+        return core.CopyAtom(op, CopyBulkTensorTileG2SNonExecTrait(res[0])), res[1]
+    elif isinstance(op, CopyBulkTensorTileG2SMulticastOp):
+        if num_multicast < 1:
+            raise ValueError(
+                f"expects num_multicast to be >= 1 for multicast G2S copies, "
+                f"but got {num_multicast}"
+            )
+        res = _cute_nvgpu_ir.atom_make_non_exec_tiled_tma_load(
+            gmem_tensor.value,
+            smem_layout,
+            cta_v_map,
+            op._to_ir(),
+            num_multicast=num_multicast,
+            internal_type=internal_type,
+            loc=loc,
+            ip=ip,
+        )
+        return (
+            core.CopyAtom(op, CopyBulkTensorTileG2SMulticastNonExecTrait(res[0])),
+            res[1],
+        )
+    elif isinstance(op, CopyBulkTensorTileS2GOp):
+        res = _cute_nvgpu_ir.atom_make_non_exec_tiled_tma_store(
+            gmem_tensor.value,
+            smem_layout,
+            cta_v_map,
+            internal_type=internal_type,
+            loc=loc,
+            ip=ip,
+        )
+        return core.CopyAtom(op, CopyBulkTensorTileS2GTrait(res[0])), res[1]
+    else:
+        raise ValueError(f"expects a bulk tensor (TMA) Copy Op, but got {op}")
+
+
+@dsl_user_op
+def tma_partition(
+    atom: core.CopyAtom,
+    cta_coord: Coord,
+    cta_layout: Layout,
+    smem_tensor: Tensor,
+    gmem_tensor: Tensor,
+    *,
+    loc=None,
+    ip=None,
+) -> Tuple[Tensor, Tensor]:
+    """
+    Tiles the GMEM and SMEM tensors for the provided TMA Copy Atom.
+    """
+    cta_coord_val = core._pack_coord(cta_coord, loc=loc, ip=ip)
+    s, d = _cute_nvgpu_ir.atom_tma_partition(
+        atom._trait.value,
+        cta_coord=cta_coord_val,
+        cta_layout=cta_layout,
+        smem_tensor=smem_tensor.value,
+        gmem_tensor=gmem_tensor.value,
+        loc=loc,
+        ip=ip,
+    )
+    return s, d
+
+
+@dsl_user_op
+def create_tma_multicast_mask(
+    cta_layout_vmnk: Layout,
+    cta_coord_vmnk: Coord,
+    mcast_mode: int,
+    *,
+    loc=None,
+    ip=None,
+) -> Int16:
+    """
+    Computes a multicast mask for a TMA load Copy.
+
+    :param cta_layout_vmnk: The VMNK layout of the cluster
+    :type cta_layout_vmnk:  Layout
+    :param cta_coord_vmnk:  The VMNK coordinate of the current CTA
+    :type cta_coord_vmnk:   Coord
+    :param mcast_mode:      The tensor mode in which to multicast
+    :type mcast_mode:       int
+    :return:                The resulting mask
+    :rtype:                 Int16
+    """
+    if core.rank(cta_layout_vmnk) != 4:
+        raise ValueError(
+            f"cta_layout_vmnk must be rank 4, but got {core.pretty_str(cta_layout_vmnk)}"
+        )
+    if core.rank(cta_coord_vmnk) != 4:
+        raise ValueError(
+            f"cta_coord_vmnk must be rank 4, but got {core.pretty_str(cta_coord_vmnk)}"
+        )
+    return core.make_layout_image_mask(
+        cta_layout_vmnk, cta_coord_vmnk, mcast_mode, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def prefetch_descriptor(tma_atom: core.CopyAtom, *, loc=None, ip=None) -> None:
+    """
+    Prefetches the TMA descriptor associated with the TMA Atom.
+    """
+    _cute_nvgpu_ir.prefetch_tma_desc(tma_atom._trait.value, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def copy_tensormap(
+    tma_atom: core.CopyAtom, tensormap_ptr: Pointer, *, loc=None, ip=None
+) -> None:
+    """
+    Copies the tensormap held by a TMA Copy Atom to the memory location pointed to by the provided
+    pointer.
+
+    :param tma_atom:      The TMA Copy Atom
+    :type tma_atom:       CopyAtom
+    :param tensormap_ptr: The pointer to the memory location to copy the tensormap to
+    :type tensormap_ptr:  Pointer
+    """
+    _cute_nvgpu_ir.copy_tma_desc(
+        tma_atom._trait.value, tensormap_ptr.value, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def update_tma_descriptor(
+    tma_atom: core.CopyAtom,
+    gmem_tensor: Tensor,
+    tma_desc_ptr: Pointer,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    Updates the TMA descriptor in the memory location pointed to by the provided pointer using
+    information from a TMA Copy Atom and the provided GMEM tensor.
+
+    Specifically, the following fields of the TMA descriptor will be updated:
+
+    1. the GMEM tensor base address
+    2. the GMEM tensor shape
+    3. the GMEM tensor stride
+
+    Other fields of the TMA descriptor are left unchanged.
+
+    :param tma_atom:      The TMA Copy Atom
+    :type tma_atom:       CopyAtom
+    :param gmem_tensor:   The GMEM tensor
+    :type gmem_tensor:    Tensor
+    :param tensormap_ptr: The pointer to the memory location of the descriptor to udpate
+    :type tensormap_ptr:  Pointer
+    """
+    _cute_nvgpu_ir.update_tma_desc(
+        tma_atom._trait.value, gmem_tensor.value, tma_desc_ptr.value, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def fence_tma_desc_acquire(
+    tma_desc_ptr: Pointer,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`__.
+    """
+    tma_desc_ptr_i64 = tma_desc_ptr.toint(loc=loc, ip=ip).ir_value()
+    llvm.inline_asm(
+        None,
+        [tma_desc_ptr_i64],
+        "fence.proxy.tensormap::generic.acquire.gpu [$0], 128;",
+        "l",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+
+
+@dsl_user_op
+def cp_fence_tma_desc_release(
+    tma_desc_global_ptr: Pointer,
+    tma_desc_shared_ptr: Pointer,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy>`__.
+    """
+    tma_desc_global_ptr_i64 = tma_desc_global_ptr.toint(loc=loc, ip=ip).ir_value()
+    tma_desc_shared_ptr_i32 = tma_desc_shared_ptr.toint(loc=loc, ip=ip).ir_value()
+    llvm.inline_asm(
+        None,
+        [tma_desc_global_ptr_i64, tma_desc_shared_ptr_i32],
+        "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [$0], [$1], 128;",
+        "l,r",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+
+
+@dsl_user_op
+def fence_tma_desc_release(*, loc=None, ip=None) -> None:
+    """
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`__.
+    """
+    llvm.inline_asm(
+        None,
+        [],
+        "fence.proxy.tensormap::generic.release.gpu;",
+        "",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/helpers.py b/python/CuTeDSL/cutlass/cute/nvgpu/helpers.py
new file mode 100644
index 00000000..020b96d8
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/nvgpu/helpers.py
@@ -0,0 +1,159 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from typing import Optional, Tuple, Type, Union
+
+from cutlass.cutlass_dsl import dsl_user_op
+
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+
+from .. import core
+from ..typing import Shape, Layout, Tensor, Numeric, NumericMeta
+from ...impl_utils import check_type_in
+from .cpasync.copy import (
+    CopyBulkTensorTileG2SOp,
+    CopyBulkTensorTileG2SNonExecTrait,
+    CopyBulkTensorTileG2SMulticastOp,
+    CopyBulkTensorTileG2SMulticastNonExecTrait,
+)
+
+
+####################################################################################################
+#
+# TMA creation helpers for tcgen05 MMAs
+#
+####################################################################################################
+
+
+@dsl_user_op
+def make_tma_tile_atom_A(
+    op: Union[CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp],
+    gmem_tensor: Tensor,
+    smem_layout: Layout,
+    mma_tiler_mnk: Shape,
+    tiled_mma: core.TiledMma,
+    cluster_shape_vmnk: Shape,
+    *,
+    internal_type: Optional[Type[Numeric]] = None,
+    loc=None,
+    ip=None,
+) -> Tuple[core.CopyAtom, Tensor]:
+    if internal_type is not None:
+        if not isinstance(internal_type, NumericMeta):
+            raise TypeError(f"internal_type must be a Numeric, but got {internal_type}")
+        internal_type = internal_type.mlir_type
+    check_type_in(
+        op,
+        [CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp],
+        "op",
+        "make_tma_tile_atom_A",
+    )
+
+    ident = core.make_identity_layout(gmem_tensor.shape, loc=loc, ip=ip)
+    mma_tiler_mk = (mma_tiler_mnk[0], *mma_tiler_mnk[2:])
+    g_tile = core.composition(ident, mma_tiler_mk, loc=loc, ip=ip)
+    cta_v_map = tiled_mma._thrfrg_A(g_tile)
+    cta_v_map = core.get(cta_v_map, mode=[1])
+    cta_v_map = core.dice(cta_v_map, (1, (1,) * core.rank(g_tile)))
+
+    if isinstance(op, CopyBulkTensorTileG2SOp):
+        num_multicast = 1
+    else:
+        assert isinstance(op, CopyBulkTensorTileG2SMulticastOp)
+        # multicast across the N-mode since those would share the same tile of A
+        num_multicast = core.size(cluster_shape_vmnk, mode=[2])
+
+    # res[0] = the IR Value for the non-executable atom instance
+    # res[1] = the IR Value for the associated TMA tensor
+    res = _cute_nvgpu_ir.atom_make_non_exec_tiled_tma_load(
+        gmem_tensor.value,
+        smem_layout,
+        cta_v_map,
+        op._to_ir(),
+        num_multicast=num_multicast,
+        internal_type=internal_type,
+        loc=loc,
+        ip=ip,
+    )
+    if isinstance(op, CopyBulkTensorTileG2SOp):
+        return core.CopyAtom(op, CopyBulkTensorTileG2SNonExecTrait(res[0])), res[1]
+    else:
+        assert isinstance(op, CopyBulkTensorTileG2SMulticastOp)
+        return (
+            core.CopyAtom(op, CopyBulkTensorTileG2SMulticastNonExecTrait(res[0])),
+            res[1],
+        )
+
+
+@dsl_user_op
+def make_tma_tile_atom_B(
+    op: Union[CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp],
+    gmem_tensor: Tensor,
+    smem_layout: Layout,
+    mma_tiler_mnk: Shape,
+    tiled_mma: core.TiledMma,
+    cluster_shape_vmnk: Shape,
+    *,
+    internal_type: Optional[Type[Numeric]] = None,
+    loc=None,
+    ip=None,
+) -> Tuple[core.CopyAtom, Tensor]:
+    if internal_type is not None:
+        if not isinstance(internal_type, NumericMeta):
+            raise TypeError(f"internal_type must be a Numeric, but got {internal_type}")
+        internal_type = internal_type.mlir_type
+    check_type_in(
+        op,
+        [CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp],
+        "op",
+        "make_tma_tile_atom_B",
+    )
+
+    ident = core.make_identity_layout(gmem_tensor.shape, loc=loc, ip=ip)
+    mma_tiler_nk = (mma_tiler_mnk[1], *mma_tiler_mnk[2:])
+    g_tile = core.composition(ident, mma_tiler_nk, loc=loc, ip=ip)
+    cta_v_map = tiled_mma._thrfrg_B(g_tile)
+    cta_v_map = core.get(cta_v_map, mode=[1])
+    cta_v_map = core.dice(cta_v_map, (1, (1,) * core.rank(g_tile)))
+
+    if isinstance(op, CopyBulkTensorTileG2SOp):
+        num_multicast = 1
+    else:
+        assert isinstance(op, CopyBulkTensorTileG2SMulticastOp)
+        # multicast across the M-mode since those would share the same tile of B
+        num_multicast = core.size(cluster_shape_vmnk, mode=[1])
+
+    # res[0] = the IR Value for the non-executable atom instance
+    # res[1] = the IR Value for the associated TMA tensor
+    res = _cute_nvgpu_ir.atom_make_non_exec_tiled_tma_load(
+        gmem_tensor.value,
+        smem_layout,
+        cta_v_map,
+        op._to_ir(),
+        num_multicast=num_multicast,
+        internal_type=internal_type,
+        loc=loc,
+        ip=ip,
+    )
+    if isinstance(op, CopyBulkTensorTileG2SOp):
+        return core.CopyAtom(op, CopyBulkTensorTileG2SNonExecTrait(res[0])), res[1]
+    else:
+        assert isinstance(op, CopyBulkTensorTileG2SMulticastOp)
+        return (
+            core.CopyAtom(op, CopyBulkTensorTileG2SMulticastNonExecTrait(res[0])),
+            res[1],
+        )
+
+
+__all__ = [
+    "make_tma_tile_atom_A",
+    "make_tma_tile_atom_B",
+]
diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/__init__.py b/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/__init__.py
new file mode 100644
index 00000000..4afeb527
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/__init__.py
@@ -0,0 +1,57 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from .copy import *
+from .mma import *
+from .helpers import *
+
+# __all__ is required here for documentation generation
+__all__ = [
+    #
+    # copy.py
+    #
+    "Repetition",
+    "Pack",
+    "Unpack",
+    "Ld16x64bOp",
+    "Ld16x128bOp",
+    "Ld16x256bOp",
+    "Ld16x32bx2Op",
+    "Ld32x32bOp",
+    "St16x64bOp",
+    "St16x128bOp",
+    "St16x256bOp",
+    "St16x32bx2Op",
+    "St32x32bOp",
+    #
+    # mma.py
+    #
+    "OperandMajorMode",
+    "OperandSource",
+    "CtaGroup",
+    "Field",
+    "MmaTF32Op",
+    "MmaF16BF16Op",
+    "MmaI8Op",
+    "MmaFP8Op",
+    "SmemLayoutAtomKind",
+    #
+    # helpers.py
+    #
+    "make_smem_layout_atom",
+    "tile_to_mma_shape",
+    "commit",
+    "is_tmem_load",
+    "is_tmem_store",
+    "get_tmem_copy_properties",
+    "find_tmem_tensor_col_offset",
+    "make_tmem_copy",
+]
diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/copy.py b/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/copy.py
new file mode 100644
index 00000000..283cf8fb
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/copy.py
@@ -0,0 +1,465 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import enum
+from dataclasses import dataclass
+from typing import Type
+
+from cutlass.cutlass_dsl import CuTeDSL
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir import ir
+
+from ..common import OpError
+from ...core import CopyOp, Trait
+from ...typing import Numeric
+
+
+class Repetition(enum.Enum):
+    """
+    An enumeration for the number of repetitions of a given TMEM copy within the instruction.
+    """
+
+    x1 = 1
+    x2 = 2
+    x4 = 4
+    x8 = 8
+    x16 = 16
+    x32 = 32
+    x64 = 64
+    x128 = 128
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+    @classmethod
+    def _missing_(cls, value):
+        if isinstance(value, int):
+            if value == 1:
+                return Repetition.x1
+            elif value == 2:
+                return Repetition.x2
+            elif value == 8:
+                return Repetition.x8
+            elif value == 16:
+                return Repetition.x16
+            elif value == 32:
+                return Repetition.x32
+            elif value == 64:
+                return Repetition.x64
+            elif value == 128:
+                return Repetition.x128
+
+
+class Pack(enum.Enum):
+    """
+    An enumeration for the possible packing patterns for TMEM to RMEM copies.
+    """
+
+    NONE = enum.auto()
+    PACK_16b_IN_32b = enum.auto()
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+
+class Unpack(enum.Enum):
+    """
+    An enumeration for the possible unpacking patterns for RMEM to TMEM copies.
+    """
+
+    NONE = enum.auto()
+    UNPACK_32b_IN_16b = enum.auto()
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+
+@dataclass(frozen=True)
+class _LdBase(CopyOp):
+    repeat: Repetition = Repetition.x1
+    pack: Pack = Pack.NONE
+
+    admissible_archs = ["sm_100a"]
+
+    def __post_init__(self) -> None:
+        # Arch verification
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+
+        if not isinstance(self.repeat, Repetition):
+            raise OpError(
+                self,
+                "expects the 'repeat' Op parameter to be a tcgen05.Repetition instance",
+            )
+        if not isinstance(self.pack, Pack):
+            raise OpError(
+                self,
+                "expects the 'pack' Op parameter to be a tcgen05.Pack instance",
+            )
+
+    def __str__(self) -> str:
+        res = (
+            f"tcgen05 {self.__class__.__name__[:-2]} Copy Operation"
+            + f"\n  number of repetitions = {self.repeat.value}"
+        )
+        if self.pack == Pack.PACK_16b_IN_32b:
+            res += f"\n  with 2x 16-bit to 32b packing"
+        return res
+
+
+@dataclass(frozen=True)
+class Ld16x64bOp(_LdBase):
+    """
+    16x64b TMEM load Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-ld>`__.
+    This Operation corresponds to the ``.16x64b`` qualifier.
+    """
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Ld16x64bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemLoadType.get(
+            copy_internal_type.mlir_type,
+            16,
+            64,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.pack == Pack.PACK_16b_IN_32b else None,
+        )
+        return Ld16x64bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class Ld16x64bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class Ld16x128bOp(_LdBase):
+    """
+    16x128b TMEM load Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-ld>`__.
+    This Operation corresponds to the ``.16x128b`` qualifier.
+    """
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.repeat == Repetition.x128:
+            raise OpError(
+                self,
+                "x128 repetition is not supported",
+                suggestion="choose one of x1, x2, x4, x8, x16, x32, x64",
+            )
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Ld16x128bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemLoadType.get(
+            copy_internal_type.mlir_type,
+            16,
+            128,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.pack == Pack.PACK_16b_IN_32b else None,
+        )
+        return Ld16x128bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class Ld16x128bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class Ld16x256bOp(_LdBase):
+    """
+    16x256b TMEM load Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-ld>`__.
+    This Operation corresponds to the ``.16x256b`` qualifier.
+    """
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.repeat in (Repetition.x128, Repetition.x64):
+            raise OpError(
+                self,
+                "x64 and x128 repetition is not supported",
+                suggestion="choose one of x1, x2, x4, x8, x16, x32",
+            )
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Ld16x256bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemLoadType.get(
+            copy_internal_type.mlir_type,
+            16,
+            256,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.pack == Pack.PACK_16b_IN_32b else None,
+        )
+        return Ld16x256bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class Ld16x256bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class Ld16x32bx2Op(_LdBase):
+    """
+    16x32bx2 TMEM load Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-ld>`__.
+    This Operation corresponds to the ``.16x32bx2`` qualifier.
+    """
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Ld16x32bx2Trait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemLoadType.get(
+            copy_internal_type.mlir_type,
+            16,
+            32,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.pack == Pack.PACK_16b_IN_32b else None,
+        )
+        return Ld16x32bx2Trait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class Ld16x32bx2Trait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class Ld32x32bOp(_LdBase):
+    """
+    32x32b TMEM load Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-ld>`__.
+    This Operation corresponds to the ``.32x32`` qualifier.
+    """
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Ld32x32bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemLoadType.get(
+            copy_internal_type.mlir_type,
+            32,
+            32,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.pack == Pack.PACK_16b_IN_32b else None,
+        )
+        return Ld32x32bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class Ld32x32bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class _StBase(CopyOp):
+    repeat: Repetition
+    unpack: Unpack = Unpack.NONE
+
+    admissible_archs = ["sm_100a"]
+
+    def __post_init__(self) -> None:
+        # Arch verification
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+
+        if not isinstance(self.repeat, Repetition):
+            raise OpError(
+                self,
+                "expects the 'repeat' Op parameter to be a tcgen05.Repetition instance",
+            )
+        if not isinstance(self.unpack, Unpack):
+            raise OpError(
+                self,
+                "expects the 'pack' Op parameter to be a tcgen05.Unpack instance",
+            )
+
+    def __str__(self) -> str:
+        res = (
+            f"tcgen05 {self.__class__.__name__[:-2]} Copy Operation"
+            + f"\n  number of repetitions = {self.repeat.value}"
+        )
+        if self.unpack == Unpack.UNPACK_32b_IN_16b:
+            res += f"\n  with 32-bit to 2x 16b unpacking"
+        return res
+
+
+@dataclass(frozen=True)
+class St16x64bOp(_StBase):
+    """
+    16x64b TMEM store Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st>`__.
+    This Operation corresponds to the ``.16x64`` qualifier.
+    """
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "St16x64bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemStoreType.get(
+            copy_internal_type.mlir_type,
+            16,
+            64,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.unpack == Unpack.UNPACK_32b_IN_16b else None,
+        )
+        return St16x64bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class St16x64bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class St16x128bOp(_StBase):
+    """
+    16x128b TMEM store Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st>`__.
+    This Operation corresponds to the ``.16x128`` qualifier.
+    """
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.repeat == Repetition.x128:
+            raise OpError(
+                self,
+                "x128 repetition is not supported",
+                suggestion="choose one of x1, x2, x4, x8, x16, x32, x64",
+            )
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "St16x128bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemStoreType.get(
+            copy_internal_type.mlir_type,
+            16,
+            128,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.unpack == Unpack.UNPACK_32b_IN_16b else None,
+        )
+        return St16x128bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class St16x128bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class St16x256bOp(_StBase):
+    """
+    16x256b TMEM store Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st>`__.
+    This Operation corresponds to the ``.16x256`` qualifier.
+    """
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.repeat in (Repetition.x128, Repetition.x64):
+            raise OpError(
+                self,
+                "x64 and x128 repetition is not supported",
+                suggestion="choose one of x1, x2, x4, x8, x16, x32",
+            )
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "St16x256bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemStoreType.get(
+            copy_internal_type.mlir_type,
+            16,
+            256,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.unpack == Unpack.UNPACK_32b_IN_16b else None,
+        )
+        return St16x256bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class St16x256bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class St16x32bx2Op(_StBase):
+    """
+    16x32x2b TMEM store Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st>`__.
+    This Operation corresponds to the ``.16x32x2`` qualifier.
+    """
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "St16x32bx2Trait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemStoreType.get(
+            copy_internal_type.mlir_type,
+            16,
+            32,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.unpack == Unpack.UNPACK_32b_IN_16b else None,
+        )
+        return St16x32bx2Trait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class St16x32bx2Trait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class St32x32bOp(_StBase):
+    """
+    32x32b TMEM store Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st>`__.
+    This Operation corresponds to the ``.32x32`` qualifier.
+    """
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "St32x32bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemStoreType.get(
+            copy_internal_type.mlir_type,
+            32,
+            32,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.unpack == Unpack.UNPACK_32b_IN_16b else None,
+        )
+        return St32x32bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class St32x32bTrait(Trait):
+    pass
diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/helpers.py b/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/helpers.py
new file mode 100644
index 00000000..cac64131
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/helpers.py
@@ -0,0 +1,301 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from typing import overload, Type, Tuple, Union
+
+from cutlass.cutlass_dsl import dsl_user_op
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir.dialects import nvvm
+
+from ...typing import (
+    Shape,
+    IntTuple,
+    Layout,
+    Tensor,
+    Int,
+    Numeric,
+    NumericMeta,
+    Int16,
+    Int32,
+)
+from ... import core
+from .mma import SmemLayoutAtomKind, CtaGroup
+from .copy import (
+    Pack,
+    Unpack,
+    Ld16x64bOp,
+    Ld16x128bOp,
+    Ld16x256bOp,
+    Ld16x32bx2Op,
+    Ld32x32bOp,
+    St16x64bOp,
+    St16x128bOp,
+    St16x256bOp,
+    St16x32bx2Op,
+    St32x32bOp,
+)
+
+
+####################################################################################################
+#
+# Helper functions for MMA
+#
+####################################################################################################
+
+
+@dsl_user_op
+def make_smem_layout_atom(
+    kind: SmemLayoutAtomKind, element_type: Type[Numeric], *, loc=None, ip=None
+) -> core.ComposedLayout:
+    """
+    Makes a SMEM layout Atom.
+
+    This function creates a composed layout in unit of elements consistent with the requested layout
+    Atom kind and element data type.
+
+    :param kind:         The kind of layout Atom
+    :type kind:          SmemLayoutAtomKind
+    :param element_type: The element data type to construct the layout for
+    :type element_type:  Type[Numeric]
+    :return:             The SMEM layout atom
+    :rtype:              core.ComposedLayout
+    """
+    if not isinstance(element_type, NumericMeta):
+        raise TypeError(f"element_type must be a Numeric, but got {element_type}")
+
+    if kind in (SmemLayoutAtomKind.MN_INTER, SmemLayoutAtomKind.K_INTER):
+        num_contiguous_bits = 128
+        sw = core.make_swizzle(0, 4, 3)
+    elif kind in (SmemLayoutAtomKind.MN_SW32, SmemLayoutAtomKind.K_SW32):
+        num_contiguous_bits = 256
+        sw = core.make_swizzle(1, 4, 3)
+    elif kind in (SmemLayoutAtomKind.MN_SW64, SmemLayoutAtomKind.K_SW64):
+        num_contiguous_bits = 512
+        sw = core.make_swizzle(2, 4, 3)
+    elif kind in (SmemLayoutAtomKind.MN_SW128, SmemLayoutAtomKind.K_SW128):
+        num_contiguous_bits = 1024
+        sw = core.make_swizzle(3, 4, 3)
+    elif kind == SmemLayoutAtomKind.MN_SW128_32B:
+        num_contiguous_bits = 1024
+        sw = core.make_swizzle(2, 5, 2)
+    else:
+        raise ValueError("unrecognized SMEM layout atom kind")
+    num_contiguous_elems = num_contiguous_bits // element_type.width
+
+    if kind in (
+        SmemLayoutAtomKind.MN_INTER,
+        SmemLayoutAtomKind.MN_SW32,
+        SmemLayoutAtomKind.MN_SW64,
+        SmemLayoutAtomKind.MN_SW128,
+        SmemLayoutAtomKind.MN_SW128_32B,
+    ):
+        # M/N-major layout
+        return core.make_composed_layout(
+            sw,
+            0,
+            core.make_layout(
+                (num_contiguous_elems, 8), stride=(1, num_contiguous_elems)
+            ),
+            loc=loc,
+            ip=ip,
+        )
+    else:
+        # K-major layout
+        return core.make_composed_layout(
+            sw,
+            0,
+            core.make_layout(
+                (8, num_contiguous_elems), stride=(num_contiguous_elems, 1)
+            ),
+            loc=loc,
+            ip=ip,
+        )
+
+
+@overload
+def tile_to_mma_shape(
+    atom: Layout, mma_tile_shape: Shape, order: IntTuple = None, *, loc=None, ip=None
+) -> Layout: ...
+
+
+@overload
+def tile_to_mma_shape(
+    atom: core.ComposedLayout,
+    mma_tile_shape: Shape,
+    order: IntTuple = None,
+    *,
+    loc=None,
+    ip=None,
+) -> core.ComposedLayout: ...
+
+
+@dsl_user_op
+def tile_to_mma_shape(
+    atom, mma_tile_shape: Shape, order: IntTuple = None, *, loc=None, ip=None
+):
+    """
+    Tiles a layout to an MMA shape.
+    """
+    # Default order is colexicographical
+    if order is None:
+        order = tuple(range(core.rank(mma_tile_shape) - 1))
+    if core.rank(order) != core.rank(mma_tile_shape) - 1:
+        raise ValueError(
+            f"rank(order)={core.rank(order)} must be equal to "
+            f"rank(mma_tile_shape)-1={core.rank(mma_tile_shape)-1}"
+        )
+    order_val = core._pack_int_tuple(order, loc=loc, ip=ip)
+    mma_tile_shape_val = core._pack_shape(mma_tile_shape, loc=loc, ip=ip)
+
+    if not (
+        core.is_static(atom)
+        and core.is_static(mma_tile_shape_val)
+        and core.is_static(order_val)
+    ):
+        raise ValueError("tile_to_mma_shape only supports static inputs")
+
+    res_ty = _cute_nvgpu_ir.tile_to_mma_shape(atom, mma_tile_shape_val, order_val)
+    return _cute_ir.static(res_ty, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def commit(
+    mbar_ptr: core.Pointer,
+    mask=None,
+    cta_group: CtaGroup = CtaGroup.ONE,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    Perform an arrive operation on a mbarrier upon completion of previous MMA operations.
+
+    :param mbar_ptr: A pointer to the mbarrier in SMEM
+    :type mbar_ptr:  Pointer
+    :param mask:     An optional multicast mask for the CTAs in the cluster to signal arrival to
+    :type mask:      Int
+    """
+    if cta_group == CtaGroup.ONE:
+        group = nvvm.Tcgen05GroupKind.CTA_1
+    else:
+        assert cta_group == CtaGroup.TWO
+        group = nvvm.Tcgen05GroupKind.CTA_2
+
+    mbar_ptr = mbar_ptr.llvm_ptr
+    if mask is not None:
+        mask = Int16(mask).ir_value(loc=loc, ip=ip)
+        nvvm.tcgen05_commit_arrive(
+            mbar_ptr, multicast_mask=mask, group=group, loc=loc, ip=ip
+        )
+    else:
+        nvvm.tcgen05_commit_arrive(mbar_ptr, group=group, loc=loc, ip=ip)
+    return
+
+
+####################################################################################################
+#
+# Helper functions for Copies
+#
+####################################################################################################
+
+
+def is_tmem_load(atom: core.CopyAtom) -> bool:
+    """
+    Returns whether a CopyAtom instance is a TMEM load.
+    """
+    return isinstance(
+        atom.op,
+        (
+            Ld16x64bOp,
+            Ld16x128bOp,
+            Ld16x256bOp,
+            Ld16x32bx2Op,
+            Ld32x32bOp,
+        ),
+    )
+
+
+def is_tmem_store(atom: core.CopyAtom) -> bool:
+    """
+    Returns whether a CopyAtom instance is a TMEM store.
+    """
+    return isinstance(
+        atom.op,
+        (
+            St16x64bOp,
+            St16x128bOp,
+            St16x256bOp,
+            St16x32bx2Op,
+            St32x32bOp,
+        ),
+    )
+
+
+def get_tmem_copy_properties(
+    atom: core.CopyAtom,
+) -> Tuple[int, int, int, Union[Pack, Unpack]]:
+    """
+    Returns the properties of a TMEM copy atom (number of data paths, bits, repetitions,
+    and whether packing/unpacking is used).
+    """
+    if isinstance(atom.op, (Ld16x64bOp, St16x64bOp)):
+        num_dp, num_bits = 16, 64
+    elif isinstance(atom.op, (Ld16x128bOp, St16x128bOp)):
+        num_dp, num_bits = 16, 128
+    elif isinstance(atom.op, (Ld16x256bOp, St16x256bOp)):
+        num_dp, num_bits = 16, 256
+    elif isinstance(atom.op, (Ld16x32bx2Op, St16x32bx2Op)):
+        num_dp, num_bits = 16, 32
+    elif isinstance(atom.op, (Ld32x32bOp, St32x32bOp)):
+        num_dp, num_bits = 32, 32
+    else:
+        raise ValueError(f"expects 'atom' to be a TMEM copy, but got {atom}")
+    if is_tmem_load(atom):
+        return num_dp, num_bits, atom.op.repeat.value, atom.op.pack
+    else:
+        assert is_tmem_store(atom), "atom must be a TMEM store"
+        return num_dp, num_bits, atom.op.repeat.value, atom.op.unpack
+
+
+@dsl_user_op
+def find_tmem_tensor_col_offset(tmem_tensor: Tensor, *, loc=None, ip=None) -> Int:
+    """
+    Computes the TMEM column offset given a TMEM tensor.
+
+    :param tmem_tensor: The TMEM tensor to use to compute the columns offset
+    :type tmem_tensor:  Tensor
+    :return:            The columns offset
+    :rtype:             Int
+    """
+    tmem_col_mask = 0x0000FFFF
+    offset = (
+        core.cosize(core.recast_tensor(tmem_tensor, Int32).layout, loc=loc, ip=ip)
+        & tmem_col_mask
+    )
+    if isinstance(offset, int):
+        return offset
+    return Int32(offset, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def make_tmem_copy(
+    atom: core.CopyAtom, tmem_tensor: Tensor, *, loc=None, ip=None
+) -> core.TiledCopy:
+    """
+    Makes a Tiled Copy instance from a TMEM Copy Atom and a TMEM tensor.
+    """
+    tiled_copy_val = _cute_nvgpu_ir.atom_make_tmem_copy(
+        atom._trait.value, tmem_tensor.value, loc=loc, ip=ip
+    )
+    new_trait = type(atom._trait)(tiled_copy_val)
+    return core.TiledCopy(atom.op, new_trait)
diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/mma.py b/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/mma.py
new file mode 100644
index 00000000..096a4e12
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/mma.py
@@ -0,0 +1,603 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import enum
+from dataclasses import dataclass
+from typing import Type
+
+from cutlass.cutlass_dsl import CuTeDSL, T
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir import ir
+
+from ..common import OpError
+from ...core import MmaOp, Trait, _pack_shape, rank, depth
+from ...typing import (
+    Shape,
+    Float8E5M2,
+    Float8E4M3FN,
+    Float16,
+    BFloat16,
+    Float32,
+    TFloat32,
+    Boolean,
+    Int8,
+    Uint8,
+    Int32,
+    Numeric,
+)
+
+
+####################################################################################################
+#
+# MMA Ops and Traits
+#
+####################################################################################################
+
+
+class OperandMajorMode(enum.Enum):
+    """
+    An enumeration for the majorness of the input operands of the MMA.
+    """
+
+    MN = _cute_ir.MajorMode.mn
+    K = _cute_ir.MajorMode.k
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+    @classmethod
+    def _missing_(cls, value):
+        if isinstance(value, str):
+            value = value.upper()
+            if value == "MN":
+                return OperandMajorMode.MN
+            elif value == "K":
+                return OperandMajorMode.K
+
+    def _to_ir(self) -> _cute_ir.MajorMode:
+        return self.value
+
+
+class OperandSource(enum.Enum):
+    """
+    An enumeration for the source memory location of the A input operand of the MMA.
+    """
+
+    TMEM = _cute_ir.MmaFragKind.tmem
+    SMEM = _cute_ir.MmaFragKind.smem_desc
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+    def _to_ir(self) -> _cute_ir.MmaFragKind:
+        return self.value
+
+
+class CtaGroup(enum.Enum):
+    """
+    An enumeration for the ``cta_group``  qualifier of the MMA.
+    """
+
+    ONE = 1
+    TWO = 2
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+
+class Field(enum.Enum):
+    """
+    An enumeration for the fields of the MMA Atom that can be modified at runtime.
+    """
+
+    NEGATE_A = "neg_a"
+    NEGATE_B = "neg_b"
+    ACCUMULATE = "accum_c"
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+    def _to_ir_field_name(self) -> str:
+        return self.value
+
+
+# Base class for all tcgen05 MMA Ops used to factor out some internal code
+@dataclass(frozen=True)
+class MmaOp(MmaOp):
+    a_dtype: Type[Numeric]
+    b_dtype: Type[Numeric]
+    acc_dtype: Type[Numeric]
+    shape_mnk: Shape
+    cta_group: CtaGroup
+    a_src: OperandSource
+    a_major_mode: OperandMajorMode
+    b_major_mode: OperandMajorMode
+
+    admissible_archs = ["sm_100a"]
+
+    def __post_init__(self) -> None:
+        # Verify arch
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+        # Verify that the user provided enum values
+        if not isinstance(self.cta_group, CtaGroup):
+            raise OpError(
+                self,
+                "expects the 'cta_group' Op parameter to be a tcgen05.CtaGroup instance",
+            )
+        if not isinstance(self.a_src, OperandSource):
+            raise OpError(
+                self,
+                "expects the 'a_src' Op parameter to be a tcgen05.OperandSource instance",
+            )
+        if not isinstance(self.a_major_mode, OperandMajorMode):
+            raise OpError(
+                self,
+                "expects the 'a_major_mode' Op parameter to be a tcgen05.OperandMajorMode instance",
+            )
+        if not isinstance(self.b_major_mode, OperandMajorMode):
+            raise OpError(
+                self,
+                "expects the 'b_major_mode' Op parameter to be a tcgen05.OperandMajorMode instance",
+            )
+        # Verify the instruction shape
+        if (rank(self.shape_mnk) not in [2, 3]) or (depth(self.shape_mnk) != 1):
+            raise OpError(
+                self,
+                f"expected a flat rank 2 or 3 tuple for the 'shape_mnk' Op parameter, "
+                f"but got {self.shape_mnk}",
+            )
+        m, n = self.shape_mnk[0], self.shape_mnk[1]
+        if self.cta_group == CtaGroup.ONE:
+            if m not in [64, 128]:
+                raise OpError(self, f"expects the M-mode to be 64 or 128, but got {m}")
+            if m == 64:
+                if (n < 8) or (n > 256) or (n % 8 != 0):
+                    raise OpError(
+                        self,
+                        f"expects the N-mode to satisfy 8 <= N <= 256 and N % 8 == 0, but got {n}",
+                    )
+            elif m == 128:
+                if (n < 16) or (n > 256) or (n % 16 != 0):
+                    raise OpError(
+                        self,
+                        f"expects the N-mode to satisfy 8 <= N <= 256 and N % 16 == 0, but got {n}",
+                    )
+        else:
+            if m not in [128, 256]:
+                raise OpError(self, f"expects the M-mode to be 128 or 256, but got {m}")
+            if (n < 32) or (n > 256) or (n % 32 != 0):
+                raise OpError(
+                    self,
+                    f"expects the N-mode to satisfy 32 <= N <= 256 and N % 32 == 0, but got {n}",
+                )
+
+    def __str__(self) -> str:
+        return (
+            self.__class__.descriptive_name  # type: ignore
+            + f"\n  A data type           = {self.a_dtype}"
+            + f"\n  B data type           = {self.b_dtype}"
+            + f"\n  Accumulator data type = {self.acc_dtype}"
+            + f"\n  CTA group             = {self.cta_group}"
+            + f"\n  A source location     = {self.a_src}"
+            + f"\n  A major mode          = {self.a_major_mode}"
+            + f"\n  B major mode          = {self.b_major_mode}"
+            + f"\n  Instruction shape MNK = {self.shape_mnk}"
+        )
+
+
+class MmaTrait(Trait):
+    admissible_fields = [Field.ACCUMULATE, Field.NEGATE_A, Field.NEGATE_B]
+
+    def set(self, field, value, *, loc=None, ip=None) -> None:
+        if field not in self.admissible_fields:
+            raise ValueError(
+                f"expects field to be one of {self.admissible_fields}, but got {field}"
+            )
+        field_name = f"#cute_nvgpu.atom_mma_field_sm100<{field._to_ir_field_name()}>"
+        attr = ir.Attribute.parse(field_name)
+        self.value = _cute_nvgpu_ir.atom_set_value(
+            self.value, attr, Boolean(value).ir_value(loc=loc, ip=ip), loc=loc, ip=ip
+        )
+
+
+#
+# TF32 MMA
+#
+
+
+@dataclass(frozen=True)
+class MmaTF32Op(MmaOp):
+    """
+    TF32 tcgen05 MMA Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma>`__.
+    This Operation corresponds to the ``.kind::tf32`` qualifier.
+    """
+
+    descriptive_name = "tcgen05 TF32 MMA Operation"
+
+    def __init__(
+        self,
+        instruction_shape: Shape,
+        cta_group: CtaGroup,
+        a_src: OperandSource,
+        a_major_mode: OperandMajorMode,
+        b_major_mode: OperandMajorMode,
+    ) -> None:
+        super().__init__(
+            TFloat32,
+            TFloat32,
+            Float32,
+            instruction_shape,
+            cta_group,
+            a_src,
+            a_major_mode,
+            b_major_mode,
+        )
+        self._verify()
+
+    def _verify(self) -> None:
+        # Verify the instruction shape
+        instruction_k = 8
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaTF32Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM100UMMAType.get(
+            shape_mnk.type.attribute,
+            self.cta_group.value,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            self.a_dtype.mlir_type,
+            self.b_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+            self.a_src._to_ir(),
+            0,
+        )
+        return MmaTF32Trait(
+            _cute_nvgpu_ir.make_sm100_mma(
+                ty,
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                loc=loc,
+                ip=ip,
+            )
+        )
+
+
+class MmaTF32Trait(MmaTrait):
+    pass
+
+
+#
+# F16/BF16 MMA
+#
+
+
+@dataclass(frozen=True)
+class MmaF16BF16Op(MmaOp):
+    """
+    F16/BF16 tcgen05 MMA Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma>`__.
+    This Operation corresponds to the ``.kind::f16`` qualifier.
+    """
+
+    descriptive_name = "tcgen05 F16/BF16 MMA Operation"
+
+    def __init__(
+        self,
+        ab_dtype: Type[Numeric],
+        acc_dtype: Type[Numeric],
+        instruction_shape: Shape,
+        cta_group: CtaGroup,
+        a_src: OperandSource,
+        a_major_mode: OperandMajorMode,
+        b_major_mode: OperandMajorMode,
+    ) -> None:
+        super().__init__(
+            ab_dtype,
+            ab_dtype,
+            acc_dtype,
+            instruction_shape,
+            cta_group,
+            a_src,
+            a_major_mode,
+            b_major_mode,
+        )
+        self._verify()
+
+    def _verify(self) -> None:
+        # Input data type verification
+        if self.a_dtype not in [Float16, BFloat16]:
+            raise OpError(
+                self,
+                "expects the 'ab_dtype' Op parameter to be one of Float16 or BFloat16",
+            )
+        assert self.b_dtype == self.a_dtype, "a_dtype and b_dtype must be the same"
+        # Accumulator data type verification
+        if self.acc_dtype not in [Float16, Float32]:
+            raise OpError(
+                self,
+                "expects the 'acc_dtype' Op parameter to be one of Float16 or Float32",
+            )
+        # Instruction shape verification
+        instruction_k = 16
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaF16BF16Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM100UMMAType.get(
+            shape_mnk.type.attribute,
+            self.cta_group.value,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            self.a_dtype.mlir_type,
+            self.b_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+            self.a_src._to_ir(),
+            0,
+        )
+        return MmaF16BF16Trait(
+            _cute_nvgpu_ir.make_sm100_mma(
+                ty,
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                loc=loc,
+                ip=ip,
+            )
+        )
+
+
+class MmaF16BF16Trait(MmaTrait):
+    pass
+
+
+#
+# I8 MMA
+#
+
+
+@dataclass(frozen=True)
+class MmaI8Op(MmaOp):
+    """
+    I8 tcgen05 MMA Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma>`__.
+    This Operation corresponds to the ``.kind::i8`` qualifier.
+    """
+
+    descriptive_name = "tcgen05 I8 MMA Operation"
+
+    def __init__(
+        self,
+        ab_dtype: Type[Numeric],
+        instruction_shape: Shape,
+        cta_group: CtaGroup,
+        a_src: OperandSource,
+        a_major_mode: OperandMajorMode,
+        b_major_mode: OperandMajorMode,
+    ) -> None:
+        super().__init__(
+            ab_dtype,
+            ab_dtype,
+            Int32,
+            instruction_shape,
+            cta_group,
+            a_src,
+            a_major_mode,
+            b_major_mode,
+        )
+        self._verify()
+
+    def _verify(self) -> None:
+        # Input data type verification
+        if self.a_dtype not in [Int8, Uint8]:
+            raise OpError(
+                self,
+                "expects the 'ab_dtype' Op parameter to be one of Int8 or Uint8",
+            )
+        assert self.b_dtype == self.a_dtype, "a_dtype and b_dtype must be the same"
+        # Instruction shape verification
+        instruction_k = 32
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaI8Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM100UMMAType.get(
+            shape_mnk.type.attribute,
+            self.cta_group.value,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            (T.si8() if self.a_dtype.signed else T.ui8()),
+            (T.si8() if self.b_dtype.signed else T.ui8()),
+            T.si32(),
+            self.a_src._to_ir(),
+            0,
+        )
+        return MmaI8Trait(
+            _cute_nvgpu_ir.make_sm100_mma(
+                ty,
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                loc=loc,
+                ip=ip,
+            )
+        )
+
+
+class MmaI8Trait(MmaTrait):
+    pass
+
+
+#
+# F8F6F4 MMA
+#
+
+
+@dataclass(frozen=True)
+class MmaFP8Op(MmaOp):
+    """
+    F8 tcgen05 MMA Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma>`__.
+    """
+
+    descriptive_name = "tcgen05 F8 MMA Operation"
+
+    def __init__(
+        self,
+        ab_dtype: Type[Numeric],
+        acc_dtype: Type[Numeric],
+        instruction_shape: Shape,
+        cta_group: CtaGroup,
+        a_src: OperandSource,
+        a_major_mode: OperandMajorMode,
+        b_major_mode: OperandMajorMode,
+    ) -> None:
+
+        super().__init__(
+            ab_dtype,
+            ab_dtype,
+            acc_dtype,
+            instruction_shape,
+            cta_group,
+            a_src,
+            a_major_mode,
+            b_major_mode,
+        )
+        self._verify()
+
+    def _verify(self) -> None:
+        # Input data type verification
+        if self.a_dtype not in [Float8E5M2, Float8E4M3FN]:
+            raise OpError(
+                self,
+                "expects the 'ab_dtype' Op parameter to be one of Float8E5M2 or Float8E4M3FN",
+            )
+        assert self.b_dtype == self.a_dtype, "a_dtype and b_dtype must be the same"
+        # Accumulator data type verification
+        if self.acc_dtype not in [Float16, Float32]:
+            raise OpError(
+                self,
+                "expects the 'acc_dtype' Op parameter to be one of Float16 or Float32",
+            )
+        # Instruction shape verification
+        instruction_k = 32
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaFP8Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM100UMMAType.get(
+            shape_mnk.type.attribute,
+            self.cta_group.value,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            self.a_dtype.mlir_type,
+            self.b_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+            self.a_src._to_ir(),
+            0,
+        )
+        return MmaFP8Trait(
+            _cute_nvgpu_ir.make_sm100_mma(
+                ty,
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                loc=loc,
+                ip=ip,
+            )
+        )
+
+
+class MmaFP8Trait(MmaTrait):
+    pass
+
+
+####################################################################################################
+#
+# SMEM layout atoms
+#
+####################################################################################################
+
+
+class SmemLayoutAtomKind(enum.Enum):
+    """
+    Enum class for the kinds of SMEM layout atoms for SM100.
+
+    Given a swizzle kind, an SMEM layout atom is the compact layout of smallest size that can be
+    used to construct an SMEM layout using blocked product for operand A or B such that the
+    resulting layout is legal for both TMA and UMMA.
+
+    Note that there are other ways of creating legal layouts for operand A and B.
+    """
+
+    MN_INTER = enum.auto()
+    MN_SW32 = enum.auto()
+    MN_SW64 = enum.auto()
+    MN_SW128 = enum.auto()
+    MN_SW128_32B = enum.auto()
+    K_INTER = enum.auto()
+    K_SW32 = enum.auto()
+    K_SW64 = enum.auto()
+    K_SW128 = enum.auto()
diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/warp/__init__.py b/python/CuTeDSL/cutlass/cute/nvgpu/warp/__init__.py
new file mode 100644
index 00000000..c2b3f7cf
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/nvgpu/warp/__init__.py
@@ -0,0 +1,25 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from .copy import *
+from .mma import *
+
+
+# __all__ is required here for documentation generation
+__all__ = [
+    # mma.py
+    "MmaF16BF16Op",
+    # copy.py
+    "LdMatrix8x8x16bOp",
+    "LdMatrix16x16x8bOp",
+    "StMatrix8x8x16bOp",
+    "StMatrix16x8x8bOp",
+]
diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/warp/copy.py b/python/CuTeDSL/cutlass/cute/nvgpu/warp/copy.py
new file mode 100644
index 00000000..a6ad4ca8
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/nvgpu/warp/copy.py
@@ -0,0 +1,189 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from dataclasses import dataclass
+from typing import Type
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir import ir
+
+from ..common import OpError
+from ...core import CopyOp, Trait, _pack_shape
+from ...typing import Numeric
+
+
+@dataclass(frozen=True)
+class BaseOp(CopyOp):
+    transpose: bool = False
+    num_matrices: int = 1
+
+    def __post_init__(self) -> None:
+        if not isinstance(self.transpose, bool):
+            raise OpError(
+                self,
+                "expects the 'transpose' Op parameter to be a bool instance",
+            )
+
+    def __str__(self) -> str:
+        res = (
+            f"{self.__class__.__name__[:-2]} Copy Operation"
+            + f"\n  number of matrices = {self.num_matrices}"
+        )
+        if self.transpose:
+            res += f"\n  transposed"
+        return res
+
+
+@dataclass(frozen=True)
+class LdMatrix8x8x16bOp(BaseOp):
+    """
+    8x8 ``ldmatrix`` Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-load-instruction-ldmatrix>`__.
+    This operation corresponds to the ``.m8n8`` qualifier.
+    """
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.num_matrices not in [1, 2, 4]:
+            raise OpError(
+                self,
+                "expects the 'num_matrices' Op parameter to be one of [1,2,4]",
+            )
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "LdMatrix8x8x16bTrait":
+        mode = _pack_shape((8, 8), loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.CopyAtomLdsmType.get(
+            copy_internal_type.mlir_type,
+            mode.type.attribute,
+            _cute_nvgpu_ir.LdsmSzPattern.u16,
+            self.num_matrices,
+            ir.UnitAttr.get() if self.transpose else None,
+        )
+        return LdMatrix8x8x16bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class LdMatrix8x8x16bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class LdMatrix16x16x8bOp(BaseOp):
+    """
+    16x16 8-bit ``ldmatrix`` Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-load-instruction-ldmatrix>`__.
+    This operation corresponds to the ``.m16n16`` and the ``.b16`` qualifiers.
+    """
+
+    def __init__(self, num_matrices: int) -> None:
+        super().__init__(transpose=True, num_matrices=num_matrices)
+        self._verify()
+
+    def _verify(self):
+        assert self.transpose, "transpose must be True"
+        if self.num_matrices not in [1, 2]:
+            raise OpError(
+                self,
+                "expects the 'num_matrices' Op parameter to be one of [1,2]",
+            )
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "LdMatrix16x16x8bTrait":
+        mode = _pack_shape((16, 16), loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.CopyAtomLdsmType.get(
+            copy_internal_type.mlir_type,
+            mode.type.attribute,
+            _cute_nvgpu_ir.LdsmSzPattern.u8,
+            self.num_matrices,
+            ir.UnitAttr.get(),
+        )
+        return LdMatrix16x16x8bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class LdMatrix16x16x8bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class StMatrix8x8x16bOp(BaseOp):
+    """
+    8x8 ``stmatrix`` Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-instructions-stmatrix>`__.
+    This operation corresponds to the ``m8n8`` qualifier.
+    """
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.num_matrices not in [1, 2, 4]:
+            raise OpError(
+                self,
+                "expects the 'num_matrices' Op parameter to be one of [1,2,4]",
+            )
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "StMatrix8x8x16bTrait":
+        mode = _pack_shape((8, 8), loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.CopyAtomStsmType.get(
+            copy_internal_type.mlir_type,
+            mode.type.attribute,
+            self.num_matrices,
+            ir.UnitAttr.get() if self.transpose else None,
+        )
+        return StMatrix8x8x16bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class StMatrix8x8x16bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class StMatrix16x8x8bOp(BaseOp):
+    """
+    16x8 ``stmatrix`` Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-instructions-stmatrix>`__.
+    This operation corresponds to the ``m16n8`` qualifier.
+    """
+
+    def __init__(self, num_matrices: int) -> None:
+        super().__init__(transpose=True, num_matrices=num_matrices)
+        self._verify()
+
+    def _verify(self):
+        if self.num_matrices not in [1, 2, 4]:
+            assert self.transpose, "transpose must be True"
+            raise OpError(
+                self,
+                "expects the 'num_matrices' Op parameter to be one of [1,2,4]",
+            )
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "StMatrix16x8x8bTrait":
+        mode = _pack_shape((16, 8), loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.CopyAtomStsmType.get(
+            copy_internal_type.mlir_type,
+            mode.type.attribute,
+            self.num_matrices,
+            ir.UnitAttr.get(),
+        )
+        return StMatrix16x8x8bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class StMatrix16x8x8bTrait(Trait):
+    pass
diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/warp/mma.py b/python/CuTeDSL/cutlass/cute/nvgpu/warp/mma.py
new file mode 100644
index 00000000..d7fe3b3b
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/nvgpu/warp/mma.py
@@ -0,0 +1,78 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from dataclasses import dataclass
+from typing import Type
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+
+from ..common import OpError
+from ...core import MmaOp, Trait, _pack_shape
+from ...typing import Shape, Float16, BFloat16, Float32, Numeric
+
+
+@dataclass(frozen=True)
+class MmaF16BF16Op(MmaOp):
+    """
+    F16/BF16 tcgen05 MMA Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-instructions-mma>`__.
+    This Operation covers the instructions using the ``.f16`` or ``.bf16`` qualifiers for the input operands.
+    """
+
+    ab_dtype: Type[Numeric]
+    acc_dtype: Type[Numeric]
+    shape_mnk: Shape
+
+    def __post_init__(self) -> None:
+        if self.ab_dtype not in [Float16, BFloat16]:
+            raise OpError(
+                self,
+                "expects the 'ab_dtype' Op parameter to be one of Float16 or BFloat16",
+            )
+        if self.acc_dtype not in [Float16, Float32]:
+            raise OpError(
+                self,
+                "expects the 'acc_dtype' Op parameter to be one of Float16 or Float32",
+            )
+        if (self.ab_dtype == BFloat16) and (self.acc_dtype != Float32):
+            raise OpError(
+                self,
+                "expects the 'acc_dtype' Op parameter to be Float32 when 'ab_dtype' is BFloat16",
+            )
+        if self.shape_mnk not in [(16, 8, 8), (16, 8, 16)]:
+            raise OpError(
+                self,
+                "expects the 'shape_mnk' Op parameter to be one of (16,8,8) or (16,8,16)",
+            )
+
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaF16BF16Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM80Type.get(
+            shape_mnk.type.attribute,
+            self.ab_dtype.mlir_type,
+            self.ab_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+        )
+        return MmaF16BF16Trait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+    def __str__(self) -> str:
+        return (
+            "warp-level F16/BF16 MMA Operation"
+            + f"\n  A/B data type         = {self.ab_dtype}"
+            + f"\n  Accumulator data type = {self.acc_dtype}"
+            + f"\n  Instruction shape MNK = {self.shape_mnk}"
+        )
+
+
+class MmaF16BF16Trait(Trait):
+    pass
diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/__init__.py b/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/__init__.py
new file mode 100644
index 00000000..49a40165
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/__init__.py
@@ -0,0 +1,29 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from .mma import *
+from .helpers import *
+
+# __all__ is required here for documentation generation
+__all__ = [
+    # mma.py
+    "OperandMajorMode",
+    "OperandSource",
+    "Field",
+    "MmaF16BF16Op",
+    "MmaF8Op",
+    "SmemLayoutAtomKind",
+    # helpers.py
+    "make_smem_layout_atom",
+    "fence",
+    "commit_group",
+    "wait_group",
+]
diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/helpers.py b/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/helpers.py
new file mode 100644
index 00000000..f6284134
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/helpers.py
@@ -0,0 +1,109 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from typing import Type
+
+from cutlass.cutlass_dsl import dsl_user_op
+
+from cutlass._mlir.dialects import nvvm
+
+from ...typing import Numeric, NumericMeta
+from ... import core
+from .mma import SmemLayoutAtomKind
+
+
+@dsl_user_op
+def make_smem_layout_atom(
+    kind: SmemLayoutAtomKind, element_type: Type[Numeric], *, loc=None, ip=None
+) -> core.ComposedLayout:
+    """
+    Makes a SMEM layout Atom.
+
+    This function creates a composed layout in unit of elements consistent with the requested layout
+    Atom kind and element data type.
+
+    :param kind:         The kind of layout Atom
+    :type kind:          SmemLayoutAtomKind
+    :param element_type: The element data type to construct the layout for
+    :type element_type:  Type[Numeric]
+    :return:             The SMEM layout atom
+    :rtype:              core.ComposedLayout
+    """
+    if not isinstance(element_type, NumericMeta):
+        raise TypeError(f"element_type must be a Numeric, but got {element_type}")
+
+    if kind in (SmemLayoutAtomKind.MN_INTER, SmemLayoutAtomKind.K_INTER):
+        num_contiguous_bits = 128
+        sw = core.make_swizzle(0, 4, 3)
+    elif kind in (SmemLayoutAtomKind.MN_SW32, SmemLayoutAtomKind.K_SW32):
+        num_contiguous_bits = 256
+        sw = core.make_swizzle(1, 4, 3)
+    elif kind in (SmemLayoutAtomKind.MN_SW64, SmemLayoutAtomKind.K_SW64):
+        num_contiguous_bits = 512
+        sw = core.make_swizzle(2, 4, 3)
+    elif kind in (SmemLayoutAtomKind.MN_SW128, SmemLayoutAtomKind.K_SW128):
+        num_contiguous_bits = 1024
+        sw = core.make_swizzle(3, 4, 3)
+    else:
+        raise ValueError("unrecognized SMEM layout atom kind")
+    num_contiguous_elems = num_contiguous_bits // element_type.width
+
+    if kind in (
+        SmemLayoutAtomKind.MN_INTER,
+        SmemLayoutAtomKind.MN_SW32,
+        SmemLayoutAtomKind.MN_SW64,
+        SmemLayoutAtomKind.MN_SW128,
+    ):
+        # M/N-major layout
+        return core.make_composed_layout(
+            sw,
+            0,
+            core.make_layout(
+                (num_contiguous_elems, 8), stride=(1, num_contiguous_elems)
+            ),
+            loc=loc,
+            ip=ip,
+        )
+    else:
+        # K-major layout
+        return core.make_composed_layout(
+            sw,
+            0,
+            core.make_layout(
+                (8, num_contiguous_elems), stride=(num_contiguous_elems, 1)
+            ),
+            loc=loc,
+            ip=ip,
+        )
+
+
+@dsl_user_op
+def fence(*, loc=None, ip=None) -> None:
+    """
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-multiply-and-accumulate-instruction-wgmma-fence>`__.
+    """
+    nvvm.wgmma_fence_aligned(loc=None, ip=None)
+
+
+@dsl_user_op
+def commit_group(*, loc=None, ip=None) -> None:
+    """
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-commit-group>`__.
+    """
+    nvvm.wgmma_commit_group_sync_aligned(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def wait_group(group, *, loc=None, ip=None) -> None:
+    """
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-multiply-and-accumulate-instruction-wgmma-wait-group>`__.
+    """
+    nvvm.wgmma_wait_group_sync_aligned(group, loc=loc, ip=ip)
diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/mma.py b/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/mma.py
new file mode 100644
index 00000000..b3749574
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/mma.py
@@ -0,0 +1,380 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import enum
+from dataclasses import dataclass
+from typing import Type
+
+from cutlass.cutlass_dsl import CuTeDSL
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir import ir
+
+from ..common import OpError
+from ...core import MmaOp, Trait, _pack_shape, rank, depth
+from ...typing import (
+    Shape,
+    Float16,
+    BFloat16,
+    Float32,
+    Boolean,
+    Float8E5M2,
+    Float8E4M3FN,
+    Numeric,
+)
+
+
+####################################################################################################
+#
+# MMA Ops and Traits
+#
+####################################################################################################
+
+
+class OperandMajorMode(enum.Enum):
+    """
+    An enumeration for the majorness of the input operands of the MMA.
+    """
+
+    MN = _cute_ir.MajorMode.mn
+    K = _cute_ir.MajorMode.k
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+    @classmethod
+    def _missing_(cls, value):
+        if isinstance(value, str):
+            value = value.upper()
+            if value == "MN":
+                return OperandMajorMode.MN
+            elif value == "K":
+                return OperandMajorMode.K
+
+    def _to_ir(self) -> _cute_ir.MajorMode:
+        return self.value
+
+
+class OperandSource(enum.Enum):
+    """
+    An enumeration for the source memory location of the A input operand of the MMA.
+    """
+
+    RMEM = _cute_ir.MmaFragKind.rmem
+    SMEM = _cute_ir.MmaFragKind.smem_desc
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+    def _to_ir(self) -> _cute_ir.MmaFragKind:
+        return self.value
+
+
+class Field(enum.Enum):
+    """
+    An enumeration for the fields of the MMA Atom that can be modified at runtime.
+    """
+
+    ACCUMULATE = "accum_c"
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+    def _to_ir_field_name(self) -> str:
+        return self.value
+
+
+@dataclass(frozen=True)
+class MmaOp(MmaOp):
+    a_dtype: Type[Numeric]
+    b_dtype: Type[Numeric]
+    acc_dtype: Type[Numeric]
+    shape_mnk: Shape
+    a_src: OperandSource
+    a_major_mode: OperandMajorMode
+    b_major_mode: OperandMajorMode
+
+    admissible_archs = ["sm_90a"]
+
+    def __post_init__(self) -> None:
+        # Verify arch
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+        # Verify that the user provided enum values
+        if not isinstance(self.a_src, OperandSource):
+            raise OpError(
+                self,
+                "expects the 'a_src' Op parameter to be a warpgroup.OperandSource instance",
+            )
+        if not isinstance(self.a_major_mode, OperandMajorMode):
+            raise OpError(
+                self,
+                "expects the 'a_major_mode' Op parameter to be a warpgroup.OperandMajorMode instance",
+            )
+        if not isinstance(self.b_major_mode, OperandMajorMode):
+            raise OpError(
+                self,
+                "expects the 'b_major_mode' Op parameter to be a warpgroup.OperandMajorMode instance",
+            )
+        # Verify instruction shape
+        if (rank(self.shape_mnk) not in [2, 3]) or (depth(self.shape_mnk) != 1):
+            raise OpError(
+                self,
+                f"expected a flat rank 2 or 3 tuple for the 'shape_mnk' Op parameter, "
+                f"but got {self.shape_mnk}",
+            )
+        m, n = self.shape_mnk[0], self.shape_mnk[1]
+        if m != 64:
+            raise OpError(self, f"expects the M-mode to be 64, but got {m}")
+        if (n < 8) or (n > 256) or (n % 8 != 0):
+            raise OpError(
+                self,
+                f"expects the N-mode to satisfy 8 <= N <= 256 and N % 8 == 0. but got {n}",
+            )
+
+    def __str__(self) -> str:
+        return (
+            self.__class__.descriptive_name  # type: ignore
+            + f"\n  A data type           = {self.a_dtype}"
+            + f"\n  B data type           = {self.b_dtype}"
+            + f"\n  Accumulator data type = {self.acc_dtype}"
+            + f"\n  A source location     = {self.a_src}"
+            + f"\n  A major mode          = {self.a_major_mode}"
+            + f"\n  B major mode          = {self.b_major_mode}"
+            + f"\n  Instruction shape MNK = {self.shape_mnk}"
+        )
+
+
+class MmaTrait(Trait):
+    admissible_fields = [Field.ACCUMULATE]
+
+    def set(self, field, value, *, loc=None, ip=None) -> None:
+        if field not in self.admissible_fields:
+            raise ValueError(
+                f"invalid field, must be {Field.ACCUMULATE}, but got {field}"
+            )
+        field_name = f"#cute_nvgpu.atom_mma_field_sm90<{field._to_ir_field_name()}>"
+        attr = ir.Attribute.parse(field_name)
+        self.value = _cute_nvgpu_ir.atom_set_value(
+            self.value, attr, Boolean(value).ir_value(loc=loc, ip=ip), loc=loc, ip=ip
+        )
+
+
+@dataclass(frozen=True)
+class MmaF16BF16Op(MmaOp):
+    """
+    F16/BF16 warpgroup MMA Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async>`__.
+    This Operation covers the instructions using the ``.f16`` or ``.bf16`` qualifiers for the input operands.
+    """
+
+    descriptive_name = "warpgroup F16/BF16 MMA Operation"
+
+    def __init__(
+        self,
+        ab_dtype: Type[Numeric],
+        acc_dtype: Type[Numeric],
+        instruction_shape: Shape,
+        a_src: OperandSource,
+        a_major_mode: OperandMajorMode,
+        b_major_mode: OperandMajorMode,
+    ) -> None:
+        super().__init__(
+            ab_dtype,
+            ab_dtype,
+            acc_dtype,
+            instruction_shape,
+            a_src,
+            a_major_mode,
+            b_major_mode,
+        )
+        self._verify()
+
+    def _verify(self) -> None:
+        # Input data type verification
+        if self.a_dtype not in [Float16, BFloat16]:
+            raise OpError(
+                self,
+                "expects the 'ab_dtype' Op parameter to be one of Float16 or BFloat16",
+            )
+        assert self.b_dtype == self.a_dtype, "a_dtype and b_dtype must be the same"
+        # Accumulator data type verification
+        if self.acc_dtype not in [Float16, Float32]:
+            raise OpError(
+                self,
+                "expects the 'acc_dtype' Op parameter to be one of Float16 or Float32",
+            )
+        if (self.a_dtype == BFloat16) and (self.acc_dtype != Float32):
+            raise OpError(
+                self,
+                "expects the 'acc_dtype' Op parameter to be Float32 when 'ab_dtype' is BFloat16",
+            )
+        # Verify the instruction shape
+        instruction_k = 16
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaF16BF16Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM90Type.get(
+            shape_mnk.type.attribute,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            self.a_dtype.mlir_type,
+            self.b_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+            self.a_src._to_ir(),
+        )
+        return MmaF16BF16Trait(
+            _cute_nvgpu_ir.make_sm90_mma(
+                ty,
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                loc=loc,
+                ip=ip,
+            )
+        )
+
+
+class MmaF16BF16Trait(MmaTrait):
+    pass
+
+
+@dataclass(frozen=True)
+class MmaF8Op(MmaOp):
+    """
+    F16/BF16 warpgroup MMA Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async>`__.
+    This Operation covers the instructions using the ``.e4m3`` or ``.e5m2`` qualifiers for the input operands.
+    """
+
+    descriptive_name = "warpgroup F8 MMA Operation"
+
+    def __init__(
+        self,
+        a_dtype: Type[Numeric],
+        b_dtype: Type[Numeric],
+        acc_dtype: Type[Numeric],
+        instruction_shape: Shape,
+        a_src: OperandSource,
+        a_major_mode: OperandMajorMode,
+        b_major_mode: OperandMajorMode,
+    ) -> None:
+        super().__init__(
+            a_dtype,
+            b_dtype,
+            acc_dtype,
+            instruction_shape,
+            a_src,
+            a_major_mode,
+            b_major_mode,
+        )
+        self._verify()
+
+    def _verify(self):
+        # Input data type verification
+        if self.a_dtype not in [Float8E5M2, Float8E4M3FN]:
+            raise OpError(
+                self,
+                "expects the 'a_dtype' Op parameter to be one of Float8E5M2 or Float8E4M3FN",
+            )
+        if self.b_dtype not in [Float8E5M2, Float8E4M3FN]:
+            raise OpError(
+                self,
+                "expects the 'b_dtype' Op parameter to be one of Float8E5M2 or Float8E4M3FN",
+            )
+        # Accumulator data type verification
+        if self.acc_dtype != Float32:
+            raise OpError(
+                self,
+                "expects the 'acc_dtype' Op parameter to be Float32",
+            )
+        # Verify the instruction shape
+        instruction_k = 32
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaF8Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM90Type.get(
+            shape_mnk.type.attribute,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            self.a_dtype.mlir_type,
+            self.b_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+            self.a_src._to_ir(),
+        )
+        return MmaF8Trait(
+            _cute_nvgpu_ir.make_sm90_mma(
+                ty, Boolean(False).ir_value(loc=loc, ip=ip), loc=loc, ip=ip
+            )
+        )
+
+
+class MmaF8Trait(MmaTrait):
+    pass
+
+
+####################################################################################################
+#
+# SMEM layout atoms
+#
+####################################################################################################
+
+
+class SmemLayoutAtomKind(enum.Enum):
+    """
+    Enum class for the kinds of SMEM layout atoms for SM90.
+
+    Given a swizzle kind, an SMEM layout atom is the compact layout of smallest size that can
+    be used to construct an SMEM layout using blocked product for operand A or B such that the
+    resulting layout is legal for both TMA and UMMA.
+
+    Note that there are other ways of creating legal layouts for operand A and B.
+    """
+
+    MN_INTER = enum.auto()
+    MN_SW32 = enum.auto()
+    MN_SW64 = enum.auto()
+    MN_SW128 = enum.auto()
+    K_INTER = enum.auto()
+    K_SW32 = enum.auto()
+    K_SW64 = enum.auto()
+    K_SW128 = enum.auto()
diff --git a/python/CuTeDSL/cutlass/cute/runtime.py b/python/CuTeDSL/cutlass/cute/runtime.py
new file mode 100644
index 00000000..47e67b88
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/runtime.py
@@ -0,0 +1,515 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import ctypes
+from functools import lru_cache
+import itertools
+import operator
+from time import time
+from typing import Union
+
+# MLIR modules imports
+from cutlass._mlir import ir
+import cutlass._mlir.dialects.cute as _cute_ir
+
+from cutlass.cutlass_dsl import TensorFormat, JitArgAdapterRegistry
+
+# Local modules imports
+from .typing import (
+    AddressSpace,
+    Tensor,
+    Type,
+    Pointer,
+    Boolean,
+    Numeric,
+    Float4E2M1FN,
+    Int64,
+    Int32,
+    Int16,
+    Int8,
+    Uint64,
+    Uint32,
+    Uint16,
+    Uint8,
+    Float64,
+    Float32,
+    Float16,
+    BFloat16,
+    Float8E5M2,
+)
+from .core import find, _Tensor as CoreTensor
+
+
+class _Pointer(Pointer):
+    """Runtime representation of a pointer that can inter-operate with various data structures,
+    including numpy arrays and device memory.
+
+    :param pointer: The pointer to the data
+    :type pointer: int or pointer-like object
+    :param dtype: Data type of the elements pointed to
+    :type dtype: Type
+    :param mem_space: Memory space where the pointer resides, defaults to generic
+    :type mem_space: _cute_ir.AddressSpace, optional
+    :param assumed_align: Assumed alignment of input pointer in bytes, defaults to None
+    :type assumed_align: int, optional
+
+    :ivar _pointer: The underlying pointer
+    :ivar _dtype: Data type of the elements
+    :ivar _addr_space: Memory space of the pointer
+    :ivar _assumed_align: Alignment of the pointer in bytes
+    :ivar _desc: C-type descriptor for the pointer
+    :ivar _c_pointer: C-compatible pointer representation
+    """
+
+    def __init__(
+        self,
+        pointer,
+        dtype,
+        mem_space: _cute_ir.AddressSpace = _cute_ir.AddressSpace.generic,
+        assumed_align=None,
+    ):
+        self._pointer = pointer
+        self._dtype = dtype
+        self._addr_space = mem_space
+
+        is_in_device = mem_space == _cute_ir.AddressSpace.gmem
+        if assumed_align is None:
+            if is_in_device:
+                self._assumed_align = 32
+            else:
+                self._assumed_align = dtype.width // 8
+        else:
+            self._assumed_align = assumed_align
+
+        class PtrDescriptor(ctypes.Structure):
+            """A ctype descriptor for CuTe memref ptr"""
+
+            _fields_ = [("ptr", ctypes.c_void_p)]
+
+            def __str__(self):
+                return f"0x{self.ptr:016x}"
+
+        self._desc = PtrDescriptor(int(self._pointer))
+        self._c_pointer = ctypes.cast(ctypes.pointer(self._desc), ctypes.c_void_p)
+        assert (
+            self._desc.ptr % self._assumed_align == 0
+        ), f"pointer must be {self._assumed_align} bytes aligned"
+
+    def size_in_bytes(self) -> int:
+        return ctypes.sizeof(self._desc)
+
+    def __get_mlir_types__(self):
+        return [self.mlir_type]
+
+    def __c_pointers__(self):
+        return [self._c_pointer]
+
+    def __new_from_mlir_values__(self, values):
+        assert len(values) == 1
+        return values[0]
+
+    # Move mlir Type out of __init__ to decouple with mlir Context
+    @property
+    def mlir_type(self) -> ir.Type:
+        return _cute_ir.PtrType.get(
+            self._dtype.mlir_type, self._addr_space, self._assumed_align
+        )
+
+    @property
+    def element_type(self) -> Type[Numeric]:
+        return self._dtype
+
+    @property
+    def memspace(self):
+        return self._addr_space
+
+    def verify(self, expected_py_type):
+        if expected_py_type is Pointer:
+            return True
+        elif isinstance(expected_py_type, ir.Value) and expected_py_type.ty is Pointer:
+            return True
+
+        return False
+
+    def __str__(self) -> str:
+        return f"Ptr<0x{self._desc.ptr:016x}@{self._addr_space}>"
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class _Tensor(Tensor):
+    def __init__(
+        self,
+        tensor,
+        assumed_align=None,
+    ):
+        # If tensor is already a DLPack object, use it directly
+        if hasattr(tensor, "__dlpack_device__") and not hasattr(tensor, "__dlpack__"):
+            self._dlpack_data = tensor
+        else:
+            self._dlpack_data = tensor.__dlpack__()
+        self._dltensor_wrapper = None
+        self._assumed_align = assumed_align
+        self._is_dynamic = False
+        self._memref_desc = None
+        self._dtype = None
+
+    @property
+    def __class__(self) -> Type[Tensor]:
+        # Cheat to let `type(_Tensor())` to return cute.Tensor
+        return Tensor
+
+    @staticmethod
+    def lazily_load_dltensor(func):
+        """Decorator to lazily load the DLTensorWrapper.
+
+        This decorator loads the DLTensorWrapper when needed,
+        avoiding overhead in the critical path of calling JIT functions.
+        """
+
+        def wrapper(self, *args, **kwargs):
+            if self._dltensor_wrapper is None:
+                self._dltensor_wrapper = _cute_ir.DLTensorWrapper(self._dlpack_data)
+            return func(self, *args, **kwargs)
+
+        return wrapper
+
+    @lazily_load_dltensor
+    def mark_layout_dynamic(self, leading_dim: int | None = None):
+        """Marks the tensor layout as dynamic based on the leading dimension.
+
+        :param leading_dim: The leading dimension of the layout, defaults to None
+        :type leading_dim: int, optional
+
+        When ``leading_dim`` is None, automatically deduces the leading dimension from the tensor layout.
+        The layout can be deduced only when exactly one dimension has a stride of 1. Raises an error
+        if the layout cannot be automatically deduced.
+
+        When ``leading_dim`` is explicitly specified, marks the layout as dynamic while setting the
+        stride at ``leading_dim`` to 1. Also validates that the specified ``leading_dim`` is consistent
+        with the existing layout by checking that the corresponding stride of that dimension is 1.
+
+        Limitation: only support flat layout for now. Will work on supporting nested layout in the future.
+
+        :return: The tensor with dynamic layout
+        :rtype: _Tensor
+        """
+        self._dltensor_wrapper.mark_layout_dynamic(leading_dim)
+        return self
+
+    @lazily_load_dltensor
+    def mark_compact_shape_dynamic(
+        self,
+        mode: int,
+        stride_order: tuple[int, ...] | None = None,
+        divisibility: int = 1,
+    ):
+        """Marks the tensor shape as dynamic and propagates dynamic and divisibility information to the corresponding strides.
+
+        :param mode: The mode of the compact shape, defaults to 0
+        :type mode: int
+        :param stride_order: Consistent with `torch.Tensor.dim_order`. Defaults to None.
+        Indicates the order of the modes (dimensions) if the current layout were converted to row-major order.
+        It starts from the outermost to the innermost dimension.
+        :type stride_order: tuple[int, ...], optional
+        :param divisibility: The divisibility constraint for the compact shape, defaults to 1
+        :type divisibility: int, optional
+        :return: The tensor with dynamic compact shape
+        :rtype: _Tensor
+
+        If ``stride_order`` is not provided, the stride ordering will be automatically deduced from the layout.
+        Automatic deduction is only possible when exactly one dimension has a stride of 1 (compact layout).
+        An error is raised if automatic deduction fails.
+
+        If ``stride_order`` is explicitly specified, it does the consistency check with the layout.
+
+        For example:
+        - Layout: (4,2):(1,4) has stride_order: (1,0) indicates the innermost dimension is 0(`4:1`), the outermost dimension is 1(`2:4`)
+        - Layout: (5,3,2,4):(3,1,15,30) has stride_order: (3,2,0,1) indicates the innermost dimension is 1(`3:1`), the outermost dimension is 3(`4:30`).
+
+        Using `torch.Tensor.dim_order()` to get the stride order of the torch tensor.
+        .. code-block:: python
+            a = torch.empty(3, 4)
+            t = cute.runtime.from_dlpack(a)
+            t = t.mark_compact_shape_dynamic(mode=0, stride_order=a.dim_order())
+        """
+        self._dltensor_wrapper.mark_compact_shape_dynamic(
+            mode, stride_order, divisibility
+        )
+        return self
+
+    @property
+    @lazily_load_dltensor
+    def element_type(self) -> Type[Numeric]:
+        if self._dtype is None:
+            self._dtype = self._dltensor_wrapper.dtype
+        return self._dtype
+
+    @element_type.setter
+    def element_type(self, new_type):
+        """Set the element type of the tensor.
+
+        :warning: This API is added for narrow precision before we have a clean `recast_tensor` story.
+
+        :note: It is only used for the case that frameworks don't natively support narrow precision but we get tensor
+              from frameworks with storage type like uint8.
+
+        **Example**:
+
+        .. code-block:: python
+
+            # Create a tensor from a numpy array
+            import numpy as np
+            from cutlass.cute import from_dlpack
+
+            # Create a tensor with Float32 elements
+            a = np.zeros(shape, dtype=np.uint8)
+            tensor = from_dlpack(a)
+
+            # Change the element type to Float4E2M1FN even storage type is uint8
+            tensor.element_type = cutlass.Float4E2M1FN
+
+            src = from_dlpack(... data tensor ...)
+            # convert and initialize narrow precision tensor
+            cute.testing.convert(src, tensor)
+        """
+        self._dtype = new_type
+
+    @property
+    @lazily_load_dltensor
+    def memspace(self):
+        return self._dltensor_wrapper.address_space
+
+    @property
+    @lazily_load_dltensor
+    def size_in_bytes(self) -> int:
+        return self._dltensor_wrapper.size_in_bytes()
+
+    @property
+    @lazily_load_dltensor
+    def mlir_type(self) -> ir.Type:
+        return self._dltensor_wrapper.get_type(
+            self.element_type.mlir_type, self._assumed_align
+        )
+
+    @lazily_load_dltensor
+    def __str__(self) -> str:
+        return f"Tensor<0x{self._dltensor_wrapper.str}>"
+
+    def __repr__(self):
+        return self.__str__()
+
+    def __setitem__(self, crd, value):
+        raise TypeError(f"runtime._Tensor is not indexable")
+
+    def __getitem__(self, crd):
+        raise TypeError(f"runtime._Tensor is not indexable")
+
+    @property
+    @lazily_load_dltensor
+    def iterator(self):
+        return _Pointer(
+            self._dltensor_wrapper.data_ptr,
+            self.element_type,
+            self.memspace,
+            self._assumed_align,
+        )
+
+    @property
+    def layout(self):
+        raise NotImplementedError(
+            f"layout property is not supported in runtime, support in future"
+        )
+
+    @property
+    @lazily_load_dltensor
+    def shape(self):
+        return self._dltensor_wrapper.shape
+
+    @property
+    @lazily_load_dltensor
+    def stride(self):
+        strides = self._dltensor_wrapper.stride
+        if strides is None:
+            strides = itertools.accumulate(
+                reversed(self.shape), func=operator.mul, initial=1
+            )
+            strides = tuple(reversed(list(strides)[:-1]))
+
+        return strides
+
+    @property
+    @lru_cache(maxsize=128, typed=True)
+    def leading_dim(self):
+        """Get the leading dimension of this Tensor.
+
+        :return: The leading dimension index or indices
+        :rtype: int or tuple or None
+
+        The return value depends on the tensor's stride pattern:
+
+        * If a single leading dimension is found, returns an integer index
+        * If nested leading dimensions are found, returns a tuple of indices
+        * If no leading dimension is found, returns None
+        """
+        return find(1, self.stride, exclude_when=(1, self.shape))
+
+    def fill(self, value: Numeric):
+        raise TypeError(f"fill function is not supported in runtime")
+
+    @property
+    @lazily_load_dltensor
+    def data_ptr(self):
+        return self._dltensor_wrapper.data_ptr
+
+    @lazily_load_dltensor
+    def __c_pointers__(self):
+        self._memref_desc = self._dltensor_wrapper.build_memref_desc(
+            self._assumed_align
+        )
+        return [_cute_ir.pycapsule_get_pointer(self._memref_desc)]
+
+    def __get_mlir_types__(self):
+        return [self.mlir_type]
+
+    def __new_from_mlir_values__(self, values):
+        assert len(values) == 1
+        assert isinstance(values[0], CoreTensor)
+        return CoreTensor(values[0].value, self._dtype)
+
+
+def from_dlpack(
+    tensor_dlpack,
+    assumed_align=None,
+) -> Tensor:
+    """Convert from tensor object supporting __dlpack__() to a CuTe Tensor.
+
+    :param tensor_dlpack: Tensor object that supports the DLPack protocol
+    :type tensor_dlpack: object
+    :param assumed_align: Assumed alignment of the tensor (bytes), defaults to None,
+      if None, will use the element size bytes as the assumed alignment.
+    :type assumed_align: int, optional
+    :return: A CuTe Tensor object
+    :rtype: Tensor
+
+    Examples:
+        .. code-block:: python
+
+            import torch
+            from cutlass.cute.runtime import from_dlpack
+            x = torch.randn(100, 100)
+            y = from_dlpack(x)
+            y.shape
+            # (100, 100)
+            type(y)
+            # <class 'cutlass.cute.Tensor'>
+    """
+    return _Tensor(
+        tensor_dlpack,
+        assumed_align=assumed_align,
+    )
+
+
+def make_ptr(
+    dtype: Type[Numeric],
+    value: Union[int, ctypes._Pointer],
+    mem_space: AddressSpace = AddressSpace.generic,
+    assumed_align=None,
+) -> Pointer:
+    """Create a pointer from a memory address
+
+    :param dtype: Data type of the pointer elements
+    :type dtype: Type[Numeric]
+    :param value: Memory address as integer or ctypes pointer
+    :type value: Union[int, ctypes._Pointer]
+    :param mem_space: Memory address space, defaults to AddressSpace.generic
+    :type mem_space: AddressSpace, optional
+    :param align_bytes: Alignment in bytes, defaults to None
+    :type align_bytes: int, optional
+    :return: A pointer object
+    :rtype: Pointer
+
+    .. code-block:: python
+
+        import numpy as np
+        import ctypes
+
+        from cutlass import Float32
+        from cutlass.cute.runtime import make_ptr
+
+        # Create a numpy array
+        a = np.random.randn(16, 32).astype(np.float32)
+
+        # Get pointer address as integer
+        ptr_address = a.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
+
+        # Create pointer from address
+        y = make_ptr(cutlass.Float32, ptr_address)
+
+        # Check properties
+        print(y.element_type)
+        print(type(y))  # <class 'cutlass.cute.Pointer'>
+    """
+    # check if value is int or ctypes.POINTER
+    if isinstance(value, int):
+        address_value = value
+    elif isinstance(value, ctypes._Pointer):
+        # get address value
+        address_value = ctypes.cast(value, ctypes.c_void_p).value
+        assert address_value is not None, "Pointer address is None"
+    else:
+        raise TypeError(
+            f"Expect int or ctypes.POINTER for value but got {type(value)=}"
+        )
+
+    return _Pointer(address_value, dtype, mem_space, assumed_align=assumed_align)
+
+
+class TensorAdapter:
+    """
+    Convert a DLPack protocol supported tensor/array to a cute tensor.
+    """
+
+    # Need reference these capsules to avoid being garbage collected
+    tensor_capsules = []
+
+    def __init__(self, arg):
+        self._arg = from_dlpack(arg).mark_layout_dynamic()
+        self.tensor_capsules.append(self._arg)
+
+    def __new_from_mlir_values__(self, values):
+        return self._arg.__new_from_mlir_values__(values)
+
+    def __c_pointers__(self):
+        return self._arg.__c_pointers__()
+
+    def __get_mlir_types__(self):
+        return self._arg.__get_mlir_types__()
+
+
+# -------------------------------------------------------------------------
+# Try to register_jit_arg_adapter for TensorAdapter
+# -------------------------------------------------------------------------
+
+try:  # Register for numpy.ndarray
+    import numpy
+
+    JitArgAdapterRegistry.register_jit_arg_adapter(numpy.ndarray)(TensorAdapter)
+except ImportError:
+    pass  # silent attempt, suppress error
+
+try:  # Register for torch.Tensor
+    import torch
+
+    JitArgAdapterRegistry.register_jit_arg_adapter(torch.Tensor)(TensorAdapter)
+except ImportError:
+    pass  # silent attempt, suppress error
diff --git a/python/CuTeDSL/cutlass/cute/testing.py b/python/CuTeDSL/cutlass/cute/testing.py
new file mode 100644
index 00000000..90fb1fb2
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/testing.py
@@ -0,0 +1,285 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import random
+import numpy as np
+import functools
+import hashlib
+
+from cutlass.cutlass_dsl import (
+    const,
+    T,
+    CuTeDSL,
+    BaseDSL,
+    t,
+    Constexpr,
+    detect_gpu_arch,
+)
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.ir as ir
+from cutlass._mlir.dialects import nvvm, cf, vector, builtin
+
+from cutlass.cute import core
+from cutlass.cute import nvgpu
+from typing import Type
+from inspect import isclass
+
+
+def assert_(cond, msg=None):
+    if isinstance(cond, ir.Value):
+        if ir.VectorType.isinstance(cond.type):
+            assert (
+                cond.type.element_type == T.bool()
+            ), f"only expects vector type with boolean elements, but got {cond.type}"
+            cond_val = vector.multi_reduction(
+                vector.CombiningKind.AND, cond, const(True), range(cond.type.rank)
+            )
+        else:
+            cond_val = cond
+    else:
+        cond_val = const(cond, t.Boolean)
+
+    cf.assert_(cond_val, msg if msg else "")
+
+
+def _maybe_recast_tensor_from_f4(src: core.Tensor, tv_layout: core.Layout):
+    if src.element_type.width == 4:
+        tv_layout = core.recast_layout(8, 4, tv_layout)
+        src = core.recast_tensor(src, dtype=t.Int8)
+    return src, tv_layout
+
+
+def _maybe_recast_to_f4(input: core.TensorSSA, dtype: Type[core.Numeric]):
+    """Conditionally recasts the tensor to 4-bit type if the destination type is 4-bit.
+
+    :param input: The input tensor to recast.
+    :param dtype: The target numeric type to potentially recast to.
+    :raises TypeError: If dtype is not a subclass of Numeric.
+    :return: A new tensor recast to 4-bit if dtype is 4-bit, otherwise returns self unchanged.
+    """
+    if not isclass(dtype) or not issubclass(dtype, core.Numeric):
+        raise TypeError(f"dst_ty must be a type of Numeric, but got {dtype}")
+
+    if dtype.width == 4:
+        recast_shape = core.recast_layout(4, 8, core.make_layout(input.shape)).shape
+        i4_vec = vector.bitcast(
+            T.vector(input.type.shape[0] * 2, T.i(4)), input.maybe_downcast()
+        )
+        res_vect = builtin.unrealized_conversion_cast(
+            [T.vector(i4_vec.type.shape[0], dtype.mlir_type)], [i4_vec]
+        )
+        return core.TensorSSA(res_vect, recast_shape, dtype)
+    return input
+
+
+def _maybe_recast_from_f4(input: core.TensorSSA, src_dtype: Type[core.Numeric]):
+    """Conditionally recasts the tensor from 4-bit type if the source type is 4-bit.
+
+    :param input: The input tensor to recast.
+    :param src_dtype: The source numeric type to potentially recast from.
+    :raises TypeError: If src_dtype is not a subclass of Numeric.
+    :return: A new tensor recast from 4-bit if src_dtype is 4-bit, otherwise returns self unchanged.
+    """
+    if not isclass(src_dtype) or not issubclass(src_dtype, core.Numeric):
+        raise TypeError(f"src_ty must be a type of Numeric, but got {src_dtype}")
+
+    if src_dtype.width == 4:
+        recast_shape = core.recast_layout(8, 4, core.make_layout(input.shape)).shape
+        i4_vec = builtin.unrealized_conversion_cast(
+            [T.vector(input.type.shape[0], T.i(4))], [input.maybe_downcast()]
+        )
+        res_vect = vector.bitcast(T.vector(i4_vec.type.shape[0] // 2, T.i8()), i4_vec)
+        return core.TensorSSA(res_vect, recast_shape, core.Int8)
+    return input
+
+
+@CuTeDSL.kernel
+def _convert_kernel(
+    gSrc: core.Tensor,
+    gDst: core.Tensor,
+    cSrc: core.Tensor,
+    src_tv_layout: core.Layout,
+    dst_tv_layout: core.Layout,
+    src_shape: core.Shape,
+    src_ty,
+    dst_ty,
+):
+    tidx = nvvm.read_ptx_sreg_tid_x(T.i32())
+    bidx = nvvm.read_ptx_sreg_ctaid_x(T.i32())
+
+    cta_coord = (None, bidx)
+    # logical idx -> address
+    ctaSrc = gSrc[cta_coord]  # (...,TileV,...)
+    ctaDst = gDst[cta_coord]  # (...,TileV,...)
+    ctaCSrc = cSrc[cta_coord]  # (...,TileV,...)
+    # print(f"ctaSrc = {ctaSrc.type}")
+
+    # compose with CTA TV layout
+    # tid, vid -> address
+    tidfrgSrc = core.composition(ctaSrc, src_tv_layout)  # (T,V)
+    tidfrgDst = core.composition(ctaDst, dst_tv_layout)  # (T,V)
+    tidfrgCSrc = core.composition(ctaCSrc, src_tv_layout)  # (T,V)
+    # print(f"tidfrgSrc = {tidfrgSrc.type}")
+
+    # slice for threads
+    thr_coord = (tidx, None)
+    thrSrc = tidfrgSrc[thr_coord]  # (V)
+    thrDst = tidfrgDst[thr_coord]  # (V)
+    thrCSrc = tidfrgCSrc[thr_coord]  # (V)
+    # print(f"thrSrc = {thrSrc.type}")
+
+    # predicate
+    if core.elem_less(thrCSrc[0], src_shape):
+        # allocate fragments for gmem->rmem
+        frgSrc = core.make_fragment(
+            core.get(src_tv_layout, mode=[1]), gSrc.element_type
+        )  # (V)
+        frgDst = core.make_fragment(
+            core.get(dst_tv_layout, mode=[1]), gDst.element_type
+        )  # (V)
+        # print(f"frgSrc = {frgSrc.type}")
+
+        # Move data to reg address space
+        copy_atom_load = core.make_copy_atom(nvgpu.CopyUniversalOp(), gSrc.element_type)
+        core.copy(copy_atom_load, thrSrc, frgSrc)
+
+        vec_src = frgSrc.load()
+        vec_src = _maybe_recast_to_f4(vec_src, src_ty)
+        vec_dst = vec_src.to(dst_ty)
+        vec_dst = _maybe_recast_from_f4(vec_dst, dst_ty)
+        frgDst.store(vec_dst)
+
+        # Copy the results back to c
+        copy_atom_stg = core.make_copy_atom(nvgpu.CopyUniversalOp(), gDst.element_type)
+        core.copy(copy_atom_stg, frgDst, thrDst)
+
+
+@CuTeDSL.jit(preprocess=False)
+def _convert(
+    src: core.Tensor,
+    dst: core.Tensor,
+    leading_mode: Constexpr,
+    elem_per_copy: Constexpr,
+):
+
+    # Step 1. figure proper tv_layout
+    src_ty = src.element_type
+    dst_ty = dst.element_type
+
+    tv_layout = core.make_layout((128, elem_per_copy), stride=(elem_per_copy, 1))
+
+    # Step 2. maybe recast from f4 tensor
+    src, src_tv_layout = _maybe_recast_tensor_from_f4(src, tv_layout)
+    dst, dst_tv_layout = _maybe_recast_tensor_from_f4(dst, tv_layout)
+    src_shape = src.shape
+    # predicate tensor
+    idA = core.make_identity_tensor(src.shape)
+
+    # Step 3. select a proper tiling pattern as (...,TileV, ...)
+    src_cta_tiler = [
+        1,
+    ] * core.rank(src.layout)
+    src_cta_tiler[leading_mode] = core.size(src_tv_layout)  # (...,TileV,...)
+    dst_cta_tiler = [
+        1,
+    ] * core.rank(dst.layout)
+    dst_cta_tiler[leading_mode] = core.size(dst_tv_layout)  # (...,TileV,...)
+
+    # Step 4. partition input and output tensor by cta tiler.
+    gS = core.zipped_divide(
+        src, tuple(src_cta_tiler)
+    )  # ((...,TileV,...),(...,RestV,...))
+    cS = core.zipped_divide(
+        idA, tuple(src_cta_tiler)
+    )  # ((...,TileV,...),(...,RestV,...))
+    gD = core.zipped_divide(
+        dst, tuple(dst_cta_tiler)
+    )  # ((...,TileV,...),(...,RestV,...))
+    # print(f"{gS.type=}")
+
+    _convert_kernel(
+        gS,
+        gD,
+        cS,
+        src_tv_layout,
+        dst_tv_layout,
+        src_shape,
+        src_ty,
+        dst_ty,
+    ).launch(
+        grid=[core.size(gS, mode=[1]), 1, 1],
+        block=[core.size(src_tv_layout, mode=[0]), 1, 1],
+    )
+
+
+# Converts from src tensor to dst tensor, their logical shape are required to be the same.
+# And when src or dst dtype is narrow precision(Float4E2M1FN/Float8E8M0FNU/Float8E4M3FN), the shape of
+# their leading dimension should be 4(fp8)/8(fp4) element align. (nvgpu.cvt_fptrunc/cvt_fpext
+# needs 32-bits aligned input/output)
+def convert(src: core.Tensor, dst: core.Tensor):
+    assert len(src.shape) == len(
+        dst.shape
+    ), "Shape of src and dst tensors should be the same rank."
+    # find leading mode
+    leading_mode = np.argmin([np.min(s) for s in src.stride])
+
+    elem_per_copy = 2
+
+    if src.element_type.width == 4 or dst.element_type.width == 4:
+        elem_per_copy = 8
+    elif src.element_type.width == 8 or dst.element_type.width == 8:
+        elem_per_copy = 4
+    assert (
+        src.shape[leading_mode] % elem_per_copy == 0
+        and dst.shape[leading_mode] % elem_per_copy == 0
+    )
+    _convert(src, dst, leading_mode, elem_per_copy)
+
+
+#########################################
+# Testing utilities
+#########################################
+
+
+def sample_pytest(rand_cfg=None):
+    """
+    Decorator to randomly sample pytest parametrized tests.
+    rand_cfg: Tuple[int, float] - (random_seed, sample_ratio)
+    Sampling is disabled when:
+    - A specific test is selected (via -k or direct test path)
+    - Not running under pytest
+    """
+    import functools
+    import os
+    import random
+    import pytest
+    import sys
+
+    seed, sample_ratio = rand_cfg
+    random.seed(seed)
+
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            if rand_cfg is not None and "PYTEST_CURRENT_TEST" in os.environ:
+                # Check if test was explicitly selected like ::test_name[param1-param2-...]
+                if "-k" in sys.argv or any(".py::" in arg for arg in sys.argv):
+                    # Test was explicitly selected, don't skip
+                    return func(*args, **kwargs)
+
+                if random.uniform(0.0, 1.0) > sample_ratio:
+                    pytest.skip(f"Randomly skipped (sampling ratio: {sample_ratio})")
+            return func(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
diff --git a/python/CuTeDSL/cutlass/cute/typing.py b/python/CuTeDSL/cutlass/cute/typing.py
new file mode 100644
index 00000000..48ac76c4
--- /dev/null
+++ b/python/CuTeDSL/cutlass/cute/typing.py
@@ -0,0 +1,193 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from abc import ABC, abstractmethod
+from typing import ForwardRef, Tuple, Union, Any, Type, List
+
+from cutlass.base_dsl.typing import *
+
+from cutlass._mlir import ir
+import cutlass._mlir.extras.types as T
+from cutlass._mlir.dialects.cute import AddressSpace
+
+
+Int = Union[int, Integer]
+
+
+ScaledBasis = ForwardRef("ScaledBasis")
+
+
+IntTuple = Union[Int, Tuple["IntTuple", ...]]
+Shape = Union[Int, Tuple["Shape", ...]]
+Stride = Union[Int, ScaledBasis, Tuple["Stride", ...]]
+Coord = Union[Int, None, Tuple["Coord", ...]]
+
+
+class Layout(ir.Value):
+    def __init__(self, op_result):
+        super().__init__(op_result)
+
+    def __str__(self): ...
+
+    def get_hier_coord(self, idx) -> Coord:
+        """Return the (hierarchical) ND logical coordinate corresponding to the linear index"""
+        ...
+
+    @property
+    def shape(self, *, loc=None, ip=None) -> Shape: ...
+
+    @property
+    def stride(self, *, loc=None, ip=None) -> Stride: ...
+
+
+Tile = Union[Int, None, Layout, Tuple["Tile", ...]]
+
+# XTuple is super set of above types
+XTuple = Union[IntTuple, Shape, Stride, Coord, Tile]
+
+Tiler = Union[Shape, Layout, Tile]
+
+
+class Pointer:
+    """
+    Abstract base class for CuTe jit function and runtime _Pointer
+    """
+
+    def __extract_mlir_values__(self):
+        # Doesn't matter just return a value
+        return [self]
+
+
+class Tensor(ABC):
+    """
+    Abstract base class for CuTe jit function and runtime _Tensor
+
+    A CuTe Tensor is iterator with layout
+
+    :Examples:
+
+    Create tensor from torch.tensor with Host Runtime:
+
+    .. code-block:: python
+
+        >>> import torch
+        >>> from cutlass.cute.runtime import from_dlpack
+        >>> mA = from_dlpack(torch.tensor([1, 3, 5], dtype=torch.int32))
+        >>> mA.shape
+        (3,)
+        >>> mA.stride
+        (1,)
+        >>> mA.layout
+        (3,):(1,)
+
+    Define JIT function:
+
+    .. code-block:: python
+
+        @cute.jit
+        def add(a: Tensor, b: Tensor, res: Tensor): ...
+
+    Call JIT function from python:
+
+    .. code-block:: python
+
+        >>> import torch
+        >>> a = torch.tensor([1, 3, 5], dtype=torch.int32)
+        >>> b = torch.tensor([2, 4, 6], dtype=torch.int32)
+        >>> c = torch.zeros([3], dtype=torch.int32)
+        >>> mA = from_dlpack(a)
+        >>> mB = from_dlpack(b)
+        >>> mC = from_dlpack(c)
+        >>> add(mA, mB, mC)
+        >>> c
+        tensor([3, 7, 11], dtype=torch.int32)
+    """
+
+    def __str__(self): ...
+
+    @abstractmethod
+    def __getitem__(self, idx) -> Union["Tensor", ir.Value, IntTuple]: ...
+
+    @abstractmethod
+    def __setitem__(self, idx, value): ...
+
+    @property
+    @abstractmethod
+    def element_type(self) -> Union[Type[Numeric], Type[IntTuple]]: ...
+
+    @element_type.setter
+    def element_type(self, new_type): ...
+
+    @property
+    @abstractmethod
+    def memspace(self) -> AddressSpace: ...
+
+    @property
+    @abstractmethod
+    def iterator(self): ...
+
+    @property
+    def layout(self) -> Union[Layout, "ComposedLayout"]: ...
+
+    @property
+    def shape(self) -> Shape: ...
+
+    def load(self, *, loc=None, ip=None) -> "TensorSSA": ...
+
+    def store(self, data: "TensorSSA", *, loc=None, ip=None): ...
+
+    def mark_layout_dynamic(self, leading_dim: int|None = None) -> "Tensor": ...
+
+    def mark_compact_shape_dynamic(
+        self, mode: int, stride_order: tuple[int, ...]|None = None, divisibility: int = 1
+    ) -> "Tensor": ...
+
+    @abstractmethod
+    def fill(self, value: Numeric) -> None: ...
+
+
+__all__ = [
+    "Coord",
+    "Numeric",
+    "Integer",
+    "Boolean",
+    "Int8",
+    "Int16",
+    "Int32",
+    "Int64",
+    "Uint8",
+    "Uint16",
+    "Uint32",
+    "Uint64",
+    "Float",
+    "Float16",
+    "BFloat16",
+    "TFloat32",
+    "Float32",
+    "Float64",
+    "Float8E5M2",
+    "Float8E4M3FN",
+    "Float8E4M3B11FNUZ",
+    "Float8E4M3",
+    "Float8E8M0FNU",
+    "Float4E2M1FN",
+    "Float6E2M3FN",
+    "Float6E3M2FN",
+    "IntTuple",
+    "Layout",
+    "Pointer",
+    "Shape",
+    "Stride",
+    "Tensor",
+    "Tile",
+    "Tiler",
+    "XTuple",
+]
diff --git a/python/CuTeDSL/cutlass/impl_utils.py b/python/CuTeDSL/cutlass/impl_utils.py
new file mode 100644
index 00000000..0bb9b520
--- /dev/null
+++ b/python/CuTeDSL/cutlass/impl_utils.py
@@ -0,0 +1,32 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+
+def check_value_in(
+    value, possible_values: list, value_description: str, prefix=""
+) -> None:
+    if value not in possible_values:
+        err_msg = prefix
+        if err_msg != "":
+            err_msg += ": "
+        err_msg += f"invalid {value_description}, got {value}, must be one of {possible_values}"
+        raise ValueError(err_msg)
+
+
+def check_type_in(ty, possible_types: list, type_description: str, prefix="") -> None:
+    if not isinstance(ty, type):
+        ty = type(ty)
+    if ty not in possible_types:
+        err_msg = prefix
+        if err_msg != "":
+            err_msg += ": "
+        err_msg += f"invalid type for {type_description}, got {ty}, must be one of {possible_types}"
+        raise TypeError(err_msg)
diff --git a/python/CuTeDSL/cutlass/torch.py b/python/CuTeDSL/cutlass/torch.py
new file mode 100644
index 00000000..0126fb04
--- /dev/null
+++ b/python/CuTeDSL/cutlass/torch.py
@@ -0,0 +1,169 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional, Type, Union
+
+from cutlass.cute.typing import (
+    Numeric,
+    Boolean,
+    Float,
+    Integer,
+    TFloat32,
+    Float8E4M3B11FNUZ,
+    Float8E4M3FN,
+    Float8E5M2,
+    Float8E8M0FNU,
+    Float4E2M1FN,
+    Tensor,
+)
+from cutlass.cute.runtime import from_dlpack
+import cutlass.cute as cute
+import torch
+
+
+def dtype(ty: Type[Numeric]):
+    """
+    Return the corresponding torch.dtype per the given DSL type
+    """
+    torch_dtype = getattr(torch, ty.__name__.lower(), None)
+
+    torch_type_map = {
+        Boolean: torch.bool,
+        # TFloat32 is just alias of float32
+        TFloat32: torch.float32,
+        Float8E5M2: torch.float8_e5m2,
+        Float8E4M3FN: torch.float8_e4m3fn,
+        Float8E4M3B11FNUZ: torch.float8_e4m3fnuz,
+    }
+    if torch_dtype is None:
+        torch_dtype = torch_type_map.get(ty)
+
+    if torch_dtype is None:
+        raise TypeError(f"{ty} is not supported by torch")
+    return torch_dtype
+
+
+@dataclass
+class ScalarInitConfig:
+    """Configuration for scalar initialization"""
+
+    value: float = 0.0
+
+
+@dataclass
+class RandomInitConfig:
+    """Configuration for random initialization"""
+
+    min_val: int = -2
+    max_val: int = 2
+
+
+@dataclass
+class GaussianInitConfig:
+    """Configuration for Gaussian initialization"""
+
+    mean: float = 0.0
+    std: float = 1.0
+    scale: float = 1.0
+
+
+class TensorInitType(Enum):
+    """Enumeration of tensor initialization types"""
+
+    SKIP = "skip"
+    SCALAR = "scalar"
+    RANDOM = "random"
+    GAUSSIAN = "gaussian"
+
+
+def create_and_permute_torch_tensor(
+    shape,
+    dtype: "torch.dtype",
+    permute_order=None,
+    init_type: TensorInitType = TensorInitType.RANDOM,
+    init_config: Optional[
+        Union[RandomInitConfig, ScalarInitConfig, GaussianInitConfig]
+    ] = None,
+) -> "torch.Tensor":
+    """
+    Create a torch tensor with specified shape and dtype. Optionally permute it and initialize it with specified init type and config
+    """
+    init_dtype = torch.int32 if init_type == TensorInitType.RANDOM else torch.float32
+    init_torch_tensor = torch.empty(*shape, dtype=init_dtype)
+    if init_type == TensorInitType.SKIP:
+        assert init_config is None
+        f32_torch_tensor = init_torch_tensor
+    elif init_type == TensorInitType.SCALAR:
+        if init_config is None:
+            init_config = ScalarInitConfig()
+        else:
+            if not isinstance(init_config, ScalarInitConfig):
+                raise ValueError("init_config must be ScalarInitConfig()")
+        f32_torch_tensor = init_torch_tensor.fill_(init_config.value)
+    elif init_type == TensorInitType.RANDOM:
+        if init_config is None:
+            init_config = RandomInitConfig()
+        else:
+            if not isinstance(init_config, RandomInitConfig):
+                raise ValueError("init_config must be RandomInitConfig()")
+        f32_torch_tensor = init_torch_tensor.random_(
+            init_config.min_val, init_config.max_val
+        ).to(dtype=torch.float32)
+    elif init_type == TensorInitType.GAUSSIAN:
+        if init_config is None:
+            init_config = GaussianInitConfig()
+        else:
+            if not isinstance(init_config, GaussianInitConfig):
+                raise ValueError("init_config must be GaussianInitConfig()")
+        f32_torch_tensor = init_torch_tensor.normal_(init_config.mean, init_config.std)
+        f32_torch_tensor = f32_torch_tensor * (1 << init_config.scale)
+    else:
+        raise ValueError(f"Invalid init type: {init_type}")
+
+    if permute_order is not None:
+        f32_torch_tensor = f32_torch_tensor.permute(permute_order)
+
+    dtype_torch_tensor = f32_torch_tensor.to(dtype=dtype)
+
+    return dtype_torch_tensor
+
+
+def convert_cute_tensor(
+    f32_torch_tensor: "torch.Tensor",
+    cute_tensor: Tensor,
+    dtype: Type[Numeric],
+    is_dynamic_layout: bool = True,
+) -> Tensor:
+    """
+    Change the value of the cute tensor to make its value converted from a fp32 torch tensor.
+    Used for fp8 types tensor creatation now.
+    """
+    # if torch_tensor is on cpu, create a gpu copy
+    if f32_torch_tensor.device.type == "cpu":
+        f32_torch_tensor = f32_torch_tensor.cuda()
+
+    # Fp8 type need explicit type conversion
+    if dtype in {
+        Float8E5M2,
+        Float8E4M3FN,
+        Float8E8M0FNU,
+        Float4E2M1FN,
+    }:
+        fp32_cute_tensor = from_dlpack(f32_torch_tensor)
+        if is_dynamic_layout:
+            fp32_cute_tensor = fp32_cute_tensor.mark_layout_dynamic(
+                f32_torch_tensor.dim_order()[-1]
+            )
+        # Copy and convert from f32 cute tensor to dtype cute tensor
+        cute.testing.convert(fp32_cute_tensor, cute_tensor)
+    return cute_tensor
diff --git a/python/CuTeDSL/cutlass/utils/README.md b/python/CuTeDSL/cutlass/utils/README.md
new file mode 100644
index 00000000..3a583ed4
--- /dev/null
+++ b/python/CuTeDSL/cutlass/utils/README.md
@@ -0,0 +1,9 @@
+# Utilities
+
+This folder contains various utilties for kernel authoring. Specifically, the implementation of the
+followings can be considered experimental and subject to breaking changes:
+
+- static persistent tile scheduler defined in [`static_persistent_tile_scheduler.py`](./static_persistent_tile_scheduler.py)
+- pipeline abstractions defined in [`pipeline.py`](./pipeline.py)
+- grouped GEMM utilties defined [`grouped_gemm_tile_scheduler_helper.py`](./grouped_gemm_tile_scheduler_helper.py)
+  and [`tensormap_manager.py`](./tensormap_manager.py)
diff --git a/python/CuTeDSL/cutlass/utils/__init__.py b/python/CuTeDSL/cutlass/utils/__init__.py
new file mode 100644
index 00000000..dc3fdbcd
--- /dev/null
+++ b/python/CuTeDSL/cutlass/utils/__init__.py
@@ -0,0 +1,78 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from .static_persistent_tile_scheduler import (
+    WorkTileInfo,
+    PersistentTileSchedulerParams,
+    StaticPersistentTileScheduler,
+)
+
+from .pipeline import (
+    Agent,
+    CooperativeGroup,
+    PipelineUserType,
+    PipelineState,
+    make_pipeline_state,
+    PipelineAsync,
+    PipelineTmaAsync,
+    PipelineTmaUmma,
+    PipelineUmmaAsync,
+    PipelineTmaStore,
+    pipeline_init_wait,
+)
+
+from .hardware_info import (
+    HardwareInfo,
+)
+
+from .blackwell_helpers import (
+    compute_epilogue_tile_shape,
+    get_smem_store_op,
+    get_tmem_load_op,
+    get_num_tmem_alloc_cols,
+    make_smem_layout_a,
+    make_smem_layout_b,
+    make_smem_layout_epi,
+    make_trivial_tiled_mma,
+)
+
+from .hopper_helpers import (
+    sm90_get_smem_store_op,
+)
+
+from .grouped_gemm_tile_scheduler_helper import (
+    GroupSearchResult,
+    GroupedGemmGroupSearchState,
+    GroupedGemmTileSchedulerHelper,
+    create_initial_search_state,
+)
+
+from .tensormap_manager import (
+    TensorMapUpdateMode,
+    TensorMapManager,
+)
+
+from .smem_allocator import SmemAllocator
+
+from .layout import LayoutEnum
+
+__all__ = [
+    "WorkTileInfo",
+    "PersistentTileSchedulerParams",
+    "StaticPersistentTileScheduler",
+    "TensorMapUpdateMode",
+    "TensorMapManager",
+    "GroupSearchResult",
+    "GroupedGemmGroupSearchState",
+    "create_initial_search_state",
+    "GroupedGemmTileSchedulerHelper",
+    "HardwareInfo",
+]
diff --git a/python/CuTeDSL/cutlass/utils/ampere_helpers.py b/python/CuTeDSL/cutlass/utils/ampere_helpers.py
new file mode 100644
index 00000000..1ba97e1c
--- /dev/null
+++ b/python/CuTeDSL/cutlass/utils/ampere_helpers.py
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from enum import Enum
+
+
+class SmemCapacity(Enum):
+    SM80_SMEM_CAPACITY_BYTES = (164 - 1) * 1024
+    SM86_SMEM_CAPACITY_BYTES = (100 - 1) * 1024
+    SM89_SMEM_CAPACITY_BYTES = (100 - 1) * 1024
+
+
+# Dictionary to map compute capability to SMEM capacity
+SMEM_CAPACITY = {
+    "sm80": SmemCapacity.SM80_SMEM_CAPACITY_BYTES.value,
+    "sm86": SmemCapacity.SM86_SMEM_CAPACITY_BYTES.value,
+    "sm89": SmemCapacity.SM89_SMEM_CAPACITY_BYTES.value,
+}
diff --git a/python/CuTeDSL/cutlass/utils/blackwell_helpers.py b/python/CuTeDSL/cutlass/utils/blackwell_helpers.py
new file mode 100644
index 00000000..ca01ad49
--- /dev/null
+++ b/python/CuTeDSL/cutlass/utils/blackwell_helpers.py
@@ -0,0 +1,910 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from enum import Enum
+from math import log2, ceil
+from typing import List, Type, Union, Tuple
+
+from cutlass.cutlass_dsl import (
+    Float16,
+    BFloat16,
+    TFloat32,
+    Float32,
+    Uint8,
+    Int8,
+    Float8E4M3FN,
+    Float8E5M2,
+    Numeric,
+    NumericMeta,
+    dsl_user_op,
+)
+import cutlass.cute as cute
+from cutlass.cute.nvgpu.common import CopyUniversalOp
+from cutlass.cute.nvgpu.warp import StMatrix8x8x16bOp, StMatrix16x8x8bOp
+from cutlass.cute.nvgpu.tcgen05 import (
+    MmaF16BF16Op,
+    MmaTF32Op,
+    MmaI8Op,
+    MmaFP8Op,
+    OperandSource,
+    OperandMajorMode,
+    CtaGroup,
+    Ld16x64bOp,
+    Ld16x128bOp,
+    Ld16x256bOp,
+    Ld16x32bx2Op,
+    Ld32x32bOp,
+    Repetition,
+    Pack,
+    find_tmem_tensor_col_offset,
+    SmemLayoutAtomKind,
+    make_smem_layout_atom,
+    tile_to_mma_shape,
+    is_tmem_load,
+    get_tmem_copy_properties,
+)
+from cutlass.utils.layout import LayoutEnum
+
+@dsl_user_op
+def compute_epilogue_tile_shape(
+    cta_tile_shape: cute.Shape,
+    use_2cta_instrs: bool,
+    layout_d: LayoutEnum,
+    elem_ty_d: Type[Numeric],
+    *,
+    layout_c: LayoutEnum = None,
+    elem_ty_c: Union[Type[Numeric], None] = None,
+    loc=None,
+    ip=None,
+) -> cute.Tile:
+    """Attempts to compute a reasonable epilogue tile based on block tile shape or allows the user to provide one.
+
+    :param cta_tile_shape: A tuple or list representing the dimensions of the CTA tile, where
+        cta_tile_shape[0] corresponds to the height (M) and cta_tile_shape[1]
+        corresponds to the width (N) of the tile.
+    :type cta_tile_shape: cute.Shape
+    :param use_2cta_instrs: A flag indicating whether the configuration is for a 2SM setup.
+    :type use_2cta_instrs: bool
+    :param layout_d: The layout enum of the output tensor D.
+    :type layout_d: LayoutEnum
+    :param elem_ty_d: The element type of output tensor D.
+    :type elem_ty_d: Type[Numeric]
+    :param layout_c: The layout enum of the input tensor C. Defaults to None.
+    :type layout_c: LayoutEnum, optional
+    :param elem_ty_c: The element type for input tensor C. Defaults to None.
+    :type elem_ty_c: Union[Type[Numeric], None], optional
+
+    :return: Returns epilog tiler, which is used in subsequent epilog partitions.
+    :rtype: cute.Tile
+
+    :raises ValueError: If the computed tile cute.size does not meet minimum requirements based on CTA dimensions.
+    """
+
+    def validate_type(ty, ty_name):
+        if not isinstance(ty, NumericMeta):
+            raise TypeError(f"{ty_name} must be Numeric, but got {ty}")
+
+    validate_type(elem_ty_d, "elem_ty_d")
+    if elem_ty_c is not None:
+        validate_type(elem_ty_c, "elem_ty_c")
+
+    cta_m, cta_n = cta_tile_shape[:2]
+    (warp_m, warp_n) = (2, 2) if (cta_m == 64 and use_2cta_instrs) else (4, 1)
+    disable_source = elem_ty_c == None
+    max_bits = (
+        elem_ty_d.width if disable_source else max(elem_ty_c.width, elem_ty_d.width)
+    )
+
+    dp_full = 32
+    tile_m = min(cta_m, dp_full * warp_m)
+    n_perf = 0
+    if disable_source:
+        if max_bits == 4:
+            compute_elts = 8192
+        else:
+            compute_elts = 4096
+        n_perf = compute_elts // tile_m
+    else:
+        if max_bits == 32:
+            n_perf = 16 if (cta_m > 64 and cta_n <= 128) else 32
+        elif max_bits == 16:
+            n_perf = 32 if cta_n <= 128 else 64
+        else:
+            n_perf = 64
+
+    d_is_m_major = layout_d.is_m_major_c()
+    c_is_m_major = True if layout_c is None else layout_c.is_m_major_c()
+
+    n_min_d = (
+        8 * warp_n
+        if d_is_m_major
+        else (128 * warp_n if elem_ty_d.width == 6 else 128 // elem_ty_d.width * warp_n)
+    )
+    n_min_c = (
+        8 * warp_n
+        if (c_is_m_major or disable_source)
+        else (128 * warp_n if elem_ty_c.width == 6 else 128 // elem_ty_c.width * warp_n)
+    )
+    tile_n = min(cta_n, max(n_perf, n_min_c, n_min_d))
+
+    if cta_n < n_min_c or cta_n < n_min_d:
+        raise ValueError(f"CTA tile too small: {cta_tile_shape=}")
+
+    # stride by tmem warp layout and return a by-mode tiler
+    tile_m_layout = cute.make_layout(tile_m, loc=loc, ip=ip)
+    tile_n_layout = cute.make_layout(
+        (tile_n // warp_n, warp_n), stride=(1, cta_n // warp_n), loc=loc, ip=ip
+    )
+    return (tile_m_layout, cute.coalesce(tile_n_layout, loc=loc, ip=ip))
+
+
+@dsl_user_op
+def get_smem_store_op(
+    layout_d: LayoutEnum,
+    elem_ty_d: Type[Numeric],
+    elem_ty_acc: Type[Numeric],
+    tiled_tmem_load: cute.TiledCopy,
+    *,
+    loc=None,
+    ip=None,
+) -> cute.CopyAtom:
+    """Selects the largest vectorized smem store atom available subject to
+    constraint of gmem layout and chosen TMEM_LOAD's thread-value ownership.
+
+    :param layout_d: The layout enum of the output tensor D.
+    :type layout_d: LayoutEnum
+    :param elem_ty_d: The element type for output tensor D.
+    :type elem_ty_d: Type[Numeric]
+    :param elem_ty_acc: The element type for accumulator.
+    :type elem_ty_acc: Type[Numeric]
+    :param tiled_tmem_load: An instance of TiledCopy that represents the tmem load operation.
+    :type tiled_tmem_load: cute.TiledCopy
+
+    :return: Either SmemStoreMatrix or SimtSyncCopy, based on the input parameters.
+    :rtype: cute.CopyAtom
+    """
+
+    def validate_type(ty, ty_name):
+        if not isinstance(ty, NumericMeta):
+            raise TypeError(f"{ty_name} must be a Numeric, but got {ty}")
+
+    validate_type(elem_ty_d, "elem_ty_d")
+    validate_type(elem_ty_acc, "elem_ty_acc")
+
+    is_m_major = layout_d.is_m_major_c()
+    is_n_major = layout_d.is_n_major_c()
+
+    if not is_tmem_load(tiled_tmem_load):
+        return cute.make_copy_atom(CopyUniversalOp(), elem_ty_d, loc=loc, ip=ip)
+
+    num_dp, num_bits, num_rep, pack = get_tmem_copy_properties(tiled_tmem_load)
+
+    use_stmatrix_m8n8_4x = (
+        all(
+            [
+                elem_ty_acc.width == 32,
+                elem_ty_d.width == 32,
+                is_n_major,
+                num_dp == 16,
+                num_bits == 128,
+                num_rep in (2, 4, 8, 16, 32, 64),
+                pack == Pack.NONE,
+            ]
+        )
+        or all(
+            [
+                elem_ty_acc.width == 32,
+                elem_ty_d.width == 16,
+                num_dp == 16,
+                num_bits == 256,
+                num_rep in (2, 4, 8, 16, 32),
+                pack == Pack.NONE,
+            ]
+        )
+        or all(
+            [
+                elem_ty_acc.width == 16,
+                elem_ty_d.width == 16,
+                num_dp == 16,
+                num_bits == 128,
+                num_rep in (2, 4, 8, 16, 32, 64),
+                pack == Pack.PACK_16b_IN_32b,
+            ]
+        )
+    )
+    use_stmatrix_m16n8_4x = all(
+        [
+            elem_ty_acc.width == 32,
+            elem_ty_d.width == 8,
+            is_m_major,
+            num_dp == 16,
+            num_bits == 256,
+            num_rep in (4, 8, 16, 32),
+            pack == Pack.NONE,
+        ]
+    )
+    use_stmatrix_m8n8_2x = (
+        all(
+            [
+                elem_ty_acc.width == 32,
+                elem_ty_d.width == 32,
+                is_n_major,
+                num_dp == 16,
+                num_bits == 128,
+                num_rep == 1,
+                pack == Pack.NONE,
+            ]
+        )
+        or all(
+            [
+                elem_ty_acc.width == 32,
+                elem_ty_d.width == 16,
+                num_dp == 16,
+                num_bits == 256,
+                num_rep == 1,
+                pack == Pack.NONE,
+            ]
+        )
+        or all(
+            [
+                elem_ty_acc.width == 16,
+                elem_ty_d.width == 16,
+                num_dp == 16,
+                num_bits == 128,
+                num_rep == 1,
+                pack == Pack.PACK_16b_IN_32b,
+            ]
+        )
+    )
+    use_stmatrix_m16n8_2x = all(
+        [
+            elem_ty_acc.width == 32,
+            elem_ty_d.width == 8,
+            is_m_major,
+            num_dp == 16,
+            num_bits == 256,
+            num_rep == 2,
+            pack == Pack.NONE,
+        ]
+    )
+    use_stmatrix_m16n8_1x = all(
+        [
+            elem_ty_acc.width == 32,
+            elem_ty_d.width == 8,
+            is_m_major,
+            num_dp == 16,
+            num_bits == 256,
+            num_rep == 1,
+            pack == Pack.NONE,
+        ]
+    )
+
+    if use_stmatrix_m8n8_4x:
+        op = StMatrix8x8x16bOp(is_m_major, 4)
+        return cute.make_copy_atom(op, elem_ty_d, loc=loc, ip=ip)
+    elif use_stmatrix_m8n8_2x:
+        op = StMatrix8x8x16bOp(is_m_major, 2)
+        return cute.make_copy_atom(op, elem_ty_d, loc=loc, ip=ip)
+    elif use_stmatrix_m16n8_4x:
+        op = StMatrix16x8x8bOp(4)
+        return cute.make_copy_atom(op, elem_ty_d, loc=loc, ip=ip)
+    elif use_stmatrix_m16n8_2x:
+        op = StMatrix16x8x8bOp(2)
+        return cute.make_copy_atom(op, elem_ty_d, loc=loc, ip=ip)
+    elif use_stmatrix_m16n8_1x:
+        op = StMatrix16x8x8bOp(1)
+        return cute.make_copy_atom(op, elem_ty_d, loc=loc, ip=ip)
+    else:
+        op = CopyUniversalOp()
+        return cute.make_copy_atom(op, elem_ty_d, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def get_tmem_load_op(
+    cta_tile_shape: cute.Shape,
+    layout_d: LayoutEnum,
+    elem_ty_d: Type[Numeric],
+    elem_ty_acc: Type[Numeric],
+    epi_tile: cute.Tile,
+    use_2cta_instrs: bool,
+    *,
+    loc=None,
+    ip=None,
+) -> cute.CopyAtom:
+    """Finds a performant TMEM_LOAD copy op for the selected epilogue
+    tile (epi_tile), element types, and tcgen05.mma instruction used.
+
+    :param cta_tile_shape: A tuple or list representing the dimensions of the CTA tile.
+    :type cta_tile_shape: cute.Shape
+    :param layout_d: The layout enum of the output tensor D.
+    :type layout_d: LayoutEnum
+    :param elem_ty_d: The element type for output tensor D.
+    :type elem_ty_d: Type[Numeric]
+    :param elem_ty_acc: The element type for accumulation.
+    :type elem_ty_acc: Type[Numeric]
+    :param epi_tile: The epilogue tile configuration.
+    :type epi_tile: cute.Tile
+    :param use_2cta_instrs: A flag indicating whether the configuration is for 2 SMs.
+    :type use_2cta_instrs: bool
+
+    :return: An instance of Sm100TmemLoad with the computed configuration.
+    :rtype: cute.CopyAtom
+
+    :raises ValueError: If the function cannot handle the given combination of accumulation
+    and dimension types, or if it cannot determine the appropriate configuration based on
+    the input parameters.
+    """
+    is_m_major = layout_d.is_m_major_c()
+
+    acc_bits = elem_ty_acc.width
+    d_bits = elem_ty_d.width
+
+    tmem_warp_shape_mn = (
+        (2, 2) if (cta_tile_shape[0] == 64 and use_2cta_instrs) else (4, 1)
+    )
+    epilog_tile_shape_mn = cute.product_each(
+        cute.shape(epi_tile, loc=loc, ip=ip), loc=loc, ip=ip
+    )
+    epilog_warp_tile_shape_mn = cute.shape_div(
+        epilog_tile_shape_mn, tmem_warp_shape_mn, loc=loc, ip=ip
+    )
+
+    num_dp = cute.size(epilog_warp_tile_shape_mn[0], loc=loc, ip=ip)
+    if num_dp not in {16, 32}:
+        raise ValueError("Cta tile and 2sm config does not generate correct num dp.")
+
+    num_col_bits = cute.size(epilog_warp_tile_shape_mn[1], loc=loc, ip=ip) * acc_bits
+
+    tmem_dp = 0
+    tmem_bit = 0
+    tmem_rep = 0
+    tmem_pack16b = False
+    if acc_bits == 32 and d_bits == 32:
+        if num_dp == 16:
+            if is_m_major:
+                tmem_dp = 16
+                tmem_bit = 256
+            else:
+                tmem_dp = 16
+                tmem_bit = 128
+        else:
+            tmem_dp = 32
+            tmem_bit = 32
+    elif acc_bits == 32 and d_bits == 16:
+        if num_dp == 16:
+            if is_m_major:
+                tmem_dp = 16
+                tmem_bit = 256
+            else:
+                tmem_dp = 16
+                tmem_bit = 256
+        else:
+            if is_m_major:
+                tmem_dp = 16
+                tmem_bit = 256
+            else:
+                tmem_dp = 32
+                tmem_bit = 32
+    elif acc_bits == 32 and d_bits == 8:
+        if num_dp == 16:
+            if is_m_major:
+                tmem_dp = 16
+                tmem_bit = 256
+            else:
+                tmem_dp = 16
+                tmem_bit = 32
+        else:
+            if is_m_major:
+                tmem_dp = 16
+                tmem_bit = 256
+            else:
+                tmem_dp = 32
+                tmem_bit = 32
+    elif acc_bits == 16 and d_bits == 16:
+        tmem_pack16b = True
+        if num_dp == 16:
+            if is_m_major:
+                tmem_dp = 16
+                tmem_bit = 128
+            else:
+                tmem_dp = 16
+                tmem_bit = 128
+        else:
+            if is_m_major:
+                tmem_dp = 16
+                tmem_bit = 128
+            else:
+                tmem_dp = 32
+                tmem_bit = 32
+    elif acc_bits == 32 and d_bits == 6:
+        if not num_dp == 32:
+            raise ValueError("Num dp must be 32.")
+        tmem_dp = 32
+        tmem_bit = 32
+    elif acc_bits == 32 and d_bits == 4:
+        if not num_dp == 32:
+            raise ValueError("Num dp must be 32.")
+        tmem_dp = 32
+        tmem_bit = 32
+    else:
+        raise ValueError(
+            f"Can not handle acc/d type combination: {elem_ty_acc=}, {elem_ty_d=}"
+        )
+
+    num_bit_div = tmem_bit
+    if tmem_dp == 16 and tmem_bit == 32:
+        num_bit_div = 64
+
+    if (num_col_bits % (num_bit_div * 128) == 0) and (
+        (tmem_dp == 16 and tmem_bit == 64)
+        or (tmem_dp == 16 and tmem_bit == 32)
+        or (tmem_dp == 32 and tmem_bit == 32)
+    ):
+        tmem_rep = 128
+    elif (num_col_bits % (num_bit_div * 64) == 0) and (
+        (tmem_dp == 16 and tmem_bit == 128)
+        or (tmem_dp == 16 and tmem_bit == 64)
+        or (tmem_dp == 16 and tmem_bit == 32)
+        or (tmem_dp == 32 and tmem_bit == 32)
+    ):
+        tmem_rep = 64
+    elif num_col_bits % (num_bit_div * 32) == 0:
+        tmem_rep = 32
+    elif num_col_bits % (num_bit_div * 16) == 0:
+        tmem_rep = 16
+    elif num_col_bits % (num_bit_div * 8) == 0:
+        tmem_rep = 8
+    elif num_col_bits % (num_bit_div * 4) == 0:
+        tmem_rep = 4
+    elif num_col_bits % (num_bit_div * 2) == 0:
+        tmem_rep = 2
+    elif num_col_bits % (num_bit_div * 1) == 0:
+        tmem_rep = 1
+    else:
+        raise ValueError("Can not pick tmem_rep based on cta tile shape and tmem atom.")
+
+    if tmem_dp == 16 and tmem_bit == 64:
+        op = Ld16x64bOp(
+            Repetition(tmem_rep), Pack.PACK_16b_IN_32b if tmem_pack16b else Pack.NONE
+        )
+        return cute.make_copy_atom(op, elem_ty_acc, loc=loc, ip=ip)
+    elif tmem_dp == 16 and tmem_bit == 128:
+        op = Ld16x128bOp(
+            Repetition(tmem_rep), Pack.PACK_16b_IN_32b if tmem_pack16b else Pack.NONE
+        )
+        return cute.make_copy_atom(op, elem_ty_acc, loc=loc, ip=ip)
+    elif tmem_dp == 16 and tmem_bit == 256:
+        op = Ld16x256bOp(
+            Repetition(tmem_rep), Pack.PACK_16b_IN_32b if tmem_pack16b else Pack.NONE
+        )
+        return cute.make_copy_atom(op, elem_ty_acc, loc=loc, ip=ip)
+    elif tmem_dp == 16 and tmem_bit == 32:
+        op = Ld16x32bx2Op(
+            Repetition(tmem_rep), Pack.PACK_16b_IN_32b if tmem_pack16b else Pack.NONE
+        )
+        return cute.make_copy_atom(op, elem_ty_acc, loc=loc, ip=ip)
+
+    elif tmem_dp == 32 and tmem_bit == 32:
+        op = Ld32x32bOp(
+            Repetition(tmem_rep), Pack.PACK_16b_IN_32b if tmem_pack16b else Pack.NONE
+        )
+        return cute.make_copy_atom(op, elem_ty_acc, loc=loc, ip=ip)
+    else:
+        raise ValueError()
+
+
+def get_num_tmem_alloc_cols(
+    tmem_tensors: Union[cute.Tensor, List[cute.Tensor]], rounding=True
+) -> int:
+    """Get the total number of TMEM allocation columns for the given TMEM tensors.
+
+    :param tmem_tensors: The TMEM tensors to get the number of allocation columns for.
+    :type tmem_tensors: Union[cute.Tensor, List[cute.Tensor]]
+    :param rounding: Whether to round up the number of allocation columns to the nearest power of 2.
+    :type rounding: bool
+
+    :return: The total number of TMEM allocation columns.
+    :rtype: int
+
+    :raises ValueError: If the number of TMEM allocation columns exceeds the maximum capacity of 512 or is less than 32.
+    """
+    # Turn tmem_tensors into a list
+    if isinstance(tmem_tensors, cute.Tensor):
+        tmem_tensors = [tmem_tensors]
+
+    # For each tensor in tmem_tensors, find the tmem_tensor_col_offset
+    num_tmem_alloc_cols_per_tensor = [
+        find_tmem_tensor_col_offset(t) for t in tmem_tensors
+    ]
+
+    # Sum up the num_tmem_alloc_cols_per_tensor
+    num_tmem_alloc_cols = sum(num_tmem_alloc_cols_per_tensor)
+
+    # Round up num_tmem_cols_total to the nearest power of 2
+    if rounding:
+        num_tmem_alloc_cols = 1 << ceil(log2(num_tmem_alloc_cols))
+
+    # Validate the number of TMEM allocation columns
+    SM100_TMEM_CAPACITY_COLUMNS = 512
+    SM100_TMEM_MIN_ALLOC_COLUMNS = 32
+    if (
+        num_tmem_alloc_cols > SM100_TMEM_CAPACITY_COLUMNS
+        or num_tmem_alloc_cols < SM100_TMEM_MIN_ALLOC_COLUMNS
+    ):
+        raise ValueError(
+            f"TMEM allocation columns {num_tmem_alloc_cols} exceeds the maximum capacity of {SM100_TMEM_CAPACITY_COLUMNS} or less than {SM100_TMEM_MIN_ALLOC_COLUMNS}"
+        )
+    return num_tmem_alloc_cols
+
+
+def get_smem_layout_atom_ab(
+    major_mode: OperandMajorMode,
+    element_type: Type[Numeric],
+    smem_shape_mn_k: Tuple[int, int],
+    *,
+    loc=None,
+    ip=None,
+) -> SmemLayoutAtomKind:
+    """Simple heuristics to select the optimal SMEM layout atom based on the
+    majorness, the data type, and the major mode size.
+
+    :param major_mode: The major mode for the SMEM tensor is K major.
+    :type major_mode: OperandMajorMode
+    :param element_type: The element type for the SMEM tensor.
+    :type element_type: Type[Numeric]
+    :param smem_shape_mn_k: The shape of the SMEM tensor.
+    :type smem_shape_mn_k: Tuple[int, int]
+
+    :return: The SMEM layout atom kind
+    :rtype: SmemLayoutAtomKind
+    """
+    is_k_major = major_mode == OperandMajorMode.K
+    major_mode_size = smem_shape_mn_k[1] if is_k_major else smem_shape_mn_k[0]
+
+    assert major_mode_size % 8 == 0
+    sw128_num_contiguous_bits = 1024
+    sw64_num_contiguous_bits = 512
+    sw32_num_contiguous_bits = 256
+    inter_num_contiguous_bits = 128
+    major_mode_size_bits = major_mode_size * element_type.width
+    assert major_mode_size_bits % inter_num_contiguous_bits == 0
+
+    if not is_k_major:
+        if (element_type.width == 32) and (
+            major_mode_size_bits % sw128_num_contiguous_bits == 0
+        ):
+            return SmemLayoutAtomKind.MN_SW128_32B
+        if major_mode_size_bits % sw128_num_contiguous_bits == 0:
+            return SmemLayoutAtomKind.MN_SW128
+        if major_mode_size_bits % sw64_num_contiguous_bits == 0:
+            return SmemLayoutAtomKind.MN_SW64
+        if major_mode_size_bits % sw32_num_contiguous_bits == 0:
+            return SmemLayoutAtomKind.MN_SW32
+        return SmemLayoutAtomKind.MN_INTER
+    if major_mode_size_bits % sw128_num_contiguous_bits == 0:
+        return SmemLayoutAtomKind.K_SW128
+    if major_mode_size_bits % sw64_num_contiguous_bits == 0:
+        return SmemLayoutAtomKind.K_SW64
+    if major_mode_size_bits % sw32_num_contiguous_bits == 0:
+        return SmemLayoutAtomKind.K_SW32
+    return SmemLayoutAtomKind.K_INTER
+
+
+@dsl_user_op
+def make_smem_layout_a(
+    tiled_mma: cute.TiledMma,
+    mma_tiler_mnk: cute.Tile,
+    a_dtype: Type[Numeric],
+    num_stages: int,
+    *,
+    loc=None,
+    ip=None,
+) -> Union[cute.Layout, cute.ComposedLayout]:
+    """This function helps with:
+    1. Get the partitioned shape of the A tensor based on the tiled_mma & MMA tiler.
+    2. Select the heuristic SMEM layout atom based on the A tensor's majorness, the data type, and the major mode size.
+    3. cute.Tile the SMEM layout atom to the MMA tile shape.
+    4. Stage the SMEM layout based on the number of stages.
+
+    :param tiled_mma: The tiled MMA used to partition tensor A
+    :type tiled_mma: cute.TiledMma
+    :param mma_tiler_mnk: The MMA tile shape
+    :type mma_tiler_mnk: cute.cute.Tile
+    :param a_dtype: The element type for tensor A
+    :type a_dtype: Type[Numeric]
+    :param num_stages: The number of pipeline stages for tensor A
+    :type num_stages: int
+
+    :return: SMEM layout for tensor A
+    :rtype: Union[cute.Layout, cute.ComposedLayout]
+    """
+
+    is_k_major = tiled_mma.op.a_major_mode == OperandMajorMode.K
+    a_smem_shape = tiled_mma.partition_shape_A(
+        cute.dice(mma_tiler_mnk, (1, None, 1), loc=loc, ip=ip)
+    )
+    a_smem_shape_mn_k = (
+        cute.size(a_smem_shape[0][0], loc=loc, ip=ip) * a_smem_shape[1],
+        cute.size(a_smem_shape[0][1], loc=loc, ip=ip) * a_smem_shape[2],
+    )
+    a_smem_layout_atom = make_smem_layout_atom(
+        get_smem_layout_atom_ab(
+            tiled_mma.op.a_major_mode,
+            a_dtype,
+            a_smem_shape_mn_k,
+            loc=loc,
+            ip=ip,
+        ),
+        a_dtype,
+        loc=loc,
+        ip=ip,
+    )
+    a_smem_layout_staged = tile_to_mma_shape(
+        a_smem_layout_atom,
+        cute.append(a_smem_shape, num_stages, loc=loc, ip=ip),
+        order=((1, 0, 2) if not is_k_major else (0, 1, 2)),
+        loc=loc,
+        ip=ip,
+    )
+    return a_smem_layout_staged
+
+
+@dsl_user_op
+def make_smem_layout_b(
+    tiled_mma: cute.TiledMma,
+    mma_tiler_mnk: cute.Tile,
+    b_dtype: Type[Numeric],
+    num_stages: int,
+    *,
+    loc=None,
+    ip=None,
+) -> Union[cute.Layout, cute.ComposedLayout]:
+    """This function helps:
+    1. Get the partitioned shape of the B tensor based on the tiled_mma & MMA tiler.
+    2. Select the heuristic SMEM layout atom based on the B tensor's majorness, the data type, and the major mode size.
+    3. cute.Tile the SMEM layout atom to the MMA tile shape.
+    4. Stage the SMEM layout based on the number of stages.
+
+    :param tiled_mma: The tiled MMA which is used to partition the B tensor.
+    :type tiled_mma: cute.TiledMma
+    :param mma_tiler_mnk: The MMA tile shape.
+    :type mma_tiler_mnk: cute.cute.Tile
+    :param b_dtype: The element type for the B tensor.
+    :type b_dtype: Type[Numeric]
+    :param num_stages: The stage of the B tensor.
+    :type num_stages: int
+
+    :return: SMEM layout for the B tensor.
+    :rtype: Union[cute.Layout, cute.ComposedLayout]
+    """
+
+    is_k_major = tiled_mma.op.b_major_mode == OperandMajorMode.K
+    b_smem_shape = tiled_mma.partition_shape_B(
+        cute.dice(mma_tiler_mnk, (None, 1, 1), loc=loc, ip=ip)
+    )
+    b_smem_shape_nk = (
+        cute.size(b_smem_shape[0][0], loc=loc, ip=ip) * b_smem_shape[1],
+        cute.size(b_smem_shape[0][1], loc=loc, ip=ip) * b_smem_shape[2],
+    )
+    b_smem_layout_atom = make_smem_layout_atom(
+        get_smem_layout_atom_ab(
+            tiled_mma.op.b_major_mode,
+            b_dtype,
+            b_smem_shape_nk,
+            loc=loc,
+            ip=ip,
+        ),
+        b_dtype,
+        loc=loc,
+        ip=ip,
+    )
+    b_smem_layout_staged = tile_to_mma_shape(
+        b_smem_layout_atom,
+        cute.append(b_smem_shape, num_stages, loc=loc, ip=ip),
+        order=((1, 0, 2) if not is_k_major else (0, 1, 2)),
+        loc=loc,
+        ip=ip,
+    )
+
+    return b_smem_layout_staged
+
+@dsl_user_op
+def get_smem_layout_atom_epi(
+    layout: LayoutEnum,
+    element_type: Type[Numeric],
+    epi_tile: cute.Tile,
+    *,
+    loc=None,
+    ip=None,
+) -> SmemLayoutAtomKind:
+    """Simple heuristics to select the optimal SMEM layout atom for epilog tensors.
+
+    :param layout: The layout enum for the SMEM tensor.
+    :type layout: LayoutEnum
+    :param element_type: The element type for the SMEM tensor.
+    :type element_type: Type[Numeric]
+    :param epi_tile: The epilogue tile shape.
+    :type epi_tile: cute.Tile
+
+    :return: The SMEM layout atom kind
+    :rtype: SmemLayoutAtomKind
+    """
+    # Get the max contiguous tile usable by TMA
+    tma_shape = tuple(
+        (
+            # assumes get<0>(epi_tile) is coalesced and unit stride
+            cute.coalesce(cute.right_inverse(x, loc=loc, ip=ip), loc=loc, ip=ip).shape
+            if isinstance(x, cute.Layout)
+            else x
+        )
+        for x in epi_tile
+    )
+
+    if layout.is_m_major_c():
+        # ColMajor C/D (M-major)
+        return get_smem_layout_atom_ab(
+            OperandMajorMode.MN, element_type, tma_shape, loc=loc, ip=ip
+        )
+    else:
+        # RowMajor C/D (N-major)
+        return get_smem_layout_atom_ab(
+            OperandMajorMode.K, element_type, tma_shape, loc=loc, ip=ip
+        )
+
+
+@dsl_user_op
+def make_smem_layout_epi(
+    epi_dtype: Type[Numeric],
+    epi_layout: LayoutEnum,
+    epi_tile: cute.Tile,
+    epi_stage: int,
+    *,
+    loc=None,
+    ip=None,
+) -> Union[cute.Layout, cute.ComposedLayout]:
+    """This function helps:
+    1. Select the heuristic SMEM layout atom based on the epilog tile shape,
+       the epilog tensor's majorness, and the element type.
+    2. cute.Tile the SMEM layout atom to the epilog tile shape.
+    3. Stage the SMEM layout based on the number of stages.
+
+    :param epi_dtype: The element type for the epilog tensor.
+    :type epi_dtype: Type[Numeric]
+    :param epi_layout: The layout enum for the epilog tensor.
+    :type epi_layout: LayoutEnum
+    :param epi_tile: The epilogue tile shape.
+    :type epi_tile: cute.cute.Tile
+    :param epi_stage: The stage of the epilog tensor.
+    :type epi_stage: int
+
+    :return: SMEM layout for epilog tensors (usually C & D which are processed in the epilog)
+    :rtype: Union[cute.Layout, cute.ComposedLayout]
+    """
+
+    epilog_shape = cute.product_each(
+        cute.shape(epi_tile, loc=loc, ip=ip), loc=loc, ip=ip
+    )
+
+    c_smem_layout_atom = make_smem_layout_atom(
+        get_smem_layout_atom_epi(
+            epi_layout,
+            epi_dtype,
+            epi_tile,
+            loc=loc,
+            ip=ip,
+        ),
+        epi_dtype,
+        loc=loc,
+        ip=ip,
+    )
+    epi_smem_layout_staged = cute.tile_to_shape(
+        c_smem_layout_atom,
+        cute.append(epilog_shape, epi_stage, loc=loc, ip=ip),
+        order=((1, 0, 2) if not epi_layout.is_n_major_c() else (0, 1, 2)),
+        loc=loc,
+        ip=ip,
+    )
+
+    return epi_smem_layout_staged
+
+
+class SmemCapacity(Enum):
+    SM100_SMEM_CAPACITY_BYTES = (228 - 1) * 1024
+    SM120_SMEM_CAPACITY_BYTES = (100 - 1) * 1024
+
+
+# Dictionary to map compute capability to SMEM capacity
+SMEM_CAPACITY = {
+    "sm100": SmemCapacity.SM100_SMEM_CAPACITY_BYTES.value,
+    "sm120": SmemCapacity.SM120_SMEM_CAPACITY_BYTES.value,
+}
+
+@dsl_user_op
+def make_trivial_tiled_mma(
+    ab_dtype: Type[Numeric],
+    a_leading_mode: OperandMajorMode,
+    b_leading_mode: OperandMajorMode,
+    acc_dtype: Type[Numeric],
+    cta_group: CtaGroup,
+    mma_tiler_mn: Tuple[int, int],
+    a_source: OperandSource = OperandSource.SMEM,
+    *,
+    loc=None,
+    ip=None,
+) -> cute.TiledMma:
+    """Make a tiled MMA atom with given data type, leading dimension, cta group and mma tile shape.
+    By default, the MMA atom is created with SMEM operand source for A.
+
+    :param ab_dtype: Data type of operands A and B.
+    :type ab_dtype: type[Numeric]
+    :param a_leading_mode: Leading dimension of operand A (1 for K, 0 for M/N).
+    :type a_leading_mode: tcgen05.OperandMajorMode
+    :param b_leading_mode: Leading dimension of operand B (1 for K, 0 for M/N).
+    :type b_leading_mode: tcgen05.OperandMajorMode
+    :param acc_dtype: Data type of the accumulator.
+    :type acc_dtype: type[Numeric]
+    :param cta_group: The CTA group to use.
+    :type cta_group: tcgen05.CtaGroup
+    :param mma_tiler_mn: The shape (M, N, K) of the MMA tiler.
+    :type mma_tiler_mn: Tuple[int, int]
+    :param a_source: The source of operand A (SMEM by default or TMEM).
+    :type a_source: OperandSource
+
+    :return: A tiled MMA atom.
+    :rtype: cute.TiledMma
+
+    :raises TypeError: If the data type is not supported.
+    """
+
+    if ab_dtype in {Float16, BFloat16}:
+        mma_op = MmaF16BF16Op(
+            ab_dtype,
+            acc_dtype,
+            (*mma_tiler_mn, 16),
+            cta_group,
+            a_source,
+            a_leading_mode,
+            b_leading_mode,
+        )
+    elif ab_dtype in {TFloat32, Float32}:
+        mma_op = MmaTF32Op(
+            (*mma_tiler_mn, 8),
+            cta_group,
+            a_source,
+            a_leading_mode,
+            b_leading_mode,
+        )
+    elif ab_dtype in {
+        Uint8,
+        Int8,
+    }:
+        mma_op = MmaI8Op(
+            ab_dtype,
+            (*mma_tiler_mn, 32),
+            cta_group,
+            a_source,
+            a_leading_mode,
+            b_leading_mode,
+        )
+    elif ab_dtype in {Float8E4M3FN, Float8E5M2}:
+        mma_op = MmaFP8Op(
+            ab_dtype,
+            acc_dtype,
+            (*mma_tiler_mn, 32),
+            cta_group,
+            a_source,
+            a_leading_mode,
+            b_leading_mode,
+        )
+    else:
+        raise TypeError(f"unsupported ab_dtype, got {ab_dtype}")
+
+    return cute.make_tiled_mma(cute.make_mma_atom(mma_op))
diff --git a/python/CuTeDSL/cutlass/utils/grouped_gemm_tile_scheduler_helper.py b/python/CuTeDSL/cutlass/utils/grouped_gemm_tile_scheduler_helper.py
new file mode 100644
index 00000000..a51bae62
--- /dev/null
+++ b/python/CuTeDSL/cutlass/utils/grouped_gemm_tile_scheduler_helper.py
@@ -0,0 +1,466 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from typing import List, Tuple
+
+import cutlass.cute as cute
+from cutlass.cutlass_dsl import Int32, extract_mlir_values, new_from_mlir_values
+from cutlass._mlir import ir
+
+from cutlass.utils.static_persistent_tile_scheduler import PersistentTileSchedulerParams
+
+
+class GroupSearchResult:
+    """
+    The result of the group search for grouped gemm.
+
+    :param group_idx: The result group index
+    :type group_idx: Int32
+    :param cta_tile_idx_m: CTA tile index along M dimension after rasterization
+    :type cta_tile_idx_m: Int32
+    :param cta_tile_idx_n: CTA tile index along N dimension after rasterization
+    :type cta_tile_idx_n: Int32
+    :param problem_shape_m: The M dimension of the gemm problem
+    :type problem_shape_m: Int32
+    :param problem_shape_n: The N dimension of the gemm problem
+    :type problem_shape_n: Int32
+    :param problem_shape_k: The K dimension of the gemm problem
+    :type problem_shape_k: Int32
+    :param cta_tile_count_k: Number of tiles along K dimension
+    :type cta_tile_count_k: Int32
+    """
+
+    def __init__(
+        self,
+        group_idx: Int32,
+        cta_tile_idx_m: Int32,
+        cta_tile_idx_n: Int32,
+        problem_shape_m: Int32,
+        problem_shape_n: Int32,
+        problem_shape_k: Int32,
+        cta_tile_count_k: Int32,
+    ) -> None:
+        self.group_idx = group_idx
+        self.cta_tile_idx_m = cta_tile_idx_m
+        self.cta_tile_idx_n = cta_tile_idx_n
+        self.problem_shape_m = problem_shape_m
+        self.problem_shape_n = problem_shape_n
+        self.problem_shape_k = problem_shape_k
+        self.cta_tile_count_k = cta_tile_count_k
+
+    def __extract_mlir_values__(self) -> List[ir.Value]:
+        values = extract_mlir_values(self.group_idx)
+        values.extend(extract_mlir_values(self.cta_tile_idx_m))
+        values.extend(extract_mlir_values(self.cta_tile_idx_n))
+        values.extend(extract_mlir_values(self.problem_shape_m))
+        values.extend(extract_mlir_values(self.problem_shape_n))
+        values.extend(extract_mlir_values(self.problem_shape_k))
+        values.extend(extract_mlir_values(self.cta_tile_count_k))
+        return values
+
+    def __new_from_mlir_values__(self, values: List[ir.Value]) -> "GroupSearchResult":
+        assert len(values) == 7
+        return GroupSearchResult(*tuple(values))
+
+
+class GroupedGemmGroupSearchState:
+    """
+    The state of group index search for grouped gemm.
+
+    The state will be initialized once and updated in every round of group index search.
+
+    :param start_group_idx: The group idx to start the search with
+    :type start_group_idx: Int32
+    :param tile_count_prev_group: Number of tiles before the matched group
+    :type tile_count_prev_group: Int32
+    :param tile_count_searched: Number of tiles we have searched. When the matched group is found,
+                               it records the number of tiles including the matched group
+    :type tile_count_searched: Int32
+    """
+
+    def __init__(
+        self,
+        start_group_idx: Int32,
+        tile_count_prev_group: Int32,
+        tile_count_searched: Int32,
+    ) -> None:
+        self.start_group_idx = start_group_idx
+        self.tile_count_prev_group = tile_count_prev_group
+        self.tile_count_searched = tile_count_searched
+
+    def __extract_mlir_values__(self) -> List[ir.Value]:
+        values = extract_mlir_values(self.start_group_idx)
+        values.extend(extract_mlir_values(self.tile_count_prev_group))
+        values.extend(extract_mlir_values(self.tile_count_searched))
+        return values
+
+    def __new_from_mlir_values__(
+        self, values: List[ir.Value]
+    ) -> "GroupedGemmGroupSearchState":
+        start_group_idx = new_from_mlir_values(self.start_group_idx, [values[0]])
+        tile_count_prev_group = new_from_mlir_values(
+            self.tile_count_prev_group, [values[1]]
+        )
+        tile_count_searched = new_from_mlir_values(
+            self.tile_count_searched, [values[2]]
+        )
+        return GroupedGemmGroupSearchState(
+            start_group_idx, tile_count_prev_group, tile_count_searched
+        )
+
+
+def create_initial_search_state() -> GroupedGemmGroupSearchState:
+    """
+    Create an initial search state for grouped gemm.
+
+    :return: A new search state with initial values
+    :rtype: GroupedGemmGroupSearchState
+    """
+    return GroupedGemmGroupSearchState(
+        start_group_idx=Int32(0),
+        tile_count_prev_group=Int32(0),
+        tile_count_searched=Int32(0),
+    )
+
+
+class GroupedGemmTileSchedulerHelper:
+    """
+    A helper to translate the raw block index (x, y, z) from tile scheduler to real CTA tile index for grouped gemm.
+
+    :param group_count: Number of groups in current grouped gemm problem
+    :type group_count: int
+    :param tile_sched_params: Parameter used to create the tile scheduler this helper works with
+    :type tile_sched_params: PersistentTileSchedulerParams
+    :param cluster_tile_shape_mnk: The shape of cluster tile as (m, n, k)
+    :type cluster_tile_shape_mnk: tuple[int, int, int]
+    :param search_state: The initial search state
+    :type search_state: GroupedGemmGroupSearchState
+    """
+
+    def __init__(
+        self,
+        group_count: int,
+        tile_sched_params: PersistentTileSchedulerParams,
+        cluster_tile_shape_mnk: tuple[int, int, int],
+        search_state: GroupedGemmGroupSearchState,
+    ) -> None:
+        self.tile_sched_params = tile_sched_params
+        self.group_count = group_count
+        self.lane_idx = cute.arch.lane_idx()
+        self.cluster_tile_shape_mnk = cluster_tile_shape_mnk
+        self.search_state = search_state
+
+    def __extract_mlir_values__(self) -> List[ir.Value]:
+        values = extract_mlir_values(self.tile_sched_params)
+        values.extend(extract_mlir_values(self.search_state))
+        return values
+
+    def __new_from_mlir_values__(
+        self, values: List[ir.Value]
+    ) -> "GroupedGemmTileSchedulerHelper":
+        tile_sched_params = new_from_mlir_values(self.tile_sched_params, values)
+        search_state = new_from_mlir_values(self.search_state, values[1:])
+        return GroupedGemmTileSchedulerHelper(
+            self.group_count,
+            tile_sched_params,
+            self.cluster_tile_shape_mnk,
+            search_state,
+        )
+
+    def delinearize_z(
+        self,
+        cta_tile_coord: tuple,
+        problem_shape_mnkl: cute.Tensor,
+    ) -> GroupSearchResult:
+        """
+        Delinearize the linear z index and return GroupSearchResult.
+
+        This function should be used by warps that need to know the CTA tile index on M and N dimensions.
+
+        :param cta_tile_coord: The raw CTA coordinate from tile scheduler
+        :type cta_tile_coord: tuple of Int32
+        :param problem_shape_mnkl: Tensor containing gemm problem size (M, N, K, L) for each group
+        :type problem_shape_mnkl: cute.Tensor
+        :return: The search result containing group index and tile coordinates
+        :rtype: GroupSearchResult
+        """
+        # delinear the z coord
+        linear_idx = cta_tile_coord[2]
+        group_idx, problem_mnkl = self._group_search_and_load_problem_shape(
+            linear_idx,
+            problem_shape_mnkl,
+            self.search_state.start_group_idx,
+            self.search_state.tile_count_prev_group,
+        )
+        # linear index local to current group
+        cluster_tile_idx_in_current_group = (
+            linear_idx - self.search_state.tile_count_prev_group
+        )
+        cluster_count_m, cluster_count_n, cluster_count_k = cute.ceil_div(
+            (problem_mnkl[0], problem_mnkl[1], problem_mnkl[2]),
+            (
+                self.cluster_tile_shape_mnk[0],
+                self.cluster_tile_shape_mnk[1],
+                self.cluster_tile_shape_mnk[2],
+            ),
+        )
+        # decompose to get indices on M and N
+        cta_tile_idx_m, cta_tile_idx_n = self._compute_cta_tile_coord(
+            cluster_tile_idx_in_current_group,
+            cta_tile_coord,
+            cluster_count_m,
+            cluster_count_n,
+        )
+        return GroupSearchResult(
+            group_idx,
+            cta_tile_idx_m,
+            cta_tile_idx_n,
+            problem_mnkl[0],
+            problem_mnkl[1],
+            problem_mnkl[2],
+            cluster_count_k,
+        )
+
+    def search_cluster_tile_count_k(
+        self,
+        cta_tile_coord: tuple,
+        problem_shape_mnkl: cute.Tensor,
+    ) -> Tuple[Int32, Int32]:
+        """
+        Search the matched group for given linear index and compute the number of tiles along K dimension for the matched group.
+
+        This function should be used by warps that are only interested in the number of tiles along K dimension.
+
+        :param cta_tile_coord: The raw CTA coordinate from tile scheduler
+        :type cta_tile_coord: tuple of Int32
+        :param problem_shape_mnkl: Tensor containing gemm problem size (M, N, K, L) for all groups
+        :type problem_shape_mnkl: cute.Tensor
+        :return: A tuple containing cluster count along K dimension and the group index
+        :rtype: Tuple[Int32, Int32]
+        """
+        group_idx, problem_mnk = self._group_search_and_load_problem_shape(
+            cta_tile_coord[2],
+            problem_shape_mnkl,
+            self.search_state.start_group_idx,
+            self.search_state.tile_count_prev_group,
+        )
+        cluster_count_k = (
+            problem_mnk[2] + self.cluster_tile_shape_mnk[2] - 1
+        ) // self.cluster_tile_shape_mnk[2]
+        return cluster_count_k, group_idx
+
+    @cute.jit
+    def _prefix_sum(self, value_per_thread: Int32) -> Int32:
+        """
+        Perform prefix sum within a full warp.
+
+        :param value_per_thread: The value for this thread to contribute to the prefix sum
+        :type value_per_thread: Int32
+        :return: The prefix sum result for this thread
+        :rtype: Int32
+        """
+        clamp_value = 0
+        idx = 1
+        sum_per_thread = value_per_thread
+        while idx < cute.arch.WARP_SIZE:
+            value = cute.arch.shuffle_sync_up(
+                sum_per_thread, idx, mask_and_clamp=clamp_value
+            )
+            if self.lane_idx >= idx:
+                sum_per_thread += value
+            idx = idx << 1
+        return sum_per_thread
+
+    def _get_problem_for_group(
+        self, problem_shape_mnkl: cute.Tensor, group_idx: Int32
+    ) -> cute.Tensor:
+        """
+        Load gemm problem (m,n,k,l) for the specified group from global memory to register.
+
+        :param problem_shape_mnkl: Tensor in global memory with layout (group_count, 4):(4, 1)
+        :type problem_shape_mnkl: cute.Tensor
+        :param group_idx: The index of the group to load
+        :type group_idx: Int32
+        :return: The problem shape tensor for the specified group
+        :rtype: cute.Tensor
+        """
+        cur_problem_mnkl = cute.make_fragment(
+            cute.make_layout(4), problem_shape_mnkl.element_type
+        )
+        cute.autovec_copy(problem_shape_mnkl[(group_idx, None)], cur_problem_mnkl)
+        return cur_problem_mnkl
+
+    def _get_cluster_tile_count_mn(self, problem_shape: cute.Tensor) -> Int32:
+        """
+        Compute total cluster count.
+
+        :param problem_shape: Tensor containing problem shape (m, n, k, l)
+        :type problem_shape: cute.Tensor
+        :return: The total cluster tile count for M and N dimensions
+        :rtype: Int32
+        """
+        cur_ntile_m = (
+            problem_shape[0] + self.cluster_tile_shape_mnk[0] - 1
+        ) // self.cluster_tile_shape_mnk[0]
+        cur_ntile_n = (
+            problem_shape[1] + self.cluster_tile_shape_mnk[1] - 1
+        ) // self.cluster_tile_shape_mnk[1]
+        cur_ntile_mn = cur_ntile_m * cur_ntile_n
+        return cur_ntile_mn
+
+    def _compute_cta_tile_coord(
+        self,
+        cluster_tile_idx: Int32,
+        cta_tile_coord_in_cluster: tuple,
+        cluster_tile_count_m: Int32,
+        cluster_tile_count_n: Int32,
+    ) -> tuple:
+        """
+        Compute CTA tile indices along M and N dimensions based on the linear index within a group.
+
+        It uses the AlongM mode to decompose the linear index onto M and N dimensions.
+
+        :param cluster_tile_idx: The linear index within a group
+        :type cluster_tile_idx: Int32
+        :param cta_tile_coord_in_cluster: CTA indices along M and N dimensions within a cluster
+        :type cta_tile_coord_in_cluster: tuple of Int32
+        :param cluster_tile_count_m: The number of clusters along M dimension of the matched group
+        :type cluster_tile_count_m: Int32
+        :param cluster_tile_count_n: The number of clusters along N dimension of the matched group
+        :type cluster_tile_count_n: Int32
+        :return: A tuple containing CTA tile indices along M and N dimensions
+        :rtype: tuple of (Int32, Int32)
+        """
+        cluster_layout_mn = cute.make_layout(
+            (cluster_tile_count_m, cluster_tile_count_n)
+        )
+        (mi, ni) = cluster_layout_mn.get_hier_coord(cluster_tile_idx)
+        cta_tile_idx_m = (
+            mi * self.tile_sched_params.cluster_shape_mn[0]
+            + cta_tile_coord_in_cluster[0]
+        )
+        cta_tile_idx_n = (
+            ni * self.tile_sched_params.cluster_shape_mn[1]
+            + cta_tile_coord_in_cluster[1]
+        )
+        return (cta_tile_idx_m, cta_tile_idx_n)
+
+    @cute.jit
+    def _group_search(
+        self,
+        linear_idx: Int32,
+        problem_shape_mnkl: cute.Tensor,
+        init_group_idx: Int32,
+        init_tile_count_searched: Int32,
+    ) -> GroupedGemmGroupSearchState:
+        """
+        Search which group the linear index belongs to.
+
+        :param linear_idx: The linear index to be decomposed
+        :type linear_idx: Int32
+        :param problem_shape_mnkl: Tensor containing gemm problem size (M, N, K, L) for all groups
+        :type problem_shape_mnkl: cute.Tensor
+        :param init_group_idx: The group idx to start the search with
+        :type init_group_idx: Int32
+        :param init_tile_count_searched: The number of tiles we have searched
+        :type init_tile_count_searched: Int32
+        :return: The updated search state
+        :rtype: GroupedGemmGroupSearchState
+        """
+        c_0 = Int32(0).ir_value()
+        last_lane_idx = cute.arch.WARP_SIZE - 1
+
+        tile_count_searched = init_tile_count_searched
+        start_group_idx = init_group_idx
+        not_found = linear_idx >= tile_count_searched
+        tile_count_prev_group = self.search_state.tile_count_prev_group
+        while not_found:
+            # get group to search for current lane
+            cur_group_idx = start_group_idx + self.lane_idx
+            # check if the group to be checked is out of range
+            inside_group_bound = cur_group_idx < self.group_count
+            cur_ntile_mn = c_0
+            if inside_group_bound:
+                # get problem size of current group
+                cur_problem_mnkl = self._get_problem_for_group(
+                    problem_shape_mnkl, cur_group_idx
+                )
+                cur_ntile_mn = self._get_cluster_tile_count_mn(cur_problem_mnkl)
+            # compute tile count from beginning to current group(included)
+            total_cluster_tile_count_ps_per_thread = self._prefix_sum(cur_ntile_mn)
+            cluster_tile_count_end_per_thread = (
+                total_cluster_tile_count_ps_per_thread + tile_count_searched
+            )
+
+            group_not_in_window = linear_idx >= cluster_tile_count_end_per_thread
+            hitted_group_idx_in_search_window = cute.arch.popc(
+                cute.arch.vote_ballot_sync(group_not_in_window)
+            )
+            not_found = hitted_group_idx_in_search_window == cute.arch.WARP_SIZE
+            start_group_idx = hitted_group_idx_in_search_window + start_group_idx
+            hit_the_1st_problem_in_search_window = (
+                hitted_group_idx_in_search_window == c_0
+            )
+            tile_count_prev_group = tile_count_searched
+            if hit_the_1st_problem_in_search_window == False:
+                tile_count_prev_group = cute.arch.shuffle_sync(
+                    cluster_tile_count_end_per_thread,
+                    hitted_group_idx_in_search_window - 1,
+                )
+
+            # If no matched group, then get new_cluster_tile_count_end from last lane
+            # Otherwise, get new_cluster_tile_count_end from the hitted group
+            lane_idx_for_cluster_tile_count_end = hitted_group_idx_in_search_window
+            if not_found:
+                lane_idx_for_cluster_tile_count_end = last_lane_idx
+            tile_count_searched = cute.arch.shuffle_sync(
+                cluster_tile_count_end_per_thread,
+                lane_idx_for_cluster_tile_count_end,
+            )
+
+        return GroupedGemmGroupSearchState(
+            start_group_idx,
+            tile_count_prev_group,
+            tile_count_searched,
+        )
+
+    def _group_search_and_load_problem_shape(
+        self,
+        linear_idx: Int32,
+        problem_shape_mnkl: cute.Tensor,
+        start_group_idx: Int32,
+        tile_count_searched: Int32,
+    ) -> Tuple[Int32, cute.Tensor]:
+        """
+        Perform group search and load problem shape for the matched group.
+
+        :param linear_idx: The linear index to be decomposed
+        :type linear_idx: Int32
+        :param problem_shape_mnkl: Tensor containing gemm problem size (M, N, K, L) for all groups
+        :type problem_shape_mnkl: cute.Tensor
+        :param start_group_idx: The group idx to start the search with
+        :type start_group_idx: Int32
+        :param tile_count_searched: The number of tiles we have searched
+        :type tile_count_searched: Int32
+        :return: A tuple containing the final group index and the problem shape tensor
+        :rtype: Tuple[Int32, cute.Tensor]
+        """
+        self.search_state = self._group_search(
+            linear_idx,
+            problem_shape_mnkl,
+            start_group_idx,
+            tile_count_searched,
+        )
+        # get final group search state
+        final_group_idx = self.search_state.start_group_idx
+        # let's revisit if it's better to broadcast problem_shape_mnk in group_search
+        problem_mnkl = self._get_problem_for_group(problem_shape_mnkl, final_group_idx)
+        return final_group_idx, problem_mnkl
diff --git a/python/CuTeDSL/cutlass/utils/hardware_info.py b/python/CuTeDSL/cutlass/utils/hardware_info.py
new file mode 100644
index 00000000..e86fcbef
--- /dev/null
+++ b/python/CuTeDSL/cutlass/utils/hardware_info.py
@@ -0,0 +1,174 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from cuda.bindings import driver, nvrtc
+
+import cutlass.cute as cute
+
+"""
+This class is used to get the hardware info of given GPU device.
+It provides methods to get the max active clusters for given cluster size.
+
+Prerequisite:
+- CUDA driver is initialized via `driver.cuInit` or other CUDA APIs.
+- CUDA context is created via `driver.cuCtxCreate` or other CUDA APIs.
+
+"""
+
+
+class HardwareInfo:
+    """
+    device_id: CUDA device ID to get the hardware info.
+    """
+
+    def __init__(self, device_id: int = 0):
+        count = self._checkCudaErrors(driver.cuDeviceGetCount())
+        if device_id >= count:
+            raise ValueError(
+                f"Device ID {device_id} is out of range for device count {count}"
+            )
+        self.device_id = device_id
+        self.device = self._checkCudaErrors(driver.cuDeviceGet(device_id))
+        self.context = self._checkCudaErrors(driver.cuCtxGetCurrent())
+        self.driver_version = self._checkCudaErrors(driver.cuDriverGetVersion())
+
+    # Getting the max active clusters for a given cluster size
+    def get_max_active_clusters(self, cluster_size: int) -> int:
+        self._get_device_function()
+        if self._cuda_driver_version_lt(11, 8):
+            raise RuntimeError(
+                "CUDA Driver version < 11.8, cannot get _max_active_clusters"
+            )
+        if cluster_size <= 0 or cluster_size > 32:
+            raise ValueError(
+                f"Cluster size must be between 1 and 32, {cluster_size} is not supported"
+            )
+
+        max_shared_memory_per_block = self._checkCudaErrors(
+            driver.cuDeviceGetAttribute(
+                driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
+                self.device,
+            )
+        )
+        self._checkCudaErrors(
+            driver.cuFuncSetAttribute(
+                self.kernel,
+                driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+                max_shared_memory_per_block,
+            )
+        )
+        max_dynamic_shared_memory = self._checkCudaErrors(
+            driver.cuOccupancyAvailableDynamicSMemPerBlock(
+                self.kernel, 1, 1  # numBlocks  # blockSize
+            )
+        )
+        max_active_blocks = self._checkCudaErrors(
+            driver.cuOccupancyMaxActiveBlocksPerMultiprocessor(
+                self.kernel, 1, max_dynamic_shared_memory  # blockSize,
+            )
+        )
+        # allow non-portable cluster size to support detection of non-portable cluster size
+        self._checkCudaErrors(
+            driver.cuFuncSetAttribute(
+                self.kernel,
+                driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED,
+                1,
+            )
+        )
+        # prepare launch configuration
+        launch_config = driver.CUlaunchConfig()
+        launch_config.blockDimX = 128
+        launch_config.blockDimY = 1
+        launch_config.blockDimZ = 1
+        launch_config.sharedMemBytes = max_dynamic_shared_memory
+        launch_config.numAttrs = 1
+        # max possible cluster size is 32
+        cluster_dims_attr = driver.CUlaunchAttribute()
+        cluster_dims_attr.id = (
+            driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
+        )
+        value = driver.CUlaunchAttributeValue()
+        value.clusterDim.x = cluster_size
+        value.clusterDim.y = 1
+        value.clusterDim.z = 1
+        cluster_dims_attr.value = value
+        launch_config.attrs = [cluster_dims_attr]
+        launch_config.gridDimX = cluster_size
+        launch_config.gridDimY = max_active_blocks
+        launch_config.gridDimZ = 1
+
+        num_clusters = self._checkCudaErrors(
+            driver.cuOccupancyMaxActiveClusters(self.kernel, launch_config)
+        )
+        return num_clusters
+
+    def get_l2_cache_size_in_bytes(self) -> int:
+        return self._checkCudaErrors(
+            driver.cuDeviceGetAttribute(
+                driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
+                self.device,
+            )
+        )
+
+    def get_device_multiprocessor_count(self) -> int:
+        return self._checkCudaErrors(
+            driver.cuDeviceGetAttribute(
+                driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+                self.device,
+            )
+        )
+
+    def _checkCudaErrors(self, result) -> None:
+        if result[0].value:
+            raise RuntimeError(
+                "CUDA error code={}({})".format(
+                    result[0].value, self._cudaGetErrorEnum(result[0])
+                )
+            )
+        # CUDA APIs always return the status as the first element of the result tuple
+        if len(result) == 1:
+            return None
+        elif len(result) == 2:
+            return result[1]
+        else:
+            return result[1:]
+
+    def _cudaGetErrorEnum(self, error) -> str:
+        if isinstance(error, driver.CUresult):
+            err, name = driver.cuGetErrorName(error)
+            return name if err == driver.CUresult.CUDA_SUCCESS else "<unknown>"
+        elif isinstance(error, nvrtc.nvrtcResult):
+            return nvrtc.nvrtcGetErrorString(error)[1]
+        else:
+            raise RuntimeError("Unknown error type: {}".format(error))
+
+    def _cuda_driver_version_ge(self, major: int, minor: int) -> bool:
+        return self.driver_version >= (major * 1000 + 10 * minor)
+
+    def _cuda_driver_version_lt(self, major: int, minor: int) -> bool:
+        return not self._cuda_driver_version_ge(major, minor)
+
+    @cute.kernel
+    def _empty_kernel(self):
+        return
+
+    @cute.jit
+    def _host_function(self):
+        self._empty_kernel().launch(
+            grid=[1, 1, 1],
+            block=[1, 1, 1],
+        )
+
+    # get a empty kernel to compute occupancy
+    def _get_device_function(self) -> None:
+        self.compiled_kernel = cute.compile(self._host_function)
+        self.module = next(iter(self.compiled_kernel.cuda_modules.modules)).cuda_module
+        self.kernel = next(iter(self.compiled_kernel.cuda_modules.modules)).kernel_ptr
diff --git a/python/CuTeDSL/cutlass/utils/hopper_helpers.py b/python/CuTeDSL/cutlass/utils/hopper_helpers.py
new file mode 100644
index 00000000..d29daf50
--- /dev/null
+++ b/python/CuTeDSL/cutlass/utils/hopper_helpers.py
@@ -0,0 +1,195 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from typing import Type, Tuple
+from enum import Enum
+
+from cutlass.utils.layout import LayoutEnum
+from cutlass.cutlass_dsl import (
+    Float16,
+    BFloat16,
+    Float8E5M2,
+    Float8E4M3FN,
+    Numeric,
+    NumericMeta,
+    dsl_user_op,
+)
+
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.nvgpu.common import CopyUniversalOp
+from cutlass.cute.nvgpu.warp import StMatrix8x8x16bOp
+from cutlass.cute.nvgpu.warpgroup import (
+    MmaF16BF16Op,
+    MmaF8Op,
+    OperandMajorMode,
+    OperandSource,
+)
+
+@dsl_user_op
+def sm90_get_smem_store_op(
+    layout_d: LayoutEnum,
+    elem_ty_d: Type[Numeric],
+    elem_ty_acc: Type[Numeric],
+    *,
+    loc=None,
+    ip=None,
+) -> cute.CopyAtom:
+    """
+    Selects the largest vectorized smem store atom available subject to constraint of gmem layout.
+
+    Parameters:
+    -----------
+    layout_d : LayoutEnum
+        The layout enum of the output tensor D.
+
+    elem_ty_d : Type[Numeric]
+        The element type for output tensor D.
+
+    elem_ty_acc : Type[Numeric]
+        The element type for accumulator.
+
+    Returns:
+    --------
+    Either SmemStoreMatrix or SimtSyncCopy, based on the input parameters.
+    """
+
+    def validate_type(ty, ty_name):
+        if not isinstance(ty, NumericMeta):
+            raise TypeError(f"{ty_name} must be a Numeric, but got {ty}")
+
+    validate_type(elem_ty_d, "elem_ty_d")
+    validate_type(elem_ty_acc, "elem_ty_acc")
+
+    is_m_major = layout_d.is_m_major_c()
+
+    if elem_ty_d.width == 16:
+        return cute.make_copy_atom(
+            StMatrix8x8x16bOp(is_m_major, 4), elem_ty_d, loc=loc, ip=ip
+        )
+    else:
+        return cute.make_copy_atom(CopyUniversalOp(), elem_ty_d, loc=loc, ip=ip)
+
+
+class SmemCapacity(Enum):
+    SM90_SMEM_CAPACITY_BYTES = (228 - 1) * 1024
+
+
+# Dictionary to map compute capability to SMEM capacity
+SMEM_CAPACITY = {
+    "sm90": SmemCapacity.SM90_SMEM_CAPACITY_BYTES.value,
+}
+
+def make_trivial_tiled_mma(
+    a_dtype: Type[Numeric],
+    b_dtype: Type[Numeric],
+    a_leading_mode: OperandMajorMode,
+    b_leading_mode: OperandMajorMode,
+    acc_dtype: Type[Numeric],
+    atom_layout_mnk: Tuple[int, int, int],
+    tiler_mn: Tuple[int, int],
+) -> cute.TiledMma:
+    """Make a tiled MMA atom with given data type, leading dimension, cta group and mma tile shape.
+    By default, the MMA atom is created with SMEM operand source for A.
+
+    :param a_dtype: Data type of operand A.
+    :type a_dtype: type[Numeric]
+    :param b_dtype: Data type of operand B.
+    :type b_dtype: type[Numeric]
+    :param a_leading_mode: Leading dimension of operand A (1 for K, 0 for M/N).
+    :type a_leading_mode: warpgroup.OperandMajorMode
+    :param b_leading_mode: Leading dimension of operand B (1 for K, 0 for M/N).
+    :type b_leading_mode: warpgroup.OperandMajorMode
+    :param acc_dtype: Data type of the accumulator.
+    :type acc_dtype: type[Numeric]
+    :param atom_layout_mnk: A integer tuple describing the tiling of Atom across threads.
+    :type atom_layout_mnk: Tuple[int, int, int]
+    :param tiler_mn: The shape (M, N) of the cta tiler.
+    :type tiler_mn: Tuple[int, int]
+
+    :return: A tiled MMA atom.
+    :rtype: cute.TiledMma
+
+    :raises TypeError: If the data type is not supported.
+    """
+
+    if a_dtype in {Float16, BFloat16}:
+        if cutlass.const_expr(a_dtype != b_dtype):
+            raise TypeError(f"Type mismatch: {a_dtype} != {b_dtype}")
+        if cutlass.const_expr(a_dtype.width != b_dtype.width):
+            raise TypeError(f"Type width mismatch: {a_dtype.width} != {b_dtype.width}")
+
+        mma_op = MmaF16BF16Op(
+            a_dtype,
+            acc_dtype,
+            (*tiler_mn, 16),
+            OperandSource.SMEM,
+            a_leading_mode,
+            b_leading_mode,
+        )
+    elif a_dtype in {Float8E4M3FN, Float8E5M2} and b_dtype in {
+        Float8E4M3FN,
+        Float8E5M2,
+    }:
+        mma_op = MmaF8Op(
+            a_dtype,
+            b_dtype,
+            acc_dtype,
+            (*tiler_mn, 32),
+            OperandSource.SMEM,
+            a_leading_mode,
+            b_leading_mode,
+        )
+    else:
+        raise TypeError(f"unsupported a_dtype and b_dtype, got {a_dtype} and {b_dtype}")
+
+    return cute.make_tiled_mma(cute.make_mma_atom(mma_op), atom_layout_mnk)
+
+def get_smem_layout_atom(
+    layout: LayoutEnum,
+    element_type: Type[Numeric],
+    major_mode_size: int,
+    *,
+    loc=None,
+    ip=None,
+):
+    """Select the optimal shared memory layout atom based on parameters.
+
+    :param layout: Layout enum of the tensor
+    :type layout: LayoutEnum
+    :param element_type: Data type of the elements
+    :type element_type: type[cutlass.Numeric]
+    :param major_mode_size: Size of the major mode dimension
+    :type major_mode_size: int
+
+    :return: Selected shared memory layout atom kind
+    :rtype: cute.nvgpu.warpgroup.SmemLayoutAtomKind
+    """
+    assert major_mode_size % 8 == 0
+    sw128_num_contiguous_bits = 1024
+    sw64_num_contiguous_bits = 512
+    sw32_num_contiguous_bits = 256
+    major_mode_size_bits = major_mode_size * element_type.width
+    if layout.sm90_mma_major_mode() == OperandMajorMode.MN:
+        if major_mode_size_bits % sw128_num_contiguous_bits == 0:
+            return cute.nvgpu.warpgroup.SmemLayoutAtomKind.MN_SW128
+        if major_mode_size_bits % sw64_num_contiguous_bits == 0:
+            return cute.nvgpu.warpgroup.SmemLayoutAtomKind.MN_SW64
+        if major_mode_size_bits % sw32_num_contiguous_bits == 0:
+            return cute.nvgpu.warpgroup.SmemLayoutAtomKind.MN_SW32
+        return cute.nvgpu.warpgroup.SmemLayoutAtomKind.MN_INTER
+    if major_mode_size_bits % sw128_num_contiguous_bits == 0:
+        return cute.nvgpu.warpgroup.SmemLayoutAtomKind.K_SW128
+    if major_mode_size_bits % sw64_num_contiguous_bits == 0:
+        return cute.nvgpu.warpgroup.SmemLayoutAtomKind.K_SW64
+    if major_mode_size_bits % sw32_num_contiguous_bits == 0:
+        return cute.nvgpu.warpgroup.SmemLayoutAtomKind.K_SW32
+    return cute.nvgpu.warpgroup.SmemLayoutAtomKind.K_INTER
diff --git a/python/CuTeDSL/cutlass/utils/layout.py b/python/CuTeDSL/cutlass/utils/layout.py
new file mode 100644
index 00000000..a1261d4d
--- /dev/null
+++ b/python/CuTeDSL/cutlass/utils/layout.py
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from enum import Enum
+
+import cutlass.cute as cute
+from cutlass.cute.nvgpu import warpgroup
+from cutlass.cute.nvgpu import tcgen05
+
+
+class LayoutEnum(Enum):
+    ROW_MAJOR = "row_major"
+    COL_MAJOR = "col_major"
+
+    def mma_major_mode(self):
+        return (
+            tcgen05.OperandMajorMode.K
+            if self == LayoutEnum.ROW_MAJOR
+            else tcgen05.OperandMajorMode.MN
+        )
+
+    def sm90_mma_major_mode(self):
+        return (
+            warpgroup.OperandMajorMode.K
+            if self == LayoutEnum.ROW_MAJOR
+            else warpgroup.OperandMajorMode.MN
+        )
+
+    def is_k_major_a(self):
+        return self == LayoutEnum.ROW_MAJOR
+
+    def is_m_major_a(self):
+        return self == LayoutEnum.COL_MAJOR
+
+    def is_k_major_b(self):
+        return self == LayoutEnum.COL_MAJOR
+
+    def is_n_major_b(self):
+        return self == LayoutEnum.ROW_MAJOR
+
+    def is_n_major_c(self):
+        return self == LayoutEnum.ROW_MAJOR
+
+    def is_m_major_c(self):
+        return self == LayoutEnum.COL_MAJOR
+
+    @staticmethod
+    def from_tensor(tensor: cute.Tensor) -> "LayoutEnum":
+        ret = None
+        if tensor.leading_dim == 1:
+            ret = LayoutEnum.ROW_MAJOR
+        elif tensor.leading_dim == 0:
+            ret = LayoutEnum.COL_MAJOR
+        else:
+            raise ValueError(f"Invalid leading dimension: {tensor.leading_dim}")
+
+        return ret
+
+
+__all__ = ["LayoutEnum"]
diff --git a/python/CuTeDSL/cutlass/utils/pipeline.py b/python/CuTeDSL/cutlass/utils/pipeline.py
new file mode 100644
index 00000000..a339a3e3
--- /dev/null
+++ b/python/CuTeDSL/cutlass/utils/pipeline.py
@@ -0,0 +1,984 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import enum
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional
+
+from cutlass.cutlass_dsl import Boolean, Int32, Int64, T, if_generate, and_, or_
+
+import cutlass._mlir.dialects.cute as _cute_ir
+
+import cutlass.cute as cute
+
+
+##############################################################################
+# Agent class
+##############################################################################
+
+
+class Agent(enum.Enum):
+    """
+    Agent indicates what is participating in the pipeline synchronization.
+    """
+    # Arbitrary grouping of N threads
+    Thread = enum.auto()
+    # Same as AsyncThread, but includes all threads in the block
+    ThreadBlock = enum.auto()
+    # Same as AsyncThread, but includes all threads in the cluster
+    ThreadBlockCluster = enum.auto()
+
+
+class CooperativeGroup:
+    """
+    CooperativeGroup contains size and alignment restrictions for an Agent.
+    """
+    def __init__(self, agent: Agent, size: int = 1, alignment: int = 1):
+        if agent is Agent.Thread:
+            assert size > 0
+            if size == 32:
+                assert (
+                    size == alignment
+                ), "Error: Alignment does not match number of threads in a warp."
+            elif size == 128:
+                assert (
+                    size == alignment
+                ), "Error: Alignment does not match number of threads in a warpgroup."
+        elif agent is Agent.ThreadBlock:
+            assert False, "Error: Not yet supported."
+        elif agent is Agent.ThreadBlockCluster:
+            assert False, "Error: Not yet supported."
+        else:
+            # Should never reach this state
+            size = 0
+
+        if size <= 0:
+            raise ValueError(
+                "Error: The number of threads in a CooperativeGroup must be more than 0."
+            )
+
+        # Size indicates how many threads are participating in this CooperativeGroup
+        self.size = size
+        # Agent indicates the type of thread group
+        self.agent = agent
+
+
+class _PipelineOp(enum.Enum):
+    """
+    PipelineOp assigns an operation to an agent corresponding to a specific hardware feature.
+    """
+    # async-threads
+    AsyncThread = enum.auto()
+    # Blackwell (SM100a) MMA instruction
+    TCGen05Mma = enum.auto()
+    # Tensor Memory Accelerator load
+    TmaLoad = enum.auto()
+    # TMA Store consuming smem produced by AsyncThread
+    TmaStore = enum.auto()
+
+
+def _get_pipeline_op(type_str):
+    return _PipelineOp(type_str)
+
+
+##############################################################################
+# SyncObjectArray class
+##############################################################################
+
+
+class SyncObjectArray(ABC):
+    """
+    SyncObjectArray is an abstract base class for different types of hardware synchronizations (e.g. smem barriers, named barriers, fences)
+    """
+
+    @abstractmethod
+    def wait(self):
+        pass
+
+    @abstractmethod
+    def arrive(self):
+        pass
+
+    @abstractmethod
+    def get_barrier(self):
+        pass
+
+
+class MbarrierArray(SyncObjectArray):
+    """
+    MbarrierArray implements an abstraction for an array of smem barriers.
+    """
+
+    def __init__(
+        self,
+        barrier_storage: cute.Pointer,
+        num_stages: int,
+        agent: tuple[_PipelineOp, CooperativeGroup],
+        tx_count: int = 0,
+    ):
+        self.barrier_storage = barrier_storage
+        self.tx_count = tx_count
+        self.num_stages = num_stages
+        self.op_type, self.cg = agent
+        self.arrive_count = self.cg.size
+
+        if self.num_stages <= 0:
+            raise ValueError("Error: Mbarrier stage count must be greater than 0.")
+        if self.arrive_count <= 0:
+            raise ValueError("Error: Mbarrier arrive count must be greater than 0.")
+        if self.op_type is _PipelineOp.TmaLoad and self.tx_count <= 0:
+            raise ValueError(
+                "Error: Mbarrier tx count must be greater than 0 for TMA ops."
+            )
+
+        # Using a tensor to store mbarrier i64 ptrs
+        self.mbarrier_array = cute.make_fragment(cute.make_layout(num_stages), Int64)
+        for i in range(num_stages):
+            self.mbarrier_array[i] = _cute_ir.ptrtoint(
+                T.i64(), (self.barrier_storage + i).value
+            )
+
+        # Mbarrier initialization in constructor
+        self.mbarrier_init()
+
+    # Mbarrier initialization
+    def mbarrier_init(self):
+        """
+        Initializes an array of mbarriers using warp 0.
+        """
+        def then_body():
+            for index in range(self.num_stages):
+                cute.arch.mbarrier_init_arrive_cnt(
+                    _mbarrier_i64_to_ptr(self.mbarrier_array[index]), self.arrive_count
+                )
+
+        warp_idx = cute.arch.warp_idx()
+        warp_idx = cute.arch.make_warp_uniform(warp_idx)
+
+        if_generate(warp_idx == 0, then_body)
+
+    def arrive(self, index: int, dst: int):
+        """
+        Select the arrive corresponding to this MbarrierArray's PipelineOp
+        :param index: Index of the mbarrier in the array to arrive on
+        :type index: int
+        :param dst: Destination parameter for selective arrival, which can be either a mask or destination cta rank. When None, both TCGen05Mma and AsyncThread will arrive on their local mbarrier.
+        - For TCGen05Mma, dst serves as a multicast mask (e.g., 0b1011 allows arrive signal to be multicast to CTAs in the cluster with rank = 0, 1, and 3).
+        - For AsyncThread, dst serves as a destination cta rank (e.g., 3 means threads will arrive on the mbarrier with rank = 3 in the cluster).
+        :type dst: int | None
+        """
+        if self.op_type is _PipelineOp.AsyncThread:
+            self.arrive_mbarrier(index, dst)
+        elif self.op_type is _PipelineOp.TCGen05Mma:
+            self.arrive_tcgen05mma(index, dst)
+        elif self.op_type in [_PipelineOp.TmaLoad]:
+            self.arrive_and_expect_tx(index, self.tx_count)
+        else:
+            print(_get_pipeline_op(self.op_type))
+            assert False, "Error: MbarrierArray is not supported for this PipelineOp."
+
+    def arrive_mbarrier(self, index: int, dst_rank: int):
+        if dst_rank is None:
+            cute.arch.mbarrier_arrive(_mbarrier_i64_to_ptr(self.mbarrier_array[index]))
+        else:
+            cute.arch.mbarrier_arrive(
+                _mbarrier_i64_to_ptr(self.mbarrier_array[index]), dst_rank
+            )
+
+    def arrive_tcgen05mma(self, index: int, mask: int):
+        if mask is None:
+            with cute.arch.elect_one():
+                cute.nvgpu.tcgen05.commit(
+                    _mbarrier_i64_to_ptr(self.mbarrier_array[index])
+                )
+        else:
+            with cute.arch.elect_one():
+                cute.nvgpu.tcgen05.commit(
+                    _mbarrier_i64_to_ptr(self.mbarrier_array[index]),
+                    mask,
+                    cute.nvgpu.tcgen05.CtaGroup.TWO,
+                )
+
+    def arrive_and_expect_tx(self, index: int, tx_count: int):
+        with cute.arch.elect_one():
+            cute.arch.mbarrier_init_tx_bytes(
+                _mbarrier_i64_to_ptr(self.mbarrier_array[index]), tx_count
+            )
+
+    def try_wait(self, index: int, phase: int):
+        return cute.arch.mbarrier_try_wait(
+            _mbarrier_i64_to_ptr(self.mbarrier_array[index]), phase
+        )
+
+    def wait(self, index: int, phase: int):
+        cute.arch.mbarrier_wait(_mbarrier_i64_to_ptr(self.mbarrier_array[index]), phase)
+
+    def get_barrier(self, index: int) -> cute.Pointer:
+        return _mbarrier_i64_to_ptr(self.mbarrier_array[index])
+
+
+class TmaStoreFence(SyncObjectArray):
+    """
+    TmaStoreFence is used for a multi-stage epilogue buffer.
+    """
+
+    def __init__(
+        self,
+        num_stages: int = 0,
+    ):
+        if num_stages <= 0:
+            raise ValueError("Mbarrier stage count must be greater than 0.")
+
+        self.num_stages = num_stages
+
+    def arrive(self):
+        cute.arch.cp_async_bulk_commit_group()
+
+    def wait(self):
+        cute.arch.cp_async_bulk_wait_group(self.num_stages - 1, read=True)
+
+    # TmaStoreFence doesn't have mbarriers
+    def get_barrier(self):
+        assert (
+            False
+        ), "Error: TmaStoreFence doesn't use mbarriers and cannot return a barrier."
+
+    def tail(self):
+        cute.arch.cp_async_bulk_wait_group(0, read=True)
+
+
+##############################################################################
+# PipelineState class
+##############################################################################
+
+
+class PipelineUserType(enum.Enum):
+    Producer = enum.auto()
+    Consumer = enum.auto()
+
+
+class PipelineState:
+    """
+    Pipeline state contains an index and phase bit corresponding to the current position in the circular buffer.
+    """
+
+    def __init__(self, stages: int, count, index, phase):
+        self._stages = stages
+        self._count = count
+        self._index = index
+        self._phase = phase
+
+    def clone(self) -> "PipelineState":
+        return PipelineState(self.stages, self._count, self.index, self.phase)
+
+    @property
+    def index(self) -> Int32:
+        return self._index
+
+    @property
+    def count(self) -> Int32:
+        return self._count
+
+    @property
+    def stages(self) -> int:
+        return self._stages
+
+    @property
+    def phase(self) -> Int32:
+        return self._phase
+
+    def reset_count(self):
+        self._count = Int32(0)
+
+    def advance(self):
+        self._index += 1
+        self._count += 1
+
+        def then_body(index, phase):
+            new_index = Int32(0)
+            new_phase = phase ^ 1
+            return new_index, new_phase
+
+        def else_body(index, phase):
+            return index, phase
+
+        self._index, self._phase = if_generate(
+            self._index == self.stages,
+            then_body,
+            else_body,
+            [self.index, self.phase],
+            [Int32, Int32],
+        )
+
+    def reverse(self):
+        self._index -= 1
+        self._count -= 1
+
+        def then_body(index, phase):
+            new_index = Int32(self.stages - 1)
+            new_phase = phase ^ 1
+            return new_index, new_phase
+
+        def else_body(index, phase):
+            return index, phase
+
+        self._index, self._phase = if_generate(
+            self._index == -1,
+            then_body,
+            else_body,
+            [self.index, self.phase],
+            [Int32, Int32],
+        )
+
+    def __get_mlir_types__(self):
+        return [self._count.type, self._index.type, self._phase.type]
+
+    def __extract_mlir_values__(self):
+        count = self._count
+        index = self._index
+        phase = self._phase
+        return [count.ir_value(), index.ir_value(), phase.ir_value()]
+
+    # This can be overridden by derived classes
+    def __new_from_mlir_values__(self, values):
+        return PipelineState(
+            self.stages, Int32(values[0]), Int32(values[1]), Int32(values[2])
+        )
+
+
+def make_pipeline_state(type: PipelineUserType, stages: int):
+    """
+    Creates a pipeline state. Producers are assumed to start with an empty buffer and have a flipped phase bit of 1.
+    """
+    if type is PipelineUserType.Producer:
+        return PipelineState(
+            stages,
+            Int32(0),
+            Int32(0),
+            Int32(1),
+        )
+    elif type is PipelineUserType.Consumer:
+        return PipelineState(
+            stages,
+            Int32(0),
+            Int32(0),
+            Int32(0),
+        )
+    else:
+        assert (
+            False
+        ), "Error: invalid PipelineUserType specified for make_pipeline_state."
+
+
+##############################################################################
+# Pipeline classes
+##############################################################################
+
+
+@dataclass(frozen=True)
+class PipelineAsync:
+    """
+    PipelineAsync is a generic pipeline class where both the producer and consumer are
+    AsyncThreads. It also serves as a base class for specialized pipeline classes.
+    """
+    sync_object_array_full: SyncObjectArray
+    sync_object_array_empty: SyncObjectArray
+    num_stages: Int32
+    producer_mask: Int32
+    consumer_mask: Int32
+
+    @staticmethod
+    def _make_sync_object_array(
+        barrier_storage: cute.Pointer,
+        num_stages: Int32,
+        agent: tuple[_PipelineOp, CooperativeGroup],
+        tx_count: int = 0,
+    ) -> SyncObjectArray:
+        """
+        Returns a SyncObjectArray corresponding to an agent's PipelineOp.
+        """
+        if agent[0] in [
+            _PipelineOp.AsyncThread,
+            _PipelineOp.TmaLoad,
+            _PipelineOp.TCGen05Mma,
+        ]:
+            return MbarrierArray(
+                barrier_storage=barrier_storage,
+                num_stages=num_stages,
+                agent=agent,
+                tx_count=tx_count,
+            )
+        elif agent[0] is _PipelineOp.TmaStore:
+            # Path taken for AsyncTmaStore
+            return TmaStoreFence(num_stages=num_stages)
+        else:
+            assert False, "Error: Invalid PipelineOp specified."
+
+    @staticmethod
+    def create(
+        barrier_storage: cute.Pointer,
+        num_stages: Int32,
+        producer_group: CooperativeGroup,
+        consumer_group: CooperativeGroup,
+        producer_mask: Int32 = None,
+        consumer_mask: Int32 = None,
+    ):
+        """
+        This helper function computes any necessary attributes and returns an instance of PipelineAsync.
+        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
+        :type barrier_storage: cute.Pointer
+        :param num_stages: Number of buffer stages for this pipeline
+        :type num_stages: Int32
+        :param producer_group: CooperativeGroup for the producer agent
+        :type producer_group: CooperativeGroup
+        :param consumer_group: CooperativeGroup for the consumer agent
+        :type consumer_group: CooperativeGroup
+        :param producer_mask: Mask for signaling arrives for the producer agent
+        :type producer_mask: Int32 | None
+        :param consumer_mask: Mask for signaling arrives for the consumer agent
+        :type consumer_mask: Int32 | None
+        """
+        producer_type = _PipelineOp.AsyncThread
+        consumer_type = _PipelineOp.AsyncThread
+
+        producer = (producer_type, producer_group)
+        consumer = (consumer_type, consumer_group)
+
+        sync_object_array_full = PipelineAsync._make_sync_object_array(
+            barrier_storage.align(min_align=8), num_stages, producer
+        )
+        sync_object_array_empty = PipelineAsync._make_sync_object_array(
+            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
+        )
+
+        pipeline_init_wait()
+
+        return PipelineAsync(
+            sync_object_array_full,
+            sync_object_array_empty,
+            num_stages,
+            producer_mask,
+            consumer_mask,
+        )
+
+    def producer_acquire(
+        self, state: PipelineState, try_acquire_token: Optional[Boolean] = None
+    ):
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_array_empty.wait(state.index, state.phase),
+        )
+
+    def producer_try_acquire(self, state: PipelineState):
+        return self.sync_object_array_empty.try_wait(state.index, state.phase)
+
+    def producer_commit(self, state: PipelineState):
+        self.sync_object_array_full.arrive(state.index, self.producer_mask)
+
+    def consumer_wait(
+        self, state: PipelineState, try_wait_token: Optional[Boolean] = None
+    ):
+        if_generate(
+            try_wait_token is None or try_wait_token == 0,
+            lambda: self.sync_object_array_full.wait(state.index, state.phase),
+        )
+
+    def consumer_try_wait(self, state: PipelineState):
+        return self.sync_object_array_full.try_wait(state.index, state.phase)
+
+    def consumer_release(self, state: PipelineState):
+        self.sync_object_array_empty.arrive(state.index, self.consumer_mask)
+
+    def producer_get_barrier(self, state: PipelineState) -> cute.Pointer:
+        return self.sync_object_array_full.get_barrier(state.index)
+
+    def producer_tail(self, state: PipelineState):
+        """
+        Make sure the last used buffer empty signal is visible to producer.
+        Producer tail is usually executed by producer before exit, to avoid dangling
+        mbarrier arrive signals after kernel exit.
+
+        :param state: The pipeline state that points to next useful buffer
+        :type state: PipelineState
+        """
+        # Assume state contains that next useful buffer
+        # So we only need to advance to num_stages - 1 times to last used buffer
+        for i in range(self.num_stages - 1):
+            state.advance()
+        self.producer_acquire(state)
+
+
+@dataclass(frozen=True)
+class PipelineTmaAsync(PipelineAsync):
+    """
+    PipelineTmaAsync is used for TMA producers and AsyncThread consumers (e.g. Hopper mainloops).
+    """
+    is_signalling_thread: bool
+
+    @staticmethod
+    def init_empty_barrier_arrive_signal(cta_layout_vmnk: cute.Layout):
+        """
+        Initialize the empty barrier arrive signal
+        This function returns the destination cta rank and a boolean indicating if the signalling thread is the same as the current thread
+        """
+        # Logic to optimally schedule Empty Arrives
+        cluster_shape_mnk = cta_layout_vmnk.shape
+        tidx, _, _ = cute.arch.thread_idx()
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+
+        is_signalling_thread = tidx < cute.size(cluster_shape_mnk)
+        dst_rank = tidx % cute.size(cluster_shape_mnk)
+        m = cluster_shape_mnk[0]
+
+        # Check if same row
+        is_same_row_l = dst_rank % m
+        is_same_row_r = cta_rank_in_cluster % m
+        is_same_row = is_same_row_l == is_same_row_r
+
+        # Check if same column
+        is_same_col_l = dst_rank // m
+        is_same_col_r = cta_rank_in_cluster // m
+
+        is_same_col = is_same_col_l == is_same_col_r
+
+        is_same_row_or_col = or_(is_same_row, is_same_col)
+        is_signalling_thread_final = and_(is_signalling_thread, is_same_row_or_col)
+
+        return dst_rank, is_signalling_thread_final
+
+    @staticmethod
+    def create(
+        barrier_storage: cute.Pointer,
+        num_stages: Int32,
+        producer_group: CooperativeGroup,
+        consumer_group: CooperativeGroup,
+        tx_count: int,
+        cta_layout_vmnk: Optional[cute.Layout] = None,
+    ):
+        """
+        This helper function computes any necessary attributes and returns an instance of PipelineTmaAsync.
+        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
+        :type barrier_storage: cute.Pointer
+        :param num_stages: Number of buffer stages for this pipeline
+        :type num_stages: Int32
+        :param producer_group: CooperativeGroup for the producer agent
+        :type producer_group: CooperativeGroup
+        :param consumer_group: CooperativeGroup for the consumer agent
+        :type consumer_group: CooperativeGroup
+        :param tx_count: Number of bytes expected to be written to the transaction barrier for one stage
+        :type tx_count: int
+        :param cta_layout_vmnk: Layout of the cluster shape
+        :type cta_layout_vmnk: cute.Layout | None
+        """
+        producer_type = _PipelineOp.TmaLoad
+        consumer_type = _PipelineOp.AsyncThread
+
+        producer = (producer_type, producer_group)
+        consumer = (consumer_type, consumer_group)
+
+        sync_object_array_full = PipelineAsync._make_sync_object_array(
+            barrier_storage.align(min_align=8), num_stages, producer, tx_count
+        )
+        sync_object_array_empty = PipelineAsync._make_sync_object_array(
+            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
+        )
+
+        dst_rank, is_signalling_thread = (
+            PipelineTmaAsync.init_empty_barrier_arrive_signal(cta_layout_vmnk)
+        )
+        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
+            dst_rank = None
+        else:
+            dst_rank = dst_rank
+
+        is_signalling_thread = is_signalling_thread
+        producer_mask = None
+
+        pipeline_init_wait(cta_layout_vmnk)
+
+        return PipelineTmaAsync(
+            sync_object_array_full,
+            sync_object_array_empty,
+            num_stages,
+            producer_mask,
+            dst_rank,
+            is_signalling_thread,
+        )
+
+    def producer_acquire(
+        self, state: PipelineState, try_acquire_token: Optional[Boolean] = None
+    ):
+        """
+        TMA producer commit conditionally waits on buffer empty and sets the transaction barrier for leader threadblocks.
+        """
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_array_empty.wait(state.index, state.phase),
+        )
+        self.sync_object_array_full.arrive(state.index, self.producer_mask)
+
+
+    def producer_commit(self, state: PipelineState):
+        """
+        TMA producer commit is a NOP. The transaction barrier signals the commit upon completion of the TMA.
+        """
+        pass
+
+    def consumer_release(self, state: PipelineState):
+        """
+        TMA consumer release conditionally signals the empty buffer to the producer.
+        """
+        if_generate(
+            self.is_signalling_thread,
+            lambda: self.sync_object_array_empty.arrive(
+                state.index, self.consumer_mask
+            ),
+        )
+
+@dataclass(frozen=True)
+class PipelineTmaUmma(PipelineAsync):
+    """
+    PipelineTmaUmma is used for TMA producers and UMMA consumers (e.g. Blackwell mainloops).
+    """
+    is_leader_cta: bool
+
+    @staticmethod
+    def _compute_mcast_arrival_mask(cta_layout_vmnk: cute.Layout):
+        """
+        Computes a mask for signaling arrivals to multicasting threadblocks.
+        """
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        cta_in_cluster_coord_vmnk = cta_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
+
+        tma_mcast_mask_a = cute.nvgpu.cpasync.create_tma_multicast_mask(
+            cta_layout_vmnk, cta_in_cluster_coord_vmnk, mcast_mode=2
+        )
+        tma_mcast_mask_b = cute.nvgpu.cpasync.create_tma_multicast_mask(
+            cta_layout_vmnk, cta_in_cluster_coord_vmnk, mcast_mode=1
+        )
+
+        block_in_cluster_coord_vmnk_peer = (
+            cta_in_cluster_coord_vmnk[0] ^ 1,
+            *cta_in_cluster_coord_vmnk[1:],
+        )
+        tma_mcast_mask_a_peer = cute.nvgpu.cpasync.create_tma_multicast_mask(
+            cta_layout_vmnk, block_in_cluster_coord_vmnk_peer, mcast_mode=2
+        )
+        tma_mcast_mask_b_peer = cute.nvgpu.cpasync.create_tma_multicast_mask(
+            cta_layout_vmnk, block_in_cluster_coord_vmnk_peer, mcast_mode=1
+        )
+        return (
+            tma_mcast_mask_a
+            | tma_mcast_mask_b
+            | tma_mcast_mask_a_peer
+            | tma_mcast_mask_b_peer
+        )
+
+    @staticmethod
+    def _compute_is_leader_cta(cta_layout_vmnk: cute.Layout):
+        """
+        Computes leader threadblocks for 2CTA kernels. For 1CTA, all threadblocks are leaders.
+        """
+        bidx, bidy, _ = cute.arch.block_idx()
+
+        mma_coord_vmnk = (
+            bidx % cute.size(cta_layout_vmnk, mode=[0]),
+            bidx // cute.size(cta_layout_vmnk, mode=[0]),
+            bidy,
+            None,
+        )
+        return mma_coord_vmnk[0] == 0
+
+    @staticmethod
+    def create(
+        barrier_storage: cute.Pointer,
+        num_stages: Int32,
+        producer_group: CooperativeGroup,
+        consumer_group: CooperativeGroup,
+        tx_count: int,
+        cta_layout_vmnk: Optional[cute.Layout] = None,
+    ):
+        """
+        This helper function computes any necessary attributes and returns an instance of PipelineTmaUmma.
+        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
+        :type barrier_storage: cute.Pointer
+        :param num_stages: Number of buffer stages for this pipeline
+        :type num_stages: Int32
+        :param producer_group: CooperativeGroup for the producer agent
+        :type producer_group: CooperativeGroup
+        :param consumer_group: CooperativeGroup for the consumer agent
+        :type consumer_group: CooperativeGroup
+        :param tx_count: Number of bytes expected to be written to the transaction barrier for one stage
+        :type tx_count: int
+        :param cta_layout_vmnk: Layout of the cluster shape
+        :type cta_layout_vmnk: cute.Layout | None
+        """
+        producer_type = _PipelineOp.TmaLoad
+        consumer_type = _PipelineOp.TCGen05Mma
+
+        producer = (producer_type, producer_group)
+        consumer = (consumer_type, consumer_group)
+
+        sync_object_array_full = PipelineAsync._make_sync_object_array(
+            barrier_storage.align(min_align=8), num_stages, producer, tx_count
+        )
+        sync_object_array_empty = PipelineAsync._make_sync_object_array(
+            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
+        )
+
+        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
+            # No mcast mask if not using clusters
+            producer_mask = None
+            # All threadblocks are leaders if not using clusters
+            is_leader_cta = True
+        else:
+            producer_mask = PipelineTmaUmma._compute_mcast_arrival_mask(cta_layout_vmnk)
+            is_leader_cta = PipelineTmaUmma._compute_is_leader_cta(cta_layout_vmnk)
+
+        consumer_mask = producer_mask
+
+        pipeline_init_wait(cta_layout_vmnk)
+
+        return PipelineTmaUmma(
+            sync_object_array_full,
+            sync_object_array_empty,
+            num_stages,
+            producer_mask,
+            consumer_mask,
+            is_leader_cta,
+        )
+
+    def producer_acquire(
+        self, state: PipelineState, try_acquire_token: Optional[Boolean] = None
+    ):
+        """
+        TMA producer commit conditionally waits on buffer empty and sets the transaction barrier for leader threadblocks.
+        """
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_array_empty.wait(state.index, state.phase),
+        )
+        if_generate(
+            self.is_leader_cta,
+            lambda: self.sync_object_array_full.arrive(state.index, self.producer_mask),
+        )
+
+    def producer_commit(self, state: PipelineState):
+        """
+        TMA producer commit is a NOP. The transaction barrier signals the commit upon completion of the TMA.
+        """
+        pass
+
+
+@dataclass(frozen=True)
+class PipelineUmmaAsync(PipelineAsync):
+    """
+    PipelineTmaUmma is used for UMMA producers and AsyncThread consumers (e.g. Blackwell accumulator pipelines).
+    """
+
+    @staticmethod
+    def _compute_tmem_sync_mask(cta_layout_vmnk: cute.Layout):
+        """
+        Computes a mask to signal completion of tmem buffers for 2CTA kernels.
+        """
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        cta_in_cluster_coord_vmnk = cta_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
+        return cute.make_layout_image_mask(
+            cta_layout_vmnk, cta_in_cluster_coord_vmnk, mode=0
+        )
+
+    @staticmethod
+    def _compute_peer_cta_rank():
+        """
+        Computes a mask to signal release of tmem buffers for 2CTA kernels.
+        """
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        return cta_rank_in_cluster // 2 * 2
+
+    @staticmethod
+    def create(
+        barrier_storage: cute.Pointer,
+        num_stages: Int32,
+        producer_group: CooperativeGroup,
+        consumer_group: CooperativeGroup,
+        cta_layout_vmnk: Optional[cute.Layout] = None,
+    ):
+        """
+        This helper function computes any necessary attributes and returns an instance of PipelineUmmaAsync.
+        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
+        :type barrier_storage: cute.Pointer
+        :param num_stages: Number of buffer stages for this pipeline
+        :type num_stages: Int32
+        :param producer_group: CooperativeGroup for the producer agent
+        :type producer_group: CooperativeGroup
+        :param consumer_group: CooperativeGroup for the consumer agent
+        :type consumer_group: CooperativeGroup
+        :param cta_layout_vmnk: Layout of the cluster shape
+        :type cta_layout_vmnk: cute.Layout | None
+        """
+        producer_type = _PipelineOp.TCGen05Mma
+        consumer_type = _PipelineOp.AsyncThread
+
+        producer = (producer_type, producer_group)
+        consumer = (consumer_type, consumer_group)
+
+        sync_object_array_full = PipelineAsync._make_sync_object_array(
+            barrier_storage.align(min_align=8), num_stages, producer
+        )
+        sync_object_array_empty = PipelineAsync._make_sync_object_array(
+            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
+        )
+
+        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
+            # Set mask to None if not using clusters (i.e. 1CTA kernels)
+            producer_mask = None
+        else:
+            producer_mask = PipelineUmmaAsync._compute_tmem_sync_mask(cta_layout_vmnk)
+
+        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1:
+            # Set mask to None if not using 2CTA intructions
+            consumer_mask = None
+        else:
+            consumer_mask = PipelineUmmaAsync._compute_peer_cta_rank()
+
+        pipeline_init_wait(cta_layout_vmnk)
+
+        return PipelineUmmaAsync(
+            sync_object_array_full,
+            sync_object_array_empty,
+            num_stages,
+            producer_mask,
+            consumer_mask,
+        )
+
+    def producer_tail(self, state: PipelineState):
+        """
+        Make sure the last used buffer empty signal is visible to producer.
+        Producer tail is usually executed by producer before exit, to avoid dangling
+        mbarrier arrive signals after kernel exit.
+
+        :param state: The pipeline state that points to next useful buffer
+        :type state: PipelineState
+        """
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        is_leader_cta = cta_rank_in_cluster % 2 == 0
+
+        def then_body():
+            # Assume state contains that next useful buffer
+            # So we only need to advance to num_stages - 1 times to last used buffer
+            for i in range(self.num_stages - 1):
+                state.advance()
+            self.producer_acquire(state)
+
+        if_generate(is_leader_cta, then_body)
+
+
+@dataclass(frozen=True)
+class PipelineTmaStore(PipelineAsync):
+    """
+    PipelineTmaStore is used for synchronizing TMA stores in the epilogue. It does not use mbarriers.
+    """
+
+    @staticmethod
+    def create(
+        num_stages: Int32,
+        producer_group: CooperativeGroup,
+    ):
+        """
+        This helper function computes any necessary attributes and returns an instance of PipelineTmaStore.
+        :param num_stages: Number of buffer stages for this pipeline
+        :type num_stages: Int32
+        :param producer_group: CooperativeGroup for the producer agent
+        :type producer_group: CooperativeGroup
+        """
+        producer_type = _PipelineOp.TmaStore
+
+        producer = (producer_type, producer_group)
+
+        sync_object_array_full = PipelineAsync._make_sync_object_array(
+            None, num_stages, producer
+        )
+
+        return PipelineTmaStore(sync_object_array_full, None, num_stages, None, None)
+
+    def producer_acquire(self):
+        self.sync_object_array_full.wait()
+
+    def producer_commit(self):
+        self.sync_object_array_full.arrive()
+
+    def consumer_wait(self):
+        assert False, "Error: PipelineTmaStore does not have a consumer agent."
+
+    def consumer_release(self):
+        assert False, "Error: PipelineTmaStore does not have a consumer agent."
+
+    def producer_tail(self):
+        self.sync_object_array_full.tail()
+
+
+##############################################################################
+# Helper functions
+##############################################################################
+
+
+def pipeline_init_wait(cta_layout_vmnk: Optional[cute.Layout] = None):
+    """
+    Fences the mbarrier init and syncs the threadblock or cluster
+    """
+    cute.arch.mbarrier_init_fence()
+
+    if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
+        # If not using clusters, sync the threadblock
+        _sync(Agent.ThreadBlock)
+    else:
+        # If using clusters, sync the cluster
+        _sync(Agent.ThreadBlockCluster)
+
+
+def _sync(group: Agent):
+    """
+    Syncs all threads within an agent.
+    """
+    if group is Agent.Thread:
+        assert False, "Error: Not supported."
+    elif group is Agent.ThreadBlock:
+        cute.arch.sync_threads()
+    elif group is Agent.ThreadBlockCluster:
+        cute.arch.cluster_arrive()
+        cute.arch.cluster_wait()
+    else:
+        assert (
+            False
+        ), "Error: No explicit sync instruction exists. Please use barriers (named / mbarrier) instead."
+
+
+def _mbarrier_i64_to_ptr(val: Int64) -> cute.Pointer:
+    """
+    Converts a smem pointer of type Int64 to cute.Pointer with 8B alignment
+    """
+    return cute.make_ptr(
+        Int64,
+        val.ir_value(),
+        mem_space=_cute_ir.AddressSpace.smem,
+        assumed_align=8,
+    )
diff --git a/python/CuTeDSL/cutlass/utils/smem_allocator.py b/python/CuTeDSL/cutlass/utils/smem_allocator.py
new file mode 100644
index 00000000..3e3a4020
--- /dev/null
+++ b/python/CuTeDSL/cutlass/utils/smem_allocator.py
@@ -0,0 +1,217 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from typing import Type, Union, overload
+
+from cutlass.cutlass_dsl import Int8, Numeric, NumericMeta
+
+import cutlass.cute as cute
+from cutlass.cute.arch import get_dyn_smem
+
+
+class SmemAllocator:
+    """
+    A class for managing shared memory allocation on GPU.
+
+    This class manages a chunk of shared memory and provide APIs for sub-allocation
+    inside the chunk.
+
+    Attributes
+    ----------
+    _base : cute.Pointer as i8 typed dynamic value
+        The current base address of the shared memory.
+
+    _allocated_bytes:
+        The bytes allocated in shared memory.
+
+    Methods
+    -------
+    allocate(num_bytes, alignment)
+        Allocates num_bytes in the shared memory with the given byte alignment.
+
+    allocate_value(value_ty, num_elems)
+        Allocates num_elems of value_ty values in the shared memory.
+
+    allocate_tensor(value_ty, layout, alignment)
+        Allocates a tensor in the shared memory with given layout and byte alignment.
+
+    Notes
+    -----
+    This class is responsible for managing the allocation of tensors in shared memory.
+    """
+
+    def __init__(self):
+        """
+        Initializes the SmemAllocator instance with dynamic smem base ptr,
+        which is i8 type and aligned to 1024.
+
+        """
+        self._base = get_dyn_smem(Int8, alignment=1024)
+        self._allocated_bytes = 0
+
+    @overload
+    def allocate(self, size_or_type: int, byte_alignment: int): ...
+
+    @overload
+    def allocate(self, size_or_type: cute.struct, byte_alignment: int): ...
+
+    def allocate(self, size_or_type, byte_alignment: int = 1) -> int:
+        """
+        Allocates a block of memory with the specified size and byte alignment.
+
+        This method adjusts the base cute.Pointer to ensure that the allocated memory
+        is aligned according to the specified byte alignment. It updates the internal
+        state to reflect the new base cute.Pointer and the total allocated bytes.
+
+        Parameters
+        ----------
+        size_or_type : int or struct
+            The number of bytes to allocate or struct class.
+        byte_alignment : int
+            The byte alignment requirement for the allocation. Defaults to 1 (no alignment).
+
+        Returns
+        ----------
+        A cute.Pointer to the start of the allocated memory block or struct instance.
+
+        Raises
+        ----------
+        ValueError
+            If num_bytes is negative or if byte_alignmemt is less than 1.
+        """
+
+        if isinstance(size_or_type, cute.struct):
+            alignment = max(byte_alignment, size_or_type.__alignof__())
+            base_ptr = self.allocate(size_or_type.__sizeof__(), alignment)
+            return size_or_type(base_ptr)
+
+        num_bytes = size_or_type
+        if num_bytes < 0:
+            raise ValueError("num_bytes must be non-negative")
+        if byte_alignment < 1:
+            raise ValueError("byte_alignment must be at least 1")
+
+        self._base = self._base.align(byte_alignment)
+        ptr = self._base
+        self._base += num_bytes
+        if self._allocated_bytes % byte_alignment != 0:
+            self._allocated_bytes += (
+                byte_alignment - self._allocated_bytes % byte_alignment
+            )
+        self._allocated_bytes += num_bytes
+        return ptr
+
+    def allocate_array(self, element_type: Type[Numeric], num_elems: int = 1):
+        """
+        Allocates num_elems values of element_type in shared memory.
+
+        This method calls allocate() to return a byte ptr, pointing to start of shared
+        memory. Then calls cute.recast_ptr() to recast this byte cute.Pointer to element_type.
+
+        Parameters
+        ----------
+        element_type : Type[Numeric]
+            The type of the values in the tensor.
+        num_elems : int, optional
+            The number of elements for each allocation. Defaults to 1.
+
+        Returns
+        ----------
+        A value_type cute.Pointer to the start of the allocated memory block.
+
+        Raises
+        ----------
+        ValueError
+            If num_elems is less than 1.
+        """
+        if num_elems < 1:
+            raise ValueError("num_elems must be at least 1")
+        if not isinstance(element_type, NumericMeta):
+            raise TypeError(
+                f"value_ty must be a type of Numeric, but got {element_type}"
+            )
+
+        ptr = self.allocate(
+            element_type.width // 8 * num_elems, element_type.width // 8
+        )
+
+        return cute.recast_ptr(ptr, dtype=element_type)
+
+    def allocate_tensor(
+        self,
+        element_type: Type[Numeric],
+        layout: Union[int, cute.Layout, cute.ComposedLayout],
+        byte_alignment: int = 1,
+        swizzle: cute.Swizzle = None,
+    ):
+        """
+        Allocates a tensor in the shared memory with value type, layout and byte alignment.
+
+        Parameters
+        ----------
+        element_type : Type[Numeric]
+            The type of the values in the tensor.
+        layout : int | DynamicInt | cute.Layout | cute.ComposedLayout
+            The layout of the tensor.
+        byte_alignment : int, optional
+            The byte alignment requirement for the allocation. Defaults to 1 (no alignment).
+        swizzle : cute.Swizzle
+            A swizzle for the iterator (for position-dependent swizzling).
+
+        Returns
+        -------
+        tensor : cute.Tensor
+            The allocated tensor with specified value type, layout and byte alignment.
+
+        Notes
+        -----
+        The base address is updated to point to the next available memory location.
+        """
+        if not isinstance(element_type, NumericMeta):
+            raise TypeError(
+                f"value_ty must be a type of Numeric, but got {element_type}"
+            )
+
+        if (
+            isinstance(layout, cute.ComposedLayout)
+            and isinstance(layout.inner, cute.Swizzle)
+        ) and (swizzle is not None):
+            raise TypeError(
+                f"iterator swizzle with swizzle layout is currently not supported"
+            )
+
+        if isinstance(layout, int):
+            layout = cute.make_layout(layout)
+
+        profile = layout(0)
+        if isinstance(profile, tuple):
+            raise TypeError(
+                f"cannot allocate a shared memory tensor with a non-integer iterator"
+            )
+
+        if not cute.is_static(layout.type):
+            raise NotImplementedError(f"dynamic layout is not supported: {layout.type}")
+
+        # At least align the allocation to the natural alignment given by the element type
+        if element_type.width // 8 > byte_alignment:
+            byte_alignment = element_type.width // 8
+
+        # Relevant only for sub-byte data types: verify that the entire allocation is byte-aligned
+        cosize_in_bits = cute.cosize(layout) * element_type.width
+        assert isinstance(cosize_in_bits, int)
+        if cosize_in_bits % 8 != 0:
+            raise ValueError("invalid allocation that is not byte-aligned")
+
+        num_bytes = cosize_in_bits // 8
+        ptr = self.allocate(num_bytes, byte_alignment)
+        ptr = cute.recast_ptr(ptr, swizzle, dtype=element_type)
+        res = cute.make_tensor(ptr, layout)
+        return res
diff --git a/python/CuTeDSL/cutlass/utils/static_persistent_tile_scheduler.py b/python/CuTeDSL/cutlass/utils/static_persistent_tile_scheduler.py
new file mode 100644
index 00000000..1a4d13de
--- /dev/null
+++ b/python/CuTeDSL/cutlass/utils/static_persistent_tile_scheduler.py
@@ -0,0 +1,384 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from typing import Tuple
+
+from cutlass.cutlass_dsl import (
+    Boolean,
+    Integer,
+    Int32,
+    min,
+    extract_mlir_values,
+    new_from_mlir_values,
+    dsl_user_op,
+)
+from cutlass._mlir import ir
+import cutlass.cute as cute
+
+##############################################################################
+# Static persistent tile scheduler
+##############################################################################
+
+
+class WorkTileInfo:
+    """A class to represent information about a work tile.
+
+    :ivar tile_idx: The index of the tile.
+    :type tile_idx: cute.Coord
+    :ivar is_valid_tile: Whether the tile is valid.
+    :type is_valid_tile: Boolean
+    """
+
+    def __init__(self, tile_idx: cute.Coord, is_valid_tile: Boolean):
+        self._tile_idx = tile_idx
+        self._is_valid_tile = Boolean(is_valid_tile)
+
+    def __extract_mlir_values__(self) -> list[ir.Value]:
+        values = extract_mlir_values(self.tile_idx)
+        values.extend(extract_mlir_values(self.is_valid_tile))
+        return values
+
+    def __new_from_mlir_values__(self, values: list[ir.Value]) -> "WorkTileInfo":
+        assert len(values) == 4
+        new_tile_idx = new_from_mlir_values(self._tile_idx, values[:-1])
+        new_is_valid_tile = new_from_mlir_values(self._is_valid_tile, [values[-1]])
+        return WorkTileInfo(new_tile_idx, new_is_valid_tile)
+
+    @property
+    def is_valid_tile(self) -> Boolean:
+        """Check latest tile returned by the scheduler is valid or not. Any scheduling
+        requests after all tasks completed will return an invalid tile.
+
+        :return: The validity of the tile.
+        :rtype: Boolean
+        """
+        return self._is_valid_tile
+
+    @property
+    def tile_idx(self) -> cute.Coord:
+        """
+        Get the index of the tile.
+
+        :return: The index of the tile.
+        :rtype: cute.Coord
+        """
+        return self._tile_idx
+
+
+class PersistentTileSchedulerParams:
+    """A class to represent parameters for a persistent tile scheduler.
+
+    This class is designed to manage and compute the layout of clusters and tiles
+    in a batched gemm problem.
+
+    :ivar cluster_shape_mn: Shape of the cluster in (m, n) dimensions (K dimension cta count must be 1).
+    :type cluster_shape_mn: tuple
+    :ivar problem_layout_ncluster_mnl: Layout of the problem in terms of
+        number of clusters in (m, n, l) dimensions.
+    :type problem_layout_ncluster_mnl: cute.Layout
+    """
+
+    def __init__(
+        self,
+        problem_shape_ntile_mnl: cute.Shape,
+        cluster_shape_mnk: cute.Shape,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        """
+        Initializes the PersistentTileSchedulerParams with the given parameters.
+
+        :param problem_shape_ntile_mnl: The shape of the problem in terms of
+            number of CTA (Cooperative Thread Array) in (m, n, l) dimensions.
+        :type problem_shape_ntile_mnl: cute.Shape
+        :param cluster_shape_mnk: The shape of the cluster in (m, n) dimensions.
+        :type cluster_shape_mnk: cute.Shape
+
+        :raises ValueError: If cluster_shape_k is not 1.
+        """
+
+        if cluster_shape_mnk[2] != 1:
+            raise ValueError(f"unsupported cluster_shape_k {cluster_shape_mnk[2]}")
+
+        self.problem_shape_ntile_mnl = problem_shape_ntile_mnl
+        # cluster_shape_mnk is kept for reconstruction
+        self._cluster_shape_mnk = cluster_shape_mnk
+        self.cluster_shape_mn = cluster_shape_mnk[:2]
+        self._loc = loc
+
+        # By default, we follow m major (col-major) raster order, so make a col-major layout
+        self.problem_layout_ncluster_mnl = cute.make_layout(
+            cute.ceil_div(
+                self.problem_shape_ntile_mnl, cluster_shape_mnk[:2], loc=loc, ip=ip
+            ),
+            loc=loc,
+            ip=ip,
+        )
+
+    def __extract_mlir_values__(self):
+        values, self._values_pos = [], []
+        for obj in [self.problem_shape_ntile_mnl, self._cluster_shape_mnk]:
+            obj_values = extract_mlir_values(obj)
+            values += obj_values
+            self._values_pos.append(len(obj_values))
+        return values
+
+    def __new_from_mlir_values__(self, values):
+        obj_list = []
+        for obj, n_items in zip(
+            [self.problem_shape_ntile_mnl, self._cluster_shape_mnk], self._values_pos
+        ):
+            obj_list.append(new_from_mlir_values(obj, values[:n_items]))
+            values = values[n_items:]
+        return PersistentTileSchedulerParams(*(tuple(obj_list)), loc=self._loc)
+
+    @dsl_user_op
+    def get_grid_shape(
+        self, max_active_clusters: Int32, *, loc=None, ip=None
+    ) -> Tuple[Integer, Integer, Integer]:
+        """
+        Computes the grid shape based on the maximum active clusters allowed.
+
+        :param max_active_clusters: The maximum number of active clusters that
+            can run in one wave.
+        :type max_active_clusters: Int32
+
+        :return: A tuple containing the grid shape in (m, n, persistent_clusters).
+            - m: self.cluster_shape_m.
+            - n: self.cluster_shape_n.
+            - persistent_clusters: Number of persistent clusters that can run.
+        """
+
+        # Total ctas in problem size
+        num_ctas_mnl = tuple(
+            x * y
+            for x, y in zip(
+                self.problem_layout_ncluster_mnl.shape, self.cluster_shape_mn
+            )
+        ) + (self.problem_layout_ncluster_mnl.shape[2],)
+
+        num_ctas_in_problem = cute.size(num_ctas_mnl, loc=loc, ip=ip)
+
+        num_ctas_per_cluster = cute.size(self.cluster_shape_mn, loc=loc, ip=ip)
+        # Total ctas that can run in one wave
+        num_ctas_per_wave = max_active_clusters * num_ctas_per_cluster
+
+        num_persistent_ctas = min(num_ctas_in_problem, num_ctas_per_wave)
+        num_persistent_clusters = num_persistent_ctas // num_ctas_per_cluster
+
+        return (*self.cluster_shape_mn, num_persistent_clusters)
+
+
+class StaticPersistentTileScheduler:
+    """A scheduler for static persistent tile execution in CUTLASS/CuTe kernels.
+
+    :ivar params: Tile schedule related params, including cluster shape and problem_layout_ncluster_mnl
+    :type params: PersistentTileSchedulerParams
+    :ivar num_persistent_clusters: Number of persistent clusters that can be launched
+    :type num_persistent_clusters: Int32
+    :ivar cta_id_in_cluster: ID of the CTA within its cluster
+    :type cta_id_in_cluster: cute.Coord
+    :ivar _num_tiles_executed: Counter for executed tiles
+    :type _num_tiles_executed: Int32
+    :ivar _current_work_linear_idx: Current cluster index
+    :type _current_work_linear_idx: Int32
+    """
+
+    def __init__(
+        self,
+        params: PersistentTileSchedulerParams,
+        num_persistent_clusters: Int32,
+        current_work_linear_idx: Int32,
+        cta_id_in_cluster: cute.Coord,
+        num_tiles_executed: Int32,
+    ):
+        """
+        Initializes the StaticPersistentTileScheduler with the given parameters.
+
+        :param params: Tile schedule related params, including cluster shape and problem_layout_ncluster_mnl.
+        :type params: PersistentTileSchedulerParams
+        :param num_persistent_clusters: Number of persistent clusters that can be launched.
+        :type num_persistent_clusters: Int32
+        :param current_work_linear_idx: Current cluster index.
+        :type current_work_linear_idx: Int32
+        :param cta_id_in_cluster: ID of the CTA within its cluster.
+        :type cta_id_in_cluster: cute.Coord
+        :param num_tiles_executed: Counter for executed tiles.
+        :type num_tiles_executed: Int32
+        """
+        self.params = params
+        self.num_persistent_clusters = num_persistent_clusters
+        self._current_work_linear_idx = current_work_linear_idx
+        self.cta_id_in_cluster = cta_id_in_cluster
+        self._num_tiles_executed = num_tiles_executed
+
+    def __extract_mlir_values__(self) -> list[ir.Value]:
+        values = extract_mlir_values(self.num_persistent_clusters)
+        values.extend(extract_mlir_values(self._current_work_linear_idx))
+        values.extend(extract_mlir_values(self.cta_id_in_cluster))
+        values.extend(extract_mlir_values(self._num_tiles_executed))
+        return values
+
+    def __new_from_mlir_values__(
+        self, values: list[ir.Value]
+    ) -> "StaticPersistentTileScheduler":
+        assert len(values) == 6
+        new_num_persistent_clusters = new_from_mlir_values(
+            self.num_persistent_clusters, [values[0]]
+        )
+        new_current_work_linear_idx = new_from_mlir_values(
+            self._current_work_linear_idx, [values[1]]
+        )
+        new_cta_id_in_cluster = new_from_mlir_values(
+            self.cta_id_in_cluster, values[2:5]
+        )
+        new_num_tiles_executed = new_from_mlir_values(
+            self._num_tiles_executed, [values[5]]
+        )
+        return StaticPersistentTileScheduler(
+            self.params,
+            new_num_persistent_clusters,
+            new_current_work_linear_idx,
+            new_cta_id_in_cluster,
+            new_num_tiles_executed,
+        )
+
+    # called by host
+    @dsl_user_op
+    @staticmethod
+    def create(
+        params: PersistentTileSchedulerParams,
+        block_idx: Tuple[Integer, Integer, Integer],
+        grid_dim: Tuple[Integer, Integer, Integer],
+        *,
+        loc=None,
+        ip=None,
+    ):
+        """Initialize the static persistent tile scheduler.
+
+        :param params: Parameters for the persistent
+            tile scheduler.
+        :type params: PersistentTileSchedulerParams
+        :param block_idx: The 3d block index in the format (bidx, bidy, bidz).
+        :type block_idx: Tuple[Integer, Integer, Integer]
+        :param grid_dim: The 3d grid dimensions for kernel launch.
+        :type grid_dim: Tuple[Integer, Integer, Integer]
+
+        :return: A StaticPersistentTileScheduler object.
+        :rtype: StaticPersistentTileScheduler
+        """
+        params = params
+
+        # Calculate the number of persistent clusters by dividing the total grid size
+        # by the number of CTAs per cluster
+        num_persistent_clusters = cute.size(grid_dim, loc=loc, ip=ip) // cute.size(
+            params.cluster_shape_mn, loc=loc, ip=ip
+        )
+
+        bidx, bidy, bidz = block_idx
+
+        # Initialize workload index equals to the cluster index in the grid
+        current_work_linear_idx = Int32(bidz)
+
+        # CTA id in the cluster
+        cta_id_in_cluster = (
+            Int32(bidx % params.cluster_shape_mn[0]),
+            Int32(bidy % params.cluster_shape_mn[1]),
+            Int32(0),
+        )
+        # Initialize number of tiles executed to zero
+        num_tiles_executed = Int32(0)
+        return StaticPersistentTileScheduler(
+            params,
+            num_persistent_clusters,
+            current_work_linear_idx,
+            cta_id_in_cluster,
+            num_tiles_executed,
+        )
+
+    # called by host
+    @staticmethod
+    def get_grid_shape(
+        params: PersistentTileSchedulerParams,
+        max_active_clusters: Int32,
+        *,
+        loc=None,
+        ip=None,
+    ) -> Tuple[Integer, Integer, Integer]:
+        """Calculates the grid shape to be launched on GPU using problem shape,
+        threadblock shape, and active cluster size.
+
+        :param params: Parameters for grid shape calculation.
+        :type params: PersistentTileSchedulerParams
+        :param max_active_clusters: Maximum active clusters allowed.
+        :type max_active_clusters: Int32
+
+        :return: The calculated 3d grid shape.
+        :rtype: Tuple[Integer, Integer, Integer]
+        """
+
+        return params.get_grid_shape(max_active_clusters, loc=loc, ip=ip)
+
+    # private method
+    def _get_current_work_for_linear_idx(
+        self, current_work_linear_idx: Int32, *, loc=None, ip=None
+    ) -> WorkTileInfo:
+        """Compute current tile coord given current_work_linear_idx and cta_id_in_cluster.
+
+        :param current_work_linear_idx: The linear index of the current work.
+        :type current_work_linear_idx: Int32
+
+        :return: An object containing information about the current tile coordinates
+            and validity status.
+        :rtype: WorkTileInfo
+        """
+
+        is_valid = current_work_linear_idx < cute.size(
+            self.params.problem_layout_ncluster_mnl, loc=loc, ip=ip
+        )
+
+        cur_cluster_coord = self.params.problem_layout_ncluster_mnl.get_hier_coord(
+            current_work_linear_idx, loc=loc, ip=ip
+        )
+
+        # cur_tile_coord is a tuple of i32 values
+        cur_tile_coord = tuple(
+            Int32(x) * Int32(z) + Int32(y)
+            for x, y, z in zip(
+                cur_cluster_coord,
+                self.cta_id_in_cluster,
+                (*self.params.cluster_shape_mn, Int32(1)),
+            )
+        )
+
+        return WorkTileInfo(cur_tile_coord, is_valid)
+
+    @dsl_user_op
+    def get_current_work(self, *, loc=None, ip=None) -> WorkTileInfo:
+        return self._get_current_work_for_linear_idx(
+            self._current_work_linear_idx, loc=loc, ip=ip
+        )
+
+    @dsl_user_op
+    def initial_work_tile_info(self, *, loc=None, ip=None) -> WorkTileInfo:
+        return self.get_current_work(loc=loc, ip=ip)
+
+    @dsl_user_op
+    def advance_to_next_work(self, *, advance_count: int = 1, loc=None, ip=None):
+        self._current_work_linear_idx += Int32(advance_count) * Int32(
+            self.num_persistent_clusters
+        )
+        self._num_tiles_executed += Int32(1)
+
+    @property
+    def num_tiles_executed(self) -> Int32:
+        return self._num_tiles_executed
diff --git a/python/CuTeDSL/cutlass/utils/tensormap_manager.py b/python/CuTeDSL/cutlass/utils/tensormap_manager.py
new file mode 100644
index 00000000..c6369c20
--- /dev/null
+++ b/python/CuTeDSL/cutlass/utils/tensormap_manager.py
@@ -0,0 +1,140 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import Tuple
+
+from cutlass.cutlass_dsl import const_expr
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+
+import cutlass.cute as cute
+
+
+class TensorMapUpdateMode(Enum):
+    """
+    Enum class defining tensor map update modes.
+
+    Modes:
+    GMEM: Update tensormap in global memory
+    SMEM: Load tensormap from global memory to shared memory,
+    update it in shared memory, then store back to global memory
+    """
+
+    GMEM = auto()  # Update tensormap in global memory
+    SMEM = auto()  # Update tensormap in shared memory
+
+
+@dataclass(frozen=True)
+class TensorMapManager:
+    """
+    Manages TensorMap operations including initialization and updates.
+    Provides utilities to convert tensormap pointer to across different memory spaces.
+    """
+
+    tensormap_update_mode: TensorMapUpdateMode
+    bytes_per_tensormap: int
+
+    # convert given cute.Pointer or cutlass.Int64 to a cute.Pointer to tensormap.
+    # address_space: the address space of the resulting tensormap pointer. It could be generic or gmem
+    def get_tensormap_ptr(
+        self,
+        ptr: cute.Pointer,
+        address_space=_cute_ir.AddressSpace.gmem,
+    ) -> cute.Pointer:
+        if address_space not in [
+            _cute_ir.AddressSpace.gmem,
+            _cute_ir.AddressSpace.generic,
+        ]:
+            raise ValueError(f"Invalid address space: {address_space} for tensormap")
+
+        gmem_ptr_i64 = ptr.toint().ir_value()
+        gmem_ptr_i64_align_ty = _cute_ir.ConstrainedIntType.get(
+            self.bytes_per_tensormap, gmem_ptr_i64.type.width
+        )
+        gmem_ptr_i64_align = _cute_ir.assume(gmem_ptr_i64_align_ty, gmem_ptr_i64)
+        gmem_ptr_ty = _cute_ir.PtrType.get(
+            _cute_nvgpu_ir.TmaDescriptorTiledType.get(),
+            address_space,
+            self.bytes_per_tensormap,
+        )
+        return _cute_ir.inttoptr(gmem_ptr_ty, gmem_ptr_i64_align)
+
+    # init tensormap pointed by dst_ptr with the one inside copy_atom.
+    # dst_ptr should be pointing to a global memory location or a smem location
+    # warp_id specifies which warp to perform the initialization
+    @cute.jit
+    def init_tensormap_from_atom(
+        self, copy_atom: cute.CopyAtom, dst_ptr: cute.Pointer, warp_id: int
+    ) -> None:
+        warp_idx = cute.arch.warp_idx()
+        warp_idx = cute.arch.make_warp_uniform(warp_idx)
+        if warp_idx == warp_id:
+            with cute.arch.elect_one():
+                cute.nvgpu.cpasync.copy_tensormap(copy_atom, dst_ptr)
+        cute.arch.sync_warp()
+        return
+
+    # Perform a fence operation to ensure previous `init_tensormap_from_atom` calls have been completed
+    def fence_tensormap_initialization(
+        self,
+    ) -> None:
+        if self.tensormap_update_mode == TensorMapUpdateMode.GMEM:
+            cute.arch.fence_acq_rel_cta()
+        return
+
+    # Perform a fence operation to ensure previous `update_tensormap` calls have been completed
+    def fence_tensormap_update(
+        self,
+        tensormap_ptr: cute.Pointer,
+    ) -> None:
+        cute.nvgpu.cpasync.fence_tma_desc_acquire(tensormap_ptr)
+        return
+
+    @cute.jit
+    def update_tensormap(
+        self,
+        tensor_gmem: Tuple[cute.Tensor, ...],
+        tma_copy_atom: Tuple[cute.CopyAtom, ...],
+        tensormap_gmem_ptr: Tuple[cute.Pointer, ...],
+        warp_id: int,
+        tensormap_smem_ptr: Tuple[cute.Pointer, ...],
+    ) -> None:
+        warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
+        # updates before touching tensormap in global memory
+        if warp_idx == warp_id:
+            if const_expr(self.tensormap_update_mode == TensorMapUpdateMode.SMEM):
+                for copy_atom, tensor, smem_ptr in zip(
+                    tma_copy_atom, tensor_gmem, tensormap_smem_ptr
+                ):
+                    cute.nvgpu.cpasync.update_tma_descriptor(
+                        copy_atom, tensor, smem_ptr
+                    )
+            # wait until it's safe to update tensormap in global memory
+            with cute.arch.elect_one():
+                cute.arch.cp_async_bulk_commit_group()
+                cute.arch.cp_async_bulk_wait_group(0, read=True)
+            cute.arch.sync_warp()
+            # updates to tensormap in global memory
+            if const_expr(self.tensormap_update_mode == TensorMapUpdateMode.SMEM):
+                for gmem_ptr, smem_ptr in zip(tensormap_gmem_ptr, tensormap_smem_ptr):
+                    cute.nvgpu.cpasync.cp_fence_tma_desc_release(gmem_ptr, smem_ptr)
+            else:
+                for copy_atom, tensor, gmem_ptr in zip(
+                    tma_copy_atom, tensor_gmem, tensormap_gmem_ptr
+                ):
+                    cute.nvgpu.cpasync.update_tma_descriptor(
+                        copy_atom, tensor, gmem_ptr
+                    )
+                cute.arch.sync_warp()
+                cute.nvgpu.cpasync.fence_tma_desc_release()
diff --git a/python/CuTeDSL/cutlass_dsl/__init__.py b/python/CuTeDSL/cutlass_dsl/__init__.py
new file mode 100644
index 00000000..9c6861c3
--- /dev/null
+++ b/python/CuTeDSL/cutlass_dsl/__init__.py
@@ -0,0 +1,37 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from .cutlass import *
+
+from ..base_dsl.ast_helpers import (
+    loop_selector,
+    if_selector,
+    if_executor,
+    while_selector,
+    while_executor,
+    range_constexpr,
+    range_dynamic,
+    const_expr,
+    dynamic_expr,
+    assert_executor,
+    bool_cast,
+)
+
+from ..base_dsl import *
+from ..base_dsl.dsl import extract_mlir_values, new_from_mlir_values
+from ..base_dsl.typing import _binary_op_type_promote
+from ..base_dsl._mlir_helpers.gpu import *
+from ..base_dsl._mlir_helpers.op import dsl_user_op
+from ..base_dsl.runtime import *
+from ..base_dsl.runtime import cuda as cuda_helpers
+from ..base_dsl.compiler import compile
+from ..base_dsl.runtime.dlpack_runtime import *
+from ..base_dsl.runtime.jit_arg_adapters import *
diff --git a/python/CuTeDSL/cutlass_dsl/cutlass.py b/python/CuTeDSL/cutlass_dsl/cutlass.py
new file mode 100644
index 00000000..1e2f4d1c
--- /dev/null
+++ b/python/CuTeDSL/cutlass_dsl/cutlass.py
@@ -0,0 +1,1322 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides a DSL for Cutlass Dialects. It also includes utils with
+regarding to that dialect.
+"""
+
+# Local module imports
+from typing import Callable, Union, Type, List, Union, Sequence, ForwardRef
+from inspect import isclass
+import functools
+import pkgutil
+from dataclasses import is_dataclass
+
+from ..base_dsl import *
+from ..base_dsl import compiler
+from ..base_dsl.dsl import is_dynamic_expression, extract_mlir_values
+from ..base_dsl.typing import *
+from ..base_dsl.typing import DynamicExpression, get_mlir_types
+from ..base_dsl.runtime.jit_arg_adapters import is_arg_spec_constexpr
+
+from ..base_dsl.ast_helpers import const_expr
+
+# MLIR Imports
+from cutlass._mlir import ir, execution_engine, passmanager
+from cutlass._mlir.dialects import arith, func, gpu, scf, cute, gpu as cutlass_gpu
+from cutlass._mlir.dialects._ods_common import (
+    get_op_result_or_op_results as _get_op_result_or_op_results,
+)
+from cutlass._mlir.extras import types as T
+
+# Helpers
+from ..base_dsl._mlir_helpers import arith as cutlass_arith
+from ..base_dsl._mlir_helpers import lru_cache_ir
+
+from ..base_dsl.ast_helpers import (
+    loop_selector,
+    executor,
+    if_selector,
+    if_executor,
+    while_selector,
+    while_executor,
+    assert_executor,
+    bool_cast,
+)
+from ..base_dsl.runtime.dlpack_runtime import (
+    get_cute_tensor_c_pointer,
+    get_tensor_desc_shape_all,
+    get_tensor_desc_stride_all,
+    get_tensor_desc_element_type,
+    get_tensor_desc_is_in_device,
+    get_tensor_desc_assumed_align,
+)
+
+from .cutlass_ast_decorators import (
+    _loop_execute_range_dynamic,
+    _if_execute_dynamic,
+    _while_execute_dynamic,
+)
+
+# =============================================================================
+# Set the AST decorator
+# =============================================================================
+
+# Set the DSL specific functions
+executor.set_functions(
+    is_dynamic_expression,
+    _loop_execute_range_dynamic,
+    _if_execute_dynamic,
+    _while_execute_dynamic,
+)
+
+
+# =============================================================================
+# Cutlass DSL Base Abstract Class
+# =============================================================================
+
+
+# Return a ctype class that represents the in-memory layout expected
+# for a CuTe hierarchical tuple type.
+def get_sparse_tuple_ctype(dyn):
+    # When there is a single dynamic value, the sparse CuTe
+    # representation is a single integer.
+    if isinstance(dyn, int):
+        return ctypes.c_int32
+
+    # For zero or greater than 1 dynamic values, the tuple
+    # representation will be a struct with a field for each dynamic
+    # value. The representation is flattened, even for hierarchical CuTe
+    # profiles (although we are only dealing with depth 1 inputs here).
+    class TupleDescriptor(ctypes.Structure):
+        _fields_ = [(f"x{idx}", ctypes.c_int32) for idx in range(len(dyn))]
+
+        def __str__(self):
+            return f"struct<{str(self._fields_)}>"
+
+    return TupleDescriptor
+
+
+def is_cute_algebra_type(arg_spec):
+    # Walk through the arg_spec to check if it's a cute algebra type
+    _cute_algebra_type_aliases = (
+        "Shape",
+        "Stride",
+        "Coord",
+        "Tile",
+        "IntTuple",
+    )
+
+    origin = get_origin(arg_spec)
+    if origin is Union:
+        for sub_ty in get_args(arg_spec):
+            sub_origin = get_origin(sub_ty)
+            if sub_origin is Tuple or (
+                type(sub_origin) is type and issubclass(sub_origin, tuple)
+            ):
+                tuple_arg0 = get_args(sub_ty)[0]
+                if isinstance(
+                    tuple_arg0, ForwardRef
+                ) and tuple_arg0.__forward_arg__ in (_cute_algebra_type_aliases):
+                    return True
+    return False
+
+
+class CutlassBaseDSL(BaseDSL):
+    """This abstract class provides a DSL for Cutlass."""
+
+    def __init__(
+        self,
+        name: str,
+        compiler_provider: Any,
+        pass_sm_arch_name: str,
+        device_compilation_only: bool = False,
+        preprocess: bool = False,
+    ):
+        super().__init__(
+            name,
+            compiler_provider,
+            pass_sm_arch_name,
+            device_compilation_only,
+            preprocess,
+        )
+
+    def _is_tensor_descriptor(self, maybe_tensor_descriptor) -> bool:
+        return False
+
+    def _build_gpu_module(self, attrs):
+        self.gpu_module = gpu.GPUModuleOp(ir.StringAttr.get("kernels"))
+        with ir.InsertionPoint(self.gpu_module.bodyRegion.blocks.append(*[])):
+            pass
+
+        for attr_name in attrs:
+            self.gpu_module.attributes[attr_name] = ir.Attribute.parse(attrs[attr_name])
+
+    def _get_pipeline(self, pipeline):
+        pipeline = super()._get_pipeline(pipeline)
+        if pipeline == None:
+            # cubin format is required to be cubin as we launch cuda module at python level.
+            return "builtin.module(cute-to-nvvm{cubin-format=bin opt-level=3})"
+
+        return pipeline
+
+    def preprocess_pipeline(self, pipeline, arch) -> str:
+        pipeline = super().preprocess_pipeline(pipeline, arch)
+        pipeline = pipeline.rstrip(")") + ",external-kernel-for-gpu-launch)"
+        return pipeline
+
+    def _enter_gpu_module(self):
+        return ir.InsertionPoint(self.gpu_module.bodyRegion.blocks[0])
+
+    def _generate_kernel_attrs(self, config: BaseDSL.LaunchConfig) -> dict:
+        assert isinstance(
+            config, BaseDSL.LaunchConfig
+        ), f"Expect LaunchConfig for @kernel, but got {type(config)}"
+
+        ret = {}
+        # generate launch bound attr from LaunchConfig
+        max_threads = ", ".join(map(str, config.block))
+        ret["nvvm.reqntid"] = ir.Attribute.parse(f"array<i32 : {max_threads}>")
+        # min_blocks_per_mp is optional for kernel
+        min_blocks = config.min_blocks_per_mp
+        if min_blocks > 0:
+            ret["nvvm.minctasm"] = ir.Attribute.parse(f"{min_blocks} : i32")
+        return ret
+
+    @lru_cache(maxsize=1)
+    def get_version(self):
+        """
+        Get the version of cutlass dsl, used for computing the hash key of the cache.
+        Including source python files and the shared library.
+        """
+        dsl_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        # get the version hash of the cutlass shared library
+        version_hash = hashlib.sha256()
+        # update the version hash of the source python files
+        for lib in pkgutil.walk_packages([dsl_path], prefix="cutlass."):
+            try:
+                with open(lib.module_finder.find_spec(lib.name).origin, "rb") as f:
+                    version_hash.update(f.read())
+            except Exception:
+                raise DSLRuntimeError(
+                    f"Failed to read module file {lib.name}. The file may not exist or may not be readable."
+                    "Please re-install the package."
+                )
+        try:
+            # update the version hash of the cutlass shared library
+            with open(
+                os.path.join(dsl_path, "_mlir/_mlir_libs/libCutlassIRPythonCAPI.so"),
+                "rb",
+            ) as f:
+                while True:
+                    chunk = f.read(1024**2)
+                    if not chunk:
+                        break
+                    version_hash.update(chunk)
+        except Exception:
+            raise DSLRuntimeError(
+                f"Failed to read the shared library file libCutlassIRPythonCAPI.so."
+                "The file may not exist or may not be readable."
+                "Please re-install the package."
+            )
+
+        return version_hash
+
+    def _kernel_helper(self, funcBody, *args, **kwargs):
+        class _CutlassIrKernelGenHelper(BaseDSL._KernelGenHelper):
+            def generate_func_op(self, arg_types, arg_attrs, kernel_name, loc=None):
+                super().generate_func_op(arg_types, arg_attrs, kernel_name)
+                self.func_op = func.FuncOp(
+                    kernel_name, ir.FunctionType.get(arg_types, []), loc=loc
+                )
+                if arg_attrs is not None:
+                    log().debug(arg_attrs)
+                    self.func_op.arg_attrs = arg_attrs
+                return self.func_op
+
+            def generate_func_ret_op(self):
+                return func.ReturnOp([])
+
+            def get_func_body_start(self):
+                assert self.func_op is not None, "Invalid func_op is not expected!"
+                return self.func_op.add_entry_block()
+
+            def generate_launch_op(self, *args, **kwargs):
+                # Extract args and do validation
+                kernelSym = kwargs.get("kernelSym", None)
+                kernelOperands = kwargs.get("kernelOperands", None)
+                requiredArgs = kwargs.get("requiredArgs", None)
+                assert kernelSym is not None, "kernelSym being None is not expected!"
+                assert (
+                    requiredArgs is not None
+                ), "requiredArgs being None is not expected!"
+                assert (
+                    kernelOperands is not None
+                ), "kernelOperands being None is not expected!"
+                assert isinstance(
+                    requiredArgs.config, BaseDSL.LaunchConfig
+                ), f"Expect LaunchConfig for @kernel, but got {type(requiredArgs.config)}"
+
+                cfg = requiredArgs.config
+
+                # Apply to grid, block, and cluster if present
+                cfg.grid = [to_index(size) for size in cfg.grid]
+                cfg.block = [to_index(size) for size in cfg.block]
+                if cfg.has_cluster:
+                    cfg.cluster = [to_index(size) for size in cfg.cluster]
+
+                cfg.smem = const(cfg.smem)
+
+                if not isinstance(cfg.async_deps, (list, tuple)):
+                    cfg.async_deps = [cfg.async_deps]
+                is_async = len(cfg.async_deps) > 0
+                token = gpu.launch_func(
+                    gpu.AsyncTokenType.get() if is_async else None,
+                    cfg.async_deps,
+                    kernelSym,
+                    *cfg.grid,
+                    *cfg.block,
+                    kernelOperands,
+                    **dict(
+                        zip(
+                            ("cluster_size_x", "cluster_size_y", "cluster_size_z"),
+                            tuple(cfg.cluster),
+                        )
+                    ),
+                    dynamic_shared_memory_size=cfg.smem,
+                )
+                return token if is_async else None
+
+        return KernelLauncher(
+            self, _CutlassIrKernelGenHelper, funcBody, *args, **kwargs
+        )
+
+    def _get_globals(self):
+        caller_globals = self.frame.f_globals
+        caller_locals = self.frame.f_locals
+        all_globals = globals().copy()
+        all_globals.update(caller_globals)
+        all_globals.update(caller_locals)
+        return all_globals
+
+    def _preprocess_launch_config_args(self, args, kwargs):
+        """Helper to preprocess args and kwargs for LaunchConfig"""
+        if "stream" in kwargs:
+            kwargs["async_deps"] = kwargs.pop("stream")
+
+    def mangle_name(self, function_name, args, args_spec: inspect.FullArgSpec):
+        """Mangle the name of the function to avoid conflicts with other functions"""
+        function_name = "cutlass_" + function_name
+        return super().mangle_name(function_name, args, args_spec)
+
+    def _validate_arg(self, arg, arg_index, arg_name, arg_annotation):
+        """
+        Validates if the arg is really of the annotated type.
+        """
+
+        if is_arg_spec_constexpr(arg_annotation, arg_name, arg_index, None):
+            pass
+        else:
+            origin = get_origin(arg_annotation)
+            # Handle special case where annotation is Type[X] but arg is an actual type
+            if origin is type and isinstance(arg, type):
+                # Get the expected base type from Type[X]
+                expected_base = get_args(arg_annotation)[0]
+                if not issubclass(arg, expected_base):
+                    return DSLRuntimeError(
+                        f"expects argument #{arg_index+1} ({arg_name}) to be Type[{expected_base}], but got {arg}"
+                    )
+            # Handle Union types and generic types
+            elif origin is Union:
+                # For Union types, check if arg matches any of the allowed types
+                allowed_types = get_args(arg_annotation)
+                if not any(
+                    (isinstance(ty, type) and isinstance(arg, ty))
+                    or (get_origin(ty) is tuple and isinstance(arg, tuple))
+                    for ty in allowed_types
+                ):
+                    return DSLRuntimeError(
+                        f"expects argument #{arg_index+1} ({arg_name}) to be one of {allowed_types}, but got {type(arg)}"
+                    )
+            elif isinstance(arg_annotation, type):
+                # Handle simple type annotations
+                if not isinstance(arg, arg_annotation) and arg is not None:
+                    return DSLRuntimeError(
+                        f"expects argument #{arg_index+1} ({arg_name}) to be {arg_annotation}, but got {type(arg)}"
+                    )
+        # Everything looks good if we are here
+        return None
+
+    def _generate_jit_func_args_for_known_types(
+        self,
+        func,
+        arg,
+        arg_name,
+        arg_spec,
+        arg_index,
+        *,
+        is_host=True,
+    ):
+        jit_arg_type, jit_arg_attr, jit_exec_arg = [], [], []
+        default_attr = ir.DictAttr.get({})
+
+        (
+            jit_exec_arg,
+            jit_arg_type,
+            jit_arg_attr,
+        ) = super()._generate_jit_func_args_for_known_types(
+            func, arg, arg_name, arg_spec, arg_index, is_host=is_host
+        )
+
+        if jit_arg_type is not None and len(jit_arg_type) == 0:
+            # Handle DSL specific types
+            if is_cute_algebra_type(arg_spec):
+                dyn_vals = extract_mlir_values(arg)
+                if dyn_vals:
+                    # Handle dynamic types
+                    jit_arg_type.extend([v.type for v in dyn_vals])
+                    jit_arg_attr.extend([default_attr] * len(dyn_vals))
+                    jit_exec_arg.extend(get_c_pointers(arg) if is_host else dyn_vals)
+                else:
+                    jit_exec_arg = jit_arg_type = jit_arg_attr = None
+        return jit_exec_arg, jit_arg_type, jit_arg_attr
+
+    def _generate_execution_arguments_for_known_types(
+        self, arg, arg_spec, arg_name, i, fop_args, iv_block_args
+    ):
+        ir_arg, iv_block_args = super()._generate_execution_arguments_for_known_types(
+            arg, arg_spec, arg_name, i, fop_args, iv_block_args
+        )
+        if not ir_arg:
+            # Handling DSL specific types
+            if is_cute_algebra_type(arg_spec):
+                n_args = len(get_mlir_types(arg))
+                blk_args = fop_args[iv_block_args : iv_block_args + n_args]
+                ir_arg.append(new_from_mlir_values(arg, blk_args))
+                iv_block_args += n_args
+
+        return ir_arg, iv_block_args
+
+
+# =============================================================================
+# Cute DSL Class
+# =============================================================================
+
+
+class CuTeDSL(CutlassBaseDSL):
+    """
+    This is a concrete DSL subclass for the CuTe dialect.
+    """
+
+    def __init__(self):
+        name = "CUTE_DSL"
+        compiler_provider = compiler.Compiler(passmanager, execution_engine)
+        pass_sm_arch_name = "cubin-chip"
+
+        super().__init__(name, compiler_provider, pass_sm_arch_name, preprocess=True)
+
+
+# =============================================================================
+# KernelLauncher
+# =============================================================================
+
+
+class KernelLauncher:
+    """
+    This class is used to launch a kernel function.
+    Usage:
+        ```python
+        @cute.kernel
+        def kernel(arg1, arg2, ...):
+            ...
+
+        @cute.jit
+        def launch_kernel():
+            kernel(arg1, arg2, ...).launch(grid=[1, 1, 1], block=[1, 1, 1], ...)
+            # or
+            kernel(arg1, arg2, ...)(grid=[1, 1, 1], block=[1, 1, 1], ...)
+        ```
+    """
+
+    def __init__(
+        self,
+        dsl: "CutlassBaseDSL",
+        kernelGenHelper: BaseDSL._KernelGenHelper,
+        funcBody,
+        *func_args,
+        **func_kwargs,
+    ):
+        self.dsl = dsl
+        self.kernelGenHelper = kernelGenHelper
+        self.funcBody = funcBody
+        self.func_args = func_args
+        self.func_kwargs = func_kwargs
+
+        self._check_func_args(funcBody, *func_args, **func_kwargs)
+
+    def _check_func_args(self, funcBody, *func_args, **func_kwargs):
+        # Get function signature
+        sig = inspect.signature(funcBody)
+
+        # func_args and func_kwargs should match funcBody's signature,
+        # no extra or missing arguments.
+        try:
+            sig.bind(*func_args, **func_kwargs)
+        except TypeError as e:
+            raise DSLRuntimeError(
+                f"Failed to bind arguments to function `{funcBody.__name__}` with signature `{sig}`",
+                cause=e,
+            )
+
+    def launch(self, *args, **kwargs):
+        self.dsl.frame = inspect.currentframe().f_back
+        self.dsl._preprocess_launch_config_args(args, kwargs)
+        config = self.dsl.LaunchConfig(*args, **kwargs)
+
+        kernel_generator = self.dsl.kernel_launcher(
+            requiredArgs=["config"],
+            unitAttrNames=["gpu.kernel", "cute.kernel"],
+            valueAttrDict=self.dsl._generate_kernel_attrs(config),
+            kernelGenHelper=self.kernelGenHelper,
+        )(self.funcBody)
+
+        ret, name = kernel_generator(*self.func_args, **self.func_kwargs, config=config)
+        self.dsl.kernel_symbols.append(name)
+        return ret.launch_op_ret
+
+    def __call__(self, *args, **kwargs):
+        return self.launch(*args, **kwargs)
+
+
+# =============================================================================
+# Utils
+# =============================================================================
+
+
+def is_frozen_dataclass(obj_or_cls) -> bool:
+    """
+    Return True if obj_or_cls is a dataclass (class or instance) declared with frozen=True,
+    otherwise False.
+    """
+    if not isinstance(obj_or_cls, type):
+        # If it's an instance, get its class
+        obj_or_cls = obj_or_cls.__class__
+
+    # Must be a dataclass, and __dataclass_params__.frozen must be True
+    return (
+        is_dataclass(obj_or_cls)
+        and getattr(obj_or_cls, "__dataclass_params__", None) is not None
+        and obj_or_cls.__dataclass_params__.frozen
+    )
+
+
+def pack_from_irvalue(
+    ir_values: List["ir.Value"],
+    indices: Dict[int, Tuple[int, int]],
+    class_types: List[Any],
+) -> List[Any]:
+    """
+    Packs MLIR values into a list of mixed values.
+    """
+    log().info("===--- Values Pack (%d)", len(ir_values))
+    for idx, packed in enumerate(ir_values):
+        log().info("[%d]: will-packed: %s", idx, ir_values)
+    for idx, unpacked in indices.items():
+        log().info("[%d]: indices: %s", idx, unpacked)
+    for idx, c in enumerate(class_types):
+        log().info("[%d]: obj-types: %s", idx, type(c))
+
+    mixed_values = [None] * len(indices)
+    for idx, (start, length) in sorted(indices.items()):
+        chunk = ir_values[start : start + length]
+        obj = class_types[idx]
+        if is_frozen_dataclass(obj):
+            mixed_values[idx] = obj
+        elif not isinstance(obj, type) and hasattr(obj, "__new_from_mlir_values__"):
+            mixed_values[idx] = obj.__new_from_mlir_values__(chunk)
+        else:
+            try:
+                if isinstance(chunk, list) and chunk[0] is None:
+                    mixed_values[idx] = class_types[idx]
+                else:
+                    mixed_values[idx] = t.as_numeric(chunk[0])
+            except DSLRuntimeError as e:
+                mixed_values[idx] = chunk[0]
+
+    log().info("------------------ ")
+    for idx, packed in enumerate(mixed_values):
+        log().info("[%d]: packed: %s", idx, packed)
+    log().info("------------------ ")
+    return mixed_values
+
+
+def unpack_to_irvalue(
+    mixed_values: List[Any], body_name: str
+) -> Tuple[List[ir.Value], List[Any], Dict[int, Tuple[int, int]], List[Any]]:
+    """
+    Unpacks mixed values into ir.Value values.
+    """
+    unpacked_values = []
+    ir_values = []
+    indices = {}
+    class_types = []
+    current_offset = 0
+
+    log().info("===--- Values UNPack (%d)", len(mixed_values))
+    for idx, packed in enumerate(mixed_values):
+        log().info("[%d]: will-unpacked: [type:%s] %s", idx, type(packed), packed)
+    for idx, item in enumerate(mixed_values):
+        class_types.append(item)
+        try:
+            if is_frozen_dataclass(item):
+                extracted_vals = [None]
+            else:
+                extracted_vals = extract_mlir_values(item)
+                # it's consexpr (python value), so we create mlir value for it
+                if extracted_vals == []:
+                    if item is None:
+                        extracted_vals = [None]
+                    else:
+                        dyn_expr = t.as_numeric(item)
+                        extracted_vals = extract_mlir_values(dyn_expr)
+                        ir_values.extend(extracted_vals)
+                else:
+                    ir_values.extend(extracted_vals)
+
+            unpacked_values.extend(extracted_vals)
+            length = len(extracted_vals)
+            indices[idx] = (current_offset, length)
+            current_offset += length
+        except Exception as e:
+            raise DSLRuntimeError(
+                f"The '{body_name}' statement encountered a user-defined Python object, which cannot be automatically converted into an dynamic expression (aka MLIR value).",
+                context={
+                    item: (
+                        f"All expressions within '{body_name}' must be dynamic expressions, "
+                        "mixing Python objects and dynamic expressions (aka MLIR values) is not supported. "
+                        "The DSL failed to convert the Python object into MLIR values."
+                    )
+                },
+                suggestion=(
+                    f"Please ensure '{item}' implements the '{DynamicExpression.__name__}', "
+                    f"so it can be treated as a valid dynamic expression or mark '{body_name}' as a constant expression if conditions are Python objects."
+                ),
+            ) from e
+
+    log().info("------------------ ")
+    for idx, unpacked in enumerate(unpacked_values):
+        log().info("[%d]: unpacked values: %s", idx, unpacked)
+    for idx, unpacked in enumerate(ir_values):
+        log().info("[%d]: unpacked ir_values: %s", idx, unpacked)
+    for idx, unpacked in indices.items():
+        log().info("[%d]: indices: %s", idx, unpacked)
+    for idx, unpacked in enumerate(class_types):
+        log().info("[%d]: initial-class-types: %s", idx, unpacked)
+    log().info("------------------ ")
+
+    return ir_values, unpacked_values, indices, class_types
+
+
+def to_index(value):
+    """Converts a value to an index, either by casting or coercing to int."""
+    if is_dynamic_expression(value):
+        if isinstance(value, Numeric):
+            value = value.ir_value()
+        assert ir.IntegerType.isinstance(
+            value.type
+        ), f"expects integer type, but got {value.type}"
+        res = arith.index_cast(T.index(), value)
+    else:
+        res = const(int(value), ty=T.index())
+
+    return res
+
+
+def _validate_iter_args_structure(iter_args, ir_values):
+    """
+    Validates that iter_args structure contains the same number of atomic values
+    as there are IR values.
+
+    Args:
+        iter_args: Original iteration arguments, possibly nested sequences
+        ir_values: Flattened MLIR values extracted from iter_args
+
+    Returns:
+        bool: True if the number of atomic values in iter_args matches
+              the number of values in ir_values
+    """
+    # Handle non-sequence case
+    if not isinstance(iter_args, (tuple, list, set)):
+        return not isinstance(ir_values, (tuple, list, set)) or len(ir_values) == 1
+
+    # If we have a sequence but ir_values isn't one, there's a mismatch
+    if not isinstance(ir_values, (tuple, list, set)):
+        return False
+
+    # Count all non-sequence values recursively
+    def count_values(args):
+        if not isinstance(args, (tuple, list, set)):
+            return 1
+        else:
+            return sum(count_values(arg) for arg in args)
+
+    return count_values(iter_args) == len(ir_values)
+
+
+
+# =============================================================================
+# DSL implementation of Python Build-in Operators
+# =============================================================================
+
+
+def _minmax(op, *args, loc=None, ip=None):
+    """Computes the minimum or maximum value from the provided arguments."""
+    from ..base_dsl.typing import _binary_op, _binary_op_type_promote
+
+    # AST Traversal doesn't support early exit in if executor
+    x = None
+    res = None
+    if len(args) == 1:
+        # Handle case for min([a, b, c, d, ..])
+        if hasattr(args[0], "__iter__"):
+            x = op(*tuple(args[0]))
+        # Handle case for min(a)
+        else:
+            x = args[0]
+    # Handle case for min(a, b, c, ...) and min([x, y], [b]) and min(a, (x, y, z))
+    elif len(args) > 1:
+        res, *xs = tuple(args)
+        for x in xs:
+            lhs = as_numeric(op(res, loc=loc, ip=ip))
+            rhs = as_numeric(op(x, loc=loc, ip=ip))
+            emitter = getattr(cutlass_arith, f"_{op.__name__}")
+
+            lhs, rhs, res_type = _binary_op_type_promote(lhs, rhs, promote_bool=True)
+
+            if isinstance(lhs.value, cutlass_arith.ArithValue) and isinstance(
+                lhs, Integer
+            ):
+                lhs_val = lhs.value.with_signedness(lhs.signed)
+            else:
+                lhs_val = lhs.value
+
+            if isinstance(rhs.value, cutlass_arith.ArithValue) and isinstance(
+                rhs, Integer
+            ):
+                rhs_val = rhs.value.with_signedness(rhs.signed)
+            else:
+                rhs_val = rhs.value
+
+            res = res_type(emitter(lhs_val, rhs_val), loc=loc, ip=ip)
+        x = res
+    else:
+        raise DSLNotImplemented(f"{type(args)} is not supported")
+    return x
+
+
+def min(*args, loc=None, ip=None):
+    """Computes the minimum value from the provided arguments.
+
+    This function differs from Python's built-in min() in that the return type
+    is determined by the static types of the inputs, not their dynamic values.
+
+    :param args: One or more values or iterables to find the minimum of
+    :type args: tuple
+    :param loc: Source location for MLIR operation tracking
+    :type loc: object, optional
+    :param ip: Insertion point for MLIR operation
+    :type ip: object, optional
+    :return: The minimum value among all inputs
+    :rtype: Numeric
+    :raises DSLNotImplemented: If the input type is not supported
+
+    Supports multiple calling patterns:
+
+    - min(a): Returns a
+    - min([a, b, c, ...]): Returns minimum of all elements in the iterable
+    - min(a, b, c, ...): Returns minimum of all arguments
+    - min([x, y], [b]): Returns minimum across all elements in all iterables
+    - min(a, (x, y, z)): Returns minimum across all elements
+
+    Examples:
+
+    .. code-block:: python
+
+        # Find minimum of two values
+        result = min(x, y)
+
+        # Find minimum of multiple values
+        result = min(a, b, c, d)
+
+        # Find minimum of values in a list
+        values = [a, b, c, d]
+        result = min(values)
+
+        # Find minimum across mixed arguments
+        result = min(x, [y, z])
+
+    Difference from Python's built-in min():
+
+    .. code-block:: python
+
+        # In Python, the return type depends on the dynamic values:
+        a = 5
+        b = 3.14
+        result = min(a, b)  # Returns 3.14 (float)
+
+        # In this DSL implementation, the return type is determined statically:
+        a = Int32(5)
+        b = Float32(3.14)
+        result = min(a, b)  # Return type is determined by the type of operands, not values
+    """
+    return _minmax(min, *args, loc=loc, ip=ip)
+
+
+def max(*args, loc=None, ip=None):
+    """Computes the maximum value from the provided arguments.
+
+    This function differs from Python's built-in max() in that the return type
+    is determined by the static types of the inputs, not their dynamic values.
+
+    :param args: One or more values or iterables to find the maximum of
+    :type args: tuple
+    :param loc: Source location for MLIR operation tracking
+    :type loc: object, optional
+    :param ip: Insertion point for MLIR operation
+    :type ip: object, optional
+    :return: The maximum value among all inputs
+    :rtype: Numeric
+    :raises DSLNotImplemented: If the input type is not supported
+
+    Supports multiple calling patterns:
+
+    - max(a): Returns a
+    - max([a, b, c, ...]): Returns maximum of all elements in the iterable
+    - max(a, b, c, ...): Returns maximum of all arguments
+    - max([x, y], [b]): Returns maximum across all elements in all iterables
+    - max(a, (x, y, z)): Returns maximum across all elements
+
+    Examples:
+
+    .. code-block:: python
+
+        # Find maximum of two values
+        result = max(x, y)
+
+        # Find maximum of multiple values
+        result = max(a, b, c, d)
+
+        # Find maximum of values in a list
+        values = [a, b, c, d]
+        result = max(values)
+
+        # Find maximum across mixed arguments
+        result = max(x, [y, z])
+
+    Difference from Python's built-in max():
+
+    .. code-block:: python
+
+        # In Python, the return type depends on the dynamic values:
+        a = 5
+        b = 3.14
+        result = max(a, b)  # Returns 5 (int)
+
+        # In this DSL implementation, the return type is determined statically:
+        a = Int32(5)
+        b = Float32(3.14)
+        result = max(a, b)  # Return type is determined by the type of operands, not values
+    """
+    return _minmax(max, *args, loc=loc, ip=ip)
+
+
+def and_(*args, loc=None, ip=None):
+    """AND operation for value in DSL numeric types.
+
+    :param *args: One or more numeric values to AND together
+    :type *args: Numeric
+    :param loc: Source location for MLIR operation tracking
+    :type loc: object, optional
+    :param ip: Insertion point for MLIR operation
+    :type ip: object, optional
+    :return: The result of the logical AND operation
+    :rtype: Numeric
+    :raises ValueError: If no arguments are provided
+
+    Supports multiple calling patterns:
+
+    - and_(a): Returns a
+    - and_(a, b, c, ...): if a is truthy, returns and_(b, c, ...), otherwise returns a
+
+    All arguments must be of the same type.
+
+    Examples:
+
+    .. code-block:: python
+
+        # In Python, 'and' returns the second operand if the first is truthy,
+        # otherwise it returns the first operand
+        a = 5
+        b = 3
+        result = a and b  # Returns 3
+
+        # In this DSL implementation, the behavior is similar but works with DSL types
+        a = Int32(5)
+        b = Int32(3)
+        result = and_(a, b)  # Returns b
+    """
+    if len(args) == 0:
+        raise ValueError("and_() requires at least one argument")
+
+    if len(args) == 1:
+        return args[0]
+
+    def and_op(lhs, rhs):
+        if not isinstance(lhs, (Numeric, cutlass_arith.ArithValue, int, float, bool)):
+            raise DSLNotImplemented(f"{type(lhs)} is not supported")
+        elif isinstance(lhs, (int, float, bool)) and isinstance(
+            rhs, (int, float, bool)
+        ):
+            return lhs and rhs
+        else:
+            return as_numeric(lhs).__dsl_and__(as_numeric(rhs))
+
+    return functools.reduce(and_op, args[1:], args[0])
+
+
+def or_(*args, loc=None, ip=None):
+    """Logical OR operation for DSL numeric types.
+
+    :param *args: One or more numeric values to OR together
+    :type *args: Numeric
+    :param loc: Source location for MLIR operation tracking
+    :type loc: object, optional
+    :param ip: Insertion point for MLIR operation
+    :type ip: object, optional
+    :return: The result of the logical OR operation
+    :rtype: Numeric
+    :raises ValueError: If no arguments are provided
+
+    Supports multiple calling patterns:
+
+    - or_(a): Returns a
+    - or_(a, b, c, ...): if a is truthy, returns a, otherwise returns or_(b, c, ...)
+
+    Examples:
+
+    .. code-block:: python
+
+        # In Python, 'or' returns the first operand if it's truthy,
+        # otherwise it returns the second operand
+        a = 5
+        b = 3
+        result = a or b  # Returns 5
+
+        # In this DSL implementation, the behavior is similar but works with DSL types
+        a = Int32(5)
+        b = Int32(3)
+        result = or_(a, b)  # Returns a
+    """
+    if len(args) == 0:
+        raise ValueError("or_() requires at least one argument")
+
+    if len(args) == 1:
+        return args[0]
+
+    def or_op(lhs, rhs):
+        if not isinstance(lhs, (Numeric, cutlass_arith.ArithValue, int, float, bool)):
+            raise DSLNotImplemented(f"{type(lhs)} is not supported")
+        elif isinstance(lhs, (int, float, bool)) and isinstance(
+            rhs, (int, float, bool)
+        ):
+            return lhs or rhs
+        else:
+            return as_numeric(lhs).__dsl_or__(as_numeric(rhs))
+
+    return functools.reduce(or_op, args[1:], args[0])
+
+
+def all_(iterable):
+    """Logical AND operation for all elements in an iterable.
+
+    Returns True if all elements in the iterable are truthy, otherwise False.
+    This is the DSL equivalent of Python's built-in all() function.
+
+    :param iterable: An iterable containing values to check
+    :type iterable: Iterable
+    :return: True if all elements are truthy, False otherwise
+    :rtype: Boolean
+
+    Examples:
+
+    .. code-block:: python
+
+        # Check if all values are non-zero
+        values = [Int32(1), Int32(2), Int32(3)]
+        result = all_(values)  # Returns True
+
+        # Check if all conditions are met
+        conditions = [a > 0, b < 10, c != 0]
+        result = all_(conditions)  # Returns True if all conditions are met
+    """
+    bool_iterable = [Boolean(i) for i in iterable]
+    return functools.reduce(
+        lambda lhs, rhs: lhs.__dsl_and__(rhs) if hasattr(lhs, "__dsl_and__") else lhs,
+        bool_iterable,
+        Boolean(True),
+    )
+
+
+def any_(iterable):
+    """Logical OR operation for any element in an iterable.
+
+    Returns True if any element in the iterable is truthy, otherwise False.
+    This is the DSL equivalent of Python's built-in any() function.
+
+    :param iterable: An iterable containing values to check
+    :type iterable: Iterable
+    :return: True if any element is truthy, False otherwise
+    :rtype: Boolean
+
+    Examples:
+
+    .. code-block:: python
+
+        # Check if any value is non-zero
+        values = [Int32(0), Int32(0), Int32(3)]
+        result = any_(values)  # Returns True
+
+        # Check if any condition is met
+        conditions = [a > 10, b < 0, c != 0]
+        result = any_(conditions)  # Returns True if any condition is met
+    """
+    bool_iterable = [Boolean(i) for i in iterable]
+    return functools.reduce(
+        lambda lhs, rhs: lhs.__dsl_or__(rhs) if hasattr(lhs, "__dsl_or__") else lhs,
+        bool_iterable,
+        Boolean(False),
+    )
+
+
+# =============================================================================
+# Conditional Expression
+# =============================================================================
+
+
+def select_(cond, if_value, else_value):
+    def _as_scalar(value):
+        if const_expr(isinstance(value, list)):
+            if const_expr(len(value) == 1):
+                return value[0]
+            else:
+                raise DSLRuntimeError(
+                    "Conditional expression must have exactly one value in all expressions"
+                )
+        return value
+
+    # Non-DSL dynamic cond should be handled before this.
+    if const_expr(not is_dynamic_expression(cond)):
+        raise DSLRuntimeError("Conditional expression must be dynamic")
+
+    # Extract MLIR values
+    cond = extract_mlir_values(cond)
+    if const_expr(is_dynamic_expression(if_value)):
+        if_value = extract_mlir_values(if_value)
+    else:
+        if_value = const(if_value)
+    if const_expr(is_dynamic_expression(else_value)):
+        else_value = extract_mlir_values(else_value)
+    else:
+        else_value = const(else_value)
+
+    return arith.SelectOp(
+        _as_scalar(cond), _as_scalar(if_value), _as_scalar(else_value)
+    ).result
+
+
+# =============================================================================
+# Terminator
+# =============================================================================
+
+
+def yield_out(args=[], loc=None, ip=None):
+    """
+    Generate a yield operation. It it used to return values from a loop, if-else, or while region.
+    """
+    scf.yield_(extract_mlir_values(args), loc=loc, ip=ip)
+
+
+# =============================================================================
+# For Loop
+# =============================================================================
+
+
+class LoopUnroll(ir.Attribute):
+    def __init__(self, **kwargs):
+        valid_keys = set(["count", "full"])
+        def to_mlir_attr(val):
+            if isinstance(val, bool):
+                return "true" if val else "false"
+            elif isinstance(val, int):
+                return f"{val} : i32"
+            else:
+                raise DSLNotImplemented(f"{type(val)} is not supported")
+
+        cfg = {key: to_mlir_attr(kwargs[key]) for key in valid_keys if key in kwargs}
+        if kwargs.get("count", None) == 1:
+            cfg["disable"] = "true"
+
+        unroll = "<" + ", ".join(f"{key} = {value}" for key, value in cfg.items()) + ">"
+
+        super().__init__(
+            ir.Attribute.parse(f"#llvm.loop_annotation<unroll = {unroll}>")
+        )
+
+
+def for_generate(
+    start,
+    stop=None,
+    step=None,
+    iter_args: Optional[Sequence[ir.Value]] = None,
+    *,
+    unroll: LoopUnroll = None,
+    loc=None,
+    ip=None,
+):
+    """
+    scf.for with yield support
+    """
+
+    if step is None:
+        step = 1
+    if stop is None:
+        stop = start
+        start = 0
+    start = const(start)
+    params = [start, stop, step]
+    for i, p in enumerate(params):
+        if isinstance(p, int):
+            p = const(p)
+        elif isinstance(p, float):
+            raise DSLRuntimeError(f"{p=} must be int.")
+        elif isinstance(p, Integer):
+            p = p.ir_value()
+        params[i] = p
+
+    start, stop, step = params
+
+    def _createI32Attr(value):
+        if not isinstance(value, int):
+            raise DSLRuntimeError(f"value must be int.")
+        return ir.IntegerAttr.get(ir.IntegerType.get_signless(32), value)
+
+    ir_iter_args = extract_mlir_values(iter_args) if iter_args is not None else None
+    if not _validate_iter_args_structure(iter_args, ir_iter_args):
+        raise DSLRuntimeError("iter_args: Elements should be extractable as ir.Value.")
+    for_op = scf.ForOp(start, stop, step, ir_iter_args, loc=loc, ip=ip)
+    if unroll is not None:
+        for_op.attributes["loop_annotation"] = unroll
+
+    iv = for_op.induction_variable
+    new_results = new_from_mlir_values(iter_args, for_op.results)
+    new_iter_args = new_from_mlir_values(iter_args, for_op.inner_iter_args)
+    new_iter_args = () if new_iter_args is None else tuple(new_iter_args)
+
+    with ir.InsertionPoint(for_op.body):
+        if len(new_iter_args) > 1:
+            yield iv, new_iter_args, new_results
+        elif len(new_iter_args) == 1:
+            yield iv, new_iter_args[0], new_results[0]
+        else:
+            yield iv
+
+
+# =============================================================================
+# Logical Operators
+# =============================================================================
+
+
+def not_(lhs: Union[ir.Value, bool], *, loc=None, ip=None):
+    """
+    Logical Not
+    """
+    res = None
+    # Handle Python bool first to prevent infinite recursion
+    if const_expr(type(lhs) == bool):
+        res = lhs ^ True
+    elif const_expr(hasattr(lhs, "__dsl_not__")):
+        res = lhs.__dsl_not__(loc=loc, ip=ip)
+    elif const_expr(is_dynamic_expression(lhs)):
+        # If lhs is MLIR value, compute not using xor
+        res = arith.XOrIOp(lhs, const(1, lhs.type)).result
+    else:
+        res = bool(lhs) ^ True
+
+    return res
+
+
+# =============================================================================
+# If/Else
+# =============================================================================
+
+
+def if_generate(
+    cond: Boolean,
+    then_body: Callable,
+    else_body: Optional[Callable] = None,
+    input_args: List[DslType] = None,
+    return_types: List[DslType] = None,
+    *,
+    loc=None,
+    ip=None,
+) -> List:
+    """
+    Generate an IfOp with optional else branch and return values.
+
+    Args:
+        cond: The condition expression
+        then_body: Function to execute in then branch
+        else_body: Optional function to execute in else branch
+        input_args: Arguments to pass to branch bodies
+        return_types: Expected return types for the operation
+        loc: Optional location information
+        ip: Optional insertion point
+
+    Returns:
+        List of DSL typed results
+    """
+    input_args = input_args or []
+    mlir_return_types = []
+
+    # Validate and collect MLIR return types (if provided).
+    if return_types is not None:
+        for t in return_types:
+            if not isinstance(t, DslType):
+                raise DSLRuntimeError(f"{t=} must be a DslType.")
+            mlir_return_types.append(t.mlir_type)
+
+    # Determine whether there's an else branch.
+    has_else = else_body is not None
+
+    # Create the IfOp.
+    if_op = scf.IfOp(
+        Boolean(cond).ir_value(), mlir_return_types, hasElse=has_else, loc=loc, ip=ip
+    )
+
+    def _execute_and_yield_out(body, input_args):
+        yield_vals = body(*input_args)
+        if return_types is not None:
+            if not isinstance(yield_vals, Iterable):
+                # body only return single element
+                yield_vals = [yield_vals]
+
+            yield_vals = [t(r) for t, r in zip(return_types, yield_vals)]
+        yield_out(yield_vals)
+
+    # Generate the body for 'then'.
+    with ir.InsertionPoint(if_op.then_block):
+        _execute_and_yield_out(then_body, input_args)
+
+    # Generate the body for 'else' if provided.
+    if has_else:
+        with ir.InsertionPoint(if_op.else_block):
+            _execute_and_yield_out(else_body, input_args)
+
+    # Collect MLIR results.
+    mlir_results = _get_op_result_or_op_results(if_op)
+
+    if not isinstance(mlir_results, list):
+        mlir_results = [mlir_results]
+
+    # Wrap the results with their DSL types.
+    if return_types is None:
+        return []
+
+    vals = [t(r) for t, r in zip(return_types, mlir_results)]
+
+    if len(vals) == 1:
+        return vals[0]
+
+    return vals
+
+
+# =============================================================================
+# While Loop
+# =============================================================================
+
+
+class WhileLoopContext:
+    """
+    Context manager for a dynamic while loop.
+    """
+
+    def __init__(
+        self,
+        inputs: Sequence[Union[ir.Value, Numeric]],
+        condition: Callable[[Sequence[ir.Value]], ir.Value],
+        *,
+        loc=None,
+        ip=None,
+    ):
+        # Keep original inputs and allow recover original type information
+        self.inputs = inputs
+
+        self.input_ir_values = extract_mlir_values(inputs)
+
+        if not _validate_iter_args_structure(inputs, self.input_ir_values):
+            raise DSLRuntimeError("inputs: Elements should be extractable as ir.Value.")
+
+        self.condition = condition
+        self.input_ir_types = [i.type for i in self.input_ir_values]
+        self.while_op = scf.WhileOp(
+            self.input_ir_types, self.input_ir_values, loc=loc, ip=ip
+        )
+
+        self.before_region = self.while_op.before
+        self.after_region = self.while_op.after
+
+        self.before_region.blocks.append(*self.input_ir_types)
+        self.before_block = self.before_region.blocks[0]
+
+        self.after_region.blocks.append(*self.input_ir_types)
+        self.after_block = self.after_region.blocks[0]
+
+    def __enter__(self):
+        with ir.InsertionPoint(self.before_block):
+            args = new_from_mlir_values(self.inputs, self.before_block.arguments)
+            cond = self.condition(*args)
+            cond_ir_val = extract_mlir_values(cond)
+            scf.ConditionOp(cond_ir_val[0], [*self.before_block.arguments])
+        self.ipoint_op = ir.InsertionPoint(self.after_block)
+        self.ipoint_op.__enter__()
+        return new_from_mlir_values(self.inputs, self.after_block.arguments)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.ipoint_op.__exit__(exc_type, exc_value, traceback)
+        return True
+
+    @property
+    def results(self):
+        return new_from_mlir_values(self.inputs, self.while_op.results_)
+
+
+def while_generate(
+    inputs: Sequence[Union[ir.Value, Numeric]],
+    condition: Callable[[Sequence[Union[ir.Value, Numeric]]], Union[ir.Value, Numeric]],
+    *,
+    loc=None,
+    ip=None,
+) -> WhileLoopContext:
+    """
+    Generate a WhileLoopContext for a dynamic loop.
+    """
+    return WhileLoopContext(inputs, condition, loc=loc, ip=ip)
diff --git a/python/CuTeDSL/cutlass_dsl/cutlass_ast_decorators.py b/python/CuTeDSL/cutlass_dsl/cutlass_ast_decorators.py
new file mode 100644
index 00000000..ba7b9d76
--- /dev/null
+++ b/python/CuTeDSL/cutlass_dsl/cutlass_ast_decorators.py
@@ -0,0 +1,515 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from typing import List, Tuple
+from cutlass._mlir import ir
+from cutlass._mlir.dialects import scf, arith
+from cutlass._mlir.extras import types as T
+
+from ..base_dsl.dsl import extract_mlir_values, new_from_mlir_values
+from ..base_dsl.ast_helpers import *
+from ..base_dsl.utils.logger import log
+from ..base_dsl import typing as t
+from ..base_dsl.typing import Int32, Float32, Boolean, Numeric, get_mlir_types
+from . import cutlass as cutlass_dsl
+
+# =============================================================================
+# AST Helpers
+# =============================================================================
+
+
+class LoopUnroll(ir.Attribute):
+    def __init__(self, **kwargs):
+        valid_keys = set(["count", "full"])
+        def to_mlir_attr(val):
+            if isinstance(val, bool):
+                return "true" if val else "false"
+            elif isinstance(val, int):
+                return f"{val} : i32"
+            else:
+                raise DSLNotImplemented(f"{type(val)} is not supported")
+
+        cfg = {key: to_mlir_attr(kwargs[key]) for key in valid_keys if key in kwargs}
+        if kwargs.get("count", None) == 1:
+            cfg["disable"] = "true"
+
+        unroll = "<" + ", ".join(f"{key} = {value}" for key, value in cfg.items()) + ">"
+
+        super().__init__(
+            ir.Attribute.parse(f"#llvm.loop_annotation<unroll = {unroll}>")
+        )
+
+
+class ScfGenerator:
+    """
+    Encapsulates common scf dialect functionality: pack, unpack, and SCF execution.
+    """
+
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def fill_none(ir_values, unpacked_values):
+        i = 0
+        for idx, item in enumerate(unpacked_values):
+            if item is not None:
+                unpacked_values[idx] = ir_values[i]
+                i += 1
+
+    @staticmethod
+    def _normalize_region_result_to_list(region_result: Any) -> List[Any]:
+        """
+        Convert region_result to a list if it is not already a list
+        If region_result is a list, return it as is.
+        If region_result is None, return an empty list.
+        If region_result is not a list, return a list containing region_result as the only element.
+        """
+        if region_result is None:
+            region_result_list = []
+        elif not isinstance(region_result, list):
+            region_result_list = [region_result]
+        else:
+            region_result_list = region_result
+        return region_result_list
+
+    @staticmethod
+    def check_region_result(region_values, ir_values):
+        for i, (expected_value, actual_value) in enumerate(
+            zip(ir_values, region_values)
+        ):
+            expected_value_type = get_mlir_types(expected_value)
+            actual_value_type = get_mlir_types(actual_value)
+            if expected_value_type != actual_value_type:
+                return False, i, expected_value_type, actual_value_type
+        return True, -1, None, None
+
+    def scf_execute_dynamic(
+        self,
+        op_type_name: str,
+        used_args: List[Any],
+        mix_iter_args: List[Any],
+        mix_iter_arg_names: List[str],
+        create_op_func: Callable[
+            [List[ir.Value], Dict[int, Tuple[int, int]], List[Any]], ir.Operation
+        ],
+        region_builders: List[
+            Callable[
+                [
+                    "ir.Operation",
+                    List["ir.Value"],  # block_args
+                    List[Any],  # used_args
+                    List["ir.Value"],  # dyn_yield_ops
+                    Dict[int, Tuple[int, int]],
+                    List[Any],
+                ],
+                Any,
+            ]
+        ],
+        # block_term_op_builder[region_builder] = scf_op_builder
+        # e.g. scf.ConditionOp for while loop
+        block_term_op_builder: Dict[Callable, Callable] = {},
+    ) -> Any:
+        # 1) Unpack
+        ir_values, dyn_unpacked_values, dyn_indices, dyn_class_types = (
+            cutlass_dsl.unpack_to_irvalue(mix_iter_args, op_type_name)
+        )
+        # 2) Create the SCF op
+        op = create_op_func(ir_values, dyn_indices, dyn_class_types)
+        log().debug("Generated scf.%s \n[%s]", op_type_name, op)
+
+        # 3) Build the regions
+        for i, builder in enumerate(region_builders):
+            region = op.regions[i]
+            block = region.blocks[0]
+            with ir.InsertionPoint(block):
+                block_args = list(block.arguments)
+                region_result = builder(
+                    op,
+                    block_args,
+                    used_args,
+                    dyn_unpacked_values,
+                    dyn_indices,
+                    dyn_class_types,
+                )
+
+                # Use custom terminator if provided for this builder, otherwise use default YieldOp
+                if builder in block_term_op_builder:
+                    # Use the provided terminator generator
+                    block_term_op_builder[builder](region_result)
+                else:
+                    # Normalize region_result
+                    region_result_list = ScfGenerator._normalize_region_result_to_list(
+                        region_result
+                    )
+                    # Default behavior - generate YieldOp
+                    region_values, unpacked_values, _, _ = (
+                        cutlass_dsl.unpack_to_irvalue(region_result_list, op_type_name)
+                    )
+
+                    is_match, mismatch_idx, expected_type, actual_type = (
+                        ScfGenerator.check_region_result(region_values, ir_values)
+                    )
+
+                    if not is_match:
+                        # From unpacked index, we need to find the original index
+                        original_idx = -1
+                        for unpacked_idx, (original_idx, length) in dyn_indices.items():
+                            if (
+                                mismatch_idx >= original_idx
+                                and mismatch_idx < original_idx + length
+                            ):
+                                original_idx = unpacked_idx
+                                break
+                        raise DSLRuntimeError(
+                            f"`{op_type_name}` expects {expected_type} type for varible `{mix_iter_arg_names[original_idx]}`, but got {actual_type}.",
+                            suggestion=f"Please make sure `{mix_iter_arg_names[original_idx]}` type is not changed inside of `{op_type_name}`.",
+                        )
+                    scf.YieldOp(region_values)
+
+        log().debug("Completed scf.%s \n[%s]", op_type_name, op)
+        ScfGenerator.fill_none(op.results, unpacked_values)
+
+        # 4) Pack final results
+        final_results = cutlass_dsl.pack_from_irvalue(
+            unpacked_values, dyn_indices, dyn_class_types
+        )
+
+        # 5) Return in a nice pattern
+        if not final_results:
+            return
+        if len(final_results) == 1:
+            return final_results[0]
+        return final_results
+
+
+def _loop_execute_range_dynamic(
+    func: Callable,
+    start: Any,
+    stop: Any,
+    step: Any,
+    used_args: List[Any] = [],
+    mix_iter_args: List[Any] = [],
+    mix_iter_arg_names: List[str] = [],
+    unroll: int = -1,
+    unroll_full: bool = False,
+):
+    """
+    Example: build an scf.for with optional unroll, using our universal helper.
+    """
+    scf_gen = ScfGenerator()
+
+    def create_for_op(
+        dyn_yield_ops: List[ir.Value],
+        dyn_indices: Dict[int, Tuple[int, int]],
+        dyn_class_types: List[Any],
+    ):
+        for d in dyn_yield_ops:
+            if not isinstance(d, ir.Value):
+                raise DSLRuntimeError(
+                    f"Invalid dyn_yield_ops: {dyn_yield_ops} \n\tExpected ir.Value, got {type(d)}"
+                )
+
+        # Convert Python ints or values to IR constants if needed
+        start_ = t.as_numeric(start)
+        stop_ = t.as_numeric(stop)
+        step_ = t.as_numeric(step)
+        assert start_ is not t.Int32, "Start is required for scf.for"
+        assert stop_ is not t.Int32, "Stop is required for scf.for"
+        assert step_ is not t.Int32, "Step is required for scf.for"
+        start_ = start_.ir_value()
+        stop_ = stop_.ir_value()
+        step_ = step_.ir_value()
+
+        # Possibly attach unroll attributes
+        unroll_attr = None
+        if unroll_full:
+            unroll_attr = LoopUnroll(full=True)
+        elif unroll != -1:
+            unroll_attr = LoopUnroll(count=unroll)
+        log().debug("Unroll attribute: %s", unroll_attr)
+
+        log().debug(
+            "Creating scf.ForOp \n\t\tstart=%s: type : %s\n\t\tstop=%s: type : %s\n\t\tstep=%s: type : %s",
+            start_,
+            type(start_),
+            stop_,
+            type(stop_),
+            step_,
+            type(step_),
+        )
+        # Create scf.ForOp, passing iteration args if any
+        try:
+            if not dyn_yield_ops:
+                for_op = scf.ForOp(start_, stop_, step_)
+            else:
+                for_op = scf.ForOp(start_, stop_, step_, list(dyn_yield_ops))
+        except Exception as e:
+            yield_ops = "\n".join(
+                f"\t\t{i} => {d} : type : {type(d)}"
+                for i, d in enumerate(dyn_yield_ops)
+            )
+            raise DSLRuntimeError(
+                f"Failed to create scf.ForOp \n\t\tstart={start_}: type : {type(start_)}"
+                f"\n\t\tstop={stop_}: type : {type(stop_)}\n\t\tstep={step_}: type : {type(step_)}"
+                f", \n\tdyn_yield_ops:\n{yield_ops}"
+            ) from e
+
+        if unroll_attr is not None:
+            for_op.attributes["loop_annotation"] = unroll_attr
+
+        return for_op
+
+    def for_body_builder(
+        op, block_args, used_args, dyn_yield_ops, dyn_indices, dyn_class_types
+    ):
+        # Insert induction variable at the beginning
+        dyn_yield_ops.insert(0, block_args[0])
+        ScfGenerator.fill_none(block_args, dyn_yield_ops)
+        block_args = dyn_yield_ops
+        # scf.ForOp block_args are typically [induction_var, iter_args...]
+        # But MLIR also gives you op.induction_variable
+        iv = t.as_numeric(op.induction_variable)
+        log().debug(
+            "For body builder: %s block_args: %s used_args: %s",
+            iv,
+            block_args,
+            used_args,
+        )
+        if len(block_args) <= 1:
+            # No iteration arguments, or only the induction var
+            func(iv, *used_args)
+            return []  # yield nothing
+        else:
+            # block_args[1:] are iteration variables
+            func_args = [*used_args]
+            func_args.extend(
+                cutlass_dsl.pack_from_irvalue(
+                    block_args[1:], dyn_indices, dyn_class_types
+                )
+            )
+            updated_func_args = func(iv, *func_args)
+            return updated_func_args
+
+    # Now call the universal SCF executor with a single region builder
+    return scf_gen.scf_execute_dynamic(
+        op_type_name="for",
+        used_args=used_args,
+        mix_iter_args=mix_iter_args,
+        mix_iter_arg_names=mix_iter_arg_names,
+        create_op_func=create_for_op,
+        region_builders=[for_body_builder],
+    )
+
+
+def _if_execute_dynamic(
+    pred: "ir.Value",
+    then_block: Callable,
+    else_block: Callable = None,
+    used_args: List[Any] = [],
+    mix_yield_args: List[Any] = [],
+    mix_yield_arg_names: List[str] = [],
+    if_constexpr=None,  # ignoring for brevity
+):
+    """
+    Build an scf.if with optional else, using our universal helper.
+    """
+    scf_gen = ScfGenerator()
+
+    def create_if_op(
+        dyn_yield_ops: List[ir.Value],
+        dyn_indices: Dict[int, Tuple[int, int]],
+        dyn_class_types: List[Any],
+    ):
+        # Assume final result types match the dynamic yields
+        result_types = [arg.type for arg in dyn_yield_ops]
+
+        pred_ = t.as_numeric(pred)
+
+        if not isinstance(pred_, Boolean):
+            # Convert to Boolean through comparison
+            pred_ = pred_ == True
+
+        try:
+            if_op = scf.IfOp(
+                pred_.ir_value(),
+                hasElse=(else_block is not None),
+                results_=result_types,
+            )
+        except Exception as e:
+            raise DSLRuntimeError(
+                f"Failed to create scf.IfOp \n\t\tpred={pred_}: type : {type(pred_)}"
+            ) from e
+        return if_op
+
+    def then_builder(
+        if_op, block_args, used_args, dyn_yield_ops, dyn_indices, dyn_class_types
+    ):
+        flat_args = [*used_args]
+        flat_args.extend(
+            cutlass_dsl.pack_from_irvalue(dyn_yield_ops, dyn_indices, dyn_class_types)
+        )
+        return then_block(*flat_args)
+
+    region_builders = [then_builder]
+
+    if else_block is not None:
+
+        def else_builder(
+            if_op, block_args, used_args, dyn_yield_ops, dyn_indices, dyn_class_types
+        ):
+            flat_args = [*used_args]
+            flat_args.extend(
+                cutlass_dsl.pack_from_irvalue(
+                    dyn_yield_ops, dyn_indices, dyn_class_types
+                )
+            )
+            return else_block(*flat_args)
+
+        region_builders.append(else_builder)
+
+    return scf_gen.scf_execute_dynamic(
+        op_type_name="if",
+        used_args=used_args,
+        mix_iter_args=mix_yield_args,
+        mix_iter_arg_names=mix_yield_arg_names,
+        create_op_func=create_if_op,
+        region_builders=region_builders,
+    )
+
+
+def _while_execute_dynamic(
+    while_before_block: Callable,
+    while_after_block: Callable = None,
+    used_args=[],
+    yield_args=[],
+    yield_arg_names=[],
+):
+    """
+    Create and return an SCF WhileOp for dynamic loops.
+    Generate the dynamic loop body using SCF WhileOp.
+
+    Args:
+        while_before_block: Function that returns (condition, updated_values)
+        while_after_block: Function that returns updated values
+        used_args: Additional arguments used in the loop body
+        yield_args: Values that are updated in the loop
+
+    See create_while_function in ast_preprocessor.py for details on the input structure.
+    """
+    log().debug("_while_execute_dynamic")
+    while_op_type_name = "while"
+    scf_gen = ScfGenerator()
+
+    def create_while_op(
+        dyn_yield_ops: List[ir.Value],
+        dyn_indices: Dict[int, Tuple[int, int]],
+        dyn_class_types: List[Any],
+    ):
+        # Create the while operation with the types from yield_args
+        result_types = [arg.type for arg in dyn_yield_ops]
+        try:
+            while_op = scf.WhileOp(result_types, dyn_yield_ops)
+            while_op.before.blocks.append(*result_types)
+            while_op.after.blocks.append(*result_types)
+            log().debug("[%s]", while_op)
+            return while_op
+        except Exception as e:
+            yield_ops = "\n".join(
+                f"\t\t{i} => {d} : type : {type(d)}"
+                for i, d in enumerate(dyn_yield_ops)
+            )
+            raise DSLRuntimeError(
+                f"Failed to create scf.WhileOp with yield_ops:\n{yield_ops}"
+            ) from e
+
+    def before_block_builder(
+        op, block_args, used_args, dyn_yield_ops, dyn_indices, dyn_class_types
+    ):
+        # Build the before (condition) block
+        ScfGenerator.fill_none(block_args, dyn_yield_ops)
+        block_args = dyn_yield_ops
+        flat_args = [*used_args]
+        flat_args.extend(
+            cutlass_dsl.pack_from_irvalue(block_args, dyn_indices, dyn_class_types)
+        )
+
+        log().debug("before block args: %s", flat_args)
+
+        cond, before_results = while_before_block(*flat_args)
+
+        if not isinstance(before_results, (list, ir.OpResultList)):
+            before_results = [before_results]
+
+        log().debug("cond [%s]", cond)
+        log().debug(
+            "before_results [%s]",
+            before_results,
+        )
+
+        return cond, before_results
+
+    def before_block_terminator(cond_and_results):
+        # Generate a condition op instead of yield op
+        cond = cond_and_results[0]
+        before_result_list = ScfGenerator._normalize_region_result_to_list(
+            cond_and_results[1]
+        )
+        ir_cond_list, _, _, _ = cutlass_dsl.unpack_to_irvalue(
+            [cond], while_op_type_name
+        )
+        ir_cond = ir_cond_list[0]
+        ir_results_list, _, _, _ = cutlass_dsl.unpack_to_irvalue(
+            before_result_list, while_op_type_name
+        )
+        log().debug(
+            "creating scf.ConditionOp with [%s], [%s]",
+            ir_cond,
+            ir_results_list,
+        )
+        scf.ConditionOp(ir_cond, ir_results_list)
+
+    def after_block_builder(
+        op, block_args, used_args, dyn_yield_ops, dyn_indices, dyn_class_types
+    ):
+        # Build the after (body) block
+        ScfGenerator.fill_none(block_args, dyn_yield_ops)
+        block_args = dyn_yield_ops
+        flat_args = [*used_args]
+        flat_args.extend(
+            cutlass_dsl.pack_from_irvalue(block_args, dyn_indices, dyn_class_types)
+        )
+
+        log().debug("after block args: %s", flat_args)
+
+        after_results = while_after_block(*flat_args)
+
+        if not isinstance(after_results, (list, ir.OpResultList)):
+            after_results = [after_results]
+
+        log().debug(
+            "after_results [%s]",
+            after_results,
+        )
+
+        return after_results
+
+    # Call the universal SCF executor with two region builders
+    return scf_gen.scf_execute_dynamic(
+        op_type_name=while_op_type_name,
+        used_args=used_args,
+        mix_iter_args=yield_args,
+        mix_iter_arg_names=yield_arg_names,
+        create_op_func=create_while_op,
+        region_builders=[before_block_builder, after_block_builder],
+        block_term_op_builder={
+            before_block_builder: before_block_terminator
+        },  # Only customize the before block
+    )
diff --git a/python/CuTeDSL/requirements.txt b/python/CuTeDSL/requirements.txt
new file mode 100644
index 00000000..78ff7a28
--- /dev/null
+++ b/python/CuTeDSL/requirements.txt
@@ -0,0 +1,3 @@
+# Use `pip install -r requirements.txt` with the present file to install a
+# wheel consistent with the present state of the github repository
+nvidia-cutlass-dsl=4.0.0.dev1
diff --git a/python/cutlass/__init__.py b/python/cutlass/__init__.py
index 6cbc9eef..bc2a98d9 100644
--- a/python/cutlass/__init__.py
+++ b/python/cutlass/__init__.py
@@ -133,7 +133,7 @@ def get_option_registry():
         this._option_registry = OptionRegistry(device_cc())
     return this._option_registry
 
-this.__version__ = '3.9.2'
+this.__version__ = '4.0.0'
 
 from cutlass.backend import create_memory_pool
 from cutlass.emit.pytorch import pytorch
diff --git a/python/cutlass/op/conv.py b/python/cutlass/op/conv.py
index 0e8366ab..3639d477 100644
--- a/python/cutlass/op/conv.py
+++ b/python/cutlass/op/conv.py
@@ -111,6 +111,7 @@
 
         args.sync()
 """
+
 from __future__ import annotations
 from typing import Optional
 from cutlass.utils.lazy_import import lazy_import
diff --git a/python/cutlass/utils/lazy_import.py b/python/cutlass/utils/lazy_import.py
index 28ba6546..16f6a185 100644
--- a/python/cutlass/utils/lazy_import.py
+++ b/python/cutlass/utils/lazy_import.py
@@ -1,3 +1,34 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
 import importlib
 from typing import Any
 
@@ -8,4 +39,3 @@ def lazy_import(mod_name: str) -> Any:
             return getattr(module, name)
     
     return Lazy()
-            
diff --git a/python/cutlass/utils/profiler.py b/python/cutlass/utils/profiler.py
index 155c1d35..5733f3ba 100644
--- a/python/cutlass/utils/profiler.py
+++ b/python/cutlass/utils/profiler.py
@@ -193,3 +193,4 @@ class CUDAEventProfiler:
             flops_ += m * n * batch_count * 2
 
         return flops_
+
diff --git a/python/cutlass_library/emit_kernel_listing.py b/python/cutlass_library/emit_kernel_listing.py
index a6eca001..70ba077e 100755
--- a/python/cutlass_library/emit_kernel_listing.py
+++ b/python/cutlass_library/emit_kernel_listing.py
@@ -75,15 +75,10 @@ audit_csv_runtime_fields = [
 ]
 
 def hash_cutlass_string(input_string):
-  # Regex pattern to match instruction shape
-  instruction_shape_pattern = r"[a-zA-Z]\d+x\d+x\d+"  # Matches '_s128x128x64', '_h64x128x16', etc.
   mma_cluster_shape_pattern = r"_\d+x\d+x\d+"         # Matches MMA and Cluster shapes (e.g., '_128x128x256', '_0x0x1')
 
-  # Remove instruction shape (e.g., '_s128x128x64', '_h64x128x16')
-  output = re.sub(instruction_shape_pattern, "", input_string)
-
   # Remove MMA and Cluster shapes (e.g., '_128x128x256', '_0x0x1')
-  output = re.sub(mma_cluster_shape_pattern, "", output)
+  output = re.sub(mma_cluster_shape_pattern, "", input_string)
 
   return output
 
@@ -288,7 +283,7 @@ def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode
   # TODO: randomize beta values for wider coverage
   beta_values = [0.5]
 
-  is_supported_arch = (arch in ["100a", "101a", "120a"])
+  is_supported_arch = (arch in ["100a", "100f", "101a", "101f", "120a", "120f"])
 
   is_runtime_datatype_enabled = mode == "functional_L0" and is_supported_arch
 
@@ -300,23 +295,23 @@ def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode
     #
 
     sm100_mma_data_type_general = [
-      'x16gemm_f16_f16_f16_f16_f16',
-      'x16gemm_f16_f16_f16_void_f16',
-      'x16gemm_f16_f16_f32_f16_f16',
-      'x8tf32gemm_f32_f32_f32_f32_f32',
-      'x16bf16gemm_f32_f32_f32_f32_f32',
+      'gemm_f16_f16_f16_f16_f16',
+      'gemm_f16_f16_f16_void_f16',
+      'gemm_f16_f16_f32_f16_f16',
+      'tf32gemm_f32_f32_f32_f32_f32',
+      'bf16gemm_f32_f32_f32_f32_f32',
     ]
 
     sm100_mma_data_type_runtime_dtype = [
-      'x32gemm_f4_f4_f32_f32_f32',
-      'x32gemm_f6_f6_f32_f32_f32',
-      'x32gemm_f8_f8_f32_f32_f32',
+      'gemm_f4_f4_f32_f32_f32',
+      'gemm_f6_f6_f32_f32_f32',
+      'gemm_f8_f8_f32_f32_f32',
     ]
 
     sm100_mma_data_type_mergeable = [
-      'x32gemm_e4m3_e4m3_f32_f32_f32',# mask out one instance for verification
-      'x32gemm_e2m1_e2m1_f32_f32_f32',
-      'x32gemm_e3m2_e3m2_f32_f32_f32',
+      'gemm_e4m3_e4m3_f32_f32_f32',# mask out one instance for verification
+      'gemm_e2m1_e2m1_f32_f32_f32',
+      'gemm_e3m2_e3m2_f32_f32_f32',
     ]
 
     sm100_mma_cluster_size = [
@@ -331,22 +326,15 @@ def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode
       'ntn' 
     ]
 
-    sm100_mma_instruction_shape = [
-      # [0] .1CTA, General
-      ['64x128', '128x128', '128x256'],
-      # [1] .2CTA, General
-      ['128x128', '256x128', '256x256'],
-    ]
-
     # regex list must be in kernel procedural name order
-    mergeable_sm100_mma_filter_regex_1sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_instruction_shape[0], sm100_mma_data_type_mergeable, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*1sm.*"
-    mergeable_sm100_mma_filter_regex_2sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_instruction_shape[1], sm100_mma_data_type_mergeable, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*2sm.*"
+    mergeable_sm100_mma_filter_regex_1sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_data_type_mergeable, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*1sm.*"
+    mergeable_sm100_mma_filter_regex_2sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_data_type_mergeable, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*2sm.*"
 
-    sm100_mma_filter_regex_1sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_instruction_shape[0], sm100_mma_data_type_general, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*1sm.*"
-    sm100_mma_filter_regex_2sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_instruction_shape[1], sm100_mma_data_type_general, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*2sm.*"
+    sm100_mma_filter_regex_1sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_data_type_general, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*1sm.*"
+    sm100_mma_filter_regex_2sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_data_type_general, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*2sm.*"
 
-    sm100_mma_filter_regex_1sm_runtime = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_instruction_shape[0], sm100_mma_data_type_runtime_dtype, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*1sm.*"
-    sm100_mma_filter_regex_2sm_runtime = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_instruction_shape[1], sm100_mma_data_type_runtime_dtype, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*2sm.*"
+    sm100_mma_filter_regex_1sm_runtime = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_data_type_runtime_dtype, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*1sm.*"
+    sm100_mma_filter_regex_2sm_runtime = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_data_type_runtime_dtype, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*2sm.*"
 
     #
     # Block Scale Gemm
@@ -354,19 +342,19 @@ def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode
 
     block_scaled_data_type_base = [
       # runtime datatypes
-      'x32gemm.*ue8m0xf4_ue8m0xf4_f32_f16_e5m2',
-      'x64gemm.*ue8m0xf4_ue8m0xf4_f32_f16_e5m2',
-      'x32gemm.*ue8m0xf4_ue8m0xf6_f32_f16_e5m2',
-      'x64gemm.*ue8m0xf4_ue8m0xf4_f32_f16_ue8m0xe2m1',
-      'x32gemm.*ue8m0xf6_ue8m0xf6_f32_f16_ue8m0xe3m2',
+      'gemm.*ue8m0xf4_ue8m0xf4_f32_f16_e5m2',
+      'gemm.*ue8m0xf4_ue8m0xf4_f32_f16_e5m2',
+      'gemm.*ue8m0xf4_ue8m0xf6_f32_f16_e5m2',
+      'gemm.*ue8m0xf4_ue8m0xf4_f32_f16_ue8m0xe2m1',
+      'gemm.*ue8m0xf6_ue8m0xf6_f32_f16_ue8m0xe3m2',
     ]
 
     block_scaled_data_type_mergeable = [
-      'x32gemm.*ue8m0xe2m1_ue8m0xe2m1_f32_f16_e5m2',
-      'x64gemm.*ue8m0xe2m1_ue8m0xe2m1_f32_f16_e5m2',
-      'x32gemm.*ue8m0xe2m1_ue8m0xe2m3_f32_f16_e5m2',
-      'x64gemm.*ue8m0xe2m1_ue8m0xe2m1_f32_f16_ue8m0xe2m1',
-      'x32gemm.*ue8m0xe2m3_ue8m0xe2m3_f32_f16_ue8m0xe3m2',
+      'gemm.*ue8m0xe2m1_ue8m0xe2m1_f32_f16_e5m2',
+      'gemm.*ue8m0xe2m1_ue8m0xe2m1_f32_f16_e5m2',
+      'gemm.*ue8m0xe2m1_ue8m0xe2m3_f32_f16_e5m2',
+      'gemm.*ue8m0xe2m1_ue8m0xe2m1_f32_f16_ue8m0xe2m1',
+      'gemm.*ue8m0xe2m3_ue8m0xe2m3_f32_f16_ue8m0xe3m2',
     ]
 
     block_scaled_data_type = block_scaled_data_type_base + block_scaled_data_type_mergeable
@@ -377,56 +365,43 @@ def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode
     ]
 
     block_scaled_layouts = ['tnt']
-    block_scaled_instruction_shape = [
-      # .1CTA
-      ['128x128', '128x192', '128x256'],
-      # .2CTA
-      ['256x128', '256x192', '256x256'],
-    ]
     # regex list must be in kernel procedural name order
-    mergeable_block_scaled_filter_regex_1sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_instruction_shape[0], block_scaled_data_type_mergeable, block_scaled_cluster_size, block_scaled_layouts]]) + ").*1sm.*"
-    mergeable_block_scaled_filter_regex_2sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_instruction_shape[1], block_scaled_data_type_mergeable, block_scaled_cluster_size, block_scaled_layouts]]) + ").*2sm.*"
+    mergeable_block_scaled_filter_regex_1sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_data_type_mergeable, block_scaled_cluster_size, block_scaled_layouts]]) + ").*1sm.*"
+    mergeable_block_scaled_filter_regex_2sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_data_type_mergeable, block_scaled_cluster_size, block_scaled_layouts]]) + ").*2sm.*"
 
-    block_scaled_filter_regex_1sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_instruction_shape[0], block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*1sm.*"
-    block_scaled_filter_regex_2sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_instruction_shape[1], block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*2sm.*"
+    block_scaled_filter_regex_1sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*1sm.*"
+    block_scaled_filter_regex_2sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*2sm.*"
     
-    if arch == "100a":
+    if arch == "100a" or arch == "100f":
       kernel_filter = f"({sm100_mma_filter_regex_1sm})|" \
                       f"({sm100_mma_filter_regex_2sm})|" \
                       f"({sm100_mma_filter_regex_1sm_runtime})|" \
                       f"({sm100_mma_filter_regex_2sm_runtime})|" \
                       f"({block_scaled_filter_regex_1sm})|" \
                       f"({block_scaled_filter_regex_2sm})"
-    elif arch == "101a":
+    elif arch == "101a" or arch == "101f":
       kernel_filter = f"({sm100_mma_filter_regex_1sm})|" \
                       f"({sm100_mma_filter_regex_2sm})|" \
                       f"({sm100_mma_filter_regex_1sm_runtime})|" \
                       f"({sm100_mma_filter_regex_2sm_runtime})|" \
                       f"({block_scaled_filter_regex_1sm})|" \
                       f"({block_scaled_filter_regex_2sm})"
-    elif arch == "120a":
+    elif arch == "120a" or arch == "120f":
 
       # blockscaled sm120_mma kernels
       blockscaled_sm120_mma_kernel_cta_tiles = [
         [ '128x128' ]
       ]
 
-      # sm120 MMA instruction shapes
-      blockscaled_sm120_mma_instruction_shapes = [
-        [ 's16x8x64gemm', 
-          's16x8x32gemm'
-        ]
-      ]
-      
       # Restrict to two layouts to reduce L0 build and test time.
       blockscaled_sm120_mma_layouts = [ 'tn' ]
-      filter_regex_blockscaled_sm120_mma = "cutlass3x_sm120_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [blockscaled_sm120_mma_instruction_shapes[0], blockscaled_sm120_mma_kernel_cta_tiles[0], blockscaled_sm120_mma_layouts]]) + ").*"
+      filter_regex_blockscaled_sm120_mma = "cutlass3x_sm120_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [blockscaled_sm120_mma_kernel_cta_tiles[0], blockscaled_sm120_mma_layouts]]) + ").*"
       
       problem_waves = [0.5, 1.25, 2.5]
 
       kernel_filter = f"({filter_regex_blockscaled_sm120_mma})"
     else:
-      error_message = "unsupported arch, only support sm100a, sm101a, sm120a"
+      error_message = "unsupported arch, only support sm100a, sm100f, sm101a, sm101f, sm120a, sm120f"
       raise Exception(error_message)
     
     # Statically encoded kernels are still added to generated_kernels 
@@ -445,14 +420,8 @@ def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode
                      ]
     # Restrict to two layouts to reduce L1 build and test time.
     sm100_mma_layouts = ['tnt', 'ntn']
-    sm100_mma_instruction_shape = [
-      # .1CTA
-      ['64x128', '128x128', '128x256'],
-      # .2CTA
-      ['128x128', '256x128', '256x256']
-    ]
-    sm100_mma_filter_regex_1sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_instruction_shape[0], sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*1sm.*"
-    sm100_mma_filter_regex_2sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_instruction_shape[1], sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*2sm.*"
+    sm100_mma_filter_regex_1sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*1sm.*"
+    sm100_mma_filter_regex_2sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*2sm.*"
     block_scaled_data_type = [
       'ue8m0xe2m1_ue8m0xe2m1_f32_f16_e5m2',
       'ue8m0xe2m1_ue8m0xe2m3_f32_f16_e5m2',
@@ -463,15 +432,10 @@ def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode
 
     block_scaled_cluster_size = ['4x4x1', '2x1x1', '0x0x1']
     block_scaled_layouts = ['tnt']
-    block_scaled_instruction_shape = [
-      # .1CTA
-      ['128x128', '128x192', '128x256'],
-      # .2CTA
-      ['256x128', '256x192', '256x256'],
-    ]
+
     # regex list must be in kernel procedural name order
-    block_scaled_filter_regex_1sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_instruction_shape[0], block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*1sm.*"
-    block_scaled_filter_regex_2sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_instruction_shape[1], block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*2sm.*"
+    block_scaled_filter_regex_1sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*1sm.*"
+    block_scaled_filter_regex_2sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*2sm.*"
     filter_regex_sm100_mma = f"({sm100_mma_filter_regex_1sm})|" \
                           f"({sm100_mma_filter_regex_2sm})|" \
                           f"({block_scaled_filter_regex_1sm})|" \
diff --git a/python/cutlass_library/gemm_operation.py b/python/cutlass_library/gemm_operation.py
index 54acee63..f85e160f 100644
--- a/python/cutlass_library/gemm_operation.py
+++ b/python/cutlass_library/gemm_operation.py
@@ -183,10 +183,7 @@ class GemmOperation:
       math_op = self.tile_description.math_instruction.math_operation
       math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ''
 
-      if self.is_3x:
-        inst_shape = "{0}x{1}x{2}".format(*tuple(self.tile_description.math_instruction.instruction_shape))
-      else:
-        inst_shape = "{0}{1}{2}".format(*tuple(self.tile_description.math_instruction.instruction_shape))
+      inst_shape = "{0}{1}{2}".format(*tuple(self.tile_description.math_instruction.instruction_shape)) if not self.is_3x else ""
 
       inst_shape += math_op_string
 
@@ -194,7 +191,9 @@ class GemmOperation:
         self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator:
         intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
 
-    return "%s%s%s%s" % (self.short_math_name(), inst_shape, intermediate_type, GemmKindNames[self.gemm_kind])
+    short_math_name = self.short_math_name() if not self.is_3x else ""
+
+    return "%s%s%s%s" % (short_math_name, inst_shape, intermediate_type, GemmKindNames[self.gemm_kind])
 
   # Generates a string representing the MMA instruction.
   def extended_name(self):
@@ -337,18 +336,36 @@ class GemmOperation:
   def opcode_class_name(self):
     return OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
 
+  def get_collective_tile_shape(self):
+    """
+    Get the tile shape passed to the collective builder.
+    On Blackwell, this is different than the operation.tile_description.tile_shape.
+    """
+    is_sm100_kernel = (self.arch == 100)
+    if not is_sm100_kernel:
+      return self.tile_description.tile_shape
+
+    opcode_class_main = self.tile_description.math_instruction.opcode_class
+    instruction_shape = self.tile_description.math_instruction.instruction_shape
+    tile_shape_m, tile_shape_n, tile_shape_k = self.tile_description.tile_shape
+    if opcode_class_main in [OpcodeClass.TensorOp, OpcodeClass.BlockScaledTensorOp, OpcodeClass.SparseTensorOp]:
+      tile_shape_m = instruction_shape[0]
+      tile_shape_n = instruction_shape[1]
+    return (tile_shape_m, tile_shape_n, tile_shape_k)
+
   # Generates the full kernel function name
   def procedural_name(self):
     ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
     opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
     if self.arch >= 90:
       kernel_name_template = "cutlass{p}_sm{ar}_{op}_{ex}{ct}{cs}_{l}_{s}_align{al}{t}{k}{e}"
+      tile_shape = self.get_collective_tile_shape()
       return kernel_name_template.format(
           p = self.prefix,
           ar = self.arch,
           op = opcode_class_name,
           ex = self.extended_name_3x(),
-          ct = '_' + 'x'.join([str(i) for i in self.tile_description.tile_shape]) if self.tile_description.tile_shape[0] > 0 else "",
+          ct = '_' + 'x'.join([str(i) for i in tile_shape]) if tile_shape[0] > 0 else "",
           cs = '_' + 'x'.join([str(i) for i in self.tile_description.cluster_shape]),
           l = self.tile_description.stages,
           s = self.layout_name_3x(),
@@ -920,28 +937,8 @@ ${compile_guard_end}
     instruction_shape = operation.tile_description.math_instruction.instruction_shape
     cluster_m = operation.tile_description.cluster_shape[0]
     cluster_n = operation.tile_description.cluster_shape[1]
-
-    tile_shape_m, tile_shape_n, tile_shape_k = tile_shape
-
-    # account for static/dynamic cluster shapes
-    cta_m = tile_shape[0] // cluster_m if cluster_m > 0 else tile_shape[0]
     cta_n = tile_shape[1] // cluster_n if cluster_n > 0 else tile_shape[1]
-
-    
-    # Shape passed to epilogue builder
-    is_sm100_kernel = (operation.arch == 100)
-    if is_sm100_kernel:
-      cta_m_per_mma_instruction = 2 if "2sm" in operation.procedural_name() else 1
-      if cluster_m <= 0: 
-        cta_m = cta_m // cta_m_per_mma_instruction
-
-      if opcode_class_main in [OpcodeClass.TensorOp 
-                               , OpcodeClass.BlockScaledTensorOp 
-                               , OpcodeClass.SparseTensorOp
-                              ]:
-        tile_shape_m = instruction_shape[0]
-        tile_shape_n = instruction_shape[1]
-    
+    tile_shape_m, tile_shape_n, tile_shape_k = operation.get_collective_tile_shape()
  
     # stage count set to zero indicates builder automatic stage selection
     if operation.tile_description.stages > 0:
diff --git a/python/cutlass_library/generator.py b/python/cutlass_library/generator.py
index 20f8e828..58b605ad 100644
--- a/python/cutlass_library/generator.py
+++ b/python/cutlass_library/generator.py
@@ -1003,14 +1003,11 @@ class ConvOperation3x:
       math_op = self.tile_description.math_instruction.math_operation
       math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ''
 
-      inst_shape = "{0}x{1}x{2}".format(*tuple(self.tile_description.math_instruction.instruction_shape))
-      inst_shape += math_op_string
-
       if self.tile_description.math_instruction.element_a != self.A.element and \
         self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator:
         intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
 
-    return "%s%s%s%s" % (self.short_math_name(), inst_shape, intermediate_type, ConvKindNames[self.conv_kind])
+    return "%s%s%s" % (math_op_string, intermediate_type, ConvKindNames[self.conv_kind])
 
   def extended_name(self):
     '''Generates a string representing the MMA atom. Assumes accumulator type is C type.'''
@@ -5997,8 +5994,8 @@ def GenerateSM90_TensorOp_mixed_dtype_WGMMA_gemm(manifest, cuda_version):
 
   math_instructions = generate_mixed_dtype_math_instructions_sm90(instantiation_level, valid_types_for_a_b_acc)
 
-  valid_types_for_d = [DataType.f32]
-  valid_types_for_c = [DataType.f32]
+  valid_types_for_d = [DataType.f32, DataType.bf16, DataType.f16, DataType.e4m3, DataType.e5m2]
+  valid_types_for_c = copy.deepcopy(valid_types_for_d)
 
   tile_descriptions = generate_tile_descriptions_sm90(
     math_instructions=math_instructions,
@@ -6009,6 +6006,12 @@ def GenerateSM90_TensorOp_mixed_dtype_WGMMA_gemm(manifest, cuda_version):
     math_inst = tile_desc.math_instruction
     data_types = []
 
+    # Limit C/D types to avoid a giant number of instantiations.
+    # A typical use case for mixed dtype in DL is weight quantization (tensor A),
+    # therefore we can limit the output type to that of activation (tensor B).
+    valid_types_for_c = [math_inst.element_b]
+    valid_types_for_d = [math_inst.element_b]
+
     for c_type, d_type in product(valid_types_for_c, valid_types_for_d):
       data_types.append(
         generate_data_types_from_math_instruction(
@@ -6791,6 +6794,11 @@ def GenerateSM100_TensorOp_32b_UMMA_gemm(manifest, cuda_version):
                         , DynamicClusterShape
                        ]
 
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_1sm = [[1,2,1], [1,1,1], [1,4,1]
+                          , DynamicClusterShape
+                         ]
+
   tile_schedulers = [
     TileSchedulerType.Default
   ]
@@ -6838,6 +6846,11 @@ def GenerateSM100_TensorOp_32b_UMMA_gemm(manifest, cuda_version):
   cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1], [4,4,1]
                         , DynamicClusterShape
                        ]
+  
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1]
+                          , DynamicClusterShape
+                         ]
 
   for math_inst in math_instructions_2sm:
     tile_descriptions = []
@@ -6937,6 +6950,11 @@ def GenerateSM100_TensorOp_16b_UMMA_gemm(manifest, cuda_version, gemm_kind=GemmK
                         , DynamicClusterShape
                        ]
 
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_1sm = [[1,2,1], [1,1,1], [1,4,1]
+                          , DynamicClusterShape
+                         ]
+
   tile_schedulers = [
     TileSchedulerType.Default
   ]
@@ -7090,6 +7108,11 @@ def GenerateSM100_TensorOp_16b_UMMA_gemm(manifest, cuda_version, gemm_kind=GemmK
                         , DynamicClusterShape
                        ]
 
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1]
+                          , DynamicClusterShape
+                         ]
+
   for math_inst in math_instructions_2sm:
     tile_descriptions = []
     for cluster_shape in cluster_shapes_2sm:
@@ -7247,6 +7270,11 @@ def GenerateSM100_TensorOp_fp8_UMMA_gemm(manifest, cuda_version, gemm_kind=GemmK
                         , DynamicClusterShape
                        ]
 
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_1sm = [[1,2,1], [2,1,1], [1,1,1], [1,4,1]
+                          , DynamicClusterShape
+                         ]
+
   tile_schedulers = [
     TileSchedulerType.Default,
   ]
@@ -7456,6 +7484,11 @@ def GenerateSM100_TensorOp_fp8_UMMA_gemm(manifest, cuda_version, gemm_kind=GemmK
                         , DynamicClusterShape
                        ]
 
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1]
+                          , DynamicClusterShape
+                         ]                   
+
   for math_inst in math_instructions_2sm:
     tile_descriptions = []
     for cluster_shape in cluster_shapes_2sm:
@@ -7916,6 +7949,13 @@ def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version):
     , DynamicClusterShape
     ]
 
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_1sm = [
+      [2,1,1],
+      [1,1,1]
+      , DynamicClusterShape
+      ]
+
   # 1xSM MMA kernels
   for math_inst in math_instructions_1sm:
     tile_descriptions = []
@@ -7985,6 +8025,12 @@ def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version):
     , DynamicClusterShape
   ]
 
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_2sm = [
+      [2,1,1]
+      , DynamicClusterShape
+    ]
+
   for math_inst in math_instructions_2sm:
     tile_descriptions = []
     for cluster_shape in cluster_shapes_2sm:
@@ -8138,6 +8184,13 @@ def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cud
     , DynamicClusterShape
   ]
 
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_1sm = [
+      [1,1,1],
+      [2,1,1]
+      , DynamicClusterShape
+    ]
+
   # 1xSM MMA kernels
   for math_inst in math_instructions_1sm:
     tile_descriptions = []
@@ -8211,6 +8264,13 @@ def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cud
     , DynamicClusterShape
   ]
 
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_2sm = [
+      [2,1,1],
+      [4,1,1]
+      , DynamicClusterShape
+    ]
+
   for math_inst in math_instructions_2sm:
     tile_descriptions = []
     for cluster_shape in cluster_shapes_2sm:
@@ -8417,6 +8477,13 @@ def GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_versio
     , DynamicClusterShape
   ]
 
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_1sm = [
+      [1,1,1],
+      [2,1,1]
+      , DynamicClusterShape
+    ]
+
   # 1xSM MMA kernels
   for math_inst in math_instructions_1sm:
     tile_descriptions = []
@@ -8537,6 +8604,13 @@ def GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_versio
     , DynamicClusterShape
   ]
 
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_2sm = [
+      [2,1,1],
+      [4,1,1]
+      , DynamicClusterShape
+    ]
+
   for math_inst in math_instructions_2sm:
     tile_descriptions = []
     for cluster_shape in cluster_shapes_2sm:
@@ -8689,6 +8763,11 @@ def GenerateSM100_TensorOp_int8_UMMA_gemm(manifest, cuda_version):
                         , DynamicClusterShape
                        ]
 
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_1sm = [[1,2,1], [2,1,1], [1,1,1], [1,4,1]
+                          , DynamicClusterShape
+                         ]                    
+
   tile_schedulers = [
     TileSchedulerType.Default,
   ]
@@ -8788,6 +8867,11 @@ def GenerateSM100_TensorOp_int8_UMMA_gemm(manifest, cuda_version):
                         , DynamicClusterShape
                        ]
 
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1]
+                          , DynamicClusterShape
+                         ]
+
   for math_inst in math_instructions_2sm:
     tile_descriptions = []
     for cluster_shape in cluster_shapes_2sm:
@@ -8925,6 +9009,9 @@ def GenerateSM100_SparseTensorOp_32b_UMMA_gemm(manifest, cuda_version):
   for math_inst in math_instructions_1sm:
     tile_descriptions = []
     for cluster_shape in sm100_cluster_shape_1sm:
+      if 101 in manifest.compute_capabilities :
+        if cluster_shape == [4,4,1] :
+          continue
       multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
       tile_descriptions.append(
         TileDescription([
@@ -8953,6 +9040,9 @@ def GenerateSM100_SparseTensorOp_32b_UMMA_gemm(manifest, cuda_version):
   for math_inst in math_instructions_2sm:
     tile_descriptions = []
     for cluster_shape in sm100_cluster_shape_2sm:
+      if 101 in manifest.compute_capabilities :
+        if cluster_shape == [4,4,1] :
+          continue
       multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
       tile_descriptions.append(
         TileDescription([
@@ -9044,6 +9134,9 @@ def GenerateSM100_SparseTensorOp_16b_UMMA_gemm(manifest, cuda_version):
   for math_inst in math_instructions_1sm:
     tile_descriptions = []
     for cluster_shape in sm100_cluster_shape_1sm:
+      if 101 in manifest.compute_capabilities :
+        if cluster_shape == [4,4,1] :
+          continue
       multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
       tile_descriptions.append(
         TileDescription([
@@ -9072,6 +9165,9 @@ def GenerateSM100_SparseTensorOp_16b_UMMA_gemm(manifest, cuda_version):
   for math_inst in math_instructions_2sm:
     tile_descriptions = []
     for cluster_shape in sm100_cluster_shape_2sm:
+      if 101 in manifest.compute_capabilities :
+        if cluster_shape == [4,4,1] :
+          continue
       multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
       tile_descriptions.append(
         TileDescription([
@@ -9163,6 +9259,9 @@ def GenerateSM100_SparseTensorOp_int8_UMMA_gemm(manifest, cuda_version):
   for math_inst in math_instructions_1sm:
     tile_descriptions = []
     for cluster_shape in sm100_cluster_shape_1sm:
+      if 101 in manifest.compute_capabilities :
+        if cluster_shape == [4,4,1] :
+          continue
       multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
       tile_descriptions.append(
         TileDescription([
@@ -9191,6 +9290,9 @@ def GenerateSM100_SparseTensorOp_int8_UMMA_gemm(manifest, cuda_version):
   for math_inst in math_instructions_2sm:
     tile_descriptions = []
     for cluster_shape in sm100_cluster_shape_2sm:
+      if 101 in manifest.compute_capabilities :
+        if cluster_shape == [4,4,1] :
+          continue
       multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
       tile_descriptions.append(
         TileDescription([
@@ -9287,6 +9389,9 @@ def GenerateSM100_SparseTensorOp_fp8_UMMA_gemm(manifest, cuda_version):
   for math_inst in math_instructions_1sm:
     tile_descriptions = []
     for cluster_shape in sm100_cluster_shape_1sm:
+      if 101 in manifest.compute_capabilities :
+        if cluster_shape == [4,4,1] :
+          continue
       multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
       tile_descriptions.append(
         TileDescription([
@@ -9319,6 +9424,9 @@ def GenerateSM100_SparseTensorOp_fp8_UMMA_gemm(manifest, cuda_version):
   for math_inst in math_instructions_2sm:
     tile_descriptions = []
     for cluster_shape in sm100_cluster_shape_2sm:
+      if 101 in manifest.compute_capabilities :
+        if cluster_shape == [4,4,1] :
+          continue
       multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
       tile_descriptions.append(
         TileDescription([
@@ -9417,6 +9525,9 @@ def GenerateSM100_SparseTensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version):
   for math_inst in math_instructions_1sm:
     tile_descriptions = []
     for cluster_shape in sm100_cluster_shape_1sm:
+      if 101 in manifest.compute_capabilities :
+        if cluster_shape == [4,4,1] :
+          continue
       multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
       tile_descriptions.append(
         TileDescription([
@@ -9476,6 +9587,9 @@ def GenerateSM100_SparseTensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version):
   for math_inst in math_instructions_2sm:
     tile_descriptions = []
     for cluster_shape in sm100_cluster_shape_2sm:
+      if 101 in manifest.compute_capabilities :
+        if cluster_shape == [4,4,1] :
+          continue
       multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
       tile_descriptions.append(
         TileDescription([
@@ -9578,6 +9692,12 @@ def GenerateSM100_TensorOp_32b_UMMA_gemm_stream_k(manifest, cuda_version):
     , DynamicClusterShape
     ]
 
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_1sm = [
+      [1,2,1], [1,1,1], [1,4,1]
+      , DynamicClusterShape
+      ]
+
   tile_schedulers = [
     TileSchedulerType.StreamK,
   ]
@@ -9612,6 +9732,12 @@ def GenerateSM100_TensorOp_32b_UMMA_gemm_stream_k(manifest, cuda_version):
     , DynamicClusterShape
   ]
 
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_2sm = [
+      [2,1,1], [2,2,1], [2,4,1], [4,1,1]
+      , DynamicClusterShape
+    ]
+
   for math_inst in math_instructions_2sm:
     tile_descriptions = []
     for cluster_shape in cluster_shapes_2sm:
@@ -9658,6 +9784,12 @@ def GenerateSM100_TensorOp_16b_UMMA_gemm_stream_k(manifest, cuda_version):
     , DynamicClusterShape
     ]
 
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_1sm = [
+      [1,2,1], [1,1,1]
+      , DynamicClusterShape
+      ]
+
   tile_schedulers = [
     TileSchedulerType.StreamK
   ]
@@ -9726,6 +9858,12 @@ def GenerateSM100_TensorOp_16b_UMMA_gemm_stream_k(manifest, cuda_version):
     , DynamicClusterShape
     ]
 
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_2sm = [
+      [2,1,1], [2,2,1], [2,4,1], [4,1,1]
+      , DynamicClusterShape
+      ]
+
   for math_inst in math_instructions_2sm:
     tile_descriptions = []
     for cluster_shape in cluster_shapes_2sm:
@@ -9809,6 +9947,12 @@ def GenerateSM100_TensorOp_fp8_UMMA_gemm_stream_k(manifest, cuda_version):
     , DynamicClusterShape
   ]
 
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_1sm = [
+      [1,2,1], [2,1,1], [1,1,1]
+      , DynamicClusterShape
+    ]
+
   tile_schedulers = [
     TileSchedulerType.StreamK,
   ]
@@ -9861,6 +10005,12 @@ def GenerateSM100_TensorOp_fp8_UMMA_gemm_stream_k(manifest, cuda_version):
     , DynamicClusterShape
     ]
 
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_2sm = [
+      [2,1,1], [2,2,1], [2,4,1], [4,1,1]
+      , DynamicClusterShape
+      ]
+
   for math_inst in math_instructions_2sm:
     tile_descriptions = []
     for cluster_shape in cluster_shapes_2sm:
@@ -9960,6 +10110,9 @@ def GenerateSM100_TensorOp_16b_UMMA_conv3x(manifest, cuda_version,
 
   cluster_shapes_1sm = [[1,1,1], [1,2,1], [1,4,1],[4,4,1]]
 
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_1sm = [[1,1,1], [1,2,1], [1,4,1]]
+
   # tile_descriptions is a 2-level list.
   # Each inner list is for each cluster shape.
   for math_inst, output_type in math_instructions_w_output_1sm:
@@ -10023,6 +10176,8 @@ def GenerateSM100_TensorOp_16b_UMMA_conv3x(manifest, cuda_version,
                           data_types_and_instruction_shapes_2sm)
 
   cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1], [4,4,1]]
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1]]
 
   for math_inst, output_type in math_instructions_w_output_2sm:
     tile_descriptions = []
@@ -10103,6 +10258,8 @@ def GenerateSM100_TensorOp_fp8_UMMA_conv3x(manifest, cuda_version,
                           data_types_and_instruction_shapes_1sm)
 
   cluster_shapes_1sm = [[1,1,1], [1,2,1], [1,4,1],[4,4,1]]
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_1sm = [[1,1,1], [1,2,1], [1,4,1]]
 
   for math_inst, output_type in math_instructions_w_output_1sm:
     tile_descriptions = []
@@ -10166,6 +10323,8 @@ def GenerateSM100_TensorOp_fp8_UMMA_conv3x(manifest, cuda_version,
                           data_types_and_instruction_shapes_2sm)
 
   cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1], [4,4,1]]
+  if 101 in manifest.compute_capabilities :
+    cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1]]
 
   for math_inst, output_type in math_instructions_w_output_2sm:
     tile_descriptions = []
@@ -10629,6 +10788,8 @@ def GenerateSM100(manifest, cuda_version):
   #
   # Dense Gemm
   #
+  architectures = manifest.args.architectures.split(';') if len(args.architectures) else ['50',]
+
   GenerateSM100_TensorOp_16b_UMMA_gemm(manifest, cuda_version)
 
   GenerateSM100_TensorOp_32b_UMMA_gemm(manifest, cuda_version)
@@ -10636,7 +10797,8 @@ def GenerateSM100(manifest, cuda_version):
 
   GenerateSM100_TensorOp_16b_UMMA_gemm_stream_k(manifest, cuda_version)
 
-  GenerateSM100_TensorOp_int8_UMMA_gemm(manifest, cuda_version)
+  if '100f' not in architectures and '101f' not in architectures:
+    GenerateSM100_TensorOp_int8_UMMA_gemm(manifest, cuda_version)
 
   GenerateSM100_TensorOp_fp8_UMMA_gemm(manifest, cuda_version)
   # grouped GEMM
@@ -10657,7 +10819,8 @@ def GenerateSM100(manifest, cuda_version):
   #
   GenerateSM100_SparseTensorOp_32b_UMMA_gemm(manifest, cuda_version)
   GenerateSM100_SparseTensorOp_16b_UMMA_gemm(manifest, cuda_version)
-  GenerateSM100_SparseTensorOp_int8_UMMA_gemm(manifest, cuda_version)
+  if '100f' not in architectures and '101f' not in architectures:
+    GenerateSM100_SparseTensorOp_int8_UMMA_gemm(manifest, cuda_version)
   GenerateSM100_SparseTensorOp_fp8_UMMA_gemm(manifest, cuda_version)
   GenerateSM100_SparseTensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version)
 
@@ -11166,7 +11329,7 @@ if __name__ == "__main__":
   GenerateSM89(manifest, args.cuda_version)
   GenerateSM90(manifest, args.cuda_version)
    
-  blackwell_enabled_arch = any(arch in ["100a", "101a", "120a"] for arch in archs)
+  blackwell_enabled_arch = any(arch in ["100a", "100f", "101a", "101f", "120a", "120f"] for arch in archs)
   if blackwell_enabled_arch:
     GenerateSM100(manifest, args.cuda_version)
     GenerateSM120(manifest, args.cuda_version)
diff --git a/python/cutlass_library/manifest.py b/python/cutlass_library/manifest.py
index 38d0f764..d10ec125 100644
--- a/python/cutlass_library/manifest.py
+++ b/python/cutlass_library/manifest.py
@@ -523,10 +523,14 @@ class Manifest:
       arch_conditional_cc = [
         '90a', 
         '100a',
+        '100f',
         '101a',
-        '120a' 
+        '101f',
+        '120a',
+        '120f' 
       ]
       architectures = [x if x not in arch_conditional_cc else x.split('a')[0] for x in architectures]
+      architectures = [x if x not in arch_conditional_cc else x.split('f')[0] for x in architectures]
 
       self.compute_capabilities = [int(x) for x in architectures]
 
diff --git a/python/cutlass_library/sm90_utils.py b/python/cutlass_library/sm90_utils.py
index 63ff6f1f..8ea870ec 100644
--- a/python/cutlass_library/sm90_utils.py
+++ b/python/cutlass_library/sm90_utils.py
@@ -375,6 +375,13 @@ def generate_tile_descriptions_sm90(math_instructions, is_aligned: bool, level:
     mma_multipliers, cluster_sizes = get_mma_multipliers(level), get_cluster_sizes(level, is_aligned)
     for math_inst, mma_mul, cluster_size in product(math_instructions, mma_multipliers, cluster_sizes):
 
+        # generator can stamp out duplicate kernels, because it doesn't explicitly set instruction
+        # shape for SM90 kernels, and the 3.X collective API doesn't directly expose them when using
+        # the auto kernel schedule.
+
+        math_inst_stub = copy.deepcopy(math_inst)
+        math_inst_stub.instruction_shape = [0, 0, 0]
+
         tile_desc = TileDescription(
             threadblock_shape=[
                 math_inst.instruction_shape[0] * mma_mul[0],
@@ -383,7 +390,7 @@ def generate_tile_descriptions_sm90(math_instructions, is_aligned: bool, level:
             ],
             stages=0,
             warp_count=[4, 1, 1],
-            math_instruction=math_inst,
+            math_instruction=math_inst_stub,
             min_compute=90,
             max_compute=90,
             cluster_shape=cluster_size)
@@ -551,6 +558,7 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types,
     b_type_size = DataTypeSize[data_types["b_type"]]
     if a_type_size != b_type_size and CudaToolkitVersionSatisfies(cuda_version, 12, 1):
         schedules = []
+        stream_k_schedules = []
         epilogue_schedule = EpilogueScheduleType.TmaWarpSpecialized
         if a_type_size > b_type_size:
             epilogue_schedule = EpilogueScheduleType.EpilogueTransposed
@@ -579,7 +587,11 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types,
                     KernelScheduleType.TmaWarpSpecializedCooperative,
                     epilogue_schedule
                 ])
-        return schedules, []
+                stream_k_schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedCooperative,
+                    epilogue_schedule
+                ])
+        return schedules, stream_k_schedules
 
     if not is_aligned and not is_blockwise(gemm_kind):
         schedules = [[KernelScheduleType.CpAsyncWarpSpecialized,
diff --git a/python/setup_library.py b/python/setup_library.py
index 8262e5a7..3738e24d 100644
--- a/python/setup_library.py
+++ b/python/setup_library.py
@@ -36,7 +36,7 @@ from setuptools import setup
 def perform_setup():
     setup(
         name='cutlass_library',
-        version='3.9.2',
+        version='4.0.0',
         description='CUTLASS library generation scripts',
         packages=['cutlass_library']
     )
diff --git a/python/setup_pycute.py b/python/setup_pycute.py
index cb945049..b84a228a 100644
--- a/python/setup_pycute.py
+++ b/python/setup_pycute.py
@@ -36,7 +36,7 @@ from setuptools import setup
 def perform_setup():
     setup(
         name='pycute',
-        version='3.9.2',
+        version='4.0.0',
         description='Python implementation of CuTe',
         packages=['pycute'],
     )
diff --git a/test/unit/gemm/device/CMakeLists.txt b/test/unit/gemm/device/CMakeLists.txt
index 8d30e790..00a1fef7 100644
--- a/test/unit/gemm/device/CMakeLists.txt
+++ b/test/unit/gemm/device/CMakeLists.txt
@@ -658,7 +658,6 @@ cutlass_test_unit_gemm_device_add_executable(
   # Syrk SM80 complex f64 tests
   syrk_cf64n_cf64t_tensor_op_f64_sm80.cu
   syrk_cf64n_cf64n_tensor_op_f64_sm80.cu
-  syrk_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
 
   # Syrk SM80 complex f32 tests
   syrk_cf32n_cf32t_tensor_op_f32_sm80.cu
@@ -703,7 +702,6 @@ cutlass_test_unit_gemm_device_add_executable(
 
   # Trmm SM80 complex f64 tests
   trmm_cf64n_cf64n_cf64t_tensor_op_f64_sm80.cu
-  trmm_cf64n_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
 
   # Trmm SM80 complex f32 tests
   trmm_cf32n_cf32n_cf32t_tensor_op_f32_sm80.cu
@@ -776,7 +774,6 @@ cutlass_test_unit_gemm_device_add_executable(
   # Symm SM80 complex f64 tests
   symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_sm80.cu
   symm_cf64n_cf64n_cf64n_tensor_op_rs_f64_sm80.cu
-  symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
 
   # Symm SM80 complex f32 tests
   symm_cf32n_cf32n_tensor_op_f32_ls_sm80.cu
@@ -793,7 +790,6 @@ cutlass_test_unit_gemm_device_add_executable(
   # Hemm SM80 complex f64 tests
   hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_sm80.cu
   hemm_cf64h_cf64n_cf64n_tensor_op_rs_f64_sm80.cu
-  hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
 
   # Hemm SM80 complex f32 tests
   hemm_cf32h_cf32n_tensor_op_f32_ls_sm80.cu
@@ -805,6 +801,20 @@ cutlass_test_unit_gemm_device_add_executable(
   hemm_cf64_cf64_cf64_tensor_op_f64_sm90.cu
 )
 
+if (NOT CUTLASS_NVCC_ARCHS MATCHES 101|101a|101f|103|103a|103f)
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_blas3_gaussian
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
+
+  syrk_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
+  trmm_cf64n_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
+  symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
+  hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
+)
+endif()
+
 cutlass_test_unit_gemm_device_add_executable(
   cutlass_test_unit_gemm_device_grouped_blas3
 
@@ -930,6 +940,13 @@ cutlass_test_unit_gemm_device_add_executable(
   # 8 unit tests
   sm100_gemm_f6_f6_f32_tensor_op_f32_ptr_array.cu
 )
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_blockwise_gemm_sm100
+
+  sm100_gemm_f8_f8_f8_tensor_op_f32_blockwise.cu
+)
+
 endif()
 
 
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnn.cu
index ae3dd8da..829ab7ef 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnn.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnn.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 3.2
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -588,7 +588,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 6.2
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -664,8 +664,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -675,8 +675,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -686,8 +686,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -697,8 +697,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 3.2
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -708,8 +708,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 6.2
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -753,7 +753,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -829,7 +829,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -905,7 +905,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -981,7 +981,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 3.2
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1057,7 +1057,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1133,7 +1133,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1209,7 +1209,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1285,7 +1285,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 6.2
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1361,8 +1361,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1372,8 +1372,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1383,8 +1383,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1394,8 +1394,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 3.2
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1405,8 +1405,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1416,8 +1416,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1427,8 +1427,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1438,8 +1438,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 6.2
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnt.cu
index 4468cc08..57cf69b5 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnt.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnt.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 3.2
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -588,7 +588,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 6.2
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -664,8 +664,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -675,8 +675,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -686,8 +686,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -697,8 +697,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 3.2
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -708,8 +708,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 6.2
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -753,7 +753,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -829,7 +829,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -905,7 +905,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -981,7 +981,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 3.2
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1057,7 +1057,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1133,7 +1133,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1209,7 +1209,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1285,7 +1285,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 6.2
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1361,8 +1361,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1372,8 +1372,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1383,8 +1383,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1394,8 +1394,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 3.2
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1405,8 +1405,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1416,8 +1416,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1427,8 +1427,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1438,8 +1438,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 6.2
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_q_tnt.cu
index 7a2b8b4a..32e576e8 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_q_tnt.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_q_tnt.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 3. 
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_mxf8_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_mxf8_q_tnt.cu
index 59ea632d..9a0dda14 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_mxf8_q_tnt.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_mxf8_q_tnt.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnn.cu
index e5162951..c8ef6c76 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnn.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnn.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 3.2
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -588,7 +588,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -664,8 +664,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -675,8 +675,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -686,8 +686,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -697,8 +697,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 3.2
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -708,8 +708,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 6.2
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -753,7 +753,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -829,7 +829,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -905,7 +905,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -981,7 +981,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 3.2
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1057,7 +1057,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1133,7 +1133,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1209,7 +1209,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1285,7 +1285,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1361,8 +1361,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1372,8 +1372,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1383,8 +1383,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1394,8 +1394,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 3.2
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1405,8 +1405,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1416,8 +1416,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1427,8 +1427,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1438,8 +1438,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 6.2
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnt.cu
index 49669982..ce88316b 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnt.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnt.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 3.2
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -588,7 +588,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -664,8 +664,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -675,8 +675,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -686,8 +686,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -697,8 +697,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 3.2
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -708,8 +708,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 6.2
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -753,7 +753,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -829,7 +829,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -905,7 +905,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -981,7 +981,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 3.2
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1057,7 +1057,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1133,7 +1133,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1209,7 +1209,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1286,7 +1286,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 6.2
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1363,8 +1363,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1374,8 +1374,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1385,8 +1385,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1396,8 +1396,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 3.2
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1407,8 +1407,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1418,8 +1418,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1429,8 +1429,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1440,8 +1440,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32
 }
 
 // 6.2
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_q_tnt.cu
index 636139cd..5e2b22c5 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_q_tnt.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_q_tnt.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 3. 
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf6_f32_f16_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf6_f32_f16_f16_q_tnt.cu
index c9660746..aa7314a3 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf6_f32_f16_f16_q_tnt.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf6_f32_f16_f16_q_tnt.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_
 }
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3
 }
 
 // 3. 
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf8_f32_f16_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf8_f32_f16_f16_q_tnt.cu
index 2d1f7fe2..742b437d 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf8_f32_f16_f16_q_tnt.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf8_f32_f16_f16_q_tnt.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_
 }
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3
 }
 
 // 3. 
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf4_f32_f16_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf4_f32_f16_f16_q_tnt.cu
index 3a2b8fb0..3d703dd9 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf4_f32_f16_f16_q_tnt.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf4_f32_f16_f16_q_tnt.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_
 }
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1
 }
 
 // 3. 
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf6_f32_f16_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf6_f32_f16_f16_q_tnt.cu
index 3645ef36..99575646 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf6_f32_f16_f16_q_tnt.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf6_f32_f16_f16_q_tnt.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_
 }
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2
 }
 
 // 3. 
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf8_f32_f16_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf8_f32_f16_f16_q_tnt.cu
index d3850ca0..c0eda537 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf8_f32_f16_f16_q_tnt.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf8_f32_f16_f16_q_tnt.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_
 }
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3
 }
 
 // 3. 
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f16_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f16_f16_q_tnt.cu
index 78173b29..fe98bbde 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f16_f16_q_tnt.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f16_f16_q_tnt.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 3. 
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f16_mxf8_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f16_mxf8_q_tnt.cu
index 875385b1..53ae2ca6 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f16_mxf8_q_tnt.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f16_mxf8_q_tnt.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f32_f32_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f32_f32_q_tnt.cu
index 07b8f150..6130a8b3 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f32_f32_q_tnt.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f32_f32_q_tnt.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 3. 
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf6_f32_f16_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf6_f32_f16_f16_q_tnt.cu
index e3040f86..01d7066f 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf6_f32_f16_f16_q_tnt.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf6_f32_f16_f16_q_tnt.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_
 }
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3
 }
 
 // 3. 
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnn.cu
index d6909000..614e69d8 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnn.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnn.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnn_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnn_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnn_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnn_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnn_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnn_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnn_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnn_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnn_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnn_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnn_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnn_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnn_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnn_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnn_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnn_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnn_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnn_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnn_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnn_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnn_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnn_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnn_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnn_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnn_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnn_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnn_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnn_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnn_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnn_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnt.cu
index c9661953..4690024d 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnt.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnt.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnn_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnn_sfd.cu
index 024246b1..9f209c05 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnn_sfd.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnn_sfd.cu
@@ -60,7 +60,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnn_align32_q_1sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnn_align32_q_1sm_epiVs64n {
 
     using LayoutA = cutlass::layout::ColumnMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnn_align32_q_1sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnn_align32_q_1sm_epiVs64n {
 
     using LayoutA = cutlass::layout::ColumnMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnn_align32_q_1sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnn_align32_q_1sm_epiVs64n {
 
     using LayoutA = cutlass::layout::ColumnMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnn_align32_q_2sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnn_align32_q_2sm_epiVs64n {
 
     using LayoutA = cutlass::layout::ColumnMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnn_align32_q_2sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnn_align32_q_2sm_epiVs64n {
 
     using LayoutA = cutlass::layout::ColumnMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnn_align32_q_2sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnn_align32_q_2sm_epiVs64n {
 
     using LayoutA = cutlass::layout::ColumnMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -546,8 +546,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnn_align32_q_1sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnn_align32_q_1sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnn_align32_q_1sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -557,8 +557,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnn_align32_q_1sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnn_align32_q_1sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnn_align32_q_1sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -568,8 +568,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnn_align32_q_1sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnn_align32_q_1sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnn_align32_q_1sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -579,8 +579,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnn_align32_q_2sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnn_align32_q_2sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnn_align32_q_2sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -590,8 +590,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnn_align32_q_2sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnn_align32_q_2sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnn_align32_q_2sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -601,8 +601,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnn_align32_q_2sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnn_align32_q_2sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnn_align32_q_2sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnt_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnt_sfd.cu
index 2c751282..2d0bc431 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnt_sfd.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnt_sfd.cu
@@ -60,7 +60,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnt_align32_q_1sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnt_align32_q_1sm_epiVs64t {
 
     using LayoutA = cutlass::layout::ColumnMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnt_align32_q_1sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnt_align32_q_1sm_epiVs64t {
 
     using LayoutA = cutlass::layout::ColumnMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnt_align32_q_1sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnt_align32_q_1sm_epiVs64t {
 
     using LayoutA = cutlass::layout::ColumnMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnt_align32_q_2sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnt_align32_q_2sm_epiVs64t {
 
     using LayoutA = cutlass::layout::ColumnMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnt_align32_q_2sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnt_align32_q_2sm_epiVs64t {
 
     using LayoutA = cutlass::layout::ColumnMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnt_align32_q_2sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnt_align32_q_2sm_epiVs64t {
 
     using LayoutA = cutlass::layout::ColumnMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -546,8 +546,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnt_align32_q_1sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnt_align32_q_1sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnt_align32_q_1sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -557,8 +557,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnt_align32_q_1sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnt_align32_q_1sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnt_align32_q_1sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -568,8 +568,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnt_align32_q_1sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnt_align32_q_1sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnt_align32_q_1sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -579,8 +579,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnt_align32_q_2sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnt_align32_q_2sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnt_align32_q_2sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -590,8 +590,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnt_align32_q_2sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnt_align32_q_2sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnt_align32_q_2sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -601,8 +601,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnt_align32_q_2sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnt_align32_q_2sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnt_align32_q_2sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnn_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnn_sfd.cu
index 43eb3bea..fa8791ad 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnn_sfd.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnn_sfd.cu
@@ -60,7 +60,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -546,8 +546,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -557,8 +557,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -568,8 +568,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -579,8 +579,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -590,8 +590,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -601,8 +601,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_sfd.cu
index 14e48873..dde24b65 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_sfd.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_sfd.cu
@@ -60,7 +60,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -546,8 +546,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -557,8 +557,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -568,8 +568,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -579,8 +579,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -590,8 +590,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -601,8 +601,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_streamk.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_streamk.cu
index 6a8608e4..9df684fa 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_streamk.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_streamk.cu
@@ -56,7 +56,7 @@ using namespace cute;
 // 6. 256x256_tnt_vs64in
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -125,7 +125,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -194,7 +194,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -263,7 +263,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -332,7 +332,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -401,7 +401,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -470,8 +470,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -481,8 +481,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -492,8 +492,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -503,8 +503,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -514,8 +514,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -525,8 +525,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttn_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttn_sfd.cu
index 67130534..4e4a7579 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttn_sfd.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttn_sfd.cu
@@ -60,7 +60,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttn_align32_q_1sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttn_align32_q_1sm_epiVs64n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::RowMajor;
@@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttn_align32_q_1sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttn_align32_q_1sm_epiVs64n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::RowMajor;
@@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttn_align32_q_1sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttn_align32_q_1sm_epiVs64n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::RowMajor;
@@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttn_align32_q_2sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttn_align32_q_2sm_epiVs64n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::RowMajor;
@@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttn_align32_q_2sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttn_align32_q_2sm_epiVs64n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::RowMajor;
@@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttn_align32_q_2sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttn_align32_q_2sm_epiVs64n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::RowMajor;
@@ -546,8 +546,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttn_align32_q_1sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttn_align32_q_1sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttn_align32_q_1sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -557,8 +557,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttn_align32_q_1sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttn_align32_q_1sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttn_align32_q_1sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -568,8 +568,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttn_align32_q_1sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttn_align32_q_1sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttn_align32_q_1sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -579,8 +579,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttn_align32_q_2sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttn_align32_q_2sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttn_align32_q_2sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -590,8 +590,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttn_align32_q_2sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttn_align32_q_2sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttn_align32_q_2sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -601,8 +601,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttn_align32_q_2sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttn_align32_q_2sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttn_align32_q_2sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttt_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttt_sfd.cu
index 9572cd45..95f428a5 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttt_sfd.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttt_sfd.cu
@@ -60,7 +60,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttt_align32_q_1sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttt_align32_q_1sm_epiVs64t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::RowMajor;
@@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttt_align32_q_1sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttt_align32_q_1sm_epiVs64t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::RowMajor;
@@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttt_align32_q_1sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttt_align32_q_1sm_epiVs64t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::RowMajor;
@@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttt_align32_q_2sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttt_align32_q_2sm_epiVs64t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::RowMajor;
@@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttt_align32_q_2sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttt_align32_q_2sm_epiVs64t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::RowMajor;
@@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttt_align32_q_2sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttt_align32_q_2sm_epiVs64t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::RowMajor;
@@ -546,8 +546,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttt_align32_q_1sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttt_align32_q_1sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttt_align32_q_1sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -557,8 +557,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttt_align32_q_1sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttt_align32_q_1sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttt_align32_q_1sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -568,8 +568,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttt_align32_q_1sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttt_align32_q_1sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttt_align32_q_1sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -579,8 +579,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttt_align32_q_2sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttt_align32_q_2sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttt_align32_q_2sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -590,8 +590,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttt_align32_q_2sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttt_align32_q_2sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttt_align32_q_2sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -601,8 +601,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttt_align32_q_2sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttt_align32_q_2sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttt_align32_q_2sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnn.cu
index 4793d9c6..ffa652c5 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnn.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnn.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnn_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnn_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnn_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnn_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnn_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnn_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnn_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnn_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnn_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnn_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnn_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnn_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnn_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnn_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnn_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnn_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnn_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnn_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnn_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnn_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnn_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnn_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnn_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnn_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnn_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnn_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnn_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnn_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnn_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnn_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnt.cu
index 61294715..3e51cd57 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnt.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnt.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnn.cu
index b6e73cb7..1a0a3359 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnn.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnn.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnn_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnn_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnn_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnn_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnn_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnn_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnn_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnn_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnn_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnn_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnn_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnn_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnn_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnn_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnn_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnn_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnn_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnn_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnn_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnn_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnn_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnn_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnn_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnn_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnn_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnn_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnn_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnn_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnn_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnn_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnt.cu
index 52e3a45e..7213e721 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnt.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnt.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnn.cu
index a4c596cf..08aebed5 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnn.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnn.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnn_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnn_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnn_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnn_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 3. 
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnn_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnn_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnn_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnn_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnn_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnn_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnn_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnn_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnn_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnn_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnn_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnn_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnn_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnn_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnn_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnn_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnn_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnn_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnn_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnn_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnn_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnn_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnn_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnn_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnn_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnn_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnn_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnn_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnt.cu
index 1a00d07f..41512aec 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnt.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnt.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 3. 
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnn_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnn_sfd.cu
index 1c48ee8f..552ffbf7 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnn_sfd.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnn_sfd.cu
@@ -60,7 +60,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -546,8 +546,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -557,8 +557,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -568,8 +568,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -579,8 +579,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -590,8 +590,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -601,8 +601,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnt_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnt_sfd.cu
index 6527b6a5..a92a16d3 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnt_sfd.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnt_sfd.cu
@@ -60,7 +60,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -546,8 +546,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -557,8 +557,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -568,8 +568,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -579,8 +579,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -590,8 +590,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -601,8 +601,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnn.cu
index 4112130f..e363feee 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnn.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnn.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -588,7 +588,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.2
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -664,8 +664,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -675,8 +675,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -686,8 +686,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -697,8 +697,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.2
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -708,8 +708,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.2
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnt.cu
index b667e416..81c13779 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnt.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnt.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.2
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -588,7 +588,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.2
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -664,8 +664,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -675,8 +675,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -686,8 +686,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -697,8 +697,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.2
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -708,8 +708,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.2
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnn_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnn_sfd.cu
index fe87a252..c4881802 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnn_sfd.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnn_sfd.cu
@@ -60,7 +60,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.2
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -546,7 +546,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -627,7 +627,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -708,8 +708,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.2
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -752,8 +752,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -763,8 +763,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -774,8 +774,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -785,8 +785,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.2
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_sfd.cu
index f3916d89..437dd2b5 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_sfd.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_sfd.cu
@@ -60,7 +60,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.2
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -546,7 +546,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -627,7 +627,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -708,8 +708,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.2
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -752,8 +752,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -763,8 +763,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -774,8 +774,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -785,8 +785,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_streamk.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_streamk.cu
index c6e16ca2..04dc3fa5 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_streamk.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_streamk.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x128x256_0_vs32_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x128x256_0_vs32_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x192x256_0_vs32_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x192x256_0_vs32_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x256x256_0_vs32_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x256x256_0_vs32_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x128x256_0_vs32_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x128x256_0_vs32_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x192x256_0_vs32_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x192x256_0_vs32_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x256x256_0_vs32_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x256x256_0_vs32_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x128x256_0_vs32_tnt_align64_o_1sm, streamk) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x128x256_0_vs32_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x128x256_0_vs32_tnt_align64_o_1sm, streamk) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x128x256_0_vs32_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x192x256_0_vs32_tnt_align64_o_1sm, streamk) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x192x256_0_vs32_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x192x256_0_vs32_tnt_align64_o_1sm, streamk) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x192x256_0_vs32_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x256x256_0_vs32_tnt_align64_o_1sm, streamk) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x256x256_0_vs32_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x256x256_0_vs32_tnt_align64_o_1sm, streamk) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x256x256_0_vs32_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x128x256_0_vs32_tnt_align64_o_2sm, streamk) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x128x256_0_vs32_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x128x256_0_vs32_tnt_align64_o_2sm, streamk) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x128x256_0_vs32_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x192x256_0_vs32_tnt_align64_o_2sm, streamk) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x192x256_0_vs32_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x192x256_0_vs32_tnt_align64_o_2sm, streamk) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x192x256_0_vs32_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x256x256_0_vs32_tnt_align64_o_2sm, streamk) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x256x256_0_vs32_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x256x256_0_vs32_tnt_align64_o_2sm, streamk) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x256x256_0_vs32_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnn.cu
index f89539f2..cd81e2bf 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnn.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnn.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.2
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -588,7 +588,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -664,8 +664,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -675,8 +675,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -686,8 +686,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -697,8 +697,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.2
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -708,8 +708,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.2
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnt.cu
index 21d3b3d8..5c195462 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnt.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnt.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.2
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -589,7 +589,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.2
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -666,8 +666,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -677,8 +677,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -688,8 +688,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -699,8 +699,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.2
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -710,8 +710,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -721,8 +721,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -732,8 +732,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -743,8 +743,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.2
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnn.cu
index 4787b0d4..91dc0d64 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnn.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnn.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -588,7 +588,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.2
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -664,8 +664,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -675,8 +675,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -686,8 +686,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -697,8 +697,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.2
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -708,8 +708,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.2
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnt.cu
index 1a40c960..a5594a86 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnt.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnt.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.2
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -588,7 +588,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.2
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -664,8 +664,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -675,8 +675,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -686,8 +686,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -697,8 +697,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.2
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -708,8 +708,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.2
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnn.cu
index f8300b56..3f82337e 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnn.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnn.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.2
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnn_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnn_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -588,7 +588,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnn_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnn_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -664,8 +664,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -675,8 +675,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -686,8 +686,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -697,8 +697,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.2
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnn_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnn_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnn_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnn_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -708,8 +708,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.2
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnn_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnn_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnn_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnn_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnt.cu
index 4c07b52c..0a197a48 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnt.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnt.cu
@@ -56,7 +56,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.2
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnt_align64_o_1sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnt_align64_o_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -589,7 +589,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.2
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnt_align64_o_2sm {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnt_align64_o_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -666,8 +666,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -677,8 +677,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -688,8 +688,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -699,8 +699,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.2
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnt_align64_o_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnt_align64_o_1sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnt_align64_o_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnt_align64_o_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -710,8 +710,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -721,8 +721,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -732,8 +732,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -743,8 +743,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.2
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnt_align64_o_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnt_align64_o_2sm;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnt_align64_o_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnt_align64_o_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnn_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnn_sfd.cu
index 7a2a9414..a5145287 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnn_sfd.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnn_sfd.cu
@@ -60,7 +60,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.2
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 4. 
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -546,7 +546,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -627,7 +627,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.2
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -708,8 +708,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.2
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -752,8 +752,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -763,8 +763,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -774,8 +774,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -785,8 +785,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.2
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnt_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnt_sfd.cu
index 2f1316e0..deb1837a 100644
--- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnt_sfd.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnt_sfd.cu
@@ -60,7 +60,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 2.
-namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 3.2
-namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 4.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 5.
-namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -546,7 +546,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -627,7 +627,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 6.2
-namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t {
+namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -708,8 +708,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m
 }
 
 // 1.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 2.
-TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 3.2
-TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -752,8 +752,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 4.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -763,8 +763,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 5.
-TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -774,8 +774,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -785,8 +785,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32
 }
 
 // 6.2
-TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
-  namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t;
+TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) {
+  namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t;
   EXPECT_TRUE(test::gemm::device::TestSmallFusion<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_blockwise.cu b/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_blockwise.cu
new file mode 100644
index 00000000..9ce8817e
--- /dev/null
+++ b/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_blockwise.cu
@@ -0,0 +1,320 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include <thrust/universal_vector.h>
+#include <thrust/generate.h>
+#include <thrust/random.h>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/host/gett.hpp"
+#include "cutlass/util/device_memory.h"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+template<cute::UMMA::Major SFAMajor,
+         cute::UMMA::Major SFBMajor,
+         int ScaleGranularityM,
+         int ScaleGranularityN,
+         int ScaleGranularityK,
+         bool Is2SM,
+         class LayoutA,
+         class LayoutB,
+         class LayoutCD,
+         class MmaTileShape,
+         class ClusterShape>
+bool groupwise_test(
+    Int<ScaleGranularityM>, Int<ScaleGranularityN>, Int<ScaleGranularityK>, C<Is2SM>,
+    LayoutA, LayoutB, LayoutCD,
+    MmaTileShape, ClusterShape) {
+
+  using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK, SFAMajor, SFBMajor>;
+  using LayoutSFA             = decltype(ScaleConfig::deduce_layoutSFA());                     // Layout type for SFA matrix operand
+  using LayoutSFB             = decltype(ScaleConfig::deduce_layoutSFB());                     // Layout type for SFB matrix operand
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      MmaTileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e4m3_t, LayoutCD, 16,
+      cutlass::float_e4m3_t, LayoutCD, 16,
+      conditional_t<Is2SM, cutlass::epilogue::TmaWarpSpecialized2Sm, cutlass::epilogue::TmaWarpSpecialized1Sm>
+    >::CollectiveOp;
+
+  using CollectiveMainloop =
+    typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, cute::tuple<LayoutA, LayoutSFA>, 16,
+      cutlass::float_e4m3_t, cute::tuple<LayoutB, LayoutSFB>, 16,
+      float,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
+    conditional_t<Is2SM, cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100, cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+
+  /// Initialization
+  StrideA stride_A;
+  StrideB stride_B;
+  StrideC stride_C;
+  StrideD stride_D;
+  // Strides just iterate over scalars and have no zeros
+  LayoutSFA layout_SFA;
+  LayoutSFB layout_SFB;
+
+  int alignment_M = max(max((is_same_v<LayoutA, cutlass::layout::ColumnMajor> ? 16 : 1) , 
+                            (SFAMajor == UMMA::Major::MN ? CollectiveMainloop::AlignmentSFA : 1)),
+                        (is_same_v<LayoutCD, cutlass::layout::ColumnMajor> ? 16 : 1));
+
+  int alignment_N = max(max((is_same_v<LayoutB, cutlass::layout::RowMajor> ? 16 : 1) , 
+                            (SFBMajor == UMMA::Major::MN ? CollectiveMainloop::AlignmentSFB : 1)),
+                         (is_same_v<LayoutCD, cutlass::layout::RowMajor> ? 16 : 1));
+
+  int alignment_K = max(max((is_same_v<LayoutA, cutlass::layout::RowMajor> ? 16 : 1) , 
+                            (SFAMajor == UMMA::Major::K ? CollectiveMainloop::AlignmentSFA : 1)),
+                        max((is_same_v<LayoutB, cutlass::layout::ColumnMajor> ? 16 : 1) , 
+                            (SFBMajor == UMMA::Major::K ? CollectiveMainloop::AlignmentSFB : 1)));
+
+  int M = 1024 + alignment_M;
+  int N = 1024 + alignment_N;
+  int K = 512  + alignment_K;
+  EXPECT_TRUE(M % alignment_M == 0);
+  EXPECT_TRUE(N % alignment_N == 0);
+  EXPECT_TRUE(K % alignment_K == 0);
+
+  stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, 1));
+  stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, 1));
+  stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, 1));
+  stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, 1));
+
+  layout_SFA = ScaleConfig::tile_atom_to_shape_SFA(make_shape(M, N, K, 1));
+  layout_SFB = ScaleConfig::tile_atom_to_shape_SFB(make_shape(M, N, K, 1));
+
+  thrust::universal_vector<cutlass::float_e4m3_t> tensor_A(M * K);
+  thrust::universal_vector<float> tensor_SFA(cute::size(cute::filter_zeros(layout_SFA)));
+  thrust::universal_vector<cutlass::float_e4m3_t> tensor_B(N * K);
+  thrust::universal_vector<float> tensor_SFB(cute::size(cute::filter_zeros(layout_SFB)));
+  thrust::universal_vector<cutlass::float_e4m3_t> tensor_C(M * N);
+  thrust::universal_vector<cutlass::float_e4m3_t> tensor_D(M * N);
+  thrust::universal_vector<cutlass::float_e4m3_t> tensor_ref_D(M * N);
+
+  thrust::random::default_random_engine engine(2025);
+  thrust::random::uniform_int_distribution<int> dist(-2, 2);
+
+  std::generate(tensor_A.begin(), tensor_A.end(), [&] () {
+    return static_cast<cutlass::float_e4m3_t>(dist(engine));
+  });
+  std::generate(tensor_SFA.begin(), tensor_SFA.end(), [&] () {
+    return static_cast<float>(dist(engine));
+  });
+  std::generate(tensor_B.begin(), tensor_B.end(), [&] () {
+    return static_cast<cutlass::float_e4m3_t>(dist(engine));
+  });
+  std::generate(tensor_SFB.begin(), tensor_SFB.end(), [&] () {
+    return static_cast<float>(dist(engine));
+  });
+  std::generate(tensor_C.begin(), tensor_C.end(), [&] () {
+    return static_cast<cutlass::float_e4m3_t>(dist(engine));
+  });
+
+  typename Gemm::Arguments arguments {
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {M, N, K, 1},
+    {thrust::raw_pointer_cast(tensor_A.data()), stride_A, 
+     thrust::raw_pointer_cast(tensor_B.data()), stride_B,
+     thrust::raw_pointer_cast(tensor_SFA.data()), layout_SFA,
+     thrust::raw_pointer_cast(tensor_SFB.data()), layout_SFB},
+    {
+      {}, // epilogue.thread
+      thrust::raw_pointer_cast(tensor_C.data()), stride_C,
+      thrust::raw_pointer_cast(tensor_D.data()), stride_D
+    }
+  };
+
+  auto &fusion_args = arguments.epilogue.thread;
+  fusion_args.alpha = 1.0f;
+  fusion_args.beta = 1.0f;
+
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  Gemm gemm;
+
+  EXPECT_TRUE(gemm.can_implement(arguments) == cutlass::Status::kSuccess);
+  EXPECT_TRUE(gemm.initialize(arguments, workspace.get()) == cutlass::Status::kSuccess);
+  EXPECT_TRUE(gemm.run() == cutlass::Status::kSuccess);
+  EXPECT_TRUE(cudaDeviceSynchronize() == cudaSuccess);
+
+  auto A = cute::make_tensor(thrust::raw_pointer_cast(tensor_A.data()),
+      cute::make_layout(cute::make_shape(M, K, 1), stride_A));
+  auto B = cute::make_tensor(thrust::raw_pointer_cast(tensor_B.data()),
+      cute::make_layout(cute::make_shape(N, K, 1), stride_B));
+  auto C = cute::make_tensor(thrust::raw_pointer_cast(tensor_C.data()),
+      cute::make_layout(cute::make_shape(M, N, 1), stride_C));
+  auto D = cute::make_tensor(thrust::raw_pointer_cast(tensor_ref_D.data()),
+      cute::make_layout(cute::make_shape(M, N, 1), stride_D));
+  auto SFA = cute::make_tensor(thrust::raw_pointer_cast(tensor_SFA.data()), layout_SFA);
+  auto SFB = cute::make_tensor(thrust::raw_pointer_cast(tensor_SFB.data()), layout_SFB);
+
+  cutlass::reference::host::GettBlockScalingMainloopParams<
+      float,
+      decltype(A), 
+      decltype(SFA), 
+      decltype(B),
+      decltype(SFB)
+    > mainloop_params{A, SFA, B, SFB};
+
+  cutlass::reference::host::GettEpilogueParams<
+      float,
+      float,
+      float,
+      float,
+      decltype(C),
+      decltype(D)
+  > epilogue_params;
+
+  epilogue_params.C = C;
+  epilogue_params.D = D;
+  epilogue_params.alpha = 1.0f;
+  epilogue_params.beta = 1.0f;
+
+  // get reference result
+  cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+  // compare_reference
+  bool equal = true;
+  for (size_t i = 0; i < tensor_ref_D.size(); ++i) {
+    equal &= (tensor_ref_D[i] == tensor_D[i]);
+  }
+  return equal;
+}
+
+TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3t_tensorop_1sm_f32_align16_blockwise, 128x128x128_1x1x1_2x2x32_scale) {
+
+  bool passed = groupwise_test<UMMA::Major::MN, UMMA::Major::K>(
+      Int<2>{}, Int<2>{}, Int<32>{}, false_type{},
+      cutlass::layout::RowMajor{}, cutlass::layout::ColumnMajor{}, 
+      cutlass::layout::RowMajor{}, 
+      Shape<_128,_128,_128>{},
+      Shape<_1,_1,_1>{});
+
+  EXPECT_TRUE(passed);
+}
+
+TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3t_tensorop_2sm_f32_align16_blockwise, 256x128x128_2x1x1_64x4x32_scale) {
+
+  bool passed = groupwise_test<UMMA::Major::MN, UMMA::Major::MN>(
+      Int<64>{}, Int<4>{}, Int<32>{}, true_type{},
+      cutlass::layout::RowMajor{}, cutlass::layout::ColumnMajor{}, 
+      cutlass::layout::RowMajor{}, 
+      Shape<_256,_128,_128>{},
+      Shape<_2,_1,_1>{});
+
+  EXPECT_TRUE(passed);
+
+}
+
+TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3t_tensorop_1sm_f32_align16_blockwise, 128x128x128_1x1x1_1x128x128_scale) {
+
+  bool passed = groupwise_test<UMMA::Major::MN, UMMA::Major::K>(
+      Int<1>{}, Int<128>{}, Int<128>{}, false_type{},
+      cutlass::layout::RowMajor{}, cutlass::layout::ColumnMajor{}, 
+      cutlass::layout::RowMajor{}, 
+      Shape<_128,_128,_128>{},
+      Shape<_1,_1,_1>{});
+
+  EXPECT_TRUE(passed);
+
+}
+
+TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3t_tensorop_2sm_f32_align16_blockwise, 256x128x128_2x1x1_1x128x128_scale) {
+
+  bool passed = groupwise_test<UMMA::Major::MN, UMMA::Major::MN>(
+      Int<1>{}, Int<128>{}, Int<128>{}, true_type{},
+      cutlass::layout::RowMajor{}, cutlass::layout::ColumnMajor{}, 
+      cutlass::layout::RowMajor{}, 
+      Shape<_256,_128,_128>{},
+      Shape<_2,_1,_1>{});
+
+  EXPECT_TRUE(passed);
+
+}
+
+TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3t_tensorop_2sm_f32_align16_blockwise, 256x128x128_2x1x1_64x64x64_scale) {
+
+  bool passed = groupwise_test<UMMA::Major::MN, UMMA::Major::MN>(
+      Int<64>{}, Int<64>{}, Int<64>{}, true_type{},
+      cutlass::layout::RowMajor{}, cutlass::layout::ColumnMajor{}, 
+      cutlass::layout::RowMajor{}, 
+      Shape<_256,_128,_128>{},
+      Shape<_2,_1,_1>{});
+
+  EXPECT_TRUE(passed);
+
+}
+
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f16_f16_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f16_f16_tn.cu
index e46934c8..584097b1 100644
--- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f16_f16_tn.cu
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f16_f16_tn.cu
@@ -54,7 +54,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_f16_128
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_f16_128
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_f16_256
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_f16_256
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_f16_128x128x
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_f16_128x256x
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_f16_256x128x
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_f16_256x256x
 
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f16_12
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f16_12
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f16_25
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_f16_25
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f16_128x128
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f16_128x256
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f16_256x128
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f16_f8_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f16_f8_tn.cu
index dbce7f5d..994b9a30 100644
--- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f16_f8_tn.cu
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f16_f8_tn.cu
@@ -54,7 +54,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_12
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_12
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_25
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_25
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_128x128
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_128x256
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_256x128
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_256x256
 
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_e4m3_1
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_e4m3_1
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_e4m3_2
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_e4m3_2
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_e4m3_128x12
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_e4m3_128x25
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_e4m3_256x12
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f32_f32_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f32_f32_tn.cu
index 94f76ca0..62069c68 100644
--- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f32_f32_tn.cu
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f32_f32_tn.cu
@@ -54,7 +54,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f32_f32_128
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f32_f32_128
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f32_f32_256
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f32_f32_256
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f32_f32_128x128x
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f32_f32_128x256x
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f32_f32_256x128x
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f32_f32_256x256x
 
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f32_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f32_12
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f32_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f32_12
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f32_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f32_25
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_f32_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_f32_25
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f32_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f32_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f32_128x128
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f32_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f32_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f32_128x256
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f32_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f32_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f32_256x128
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_f32_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_f32_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f6_f32_f16_f16_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f6_f32_f16_f16_tn.cu
index 1896e27d..95f7cff8 100644
--- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f6_f32_f16_f16_tn.cu
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f6_f32_f16_f16_tn.cu
@@ -54,7 +54,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e3m2_f32_f16_f16_128
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e3m2_f32_f16_f16_128
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e3m2_f32_f16_f16_256
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e3m2_f32_f16_f16_256
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e3m2_f32_f16_f16_128x128x
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e3m2_f32_f16_f16_128x256x
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e3m2_f32_f16_f16_256x128x
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e3m2_f32_f16_f16_256x256x
 
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e3m2_f32_void_f16_12
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e3m2_f32_void_f16_12
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e3m2_f32_void_f16_25
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e3m2_f32_void_f16_25
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e3m2_f32_void_f16_128x128
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e3m2_f32_void_f16_128x256
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e3m2_f32_void_f16_256x128
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f8_f32_f16_f16_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f8_f32_f16_f16_tn.cu
index 4960e6a7..0f3f40ff 100644
--- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f8_f32_f16_f16_tn.cu
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f8_f32_f16_f16_tn.cu
@@ -54,7 +54,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e4m3_f32_f16_f16_128
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e4m3_f32_f16_f16_128
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e4m3_f32_f16_f16_256
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e4m3_f32_f16_f16_256
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e4m3_f32_f16_f16_128x128x
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e4m3_f32_f16_f16_128x256x
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e4m3_f32_f16_f16_256x128x
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e4m3_f32_f16_f16_256x256x
 
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e4m3_f32_void_f16_12
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e4m3_f32_void_f16_12
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e4m3_f32_void_f16_25
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e4m3_f32_void_f16_25
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e4m3_f32_void_f16_128x128
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e4m3_f32_void_f16_128x256
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e4m3_f32_void_f16_256x128
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f4_f32_f16_f16_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f4_f32_f16_f16_tn.cu
index 9643370a..3aa37e55 100644
--- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f4_f32_f16_f16_tn.cu
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f4_f32_f16_f16_tn.cu
@@ -54,7 +54,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e2m1_f32_f16_f16_128
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e2m1_f32_f16_f16_128
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e2m1_f32_f16_f16_256
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e2m1_f32_f16_f16_256
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e2m1_f32_f16_f16_128x128x
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e2m1_f32_f16_f16_128x256x
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e2m1_f32_f16_f16_256x128x
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e2m1_f32_f16_f16_256x256x
 
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e2m1_f32_void_f16_12
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e2m1_f32_void_f16_12
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e2m1_f32_void_f16_25
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e2m1_f32_void_f16_25
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e2m1_f32_void_f16_128x128
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e2m1_f32_void_f16_128x256
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e2m1_f32_void_f16_256x128
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f16_f16_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f16_f16_tn.cu
index 94b52d60..22b84502 100644
--- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f16_f16_tn.cu
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f16_f16_tn.cu
@@ -54,7 +54,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_f16_128
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_f16_128
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_f16_256
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_f16_256
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_f16_128x128x
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_f16_128x256x
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_f16_256x128x
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_f16_256x256x
 
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f16_12
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f16_12
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f16_25
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_f16_25
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f16_128x128
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f16_128x256
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f16_256x128
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f16_f8_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f16_f8_tn.cu
index c8ab01da..190cb81a 100644
--- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f16_f8_tn.cu
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f16_f8_tn.cu
@@ -54,7 +54,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_12
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_12
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_25
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_25
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_128x128
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_128x256
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_256x128
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_256x256
 
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_e4m3_1
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_e4m3_1
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_e4m3_2
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_e4m3_2
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_e4m3_128x12
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_e4m3_128x25
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_e4m3_256x12
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f32_f32_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f32_f32_tn.cu
index f97911be..470850f2 100644
--- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f32_f32_tn.cu
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f32_f32_tn.cu
@@ -54,7 +54,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f32_f32_128
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f32_f32_128
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f32_f32_256
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f32_f32_256
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f32_f32_128x128x
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f32_f32_128x256x
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f32_f32_256x128x
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f32_f32_256x256x
 
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f32_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f32_12
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f32_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f32_12
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f32_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f32_25
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_f32_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_f32_25
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f32_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f32_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f32_128x128
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f32_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f32_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f32_128x256
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f32_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f32_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f32_256x128
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_f32_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_f32_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f8_f32_f16_f16_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f8_f32_f16_f16_tn.cu
index ad81c5f8..904ae41e 100644
--- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f8_f32_f16_f16_tn.cu
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f8_f32_f16_f16_tn.cu
@@ -54,7 +54,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e4m3_f32_f16_f16_128
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e4m3_f32_f16_f16_128
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e4m3_f32_f16_f16_256
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e4m3_f32_f16_f16_256
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e4m3_f32_f16_f16_128x128x
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e4m3_f32_f16_f16_128x256x
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e4m3_f32_f16_f16_256x128x
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e4m3_f32_f16_f16_256x256x
 
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e4m3_f32_void_f16_12
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e4m3_f32_void_f16_12
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e4m3_f32_void_f16_25
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e4m3_f32_void_f16_25
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e4m3_f32_void_f16_128x128
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e4m3_f32_void_f16_128x256
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e4m3_f32_void_f16_256x128
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f8_f4_f32_f16_f16_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f8_f4_f32_f16_f16_tn.cu
index 4b0ab7f8..1f72248d 100644
--- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f8_f4_f32_f16_f16_tn.cu
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f8_f4_f32_f16_f16_tn.cu
@@ -54,7 +54,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e2m1_f32_f16_f16_128
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e2m1_f32_f16_f16_128
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e2m1_f32_f16_f16_256
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e2m1_f32_f16_f16_256
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e2m1_f32_f16_f16_128x128x
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e2m1_f32_f16_f16_128x256x
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e2m1_f32_f16_f16_256x128x
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e2m1_f32_f16_f16_256x256x
 
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e2m1_f32_void_f16_12
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e2m1_f32_void_f16_12
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e2m1_f32_void_f16_25
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e2m1_f32_void_f16_25
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e2m1_f32_void_f16_128x128
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e2m1_f32_void_f16_128x256
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e2m1_f32_void_f16_256x128
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f8_f6_f32_f16_f16_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f8_f6_f32_f16_f16_tn.cu
index adf4188e..acf64f12 100644
--- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f8_f6_f32_f16_f16_tn.cu
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f8_f6_f32_f16_f16_tn.cu
@@ -54,7 +54,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e3m2_f32_f16_f16_128
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e3m2_f32_f16_f16_128
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e3m2_f32_f16_f16_256
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e3m2_f32_f16_f16_256
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e3m2_f32_f16_f16_128x128x
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e3m2_f32_f16_f16_128x256x
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e3m2_f32_f16_f16_256x128x
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e3m2_f32_f16_f16_256x256x
 
 
 // 1. 
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e3m2_f32_void_f16_12
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e3m2_f32_void_f16_12
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e3m2_f32_void_f16_25
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm {
 
     using LayoutA = cutlass::layout::RowMajor;
     using LayoutB = cutlass::layout::ColumnMajor;
@@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e3m2_f32_void_f16_25
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e3m2_f32_void_f16_128x128
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e3m2_f32_void_f16_128x256
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
@@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e3m2_f32_void_f16_256x128
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm;
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) {
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm;
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
     test::gemm::device::CheckEquality::RELATIVE,
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f16_f16_f32_f16_f16_hmma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f16_f16_f32_f16_f16_hmma.cu
index e5e4f3ff..b92d17ae 100644
--- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f16_f16_f32_f16_f16_hmma.cu
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f16_f16_f32_f16_f16_hmma.cu
@@ -47,7 +47,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1.
-namespace cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_f16_f16_128x64x64_1x1x1_0_tnn_align16_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x64x64_1x1x1_0_tnn_align16_1sm {
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
             cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp,
@@ -88,7 +88,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_f16_f16_128x64
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_f16_f16_128x128x64_1x1x1_0_tnn_align16_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x128x64_1x1x1_0_tnn_align16_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -130,7 +130,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_f16_f16_128x1
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_f16_f16_128x192x64_1x1x1_0_tnn_align16_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x192x64_1x1x1_0_tnn_align16_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -172,7 +172,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_f16_f16_128x1
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_f16_f16_128x256x64_1x1x1_0_tnn_align16_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x256x64_1x1x1_0_tnn_align16_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -214,7 +214,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_f16_f16_128x2
 }
 
 // 5.
-namespace cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64x64_2x1x1_0_tnn_align16_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x64x64_2x1x1_0_tnn_align16_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -256,7 +256,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64
 }
 
 // 6.
-namespace cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64x128_2x1x1_0_tnn_align16_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x64x128_2x1x1_0_tnn_align16_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -298,7 +298,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64
 }
 
 // 7.
-namespace cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x128x64_2x1x1_0_tnn_align16_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x128x64_2x1x1_0_tnn_align16_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -340,7 +340,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x1
 }
 
 // 8.
-namespace cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x128x128_2x1x1_0_tnn_align16_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x128x128_2x1x1_0_tnn_align16_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -382,7 +382,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x1
 }
 
 // 9.
-namespace cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_f16_f16_256x192x64_2x1x1_0_tnn_align16_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x192x64_2x1x1_0_tnn_align16_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -424,7 +424,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_f16_f16_256x1
 }
 
 // 10.
-namespace cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x256x64_2x1x1_0_tnn_align16_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x256x64_2x1x1_0_tnn_align16_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -466,7 +466,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x2
 }
 
 // 11.
-namespace cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x256x128_2x1x1_0_tnn_align16_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x256x128_2x1x1_0_tnn_align16_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -508,9 +508,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x2
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_f16_f16_128x64x64_1x1x1_0_tnn_align16_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x64x64_1x1x1_0_tnn_align16_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_f16_f16_128x64x64_1x1x1_0_tnn_align16_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x64x64_1x1x1_0_tnn_align16_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -521,9 +521,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_f16_f16_128x64x64_1
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_f16_f16_128x128x64_1x1x1_0_tnn_align16_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x128x64_1x1x1_0_tnn_align16_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_f16_f16_128x128x64_1x1x1_0_tnn_align16_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x128x64_1x1x1_0_tnn_align16_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -534,9 +534,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_f16_f16_128x128x64
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_f16_f16_128x192x64_1x1x1_0_tnn_align16_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x192x64_1x1x1_0_tnn_align16_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_f16_f16_128x192x64_1x1x1_0_tnn_align16_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x192x64_1x1x1_0_tnn_align16_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -547,9 +547,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_f16_f16_128x192x64
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_f16_f16_128x256x64_1x1x1_0_tnn_align16_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x256x64_1x1x1_0_tnn_align16_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_f16_f16_128x256x64_1x1x1_0_tnn_align16_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x256x64_1x1x1_0_tnn_align16_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -560,9 +560,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_f16_f16_128x256x64
 }
 
 // 5.
-TEST(cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64x64_2x1x1_0_tnn_align16_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x64x64_2x1x1_0_tnn_align16_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64x64_2x1x1_0_tnn_align16_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x64x64_2x1x1_0_tnn_align16_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -573,9 +573,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64x64_2
 }
 
 //6.
-TEST(cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64x128_2x1x1_0_tnn_align16_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x64x128_2x1x1_0_tnn_align16_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64x128_2x1x1_0_tnn_align16_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x64x128_2x1x1_0_tnn_align16_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -586,9 +586,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64x128_
 }
 
 // 7.
-TEST(cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x128x64_2x1x1_0_tnn_align16_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x128x64_2x1x1_0_tnn_align16_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x128x64_2x1x1_0_tnn_align16_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x128x64_2x1x1_0_tnn_align16_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -599,9 +599,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x128x64
 }
 
 // 8.
-TEST(cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x128x128_2x1x1_0_tnn_align16_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x128x128_2x1x1_0_tnn_align16_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x128x128_2x1x1_0_tnn_align16_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x128x128_2x1x1_0_tnn_align16_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -612,9 +612,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x128x12
 }
 
 // 9.
-TEST(cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_f16_f16_256x192x64_2x1x1_0_tnn_align16_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x192x64_2x1x1_0_tnn_align16_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_f16_f16_256x192x64_2x1x1_0_tnn_align16_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x192x64_2x1x1_0_tnn_align16_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -625,9 +625,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_f16_f16_256x192x64
 }
 
 // 10.
-TEST(cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x256x64_2x1x1_0_tnn_align16_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x256x64_2x1x1_0_tnn_align16_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x256x64_2x1x1_0_tnn_align16_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x256x64_2x1x1_0_tnn_align16_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -638,9 +638,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x256x64
 }
 
 // 11.
-TEST(cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x256x128_2x1x1_0_tnn_align16_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x256x128_2x1x1_0_tnn_align16_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x256x128_2x1x1_0_tnn_align16_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x256x128_2x1x1_0_tnn_align16_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -651,7 +651,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x256x12
 }
 
 // 1.
-namespace cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_void_f16_128x64x64_1x1x1_0_tnn_align16_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x64x64_1x1x1_0_tnn_align16_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -693,7 +693,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_void_f16_128x6
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_void_f16_128x128x64_1x1x1_0_tnn_align16_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x128x64_1x1x1_0_tnn_align16_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -735,7 +735,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_void_f16_128x
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_void_f16_128x192x64_1x1x1_0_tnn_align16_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x192x64_1x1x1_0_tnn_align16_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -777,7 +777,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_void_f16_128x
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_void_f16_128x256x64_1x1x1_0_tnn_align16_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x256x64_1x1x1_0_tnn_align16_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -819,7 +819,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_void_f16_128x
 }
 
 // 5.
-namespace cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x64x64_2x1x1_0_tnn_align16_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x64x64_2x1x1_0_tnn_align16_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -861,7 +861,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x6
 }
 
 // 6.
-namespace cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x64x128_2x1x1_0_tnn_align16_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x64x128_2x1x1_0_tnn_align16_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -903,7 +903,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x6
 }
 
 // 7.
-namespace cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x128x64_2x1x1_0_tnn_align16_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x128x64_2x1x1_0_tnn_align16_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -945,7 +945,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x
 }
 
 // 8.
-namespace cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x128x128_2x1x1_0_tnn_align16_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x128x128_2x1x1_0_tnn_align16_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -987,7 +987,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x
 }
 
 // 9.
-namespace cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_void_f16_256x192x64_2x1x1_0_tnn_align16_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x192x64_2x1x1_0_tnn_align16_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -1029,7 +1029,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_void_f16_256x
 }
 
 // 10.
-namespace cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x256x128_2x1x1_0_tnn_align16_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x256x128_2x1x1_0_tnn_align16_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -1071,7 +1071,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x
 }
 
 // 11.
-namespace cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x256x64_2x1x1_0_tnn_align16_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x256x64_2x1x1_0_tnn_align16_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -1113,9 +1113,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_void_f16_128x64x64_1x1x1_0_tnn_align16_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x64x64_1x1x1_0_tnn_align16_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_void_f16_128x64x64_1x1x1_0_tnn_align16_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x64x64_1x1x1_0_tnn_align16_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1126,9 +1126,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_void_f16_128x64x64_
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_void_f16_128x128x64_1x1x1_0_tnn_align16_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x128x64_1x1x1_0_tnn_align16_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_void_f16_128x128x64_1x1x1_0_tnn_align16_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x128x64_1x1x1_0_tnn_align16_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1139,9 +1139,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_void_f16_128x128x6
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_void_f16_128x192x64_1x1x1_0_tnn_align16_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x192x64_1x1x1_0_tnn_align16_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_void_f16_128x192x64_1x1x1_0_tnn_align16_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x192x64_1x1x1_0_tnn_align16_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1152,9 +1152,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_void_f16_128x192x6
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_void_f16_128x256x64_1x1x1_0_tnn_align16_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x256x64_1x1x1_0_tnn_align16_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_void_f16_128x256x64_1x1x1_0_tnn_align16_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x256x64_1x1x1_0_tnn_align16_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1165,9 +1165,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_void_f16_128x256x6
 }
 
 // 5.
-TEST(cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x64x64_2x1x1_0_tnn_align16_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x64x64_2x1x1_0_tnn_align16_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x64x64_2x1x1_0_tnn_align16_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x64x64_2x1x1_0_tnn_align16_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1178,9 +1178,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x64x64_
 }
 
 // 6.
-TEST(cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x64x128_2x1x1_0_tnn_align16_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x64x128_2x1x1_0_tnn_align16_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x64x128_2x1x1_0_tnn_align16_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x64x128_2x1x1_0_tnn_align16_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1191,9 +1191,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x64x128
 }
 
 // 7.
-TEST(cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x128x64_2x1x1_0_tnn_align16_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x128x64_2x1x1_0_tnn_align16_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x128x64_2x1x1_0_tnn_align16_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x128x64_2x1x1_0_tnn_align16_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1204,9 +1204,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x128x6
 }
 
 // 8.
-TEST(cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x128x128_2x1x1_0_tnn_align16_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x128x128_2x1x1_0_tnn_align16_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x128x128_2x1x1_0_tnn_align16_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x128x128_2x1x1_0_tnn_align16_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1217,9 +1217,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x128x1
 }
 
 // 9.
-TEST(cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_void_f16_256x192x64_2x1x1_0_tnn_align16_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x192x64_2x1x1_0_tnn_align16_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_void_f16_256x192x64_2x1x1_0_tnn_align16_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x192x64_2x1x1_0_tnn_align16_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1230,9 +1230,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_void_f16_256x192x6
 }
 
 // 10.
-TEST(cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x256x64_2x1x1_0_tnn_align16_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x256x64_2x1x1_0_tnn_align16_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x256x64_2x1x1_0_tnn_align16_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x256x64_2x1x1_0_tnn_align16_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1243,9 +1243,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x256x6
 }
 
 // 11.
-TEST(cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x256x128_2x1x1_0_tnn_align16_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x256x128_2x1x1_0_tnn_align16_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x256x128_2x1x1_0_tnn_align16_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x256x128_2x1x1_0_tnn_align16_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f32_f32_f32_f32_f32_tfmma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f32_f32_f32_f32_f32_tfmma.cu
index 92671e67..a6837005 100644
--- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f32_f32_f32_f32_f32_tfmma.cu
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f32_f32_f32_f32_f32_tfmma.cu
@@ -47,7 +47,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1.
-namespace cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_f32_f32_128x64x32_1x1x1_0_tnn_align8_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x64x32_1x1x1_0_tnn_align8_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -89,7 +89,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_f32_f32_128x64
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_f32_f32_128x128x32_1x1x1_0_tnn_align8_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x128x32_1x1x1_0_tnn_align8_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -131,7 +131,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_f32_f32_128x1
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_f32_f32_128x192x32_1x1x1_0_tnn_align8_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x192x32_1x1x1_0_tnn_align8_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -173,7 +173,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_f32_f32_128x1
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_f32_f32_128x256x32_1x1x1_0_tnn_align8_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x256x32_1x1x1_0_tnn_align8_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -215,7 +215,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_f32_f32_128x2
 }
 
 // 5.
-namespace cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64x32_2x1x1_0_tnn_align8_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x64x32_2x1x1_0_tnn_align8_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -257,7 +257,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64
 }
 
 // 6.
-namespace cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64x64_2x1x1_0_tnn_align8_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x64x64_2x1x1_0_tnn_align8_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -299,7 +299,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64
 }
 
 // 7.
-namespace cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x128x32_2x1x1_0_tnn_align8_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x128x32_2x1x1_0_tnn_align8_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -341,7 +341,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x1
 }
 
 // 8.
-namespace cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x128x64_2x1x1_0_tnn_align8_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x128x64_2x1x1_0_tnn_align8_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -383,7 +383,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x1
 }
 
 // 9.
-namespace cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_f32_f32_256x192x32_2x1x1_0_tnn_align8_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x192x32_2x1x1_0_tnn_align8_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -425,7 +425,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_f32_f32_256x1
 }
 
 // 10.
-namespace cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x256x32_2x1x1_0_tnn_align8_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x256x32_2x1x1_0_tnn_align8_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -467,7 +467,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x2
 }
 
 // 11.
-namespace cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x256x64_2x1x1_0_tnn_align8_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x256x64_2x1x1_0_tnn_align8_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -509,9 +509,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x2
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_f32_f32_128x64x32_1x1x1_0_tnn_align8_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x64x32_1x1x1_0_tnn_align8_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_f32_f32_128x64x32_1x1x1_0_tnn_align8_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x64x32_1x1x1_0_tnn_align8_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -522,9 +522,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_f32_f32_128x64x32_1
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_f32_f32_128x128x32_1x1x1_0_tnn_align8_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x128x32_1x1x1_0_tnn_align8_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_f32_f32_128x128x32_1x1x1_0_tnn_align8_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x128x32_1x1x1_0_tnn_align8_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -535,9 +535,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_f32_f32_128x128x32
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_f32_f32_128x192x32_1x1x1_0_tnn_align8_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x192x32_1x1x1_0_tnn_align8_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_f32_f32_128x192x32_1x1x1_0_tnn_align8_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x192x32_1x1x1_0_tnn_align8_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -548,9 +548,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_f32_f32_128x192x32
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_f32_f32_128x256x32_1x1x1_0_tnn_align8_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x256x32_1x1x1_0_tnn_align8_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_f32_f32_128x256x32_1x1x1_0_tnn_align8_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x256x32_1x1x1_0_tnn_align8_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -561,9 +561,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_f32_f32_128x256x32
 }
 
 // 5.
-TEST(cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64x32_2x1x1_0_tnn_align8_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x64x32_2x1x1_0_tnn_align8_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64x32_2x1x1_0_tnn_align8_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x64x32_2x1x1_0_tnn_align8_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -574,9 +574,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64x32_2
 }
 
 //6.
-TEST(cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64x64_2x1x1_0_tnn_align8_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x64x64_2x1x1_0_tnn_align8_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64x64_2x1x1_0_tnn_align8_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x64x64_2x1x1_0_tnn_align8_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -587,9 +587,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64x64_2
 }
 
 // 7.
-TEST(cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x128x32_2x1x1_0_tnn_align8_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x128x32_2x1x1_0_tnn_align8_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x128x32_2x1x1_0_tnn_align8_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x128x32_2x1x1_0_tnn_align8_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -600,9 +600,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x128x32
 }
 
 // 8.
-TEST(cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x128x64_2x1x1_0_tnn_align8_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x128x64_2x1x1_0_tnn_align8_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x128x64_2x1x1_0_tnn_align8_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x128x64_2x1x1_0_tnn_align8_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -613,9 +613,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x128x64
 }
 
 // 9.
-TEST(cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_f32_f32_256x192x32_2x1x1_0_tnn_align8_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x192x32_2x1x1_0_tnn_align8_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_f32_f32_256x192x32_2x1x1_0_tnn_align8_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x192x32_2x1x1_0_tnn_align8_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -626,9 +626,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_f32_f32_256x192x32
 }
 
 // 10.
-TEST(cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x256x32_2x1x1_0_tnn_align8_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x256x32_2x1x1_0_tnn_align8_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x256x32_2x1x1_0_tnn_align8_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x256x32_2x1x1_0_tnn_align8_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -639,9 +639,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x256x32
 }
 
 // 11.
-TEST(cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x256x64_2x1x1_0_tnn_align8_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x256x64_2x1x1_0_tnn_align8_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x256x64_2x1x1_0_tnn_align8_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x256x64_2x1x1_0_tnn_align8_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -652,7 +652,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x256x64
 }
 
 // 1.
-namespace cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_void_f32_128x64x32_1x1x1_0_tnn_align8_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x64x32_1x1x1_0_tnn_align8_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -694,7 +694,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_void_f32_128x6
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_void_f32_128x128x32_1x1x1_0_tnn_align8_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x128x32_1x1x1_0_tnn_align8_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -736,7 +736,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_void_f32_128x
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_void_f32_128x192x32_1x1x1_0_tnn_align8_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x192x32_1x1x1_0_tnn_align8_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -778,7 +778,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_void_f32_128x
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_void_f32_128x256x32_1x1x1_0_tnn_align8_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x256x32_1x1x1_0_tnn_align8_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -820,7 +820,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_void_f32_128x
 }
 
 // 5.
-namespace cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x64x32_2x1x1_0_tnn_align8_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x64x32_2x1x1_0_tnn_align8_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -862,7 +862,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x6
 }
 
 // 6.
-namespace cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x64x64_2x1x1_0_tnn_align8_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x64x64_2x1x1_0_tnn_align8_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -904,7 +904,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x6
 }
 
 // 7.
-namespace cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x128x32_2x1x1_0_tnn_align8_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x128x32_2x1x1_0_tnn_align8_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -946,7 +946,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x
 }
 
 // 8.
-namespace cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x128x64_2x1x1_0_tnn_align8_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x128x64_2x1x1_0_tnn_align8_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -988,7 +988,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x
 }
 
 // 9.
-namespace cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_void_f32_256x192x32_2x1x1_0_tnn_align8_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x192x32_2x1x1_0_tnn_align8_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -1030,7 +1030,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_void_f32_256x
 }
 
 // 10.
-namespace cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x256x64_2x1x1_0_tnn_align8_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x256x64_2x1x1_0_tnn_align8_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -1072,7 +1072,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x
 }
 
 // 11.
-namespace cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x256x32_2x1x1_0_tnn_align8_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x256x32_2x1x1_0_tnn_align8_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -1114,9 +1114,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_void_f32_128x64x32_1x1x1_0_tnn_align8_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x64x32_1x1x1_0_tnn_align8_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_void_f32_128x64x32_1x1x1_0_tnn_align8_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x64x32_1x1x1_0_tnn_align8_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1127,9 +1127,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_void_f32_128x64x32_
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_void_f32_128x128x32_1x1x1_0_tnn_align8_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x128x32_1x1x1_0_tnn_align8_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_void_f32_128x128x32_1x1x1_0_tnn_align8_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x128x32_1x1x1_0_tnn_align8_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1140,9 +1140,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_void_f32_128x128x3
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_void_f32_128x192x32_1x1x1_0_tnn_align8_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x192x32_1x1x1_0_tnn_align8_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_void_f32_128x192x32_1x1x1_0_tnn_align8_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x192x32_1x1x1_0_tnn_align8_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1153,9 +1153,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_void_f32_128x192x3
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_void_f32_128x256x32_1x1x1_0_tnn_align8_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x256x32_1x1x1_0_tnn_align8_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_void_f32_128x256x32_1x1x1_0_tnn_align8_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x256x32_1x1x1_0_tnn_align8_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1166,9 +1166,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_void_f32_128x256x3
 }
 
 // 5.
-TEST(cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x64x32_2x1x1_0_tnn_align8_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x64x32_2x1x1_0_tnn_align8_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x64x32_2x1x1_0_tnn_align8_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x64x32_2x1x1_0_tnn_align8_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1179,9 +1179,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x64x32_
 }
 
 // 6.
-TEST(cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x64x64_2x1x1_0_tnn_align8_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x64x64_2x1x1_0_tnn_align8_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x64x64_2x1x1_0_tnn_align8_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x64x64_2x1x1_0_tnn_align8_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1192,9 +1192,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x64x64_
 }
 
 // 7.
-TEST(cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x128x32_2x1x1_0_tnn_align8_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x128x32_2x1x1_0_tnn_align8_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x128x32_2x1x1_0_tnn_align8_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x128x32_2x1x1_0_tnn_align8_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1205,9 +1205,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x128x3
 }
 
 // 8.
-TEST(cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x128x64_2x1x1_0_tnn_align8_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x128x64_2x1x1_0_tnn_align8_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x128x64_2x1x1_0_tnn_align8_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x128x64_2x1x1_0_tnn_align8_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1218,9 +1218,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x128x6
 }
 
 // 9.
-TEST(cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_void_f32_256x192x32_2x1x1_0_tnn_align8_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x192x32_2x1x1_0_tnn_align8_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_void_f32_256x192x32_2x1x1_0_tnn_align8_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x192x32_2x1x1_0_tnn_align8_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1231,9 +1231,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_void_f32_256x192x3
 }
 
 // 10.
-TEST(cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x256x32_2x1x1_0_tnn_align8_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x256x32_2x1x1_0_tnn_align8_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x256x32_2x1x1_0_tnn_align8_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x256x32_2x1x1_0_tnn_align8_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1244,9 +1244,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x256x3
 }
 
 // 11.
-TEST(cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x256x64_2x1x1_0_tnn_align8_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x256x64_2x1x1_0_tnn_align8_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x256x64_2x1x1_0_tnn_align8_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x256x64_2x1x1_0_tnn_align8_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f16_f16_qmma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f16_f16_qmma.cu
index 12b22760..99644205 100644
--- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f16_f16_qmma.cu
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f16_f16_qmma.cu
@@ -48,7 +48,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1.
-namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_f16_128x64x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x64x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -90,7 +90,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_f16_128x
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_f16_128x128x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x128x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_f16_128
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_f16_128x192x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x192x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -174,7 +174,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_f16_128
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_f16_128x256x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x256x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -216,7 +216,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_f16_128
 }
 
 // 5.
-namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x64x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x64x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -258,7 +258,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x
 }
 
 // 6.
-namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x64x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x64x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -300,7 +300,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x
 }
 
 // 7.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256x128x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x128x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -342,7 +342,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256
 }
 
 // 8.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256x128x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x128x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -384,7 +384,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256
 }
 
 // 9.
-namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_f16_256x192x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x192x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -426,7 +426,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_f16_256
 }
 
 // 10.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256x256x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x256x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -468,7 +468,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256
 }
 
 // 11.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256x256x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x256x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -510,9 +510,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_f16_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_f16_128x64x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x64x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -523,9 +523,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_f16_128x64x12
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_f16_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_f16_128x128x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x128x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -536,9 +536,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_f16_128x128x
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_f16_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_f16_128x192x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x192x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -549,9 +549,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_f16_128x192x
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_f16_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_f16_128x256x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x256x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -562,9 +562,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_f16_128x256x
 }
 
 // 5.
-TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x64x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x64x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -575,9 +575,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x64x12
 }
 
 //6.
-TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x64x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x64x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -588,9 +588,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x64x25
 }
 
 // 7.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256x128x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x128x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -601,9 +601,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256x128x
 }
 
 // 8.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256x128x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x128x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -614,9 +614,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256x128x
 }
 
 // 9.
-TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_f16_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_f16_256x192x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x192x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -627,9 +627,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_f16_256x192x
 }
 
 // 10.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256x256x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x256x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -640,9 +640,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256x256x
 }
 
 // 11.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256x256x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x256x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -653,7 +653,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256x256x
 }
 
 // 1.
-namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f16_128x64x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x64x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -695,7 +695,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f16_128
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f16_128x128x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x128x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -737,7 +737,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f16_12
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f16_128x192x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x192x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -779,7 +779,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f16_12
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f16_128x256x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x256x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -821,7 +821,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f16_12
 }
 
 // 5.
-namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256x64x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x64x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -863,7 +863,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256
 }
 
 // 6.
-namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256x64x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x64x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -905,7 +905,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256
 }
 
 // 7.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_256x128x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x128x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -947,7 +947,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_25
 }
 
 // 8.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_256x128x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x128x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -989,7 +989,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_25
 }
 
 // 9.
-namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f16_256x192x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x192x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -1031,7 +1031,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f16_25
 }
 
 // 10.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_256x256x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x256x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -1073,7 +1073,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_25
 }
 
 // 11.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_256x256x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x256x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -1115,9 +1115,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_25
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f16_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f16_128x64x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x64x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1128,9 +1128,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f16_128x64x1
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f16_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f16_128x128x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x128x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1141,9 +1141,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f16_128x128
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f16_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f16_128x192x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x192x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1154,9 +1154,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f16_128x192
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f16_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f16_128x256x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x256x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1167,9 +1167,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f16_128x256
 }
 
 // 5.
-TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256x64x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x64x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1180,9 +1180,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256x64x1
 }
 
 // 6.
-TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256x64x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x64x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1193,9 +1193,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256x64x2
 }
 
 // 7.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_256x128x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x128x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1206,9 +1206,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_256x128
 }
 
 // 8.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_256x128x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x128x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1219,9 +1219,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_256x128
 }
 
 // 9.
-TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f16_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f16_256x192x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x192x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1232,9 +1232,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f16_256x192
 }
 
 // 10.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_256x256x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x256x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1245,9 +1245,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_256x256
 }
 
 // 11.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_256x256x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x256x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f16_f8_qmma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f16_f8_qmma.cu
index 43a5222a..d103086c 100644
--- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f16_f8_qmma.cu
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f16_f8_qmma.cu
@@ -48,7 +48,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1.
-namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -90,7 +90,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_128
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_12
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -174,7 +174,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_12
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -216,7 +216,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_12
 }
 
 // 5.
-namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -258,7 +258,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256
 }
 
 // 6.
-namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -300,7 +300,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256
 }
 
 // 7.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -342,7 +342,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_25
 }
 
 // 8.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -384,7 +384,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_25
 }
 
 // 9.
-namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -426,7 +426,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_25
 }
 
 // 10.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -468,7 +468,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_25
 }
 
 // 11.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -510,9 +510,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_25
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -523,9 +523,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x64x1
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -536,9 +536,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x128
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -549,9 +549,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x192
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -562,9 +562,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x256
 }
 
 // 5.
-TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -575,9 +575,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x1
 }
 
 //6.
-TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -588,9 +588,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x2
 }
 
 // 7.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -601,9 +601,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x128
 }
 
 // 8.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -614,9 +614,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x128
 }
 
 // 9.
-TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -627,9 +627,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x192
 }
 
 // 10.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -640,9 +640,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x256
 }
 
 // 11.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -653,7 +653,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x256
 }
 
 // 1.
-namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -695,7 +695,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_e4m3_12
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -737,7 +737,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_e4m3_1
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -779,7 +779,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_e4m3_1
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -821,7 +821,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_e4m3_1
 }
 
 // 5.
-namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -863,7 +863,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_25
 }
 
 // 6.
-namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -905,7 +905,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_25
 }
 
 // 7.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -947,7 +947,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_2
 }
 
 // 8.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -989,7 +989,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_2
 }
 
 // 9.
-namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -1031,7 +1031,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_e4m3_2
 }
 
 // 10.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -1073,7 +1073,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_2
 }
 
 // 11.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -1115,9 +1115,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_2
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1128,9 +1128,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_e4m3_128x64x
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1141,9 +1141,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_e4m3_128x12
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1154,9 +1154,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_e4m3_128x19
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1167,9 +1167,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_e4m3_128x25
 }
 
 // 5.
-TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1180,9 +1180,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_256x64x
 }
 
 // 6.
-TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1193,9 +1193,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_256x64x
 }
 
 // 7.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1206,9 +1206,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_256x12
 }
 
 // 8.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1219,9 +1219,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_256x12
 }
 
 // 9.
-TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1232,9 +1232,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_e4m3_256x19
 }
 
 // 10.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1245,9 +1245,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_256x25
 }
 
 // 11.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f32_f32_qmma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f32_f32_qmma.cu
index 71a54d0b..fb1160ab 100644
--- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f32_f32_qmma.cu
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f32_f32_qmma.cu
@@ -48,7 +48,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1.
-namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f32_f32_128x64x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x64x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -90,7 +90,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f32_f32_128x
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f32_f32_128x128x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x128x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -132,7 +132,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f32_f32_128
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f32_f32_128x192x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x192x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -174,7 +174,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f32_f32_128
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f32_f32_128x256x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x256x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -216,7 +216,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f32_f32_128
 }
 
 // 5.
-namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x64x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x64x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -258,7 +258,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x
 }
 
 // 6.
-namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x64x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x64x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -300,7 +300,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x
 }
 
 // 7.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256x128x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x128x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -342,7 +342,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256
 }
 
 // 8.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256x128x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x128x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -384,7 +384,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256
 }
 
 // 9.
-namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f32_f32_256x192x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x192x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -426,7 +426,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f32_f32_256
 }
 
 // 10.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256x256x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x256x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -468,7 +468,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256
 }
 
 // 11.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256x256x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x256x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -510,9 +510,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f32_f32_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f32_f32_128x64x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x64x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -523,9 +523,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f32_f32_128x64x12
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f32_f32_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f32_f32_128x128x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x128x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -536,9 +536,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f32_f32_128x128x
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f32_f32_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f32_f32_128x192x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x192x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -549,9 +549,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f32_f32_128x192x
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f32_f32_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f32_f32_128x256x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x256x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -561,9 +561,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f32_f32_128x256x
     {512}));
 }
 // 5.
-TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x64x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x64x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -574,9 +574,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x64x12
 }
 
 //6.
-TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x64x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x64x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -587,9 +587,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x64x25
 }
 
 // 7.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256x128x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x128x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -600,9 +600,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256x128x
 }
 
 // 8.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256x128x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x128x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -612,9 +612,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256x128x
     {512}));
 }
 // 9.
-TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f32_f32_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f32_f32_256x192x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x192x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -625,9 +625,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f32_f32_256x192x
 }
 
 // 10.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256x256x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x256x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -638,9 +638,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256x256x
 }
 
 // 11.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256x256x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x256x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -651,7 +651,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256x256x
 }
 
 // 1.
-namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f32_128x64x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x64x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -693,7 +693,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f32_128
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f32_128x128x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x128x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -735,7 +735,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f32_12
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f32_128x192x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x192x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -777,7 +777,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f32_12
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f32_128x256x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x256x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -819,7 +819,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f32_12
 }
 
 // 5.
-namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256x64x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x64x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -861,7 +861,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256
 }
 
 // 6.
-namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256x64x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x64x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -903,7 +903,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256
 }
 
 // 7.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_256x128x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x128x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -945,7 +945,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_25
 }
 
 // 8.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_256x128x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x128x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -987,7 +987,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_25
 }
 
 // 9.
-namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f32_256x192x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x192x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -1029,7 +1029,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f32_25
 }
 
 // 10.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_256x256x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x256x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -1071,7 +1071,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_25
 }
 
 // 11.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_256x256x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x256x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -1113,9 +1113,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_25
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f32_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f32_128x64x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x64x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1126,9 +1126,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f32_128x64x1
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f32_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f32_128x128x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x128x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1138,9 +1138,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f32_128x128
     {512}));
 }
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f32_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f32_128x192x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x192x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1151,9 +1151,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f32_128x192
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f32_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f32_128x256x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x256x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1164,9 +1164,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f32_128x256
 }
 
 // 5.
-TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256x64x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x64x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1176,9 +1176,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256x64x1
     {512}));
 }
 // 6.
-TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256x64x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x64x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1188,9 +1188,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256x64x2
     {512}));
 }
 // 7.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_256x128x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x128x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1201,9 +1201,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_256x128
 }
 
 // 8.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_256x128x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x128x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1214,9 +1214,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_256x128
 }
 
 // 9.
-TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f32_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f32_256x192x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x192x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1227,9 +1227,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f32_256x192
 }
 
 // 10.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_256x256x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x256x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1239,9 +1239,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_256x256
     {512}));
 }
 // 11.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_256x256x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x256x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_s8_s8_s32_s8_s8_imma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_s8_s8_s32_s8_s8_imma.cu
index 0ec92dd9..54af8b29 100644
--- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_s8_s8_s32_s8_s8_imma.cu
+++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_s8_s8_s32_s8_s8_imma.cu
@@ -47,7 +47,7 @@ using namespace cute;
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 // 1.
-namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_s8_s8_128x64x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x64x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -89,7 +89,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_s8_s8_128x64x128
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_s8_s8_128x128x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x128x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -131,7 +131,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_s8_s8_128x128x1
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_s8_s8_128x192x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x192x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -173,7 +173,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_s8_s8_128x192x1
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_s8_s8_128x256x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x256x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -215,7 +215,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_s8_s8_128x256x1
 }
 
 // 5.
-namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x64x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -257,7 +257,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x128
 }
 
 // 6.
-namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x64x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -299,7 +299,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x256
 }
 
 // 7.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x128x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -341,7 +341,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x1
 }
 
 // 8.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x128x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -383,7 +383,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x2
 }
 
 // 9.
-namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_s8_s8_256x192x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x192x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -425,7 +425,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_s8_s8_256x192x1
 }
 
 // 10.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x256x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -467,7 +467,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x1
 }
 
 // 11.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x256x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -509,9 +509,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x2
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_s8_s8_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_s8_s8_128x64x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x64x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -522,9 +522,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_s8_s8_128x64x128_1x1x
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_s8_s8_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_s8_s8_128x128x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x128x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -535,9 +535,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_s8_s8_128x128x128_1x
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_s8_s8_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_s8_s8_128x192x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x192x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -548,9 +548,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_s8_s8_128x192x128_1x
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_s8_s8_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_s8_s8_128x256x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x256x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -561,9 +561,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_s8_s8_128x256x128_1x
 }
 
 // 5.
-TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x64x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -574,9 +574,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x128_2x1x
 }
 
 //6.
-TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x64x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -587,9 +587,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x256_2x1x
 }
 
 // 7.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x128x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -600,9 +600,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x128_2x
 }
 
 // 8.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x128x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -613,9 +613,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x256_2x
 }
 
 // 9.
-TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_s8_s8_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_s8_s8_256x192x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x192x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -626,9 +626,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_s8_s8_256x192x128_2x
 }
 
 // 10.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x256x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -639,9 +639,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x128_2x
 }
 
 // 11.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x256x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 1,
@@ -652,7 +652,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x256_2x
 }
 
 // 1.
-namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_void_s8_128x64x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x64x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -694,7 +694,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_void_s8_128x64x1
 }
 
 // 2.
-namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_void_s8_128x128x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x128x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -736,7 +736,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_void_s8_128x128
 }
 
 // 3.
-namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_void_s8_128x192x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x192x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -778,7 +778,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_void_s8_128x192
 }
 
 // 4.
-namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_void_s8_128x256x128_1x1x1_0_tnn_align32_1sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x256x128_1x1x1_0_tnn_align32_1sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -820,7 +820,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_void_s8_128x256
 }
 
 // 5.
-namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x64x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -862,7 +862,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x1
 }
 
 // 6.
-namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x64x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -904,7 +904,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x2
 }
 
 // 7.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x128x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -946,7 +946,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128
 }
 
 // 8.
-namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x128x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -988,7 +988,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128
 }
 
 // 9.
-namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_void_s8_256x192x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x192x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -1030,7 +1030,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_void_s8_256x192
 }
 
 // 10.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256x256_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x256x256_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -1072,7 +1072,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256
 }
 
 // 11.
-namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256x128_2x1x1_0_tnn_align32_2sm {
+namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x256x128_2x1x1_0_tnn_align32_2sm {
 
     using CollectiveEpilogue =
         typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -1114,9 +1114,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256
 }
 
 // 1.
-TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_void_s8_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x64x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_void_s8_128x64x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x64x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1127,9 +1127,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_void_s8_128x64x128_1x
 }
 
 // 2.
-TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_void_s8_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x128x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_void_s8_128x128x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x128x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1140,9 +1140,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_void_s8_128x128x128_
 }
 
 // 3.
-TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_void_s8_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x192x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_void_s8_128x192x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x192x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1153,9 +1153,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_void_s8_128x192x128_
 }
 
 // 4.
-TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_void_s8_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x256x128_1x1x1_0_tnn_align32_1sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_void_s8_128x256x128_1x1x1_0_tnn_align32_1sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x256x128_1x1x1_0_tnn_align32_1sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1166,9 +1166,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_void_s8_128x256x128_
 }
 
 // 5.
-TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x64x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x64x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1179,9 +1179,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x128_2x
 }
 
 // 6.
-TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x64x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x64x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1192,9 +1192,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x256_2x
 }
 
 // 7.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x128x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x128x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1205,9 +1205,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128x128_
 }
 
 // 8.
-TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x128x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x128x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1218,9 +1218,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128x256_
 }
 
 // 9.
-TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_void_s8_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x192x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_void_s8_256x192x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x192x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1231,9 +1231,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_void_s8_256x192x128_
 }
 
 // 10.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x256x128_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256x128_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x256x128_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
@@ -1244,9 +1244,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256x128_
 }
 
 // 11.
-TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
+TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x256x256_2x1x1_0_tnn_align32_2sm, func_check)
 {
-  namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256x256_2x1x1_0_tnn_align32_2sm;
+  namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x256x256_2x1x1_0_tnn_align32_2sm;
 
   EXPECT_TRUE(test::gemm::device::TestSmall<gemm::Gemm>(
     1, 0,
diff --git a/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_fp32.cu b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_fp32.cu
index 63956c90..f686f713 100644
--- a/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_fp32.cu
+++ b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_fp32.cu
@@ -550,5 +550,77 @@ TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32t_tensor_op_gmma_f32, 64x128x128_tma_epilog
   EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
 }
 
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32t_tensor_op_gmma_f32, 128x56x128_tma_epilogue_fp8_fast_accum) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileMNK = Shape<_128,_56,_128>;
 
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileMNK, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      void, LayoutC, 4,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      TileMNK, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+
+  TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32t_tensor_op_gmma_f32, 128x112x128_tma_epilogue_fp8_fast_accum) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileMNK = Shape<_128,_112,_128>;
+
+  using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileMNK, Shape<_1,_1,_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      void, LayoutC, 4,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 16,
+      cutlass::float_e4m3_t, LayoutB, 16,
+      float,
+      TileMNK, Shape<_1,_1,_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename EpilogueOp::SharedStorage)>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveOp,
+      EpilogueOp
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
+}
+#endif // defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
 #endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)