fix: examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu GridDim miscalculated (#2492)

* fix: examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu Launch dimGrid error

* feat: add cta tiler

* Update examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu

use cluster_layout_vmnk instead of cta_tiler

Co-authored-by: Junkai-Wu <junkaiw@nvidia.com>

* feat: remove cta_tiler

---------

Co-authored-by: qinghongzeng <qinghongzeng@deeproute.ai>
Co-authored-by: Junkai-Wu <junkaiw@nvidia.com>
This commit is contained in:
xiangjiaojun
2025-07-31 10:11:04 +08:00
committed by GitHub
parent e093b4f691
commit 84a27b3926

View File

@ -573,8 +573,8 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
dim3 dimBlock(128);
dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
dim3 dimGrid(round_up(size(ceil_div(Gemm_M, bM)), dimCluster.x),
round_up(size(ceil_div(Gemm_N, bN)), dimCluster.y));
dim3 dimGrid(size(ceil_div(Gemm_M, bM * size<1>(cluster_layout_vmnk))) * dimCluster.x,
size(ceil_div(Gemm_N, bN * size<2>(cluster_layout_vmnk))) * dimCluster.y);
int smemBytes = sizeof(SMEMStorage);
auto* kernel_ptr = &gemm_device<SMEMStorage,