v4.2 release. (#2587)

* Fix default cluster callback values to 1 to avoid profiler failure when these values are not set in command line.

* v4.2 release.
This commit is contained in:
Junkai-Wu
2025-08-23 06:11:24 +08:00
committed by GitHub
parent 11cad1f67b
commit a49a78ffef
351 changed files with 28182 additions and 2032 deletions

View File

@ -452,8 +452,8 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
dim3 dimBlock(128);
dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
dim3 dimGrid(round_up(size(ceil_div(Gemm_M, bM)), dimCluster.x),
round_up(size(ceil_div(Gemm_N, bN)), dimCluster.y));
dim3 dimGrid(size(ceil_div(Gemm_M, bM * size<1>(cluster_layout_vmnk))) * dimCluster.x,
size(ceil_div(Gemm_N, bN * size<2>(cluster_layout_vmnk))) * dimCluster.y);
int smemBytes = sizeof(SMEMStorage);
auto* kernel_ptr = &gemm_device<SMEMStorage,

View File

@ -528,8 +528,8 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
dim3 dimBlock(128);
dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
dim3 dimGrid(round_up(size(ceil_div(Gemm_M, bM)), dimCluster.x),
round_up(size(ceil_div(Gemm_N, bN)), dimCluster.y));
dim3 dimGrid(size(ceil_div(Gemm_M, bM * size<1>(cluster_layout_vmnk))) * dimCluster.x,
size(ceil_div(Gemm_N, bN * size<2>(cluster_layout_vmnk))) * dimCluster.y);
int smemBytes = sizeof(SMEMStorage);
auto* kernel_ptr = &gemm_device<SMEMStorage,

View File

@ -567,8 +567,8 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
dim3 dimBlock(128);
dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
dim3 dimGrid(round_up(size(ceil_div(Gemm_M, bM)), dimCluster.x),
round_up(size(ceil_div(Gemm_N, bN)), dimCluster.y));
dim3 dimGrid(size(ceil_div(Gemm_M, bM * size<1>(cluster_layout_vmnk))) * dimCluster.x,
size(ceil_div(Gemm_N, bN * size<2>(cluster_layout_vmnk))) * dimCluster.y);
int smemBytes = sizeof(SMEMStorage);
auto* kernel_ptr = &gemm_device<SMEMStorage,

View File

@ -575,6 +575,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
dim3 dimGrid(size(ceil_div(Gemm_M, bM * size<1>(cluster_layout_vmnk))) * dimCluster.x,
size(ceil_div(Gemm_N, bN * size<2>(cluster_layout_vmnk))) * dimCluster.y);
int smemBytes = sizeof(SMEMStorage);
auto* kernel_ptr = &gemm_device<SMEMStorage,

View File

@ -681,8 +681,8 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
dim3 dimBlock(128);
dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
dim3 dimGrid(round_up(size(ceil_div(Gemm_M, bM)), dimCluster.x),
round_up(size(ceil_div(Gemm_N, bN)), dimCluster.y));
dim3 dimGrid(size(ceil_div(Gemm_M, bM * size<1>(cluster_layout_vmnk))) * dimCluster.x,
size(ceil_div(Gemm_N, bN * size<2>(cluster_layout_vmnk))) * dimCluster.y);
int smemBytes = sizeof(SMEMStorage);
auto* kernel_ptr = &gemm_device<SMEMStorage,