v4.1 release update v2. (#2481)

This commit is contained in:
Junkai-Wu
2025-07-22 10:03:55 +08:00
committed by GitHub
parent 9baa06dd57
commit fd6cfe1ed0
179 changed files with 7878 additions and 1286 deletions

View File

@ -606,10 +606,46 @@ void initialize_blockwise_gemm_reference_operations_given_C_and_D(Manifest &mani
float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 1, 128, 128);
make_blockwise_gemm<
float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 128, 1, 128);
make_blockwise_gemm<
float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 128, 128, 128);
make_blockwise_gemm<
float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 64, 1, 128);
make_blockwise_gemm<
float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 64, 128, 128);
make_blockwise_gemm<
float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 128, 32, 128);
make_blockwise_gemm<
float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 1, 32, 128);
make_blockwise_gemm<
float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 128, 64, 128);
make_blockwise_gemm<
float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 1, 64, 128);
make_blockwise_gemm<
float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 128, 256, 128);
make_blockwise_gemm<
float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 1, 256, 128);
make_blockwise_gemm<
@ -620,10 +656,46 @@ void initialize_blockwise_gemm_reference_operations_given_C_and_D(Manifest &mani
float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 1, 128, 128);
make_blockwise_gemm<
float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 128, 1, 128);
make_blockwise_gemm<
float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 128, 128, 128);
make_blockwise_gemm<
float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 64, 1 , 128);
make_blockwise_gemm<
float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 64, 128, 128);
make_blockwise_gemm<
float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 128, 32, 128);
make_blockwise_gemm<
float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 1, 32, 128);
make_blockwise_gemm<
float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 128, 64, 128);
make_blockwise_gemm<
float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 1, 64, 128);
make_blockwise_gemm<
float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 128, 256, 128);
make_blockwise_gemm<
float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 1, 256, 128);
make_blockwise_gemm<
float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
@ -633,11 +705,46 @@ void initialize_blockwise_gemm_reference_operations_given_C_and_D(Manifest &mani
float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 1, 128, 128);
make_blockwise_gemm<
float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 128, 1, 128);
make_blockwise_gemm<
float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 128, 128, 128);
make_blockwise_gemm<
float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 64, 1, 128);
make_blockwise_gemm<
float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 64, 128, 128);
make_blockwise_gemm<
float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 128, 32, 128);
make_blockwise_gemm<
float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 1, 32, 128);
make_blockwise_gemm<
float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 128, 64, 128);
make_blockwise_gemm<
float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 1, 64, 128);
make_blockwise_gemm<
float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 128, 256, 128);
make_blockwise_gemm<
float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 1, 256, 128);
make_blockwise_gemm<
float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
@ -647,10 +754,46 @@ void initialize_blockwise_gemm_reference_operations_given_C_and_D(Manifest &mani
float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 1, 128, 128);
make_blockwise_gemm<
float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 128, 1, 128);
make_blockwise_gemm<
float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 128, 128, 128);
make_blockwise_gemm<
float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 64, 1 , 128);
make_blockwise_gemm<
float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 64, 128, 128);
make_blockwise_gemm<
float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 128, 32, 128);
make_blockwise_gemm<
float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 1, 32, 128);
make_blockwise_gemm<
float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 128, 64, 128);
make_blockwise_gemm<
float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 1, 64, 128);
make_blockwise_gemm<
float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 128, 256, 128);
make_blockwise_gemm<
float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
>(manifest, 1, 256, 128);
}