v4.1 release update v2. (#2481)

This commit is contained in:
Junkai-Wu
2025-07-22 10:03:55 +08:00
committed by GitHub
parent 9baa06dd57
commit fd6cfe1ed0
179 changed files with 7878 additions and 1286 deletions

View File

@ -259,7 +259,7 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K)
// Step 2: The Mainloop.
// Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
// Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
// Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
@ -394,7 +394,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
// In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
// Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
// and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
// The MMA's partitioning then yeilds the CTA-local work.
// The MMA's partitioning then yields the CTA-local work.
if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;

View File

@ -295,7 +295,7 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K)
// Step 2: The Mainloop.
// Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
// Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
// Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
@ -433,7 +433,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
// In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
// Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
// and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
// The MMA's partitioning then yeilds the CTA-local work.
// The MMA's partitioning then yields the CTA-local work.
if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;

View File

@ -333,7 +333,7 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K)
// Step 2: The Mainloop.
// Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
// Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
// Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
@ -471,7 +471,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
// In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
// Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
// and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
// The MMA's partitioning then yeilds the CTA-local work.
// The MMA's partitioning then yields the CTA-local work.
if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;

View File

@ -328,7 +328,7 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K)
// Step 2: The Mainloop.
// Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
// Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
// Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
@ -473,7 +473,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
// In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
// Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
// and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
// The MMA's partitioning then yeilds the CTA-local work.
// The MMA's partitioning then yields the CTA-local work.
if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;

View File

@ -341,7 +341,7 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K)
// Step 2: The Mainloop.
// Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
// Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
// Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
@ -527,7 +527,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
// In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
// Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
// and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
// The MMA's partitioning then yeilds the CTA-local work.
// The MMA's partitioning then yields the CTA-local work.
if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;

View File

@ -200,7 +200,7 @@ int main(int argc, char** argv)
// Construct tiled copy, a tiling of copy atoms.
//
// Note, this assumes the vector and thread layouts are aligned with contigous data
// Note, this assumes the vector and thread layouts are aligned with contiguous data
// in GMEM. Alternative thread layouts are possible but may result in uncoalesced
// reads. Alternative value layouts are also possible, though incompatible layouts
// will result in compile time errors.