v4.1 release update v2. (#2481)
This commit is contained in:
@ -259,7 +259,7 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K)
|
||||
|
||||
// Step 2: The Mainloop.
|
||||
|
||||
// Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
// Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
|
||||
|
||||
// Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
|
||||
@ -394,7 +394,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
|
||||
// In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
|
||||
// Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
|
||||
// and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
|
||||
// The MMA's partitioning then yeilds the CTA-local work.
|
||||
// The MMA's partitioning then yields the CTA-local work.
|
||||
|
||||
if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
|
||||
std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
|
||||
|
||||
@ -295,7 +295,7 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K)
|
||||
|
||||
// Step 2: The Mainloop.
|
||||
|
||||
// Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
// Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
|
||||
|
||||
// Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
|
||||
@ -433,7 +433,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
|
||||
// In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
|
||||
// Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
|
||||
// and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
|
||||
// The MMA's partitioning then yeilds the CTA-local work.
|
||||
// The MMA's partitioning then yields the CTA-local work.
|
||||
|
||||
if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
|
||||
std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
|
||||
|
||||
@ -333,7 +333,7 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K)
|
||||
|
||||
// Step 2: The Mainloop.
|
||||
|
||||
// Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
// Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
|
||||
|
||||
// Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
|
||||
@ -471,7 +471,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
|
||||
// In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
|
||||
// Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
|
||||
// and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
|
||||
// The MMA's partitioning then yeilds the CTA-local work.
|
||||
// The MMA's partitioning then yields the CTA-local work.
|
||||
|
||||
if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
|
||||
std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
|
||||
|
||||
@ -328,7 +328,7 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K)
|
||||
|
||||
// Step 2: The Mainloop.
|
||||
|
||||
// Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
// Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
|
||||
|
||||
// Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
|
||||
@ -473,7 +473,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
|
||||
// In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
|
||||
// Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
|
||||
// and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
|
||||
// The MMA's partitioning then yeilds the CTA-local work.
|
||||
// The MMA's partitioning then yields the CTA-local work.
|
||||
|
||||
if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
|
||||
std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
|
||||
|
||||
@ -341,7 +341,7 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K)
|
||||
|
||||
// Step 2: The Mainloop.
|
||||
|
||||
// Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
// Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
|
||||
|
||||
// Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
|
||||
@ -527,7 +527,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
|
||||
// In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
|
||||
// Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
|
||||
// and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
|
||||
// The MMA's partitioning then yeilds the CTA-local work.
|
||||
// The MMA's partitioning then yields the CTA-local work.
|
||||
|
||||
if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
|
||||
std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
|
||||
|
||||
Reference in New Issue
Block a user