v4.1 release update v2. (#2481)
This commit is contained in:
@ -259,7 +259,7 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K)
|
||||
|
||||
// Step 2: The Mainloop.
|
||||
|
||||
// Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
// Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
|
||||
|
||||
// Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
|
||||
@ -394,7 +394,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
|
||||
// In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
|
||||
// Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
|
||||
// and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
|
||||
// The MMA's partitioning then yeilds the CTA-local work.
|
||||
// The MMA's partitioning then yields the CTA-local work.
|
||||
|
||||
if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
|
||||
std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
|
||||
|
||||
@ -295,7 +295,7 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K)
|
||||
|
||||
// Step 2: The Mainloop.
|
||||
|
||||
// Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
// Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
|
||||
|
||||
// Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
|
||||
@ -433,7 +433,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
|
||||
// In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
|
||||
// Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
|
||||
// and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
|
||||
// The MMA's partitioning then yeilds the CTA-local work.
|
||||
// The MMA's partitioning then yields the CTA-local work.
|
||||
|
||||
if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
|
||||
std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
|
||||
|
||||
@ -333,7 +333,7 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K)
|
||||
|
||||
// Step 2: The Mainloop.
|
||||
|
||||
// Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
// Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
|
||||
|
||||
// Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
|
||||
@ -471,7 +471,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
|
||||
// In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
|
||||
// Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
|
||||
// and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
|
||||
// The MMA's partitioning then yeilds the CTA-local work.
|
||||
// The MMA's partitioning then yields the CTA-local work.
|
||||
|
||||
if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
|
||||
std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
|
||||
|
||||
@ -328,7 +328,7 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K)
|
||||
|
||||
// Step 2: The Mainloop.
|
||||
|
||||
// Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
// Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
|
||||
|
||||
// Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
|
||||
@ -473,7 +473,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
|
||||
// In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
|
||||
// Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
|
||||
// and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
|
||||
// The MMA's partitioning then yeilds the CTA-local work.
|
||||
// The MMA's partitioning then yields the CTA-local work.
|
||||
|
||||
if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
|
||||
std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
|
||||
|
||||
@ -341,7 +341,7 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K)
|
||||
|
||||
// Step 2: The Mainloop.
|
||||
|
||||
// Set mma accumlate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
// Set mma accumulate option to zero so that the first MMA instruction will clear the TMEM accumulator.
|
||||
tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
|
||||
|
||||
// Execute a MmaTile_M x MmaTile_N x GEMM_K GEMM
|
||||
@ -527,7 +527,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
|
||||
// In SM100, the MMAs are Cluster-local and perform CTA-level partitioning.
|
||||
// Thus, SM90 uses a cta_tiler to extract portions of the Problem for the CTA
|
||||
// and SM100 uses a mma_tiler to extract portions of the Problem for the MMA.
|
||||
// The MMA's partitioning then yeilds the CTA-local work.
|
||||
// The MMA's partitioning then yields the CTA-local work.
|
||||
|
||||
if (not evenly_divides(shape(mma_tiler), tile_shape(tiled_mma))) {
|
||||
std::cerr << "The MMA Shape should evenly divide the MMA Tiler." << std::endl;
|
||||
|
||||
@ -200,7 +200,7 @@ int main(int argc, char** argv)
|
||||
|
||||
// Construct tiled copy, a tiling of copy atoms.
|
||||
//
|
||||
// Note, this assumes the vector and thread layouts are aligned with contigous data
|
||||
// Note, this assumes the vector and thread layouts are aligned with contiguous data
|
||||
// in GMEM. Alternative thread layouts are possible but may result in uncoalesced
|
||||
// reads. Alternative value layouts are also possible, though incompatible layouts
|
||||
// will result in compile time errors.
|
||||
|
||||
Reference in New Issue
Block a user