CUTLASS 2.3 adds GEMMs targeting Sparse Tensor Cores on the NVIDIA Ampere Architecture, fast SGEMM, and small matrix classes, bug fixes, and performance enhancements.
938 lines
23 KiB
C++
938 lines
23 KiB
C++
/***************************************************************************************************
|
|
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
|
* provided that the following conditions are met:
|
|
* * Redistributions of source code must retain the above copyright notice, this list of
|
|
* conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
|
* conditions and the following disclaimer in the documentation and/or other materials
|
|
* provided with the distribution.
|
|
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
|
* to endorse or promote products derived from this software without specific prior written
|
|
* permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
|
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
|
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
|
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
**************************************************************************************************/
|
|
/*! \file
|
|
\brief Defines layout functions used by TensorRef and derived classes.
|
|
|
|
Layout functions map logical coordinates to linear memory. They often require additional
|
|
data to describe strides between elements.
|
|
|
|
Layout functions must implement all members in the public interface of IdentityTensorLayout<>
|
|
defined in cutlass/tensor_ref.h.
|
|
*/
|
|
#pragma once
|
|
|
|
#include "cutlass/cutlass.h"
|
|
#include "cutlass/matrix_coord.h"
|
|
|
|
namespace cutlass {
|
|
namespace layout {
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// Defines data layouts of various matrix formats usable by TensorRef and other classes.
|
|
//
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
/// Mapping function for row-major matrices.
|
|
class RowMajor {
|
|
public:
|
|
/// Logical rank of tensor
|
|
static int const kRank = 2;
|
|
|
|
/// Rank of stride vector
|
|
static int const kStrideRank = 1;
|
|
|
|
/// Index type used for coordinates
|
|
using Index = int32_t;
|
|
|
|
/// Long index type used for offsets
|
|
using LongIndex = int64_t;
|
|
|
|
/// Logical coordinate
|
|
using TensorCoord = MatrixCoord;
|
|
|
|
/// Stride vector
|
|
using Stride = Coord<kStrideRank, Index>;
|
|
|
|
private:
|
|
//
|
|
// Data members
|
|
//
|
|
|
|
/// Stride data member
|
|
Stride stride_;
|
|
|
|
public:
|
|
//
|
|
// Methods
|
|
//
|
|
|
|
/// Constructor
|
|
CUTLASS_HOST_DEVICE
|
|
RowMajor(Index ldm = 0): stride_(ldm) { }
|
|
|
|
/// Ctor
|
|
CUTLASS_HOST_DEVICE
|
|
RowMajor(Stride stride): stride_(stride) { }
|
|
|
|
/// Helper returns a layout to a tightly packed tensor
|
|
CUTLASS_HOST_DEVICE
|
|
static RowMajor packed(MatrixCoord const &extent) {
|
|
return RowMajor(extent.column());
|
|
}
|
|
|
|
/// Returns the offset of a coordinate in linear memory.
|
|
/// Assumes coordinate has convention (row, column)
|
|
CUTLASS_HOST_DEVICE
|
|
LongIndex operator()(MatrixCoord const &coord) const {
|
|
return LongIndex(coord.row()) * LongIndex(stride_[0]) + coord.column();
|
|
}
|
|
|
|
/// Inverse of layout function, mapping linear offset to logical coordinate
|
|
CUTLASS_HOST_DEVICE
|
|
MatrixCoord inverse(LongIndex offset) const {
|
|
return MatrixCoord(Index(offset / stride_[0]), Index(offset % stride_[0]));
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Stride stride() const {
|
|
return stride_;
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Stride & stride() {
|
|
return stride_;
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Index stride(int idx) const {
|
|
return stride_[idx];
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Index & stride(int idx) {
|
|
return stride_[idx];
|
|
}
|
|
|
|
/// Compute the number of contiguous elements needed to store a tensor with the given size
|
|
CUTLASS_HOST_DEVICE
|
|
LongIndex capacity(MatrixCoord const &extent) const {
|
|
return LongIndex(extent.row()) * LongIndex(stride_[0]);
|
|
}
|
|
};
|
|
|
|
/// Mapping function for column-major matrices.
|
|
class ColumnMajor {
|
|
public:
|
|
/// Logical rank of tensor
|
|
static int const kRank = 2;
|
|
|
|
/// Rank of stride vector
|
|
static int const kStrideRank = 1;
|
|
|
|
/// Index type used for coordinates
|
|
using Index = int32_t;
|
|
|
|
/// Long index type used for offsets
|
|
using LongIndex = int64_t;
|
|
|
|
/// Logical coordinate
|
|
using TensorCoord = MatrixCoord;
|
|
|
|
/// Stride vector
|
|
using Stride = Coord<kStrideRank, Index>;
|
|
|
|
private:
|
|
//
|
|
// Data members
|
|
//
|
|
|
|
/// Stride data member
|
|
Stride stride_;
|
|
|
|
public:
|
|
//
|
|
// Methods
|
|
//
|
|
|
|
/// Ctor
|
|
CUTLASS_HOST_DEVICE
|
|
ColumnMajor(Index ldm = 0): stride_(ldm) { }
|
|
|
|
/// Ctor
|
|
CUTLASS_HOST_DEVICE
|
|
ColumnMajor(Stride stride): stride_(stride) { }
|
|
|
|
|
|
/// Helper returns a layout to a tightly packed tensor
|
|
CUTLASS_HOST_DEVICE
|
|
static ColumnMajor packed(MatrixCoord const &extent) {
|
|
return ColumnMajor(extent.row());
|
|
}
|
|
|
|
/// Returns the offset of a coordinate in linear memory.
|
|
/// Assumes coordinate has convention (row, column)
|
|
CUTLASS_HOST_DEVICE
|
|
LongIndex operator()(MatrixCoord const &coord) const {
|
|
return LongIndex(coord.column()) * LongIndex(stride_[0]) + coord.row();
|
|
}
|
|
|
|
/// Inverse of layout function, mapping linear offset to logical coordinate
|
|
CUTLASS_HOST_DEVICE
|
|
MatrixCoord inverse(LongIndex offset) const {
|
|
return MatrixCoord(Index(offset % stride_[0]), Index(offset / stride_[0]));
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Stride stride() const {
|
|
return stride_;
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Stride & stride() {
|
|
return stride_;
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Index stride(int idx) const {
|
|
return stride_[idx];
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Index & stride(int idx) {
|
|
return stride_[idx];
|
|
}
|
|
|
|
/// Compute the number of contiguous elements needed to store a tensor with the given size
|
|
CUTLASS_HOST_DEVICE
|
|
LongIndex capacity(MatrixCoord const &extent) const {
|
|
return LongIndex(extent.column()) * LongIndex(stride_[0]);
|
|
}
|
|
};
|
|
|
|
/// Mapping function for interleaved matrices. Matrix is structured
|
|
/// as row-major arrangement of fixed-size columns.
|
|
template <int Interleave>
|
|
struct RowMajorInterleaved {
|
|
|
|
/// Logical rank of tensor
|
|
static int const kRank = 2;
|
|
|
|
/// Rank of stride vector
|
|
static int const kStrideRank = 1;
|
|
|
|
/// Index type used for coordinates
|
|
using Index = int32_t;
|
|
|
|
/// Long index type used for offsets
|
|
using LongIndex = int64_t;
|
|
|
|
/// Logical coordinate
|
|
using TensorCoord = MatrixCoord;
|
|
|
|
/// Stride vector
|
|
using Stride = Coord<kStrideRank, Index>;
|
|
|
|
/// Size of interleaved columns
|
|
static int const kInterleave = Interleave;
|
|
|
|
private:
|
|
//
|
|
// Data members
|
|
//
|
|
|
|
/// Stride data member
|
|
Stride stride_;
|
|
|
|
public:
|
|
//
|
|
// Methods
|
|
//
|
|
|
|
/// Ctor
|
|
CUTLASS_HOST_DEVICE
|
|
RowMajorInterleaved(Index ldm = 0): stride_(ldm) { }
|
|
|
|
/// Ctor
|
|
CUTLASS_HOST_DEVICE
|
|
RowMajorInterleaved(Stride stride): stride_(stride) { }
|
|
|
|
/// Helper returns a layout to a tightly packed tensor
|
|
CUTLASS_HOST_DEVICE
|
|
static RowMajorInterleaved packed(MatrixCoord const &extent) {
|
|
return RowMajorInterleaved(extent.column() * kInterleave);
|
|
}
|
|
|
|
/// Returns the offset of a coordinate in linear memory.
|
|
/// Assumes coordinate has convention (row, column)
|
|
CUTLASS_HOST_DEVICE
|
|
LongIndex operator()(MatrixCoord const &coord) const {
|
|
Index row_major = coord.row() / kInterleave;
|
|
Index row_minor = coord.row() % kInterleave;
|
|
return LongIndex(row_major) * LongIndex(stride_[0]) + LongIndex(coord.column()) * kInterleave + row_minor;
|
|
}
|
|
|
|
/// Inverse of layout function, mapping linear offset to logical coordinate
|
|
CUTLASS_HOST_DEVICE
|
|
MatrixCoord inverse(LongIndex offset) const {
|
|
|
|
Index row_major = Index(offset / stride_[0]);
|
|
Index residual = Index(offset % stride_[0]);
|
|
|
|
Index column = residual / kInterleave;
|
|
Index row_minor = residual % kInterleave;
|
|
|
|
return MatrixCoord(row_major * kInterleave + row_minor, column);
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Stride stride() const {
|
|
return stride_;
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Stride & stride() {
|
|
return stride_;
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Index stride(int idx) const {
|
|
return stride_[idx];
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Index & stride(int idx) {
|
|
return stride_[idx];
|
|
}
|
|
|
|
/// Compute the number of contiguous elements needed to store a tensor with the given size
|
|
CUTLASS_HOST_DEVICE
|
|
LongIndex capacity(MatrixCoord const &extent) const {
|
|
return (extent.row() + kInterleave - 1) / kInterleave * stride_[0];
|
|
}
|
|
};
|
|
|
|
/// Mapping function for interleaved matrices. Matrix is structured
|
|
/// as column-major arrangement of fixed-size rows.
|
|
template <int Interleave>
|
|
struct ColumnMajorInterleaved {
|
|
|
|
/// Logical rank of tensor
|
|
static int const kRank = 2;
|
|
|
|
/// Rank of stride vector
|
|
static int const kStrideRank = 1;
|
|
|
|
/// Index type used for coordinates
|
|
using Index = int32_t;
|
|
|
|
/// Long index type used for offsets
|
|
using LongIndex = int64_t;
|
|
|
|
/// Logical coordinate
|
|
using TensorCoord = MatrixCoord;
|
|
|
|
/// Stride vector
|
|
using Stride = Coord<kStrideRank, Index>;
|
|
|
|
/// Size of interleaved columns
|
|
static int const kInterleave = Interleave;
|
|
|
|
private:
|
|
//
|
|
// Data members
|
|
//
|
|
|
|
/// Stride data member
|
|
Stride stride_;
|
|
|
|
public:
|
|
//
|
|
// Methods
|
|
//
|
|
|
|
/// Ctor
|
|
CUTLASS_HOST_DEVICE
|
|
ColumnMajorInterleaved(Index ldm = 0): stride_(ldm) { }
|
|
|
|
/// Ctor
|
|
CUTLASS_HOST_DEVICE
|
|
ColumnMajorInterleaved(Stride stride): stride_(stride) { }
|
|
|
|
|
|
/// Helper returns a layout to a tightly packed tensor
|
|
CUTLASS_HOST_DEVICE
|
|
static ColumnMajorInterleaved packed(MatrixCoord const &extent) {
|
|
return ColumnMajorInterleaved(extent.row() * kInterleave);
|
|
}
|
|
|
|
/// Returns the offset of a coordinate in linear memory.
|
|
/// Assumes coordinate has convention (row, column)
|
|
CUTLASS_HOST_DEVICE
|
|
LongIndex operator()(MatrixCoord const &coord) const {
|
|
Index column_major = coord.column() / kInterleave;
|
|
Index column_minor = coord.column() % kInterleave;
|
|
return LongIndex(column_major) * LongIndex(stride_[0]) + LongIndex(coord.row()) * kInterleave + column_minor;
|
|
}
|
|
|
|
/// Inverse of layout function, mapping linear offset to logical coordinate
|
|
CUTLASS_HOST_DEVICE
|
|
MatrixCoord inverse(LongIndex offset) const {
|
|
|
|
Index column_major = Index(offset / stride_[0]);
|
|
Index residual = Index(offset % stride_[0]);
|
|
|
|
Index row = residual / kInterleave;
|
|
Index column_minor = residual % kInterleave;
|
|
|
|
return MatrixCoord(row, column_major * kInterleave + column_minor);
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Stride stride() const {
|
|
return stride_;
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Stride & stride() {
|
|
return stride_;
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Index stride(int idx) const {
|
|
return stride_[idx];
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Index & stride(int idx) {
|
|
return stride_[idx];
|
|
}
|
|
|
|
/// Compute the number of contiguous elements needed to store a tensor with the given size
|
|
CUTLASS_HOST_DEVICE
|
|
LongIndex capacity(MatrixCoord const &extent) const {
|
|
return (extent.column() + kInterleave - 1) / kInterleave * stride_[0];
|
|
}
|
|
};
|
|
|
|
/// Enumerated type for canonical pitch-linear matrix layouts
|
|
enum class Matrix {
|
|
kColumnMajor, ///< leading dimension refers to stride between columns; stride along rows is 1
|
|
kRowMajor ///< leading dimension refers to stride between rows; stride along columns is 1
|
|
};
|
|
|
|
/// Mapping function for scenario in which layout is row-major or column-major but this information
|
|
/// is only available at runtime.
|
|
struct ContiguousMatrix {
|
|
|
|
/// Logical rank of tensor
|
|
static int const kRank = 2;
|
|
|
|
/// Rank of stride vector
|
|
static int const kStrideRank = 1;
|
|
|
|
/// Index type used for coordinates
|
|
using Index = int32_t;
|
|
|
|
/// Long index type used for offsets
|
|
using LongIndex = int64_t;
|
|
|
|
/// Logical coordinate
|
|
using TensorCoord = MatrixCoord;
|
|
|
|
/// Stride vector
|
|
using Stride = Coord<kStrideRank, Index>;
|
|
|
|
private:
|
|
//
|
|
// Data members
|
|
//
|
|
|
|
/// Stride data member
|
|
Stride stride_;
|
|
|
|
/// Enumerated type indicating canonical matrix layout
|
|
Matrix layout_;
|
|
|
|
public:
|
|
//
|
|
// Methods
|
|
//
|
|
|
|
/// Ctor
|
|
CUTLASS_HOST_DEVICE
|
|
ContiguousMatrix(
|
|
Index ldm = 0,
|
|
Matrix layout = Matrix::kColumnMajor
|
|
):
|
|
stride_(ldm), layout_(layout) { }
|
|
|
|
/// Helper returns a layout to a tightly packed tensor
|
|
CUTLASS_HOST_DEVICE
|
|
static ContiguousMatrix packed(
|
|
MatrixCoord const &extent,
|
|
Matrix layout = Matrix::kColumnMajor) {
|
|
|
|
Index ldm = 0;
|
|
if (layout == Matrix::kColumnMajor) {
|
|
ldm = extent.row();
|
|
}
|
|
else if (layout == Matrix::kRowMajor) {
|
|
ldm = extent.column();
|
|
}
|
|
return ContiguousMatrix(ldm, layout);
|
|
}
|
|
|
|
/// Returns the offset of a coordinate in linear memory.
|
|
/// Assumes coordinate has convention (row, column)
|
|
CUTLASS_HOST_DEVICE
|
|
LongIndex operator()(MatrixCoord const &coord) const {
|
|
if (layout_ == Matrix::kColumnMajor) {
|
|
return coord.row() + coord.column() * stride_[0];
|
|
}
|
|
else if (layout_ == Matrix::kRowMajor) {
|
|
return coord.row() * stride_[0] + coord.column();
|
|
}
|
|
else {
|
|
// degenerate case
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
/// Inverse of layout function, mapping linear offset to logical coordinate
|
|
CUTLASS_HOST_DEVICE
|
|
MatrixCoord inverse(LongIndex offset) const {
|
|
// TODO
|
|
return MatrixCoord(0, 0);
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Stride stride() const {
|
|
return stride_;
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Stride & stride() {
|
|
return stride_;
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Index stride(int idx) const {
|
|
return stride_[idx];
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Index & stride(int idx) {
|
|
return stride_[idx];
|
|
}
|
|
|
|
/// Compute the number of contiguous elements needed to store a tensor with the given size
|
|
CUTLASS_HOST_DEVICE
|
|
LongIndex capacity(MatrixCoord const &extent) const {
|
|
if (layout_ == Matrix::kColumnMajor) {
|
|
return stride_[0] * extent.column();
|
|
}
|
|
else if (layout_ == Matrix::kRowMajor) {
|
|
return stride_[0] * extent.row();
|
|
}
|
|
else {
|
|
// degenerate case
|
|
return 0;
|
|
}
|
|
}
|
|
};
|
|
|
|
/// Mapping function for block-linear matrices. Matrix is structured
|
|
/// as column-major arrangement of 2D tiles (that are column-major).
|
|
template <int BlockRows, int BlockColumns>
|
|
struct ColumnMajorBlockLinear {
|
|
/// Logical rank of tensor
|
|
static int const kRank = 2;
|
|
|
|
/// Rank of stride vector
|
|
static int const kStrideRank = 1;
|
|
|
|
/// Index type used for coordinates
|
|
using Index = int32_t;
|
|
|
|
/// Long index type used for offsets
|
|
using LongIndex = int64_t;
|
|
|
|
/// Logical coordinate
|
|
using TensorCoord = MatrixCoord;
|
|
|
|
/// Stride vector
|
|
using Stride = Coord<kStrideRank, Index>;
|
|
|
|
/// Size of a block in rows
|
|
static int const kBlockRows = BlockRows;
|
|
|
|
/// Size of a block in columns
|
|
static int const kBlockColumns = BlockColumns;
|
|
|
|
private:
|
|
//
|
|
// Data members
|
|
//
|
|
|
|
/// Stride data member
|
|
Stride stride_;
|
|
|
|
public:
|
|
//
|
|
// Methods
|
|
//
|
|
|
|
/// Ctor
|
|
CUTLASS_HOST_DEVICE
|
|
ColumnMajorBlockLinear(Index ldm = 0): stride_(ldm) { }
|
|
|
|
/// Helper returns a layout to a tightly packed tensor
|
|
CUTLASS_HOST_DEVICE
|
|
static ColumnMajorBlockLinear packed(MatrixCoord const &extent) {
|
|
return ColumnMajorBlockLinear(extent.row() * kBlockRows * kBlockColumns);
|
|
}
|
|
|
|
/// Returns the offset of a coordinate in linear memory.
|
|
/// Assumes coordinate has convention (row, column)
|
|
CUTLASS_HOST_DEVICE
|
|
LongIndex operator()(MatrixCoord const &coord) const {
|
|
return
|
|
(coord.row() % kBlockRows) +
|
|
(coord.column() % kBlockColumns) * kBlockRows +
|
|
(coord.row() / kBlockRows) * kBlockRows * kBlockColumns +
|
|
(coord.column() / kBlockColumns) * stride_[0];
|
|
}
|
|
|
|
/// Inverse of layout function, mapping linear offset to logical coordinate
|
|
CUTLASS_HOST_DEVICE
|
|
MatrixCoord inverse(LongIndex offset) const {
|
|
|
|
// TODO
|
|
return MatrixCoord(0, 0);
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Stride stride() const {
|
|
return stride_;
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Stride & stride() {
|
|
return stride_;
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Index stride(int idx) const {
|
|
return stride_[idx];
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Index & stride(int idx) {
|
|
return stride_[idx];
|
|
}
|
|
|
|
/// Compute the number of contiguous elements needed to store a tensor with the given size
|
|
CUTLASS_HOST_DEVICE
|
|
LongIndex capacity(MatrixCoord const &extent) const {
|
|
return (extent.column() + kBlockColumns - 1) / kBlockColumns * stride_[0];
|
|
}
|
|
};
|
|
|
|
/// Mapping function for block-linear matrices. Matrix is structured
|
|
/// as row-major arrangement of 2D tiles (that are row-major)
|
|
template <int BlockRows, int BlockColumns>
|
|
struct RowMajorBlockLinear {
|
|
/// Logical rank of tensor
|
|
static int const kRank = 2;
|
|
|
|
/// Rank of stride vector
|
|
static int const kStrideRank = 1;
|
|
|
|
/// Index type used for coordinates
|
|
using Index = int32_t;
|
|
|
|
/// Long index type used for offsets
|
|
using LongIndex = int64_t;
|
|
|
|
/// Logical coordinate
|
|
using TensorCoord = MatrixCoord;
|
|
|
|
/// Stride vector
|
|
using Stride = Coord<kStrideRank, Index>;
|
|
|
|
/// Size of a block in rows
|
|
static int const kBlockRows = BlockRows;
|
|
|
|
/// Size of a block in columns
|
|
static int const kBlockColumns = BlockColumns;
|
|
|
|
private:
|
|
//
|
|
// Data members
|
|
//
|
|
|
|
/// Stride data member
|
|
Stride stride_;
|
|
|
|
public:
|
|
//
|
|
// Methods
|
|
//
|
|
|
|
/// Ctor
|
|
CUTLASS_HOST_DEVICE
|
|
RowMajorBlockLinear(Index ldm = 0): stride_(ldm) { }
|
|
|
|
/// Helper returns a layout to a tightly packed tensor
|
|
CUTLASS_HOST_DEVICE
|
|
static RowMajorBlockLinear packed(MatrixCoord const &extent) {
|
|
return RowMajorBlockLinear(extent.column() * kBlockRows * kBlockColumns);
|
|
}
|
|
|
|
/// Returns the offset of a coordinate in linear memory.
|
|
/// Assumes coordinate has convention (row, column)
|
|
CUTLASS_HOST_DEVICE
|
|
LongIndex operator()(MatrixCoord const &coord) const {
|
|
return
|
|
(coord.column() % kBlockColumns) +
|
|
(coord.row() % kBlockRows) * kBlockColumns +
|
|
(coord.column() / kBlockColumns) * kBlockRows * kBlockColumns +
|
|
(coord.row() / kBlockRows) * stride_[0];
|
|
}
|
|
|
|
/// Inverse of layout function, mapping linear offset to logical coordinate
|
|
CUTLASS_HOST_DEVICE
|
|
MatrixCoord inverse(LongIndex offset) const {
|
|
// TODO
|
|
return MatrixCoord(0, 0);
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Stride stride() const {
|
|
return stride_;
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Stride & stride() {
|
|
return stride_;
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Index stride(int idx) const {
|
|
return stride_[idx];
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Index & stride(int idx) {
|
|
return stride_[idx];
|
|
}
|
|
|
|
/// Compute the number of contiguous elements needed to store a tensor with the given size
|
|
CUTLASS_HOST_DEVICE
|
|
LongIndex capacity(MatrixCoord const &extent) const {
|
|
return (extent.row() + kBlockRows - 1) / kBlockRows * stride_[0];
|
|
}
|
|
};
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
struct GeneralMatrix {
|
|
|
|
/// Logical rank of tensor
|
|
static int const kRank = 2;
|
|
|
|
/// Rank of stride vector
|
|
static int const kStrideRank = 2;
|
|
|
|
/// Index type used for coordinates
|
|
using Index = int32_t;
|
|
|
|
/// Long index type used for offsets
|
|
using LongIndex = int64_t;
|
|
|
|
/// Logical coordinate
|
|
using TensorCoord = MatrixCoord;
|
|
|
|
/// Stride vector
|
|
using Stride = Coord<kStrideRank, Index>;
|
|
|
|
private:
|
|
//
|
|
// Data members
|
|
//
|
|
|
|
Matrix layout_id_;
|
|
|
|
/// Stride data member
|
|
Stride stride_;
|
|
|
|
public:
|
|
//
|
|
// Methods
|
|
//
|
|
|
|
/// Ctor
|
|
CUTLASS_HOST_DEVICE
|
|
GeneralMatrix(): layout_id_(Matrix::kColumnMajor), stride_(make_Coord(0, 1)) { }
|
|
|
|
/// Ctor
|
|
CUTLASS_HOST_DEVICE
|
|
GeneralMatrix(
|
|
Matrix layout_id,
|
|
Index ldm,
|
|
Index interleave): layout_id_(layout_id), stride_(make_Coord(ldm, interleave)) { }
|
|
|
|
/// Helper returns a layout to a tightly packed tensor
|
|
CUTLASS_HOST_DEVICE
|
|
static GeneralMatrix packed(
|
|
MatrixCoord const &extent,
|
|
Matrix layout_id = Matrix::kColumnMajor,
|
|
Index interleave = 1) {
|
|
|
|
Index c;
|
|
if (layout_id == Matrix::kRowMajor) {
|
|
c = extent.column();
|
|
}
|
|
else {
|
|
c = extent.row();
|
|
}
|
|
|
|
Index ldm = c * interleave;
|
|
|
|
return GeneralMatrix(layout_id, ldm, interleave);
|
|
}
|
|
|
|
/// Returns the offset of a coordinate in linear memory.
|
|
/// Assumes coordinate has convention (row, column)
|
|
CUTLASS_HOST_DEVICE
|
|
LongIndex operator()(MatrixCoord const &coord) const {
|
|
Index c, s;
|
|
if (layout_id_ == Matrix::kRowMajor) {
|
|
c = coord.column();
|
|
s = coord.row();
|
|
}
|
|
else {
|
|
s = coord.column();
|
|
c = coord.row();
|
|
}
|
|
|
|
Index v = s / stride_[1];
|
|
Index residual = (s % stride_[1]);
|
|
|
|
return LongIndex(c) * LongIndex(stride_[1]) + LongIndex(v) * LongIndex(stride_[0]) + residual;
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Stride stride() const {
|
|
return stride_;
|
|
}
|
|
|
|
CUTLASS_HOST_DEVICE
|
|
Matrix layout_id() const {
|
|
return layout_id_;
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Stride & stride() {
|
|
return stride_;
|
|
}
|
|
|
|
CUTLASS_HOST_DEVICE
|
|
Matrix & layout_id() {
|
|
return layout_id_;
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Index stride(int idx) const {
|
|
return stride_[idx];
|
|
}
|
|
|
|
/// Returns the stride of the layout
|
|
CUTLASS_HOST_DEVICE
|
|
Index & stride(int idx) {
|
|
return stride_[idx];
|
|
}
|
|
|
|
/// Compute the number of contiguous elements needed to store a tensor with the given size
|
|
CUTLASS_HOST_DEVICE
|
|
LongIndex capacity(MatrixCoord const &extent) const {
|
|
Index s;
|
|
if (layout_id_ == Matrix::kRowMajor) {
|
|
s = extent.row();
|
|
}
|
|
else {
|
|
s = extent.column();
|
|
}
|
|
|
|
Index v = Index((s + stride_[1] - 1) / stride_[1]);
|
|
return LongIndex(v) * LongIndex(stride_[0]);
|
|
}
|
|
};
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
/// Defines transposes of matrix layouts
|
|
template <typename Layout>
|
|
struct LayoutTranspose;
|
|
|
|
/// Transpose of row-major is column-major
|
|
template <>
|
|
struct LayoutTranspose<layout::RowMajor> {
|
|
using type = layout::ColumnMajor;
|
|
};
|
|
|
|
/// Transpose of column-major is row-major
|
|
template <>
|
|
struct LayoutTranspose<layout::ColumnMajor> {
|
|
using type = layout::RowMajor;
|
|
};
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
} // namespace layout
|
|
} // namespace cutlass
|