CUTLASS 2.6 (#298)

CUTLASS 2.6
This commit is contained in:
Manish Gupta
2021-07-22 21:40:53 -07:00
committed by GitHub
parent 6c29fe20ba
commit e5d51840e8
308 changed files with 32408 additions and 4722 deletions

View File

@ -274,9 +274,9 @@ public:
library::LayoutTypeID const &layout_a,
library::LayoutTypeID const &layout_b,
library::LayoutTypeID const &layout_c) {
std::vector<int> stride_activations;
std::vector<int> stride_filters;
std::vector<int> stride_output;
std::vector<int64_t> stride_activations;
std::vector<int64_t> stride_filters;
std::vector<int64_t> stride_output;
// Strides for interleaved fprop
if (conv_kind == library::ConvKind::kFprop &&

View File

@ -268,7 +268,7 @@ public:
A(nullptr), B(nullptr), C(nullptr), Computed(nullptr), Reference(nullptr) { }
// Returns stride vector for tensor A
std::vector<int> stride_a(library::ConvKind const &conv_kind) {
std::vector<int64_t> stride_a(library::ConvKind const &conv_kind) {
return {
configuration.layout_a(conv_kind).stride()[0],
configuration.layout_a(conv_kind).stride()[1],
@ -278,7 +278,7 @@ public:
}
// Returns stride vector for tensor B
std::vector<int> stride_b(library::ConvKind const &conv_kind) {
std::vector<int64_t> stride_b(library::ConvKind const &conv_kind) {
return {
configuration.layout_b(conv_kind).stride()[0],
@ -289,7 +289,7 @@ public:
}
// Returns stride vector for tensor C
std::vector<int> stride_c(library::ConvKind const &conv_kind) {
std::vector<int64_t> stride_c(library::ConvKind const &conv_kind) {
return {
configuration.layout_c(conv_kind).stride()[0],

View File

@ -67,6 +67,15 @@ CutlassProfiler::~CutlassProfiler() {
/// Execute the program
int CutlassProfiler::operator()() {
if (options_.cmdline.num_naked_args() > 0) {
std::cerr << "Unknown args: \n";
options_.cmdline.print_naked_args(std::cerr);
std::cerr << "\n\n\n";
print_usage_(std::cout);
return 1;
}
if (options_.about.help) {
if (options_.operation_kind == library::OperationKind::kInvalid) {
print_usage_(std::cout);

View File

@ -54,7 +54,7 @@ size_t DeviceAllocation::bytes(library::NumericTypeID type, size_t capacity) {
/////////////////////////////////////////////////////////////////////////////////////////////////
template <typename Layout>
static std::vector<int> get_packed_layout_stride(std::vector<int> const &extent) {
static std::vector<int64_t> get_packed_layout_stride(std::vector<int> const &extent) {
typename Layout::TensorCoord extent_coord;
typename Layout::Stride stride_coord;
@ -67,25 +67,25 @@ static std::vector<int> get_packed_layout_stride(std::vector<int> const &extent)
extent_coord[i] = extent.at(i);
}
std::vector<int> stride;
std::vector<int64_t> stride;
stride.resize(Layout::kStrideRank, 0);
Layout layout = Layout::packed(extent_coord);
stride_coord = layout.stride();
for (int i = 0; i < Layout::kStrideRank; ++i) {
stride.at(i) = stride_coord[i];
stride.at(i) = (int64_t)stride_coord[i];
}
return stride;
}
/// Returns the stride of a packed layout
std::vector<int> DeviceAllocation::get_packed_layout(
std::vector<int64_t> DeviceAllocation::get_packed_layout(
library::LayoutTypeID layout_id,
std::vector<int> const &extent) {
std::vector<int> stride;
std::vector<int64_t> stride;
switch (layout_id) {
case library::LayoutTypeID::kColumnMajor:
@ -159,7 +159,7 @@ static size_t construct_layout_(
void *bytes,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
std::vector<int> &stride) {
std::vector<int64_t> &stride) {
if (extent.size() != Layout::kRank) {
throw std::runtime_error(
@ -183,7 +183,7 @@ static size_t construct_layout_(
typename Layout::Stride stride_coord;
for (int i = 0; i < Layout::kStrideRank; ++i) {
stride_coord[i] = stride.at(i);
stride_coord[i] = (int)stride.at(i);
}
typename Layout::TensorCoord extent_coord;
@ -210,7 +210,7 @@ size_t DeviceAllocation::construct_layout(
void *bytes,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
std::vector<int> &stride) {
std::vector<int64_t> &stride) {
switch (layout_id) {
case library::LayoutTypeID::kColumnMajor:
@ -309,7 +309,7 @@ DeviceAllocation::DeviceAllocation(
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
std::vector<int> const &stride,
std::vector<int64_t> const &stride,
int batch_count
):
type_(type), batch_stride_(size_t(0)), capacity_(size_t(0)), pointer_(nullptr), batch_count_(1) {
@ -370,12 +370,12 @@ DeviceAllocation &DeviceAllocation::reset(
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
std::vector<int> const &stride,
std::vector<int64_t> const &stride,
int batch_count) {
reset();
tensor_ref_buffer_.resize(sizeof(pointer_) + (sizeof(int) * library::get_layout_stride_rank(layout_id)), 0);
tensor_ref_buffer_.resize(sizeof(pointer_) + (sizeof(int64_t) * library::get_layout_stride_rank(layout_id)), 0);
type_ = type;
@ -422,7 +422,7 @@ library::LayoutTypeID DeviceAllocation::layout() const {
return layout_;
}
std::vector<int> const & DeviceAllocation::stride() const {
std::vector<int64_t> const & DeviceAllocation::stride() const {
return stride_;
}
@ -1277,6 +1277,15 @@ struct vector_to_coord {
vector_to_coord<TensorCoord, Rank - 1>(coord, vec);
}
}
vector_to_coord(TensorCoord &coord, std::vector<int64_t> const &vec) {
coord[Rank - 1] = (int)vec.at(Rank - 1);
if (Rank > 1) {
vector_to_coord<TensorCoord, Rank - 1>(coord, vec);
}
}
};
/// Permits copying dynamic vectors into static-length vectors
@ -1287,6 +1296,11 @@ struct vector_to_coord<TensorCoord, 1> {
coord[0] = vec.at(0);
}
vector_to_coord(TensorCoord &coord, std::vector<int64_t> const &vec) {
coord[0] = (int)vec.at(0);
}
};
/// Permits copying dynamic vectors into static-length vectors
@ -1306,7 +1320,7 @@ static void write_tensor_csv_static_tensor_view(
DeviceAllocation &allocation) {
Coord<Layout::kRank> extent;
Coord<Layout::kStrideRank> stride;
Coord<Layout::kStrideRank, typename Layout::Stride::Index> stride;
if (allocation.extent().size() != Layout::kRank) {
throw std::runtime_error("Allocation extent has invalid rank");
@ -1317,7 +1331,8 @@ static void write_tensor_csv_static_tensor_view(
}
vector_to_coord<Coord<Layout::kRank>, Layout::kRank>(extent, allocation.extent());
vector_to_coord<Coord<Layout::kStrideRank>, Layout::kStrideRank>(stride, allocation.stride());
vector_to_coord<Coord<Layout::kStrideRank, typename Layout::Stride::Index>,
Layout::kStrideRank>(stride, allocation.stride());
Layout layout(stride);
HostTensor<Element, Layout> host_tensor(extent, layout, false);
@ -1498,6 +1513,162 @@ void DeviceAllocation::write_tensor_csv(
}
}
template <typename Element, typename Layout>
static void tensor_fill_tensor_view(DeviceAllocation &allocation, Element val = Element()) {
Coord<Layout::kRank> extent;
Coord<Layout::kStrideRank, typename Layout::LongIndex> stride;
if (allocation.extent().size() != Layout::kRank) {
throw std::runtime_error("Allocation extent has invalid rank");
}
if (allocation.stride().size() != Layout::kStrideRank) {
throw std::runtime_error("Allocation stride has invalid rank");
}
vector_to_coord<Coord<Layout::kRank>, Layout::kRank>(extent, allocation.extent());
vector_to_coord<Coord<Layout::kStrideRank, typename Layout::LongIndex>,
Layout::kStrideRank>(stride, allocation.stride());
TensorView<Element, Layout> view(
static_cast<Element *>(allocation.data()),
Layout(stride),
extent
);
cutlass::reference::device::TensorFill<Element, Layout>(
view,
val
);
}
template <typename Element>
static void tensor_fill(DeviceAllocation &allocation, Element val = Element()) {
switch (allocation.layout()) {
case library::LayoutTypeID::kRowMajor:
tensor_fill_tensor_view<Element, layout::RowMajor>(allocation, val);
break;
case library::LayoutTypeID::kColumnMajor:
tensor_fill_tensor_view<Element, layout::ColumnMajor>(allocation, val);
break;
case library::LayoutTypeID::kTensorNHWC:
tensor_fill_tensor_view<Element, layout::TensorNHWC>(allocation, val);
break;
case library::LayoutTypeID::kTensorNDHWC:
tensor_fill_tensor_view<Element, layout::TensorNDHWC>(allocation, val);
break;
case library::LayoutTypeID::kTensorNC32HW32:
tensor_fill_tensor_view<Element, layout::TensorNCxHWx<32>>(allocation, val);
break;
case library::LayoutTypeID::kTensorNC64HW64:
tensor_fill_tensor_view<Element, layout::TensorNCxHWx<64>>(allocation, val);
break;
case library::LayoutTypeID::kTensorC32RSK32:
tensor_fill_tensor_view<Element, layout::TensorCxRSKx<32>>(allocation, val);
break;
case library::LayoutTypeID::kTensorC64RSK64:
tensor_fill_tensor_view<Element, layout::TensorCxRSKx<64>>(allocation, val);
break;
default:
throw std::runtime_error("Unsupported layout");
break;
}
}
/// Fills a tensor uniformly with a value (most frequently used to clear the tensor)
void DeviceAllocation::fill(double val = 0.0) {
switch (this->type()) {
case library::NumericTypeID::kF16:
tensor_fill<half_t>(*this, static_cast<half_t>(val));
break;
case library::NumericTypeID::kBF16:
tensor_fill<bfloat16_t>(*this, static_cast<bfloat16_t>(val));
break;
case library::NumericTypeID::kTF32:
tensor_fill<tfloat32_t>(*this, static_cast<tfloat32_t>(val));
break;
case library::NumericTypeID::kF32:
tensor_fill<float>(*this, static_cast<float>(val));
break;
case library::NumericTypeID::kF64:
tensor_fill<double>(*this, static_cast<double>(val));
break;
case library::NumericTypeID::kS2:
tensor_fill<int2b_t>(*this, static_cast<int2b_t>(val));
break;
case library::NumericTypeID::kS4:
tensor_fill<int4b_t>(*this, static_cast<int4b_t>(val));
break;
case library::NumericTypeID::kS8:
tensor_fill<int8_t>(*this, static_cast<int8_t>(val));
break;
case library::NumericTypeID::kS16:
tensor_fill<int16_t>(*this, static_cast<int16_t>(val));
break;
case library::NumericTypeID::kS32:
tensor_fill<int32_t>(*this, static_cast<int32_t>(val));
break;
case library::NumericTypeID::kS64:
tensor_fill<int64_t>(*this, static_cast<int64_t>(val));
break;
case library::NumericTypeID::kB1:
tensor_fill<uint1b_t>(*this, static_cast<uint1b_t>(val));
break;
case library::NumericTypeID::kU2:
tensor_fill<uint2b_t>(*this, static_cast<uint2b_t>(val));
break;
case library::NumericTypeID::kU4:
tensor_fill<uint4b_t>(*this, static_cast<uint4b_t>(val));
break;
case library::NumericTypeID::kU8:
tensor_fill<uint8_t>(*this, static_cast<uint8_t>(val));
break;
case library::NumericTypeID::kU16:
tensor_fill<uint16_t>(*this, static_cast<uint16_t>(val));
break;
case library::NumericTypeID::kU32:
tensor_fill<uint32_t>(*this, static_cast<uint32_t>(val));
break;
case library::NumericTypeID::kU64:
tensor_fill<uint64_t>(*this, static_cast<uint64_t>(val));
break;
case library::NumericTypeID::kCF16:
tensor_fill<cutlass::complex<half_t> >(*this, from_real<half_t>(val));
break;
case library::NumericTypeID::kCF32:
tensor_fill<cutlass::complex<float> >(*this, from_real<float>(val));
break;
case library::NumericTypeID::kCF64:
tensor_fill<cutlass::complex<double> >(*this, from_real<double>(val));
break;
default:
throw std::runtime_error("Unsupported numeric type");
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace profiler

View File

@ -64,7 +64,7 @@ private:
library::LayoutTypeID layout_;
/// Stride vector
std::vector<int> stride_;
std::vector<int64_t> stride_;
/// Extent vector
std::vector<int> extent_;
@ -84,7 +84,7 @@ public:
static size_t bytes(library::NumericTypeID type, size_t capacity);
/// Returns the stride of a packed layout
static std::vector<int> get_packed_layout(
static std::vector<int64_t> get_packed_layout(
library::LayoutTypeID layout_id,
std::vector<int> const &extent);
@ -93,7 +93,7 @@ public:
void *bytes,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
std::vector<int> &stride);
std::vector<int64_t> &stride);
/// Returns true if two blocks have exactly the same value
static bool block_compare_equal(
@ -124,7 +124,7 @@ public:
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
std::vector<int> const &stride = std::vector<int>(),
std::vector<int64_t> const &stride = std::vector<int64_t>(),
int batch_count = 1);
~DeviceAllocation();
@ -139,7 +139,7 @@ public:
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
std::vector<int> const &stride = std::vector<int>(),
std::vector<int64_t> const &stride = std::vector<int64_t>(),
int batch_count = 1);
/// Returns a buffer owning the tensor reference
@ -162,7 +162,7 @@ public:
library::LayoutTypeID layout() const;
/// Gets the stride vector
std::vector<int> const & stride() const;
std::vector<int64_t> const & stride() const;
/// Gets the extent vector
std::vector<int> const & extent() const;
@ -193,6 +193,9 @@ public:
/// Initializes a host allocation to a random distribution using std::cout
void initialize_random_sparsemeta_host(int seed, int MetaSizeInBits);
/// Uniformly fills a tensor with a value when provided o.w. zero
void fill(double value);
/// Copies from an equivalent-sized tensor in device memory
void copy_from_device(void const *ptr);

View File

@ -52,7 +52,7 @@ DeviceAllocation *DeviceContext::allocate_tensor(
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
std::vector<int> const &stride,
std::vector<int64_t> const &stride,
int batch_count) {
device_memory_.emplace_back(type, layout_id, extent, stride, batch_count);
@ -69,7 +69,7 @@ DeviceAllocation *DeviceContext::allocate_tensor(
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
std::vector<int> const &stride,
std::vector<int64_t> const &stride,
int batch_count) {
DeviceAllocation *allocation =
@ -133,7 +133,7 @@ DeviceAllocation *DeviceContext::allocate_sparsemeta_tensor(
library::LayoutTypeID layout_id,
library::NumericTypeID type_a,
std::vector<int> const &extent,
std::vector<int> const &stride,
std::vector<int64_t> const &stride,
int batch_count) {
DeviceAllocation *allocation =

View File

@ -77,7 +77,7 @@ public:
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
std::vector<int> const &stride = std::vector<int>(),
std::vector<int64_t> const &stride = std::vector<int64_t>(),
int batch_count = 1);
/// Allocates memory of a given type, capacity (elements), and name
@ -87,7 +87,7 @@ public:
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
std::vector<int> const &stride = std::vector<int>(),
std::vector<int64_t> const &stride = std::vector<int64_t>(),
int batch_count = 1);
/// Allocates memory for sparse meta data
@ -98,7 +98,7 @@ public:
library::LayoutTypeID layout_id,
library::NumericTypeID type_a,
std::vector<int> const &extent,
std::vector<int> const &stride = std::vector<int>(),
std::vector<int64_t> const &stride = std::vector<int64_t>(),
int batch_count = 1);
/// Clears named allocations (but does not necessarily free memory)