@ -274,9 +274,9 @@ public:
|
||||
library::LayoutTypeID const &layout_a,
|
||||
library::LayoutTypeID const &layout_b,
|
||||
library::LayoutTypeID const &layout_c) {
|
||||
std::vector<int> stride_activations;
|
||||
std::vector<int> stride_filters;
|
||||
std::vector<int> stride_output;
|
||||
std::vector<int64_t> stride_activations;
|
||||
std::vector<int64_t> stride_filters;
|
||||
std::vector<int64_t> stride_output;
|
||||
|
||||
// Strides for interleaved fprop
|
||||
if (conv_kind == library::ConvKind::kFprop &&
|
||||
|
||||
@ -268,7 +268,7 @@ public:
|
||||
A(nullptr), B(nullptr), C(nullptr), Computed(nullptr), Reference(nullptr) { }
|
||||
|
||||
// Returns stride vector for tensor A
|
||||
std::vector<int> stride_a(library::ConvKind const &conv_kind) {
|
||||
std::vector<int64_t> stride_a(library::ConvKind const &conv_kind) {
|
||||
return {
|
||||
configuration.layout_a(conv_kind).stride()[0],
|
||||
configuration.layout_a(conv_kind).stride()[1],
|
||||
@ -278,7 +278,7 @@ public:
|
||||
}
|
||||
|
||||
// Returns stride vector for tensor B
|
||||
std::vector<int> stride_b(library::ConvKind const &conv_kind) {
|
||||
std::vector<int64_t> stride_b(library::ConvKind const &conv_kind) {
|
||||
|
||||
return {
|
||||
configuration.layout_b(conv_kind).stride()[0],
|
||||
@ -289,7 +289,7 @@ public:
|
||||
}
|
||||
|
||||
// Returns stride vector for tensor C
|
||||
std::vector<int> stride_c(library::ConvKind const &conv_kind) {
|
||||
std::vector<int64_t> stride_c(library::ConvKind const &conv_kind) {
|
||||
|
||||
return {
|
||||
configuration.layout_c(conv_kind).stride()[0],
|
||||
|
||||
@ -67,6 +67,15 @@ CutlassProfiler::~CutlassProfiler() {
|
||||
/// Execute the program
|
||||
int CutlassProfiler::operator()() {
|
||||
|
||||
if (options_.cmdline.num_naked_args() > 0) {
|
||||
std::cerr << "Unknown args: \n";
|
||||
options_.cmdline.print_naked_args(std::cerr);
|
||||
std::cerr << "\n\n\n";
|
||||
|
||||
print_usage_(std::cout);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (options_.about.help) {
|
||||
if (options_.operation_kind == library::OperationKind::kInvalid) {
|
||||
print_usage_(std::cout);
|
||||
|
||||
@ -54,7 +54,7 @@ size_t DeviceAllocation::bytes(library::NumericTypeID type, size_t capacity) {
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename Layout>
|
||||
static std::vector<int> get_packed_layout_stride(std::vector<int> const &extent) {
|
||||
static std::vector<int64_t> get_packed_layout_stride(std::vector<int> const &extent) {
|
||||
|
||||
typename Layout::TensorCoord extent_coord;
|
||||
typename Layout::Stride stride_coord;
|
||||
@ -67,25 +67,25 @@ static std::vector<int> get_packed_layout_stride(std::vector<int> const &extent)
|
||||
extent_coord[i] = extent.at(i);
|
||||
}
|
||||
|
||||
std::vector<int> stride;
|
||||
std::vector<int64_t> stride;
|
||||
stride.resize(Layout::kStrideRank, 0);
|
||||
|
||||
Layout layout = Layout::packed(extent_coord);
|
||||
stride_coord = layout.stride();
|
||||
|
||||
for (int i = 0; i < Layout::kStrideRank; ++i) {
|
||||
stride.at(i) = stride_coord[i];
|
||||
stride.at(i) = (int64_t)stride_coord[i];
|
||||
}
|
||||
|
||||
return stride;
|
||||
}
|
||||
|
||||
/// Returns the stride of a packed layout
|
||||
std::vector<int> DeviceAllocation::get_packed_layout(
|
||||
std::vector<int64_t> DeviceAllocation::get_packed_layout(
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent) {
|
||||
|
||||
std::vector<int> stride;
|
||||
std::vector<int64_t> stride;
|
||||
|
||||
switch (layout_id) {
|
||||
case library::LayoutTypeID::kColumnMajor:
|
||||
@ -159,7 +159,7 @@ static size_t construct_layout_(
|
||||
void *bytes,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int> &stride) {
|
||||
std::vector<int64_t> &stride) {
|
||||
|
||||
if (extent.size() != Layout::kRank) {
|
||||
throw std::runtime_error(
|
||||
@ -183,7 +183,7 @@ static size_t construct_layout_(
|
||||
|
||||
typename Layout::Stride stride_coord;
|
||||
for (int i = 0; i < Layout::kStrideRank; ++i) {
|
||||
stride_coord[i] = stride.at(i);
|
||||
stride_coord[i] = (int)stride.at(i);
|
||||
}
|
||||
|
||||
typename Layout::TensorCoord extent_coord;
|
||||
@ -210,7 +210,7 @@ size_t DeviceAllocation::construct_layout(
|
||||
void *bytes,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int> &stride) {
|
||||
std::vector<int64_t> &stride) {
|
||||
|
||||
switch (layout_id) {
|
||||
case library::LayoutTypeID::kColumnMajor:
|
||||
@ -309,7 +309,7 @@ DeviceAllocation::DeviceAllocation(
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int> const &stride,
|
||||
std::vector<int64_t> const &stride,
|
||||
int batch_count
|
||||
):
|
||||
type_(type), batch_stride_(size_t(0)), capacity_(size_t(0)), pointer_(nullptr), batch_count_(1) {
|
||||
@ -370,12 +370,12 @@ DeviceAllocation &DeviceAllocation::reset(
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int> const &stride,
|
||||
std::vector<int64_t> const &stride,
|
||||
int batch_count) {
|
||||
|
||||
reset();
|
||||
|
||||
tensor_ref_buffer_.resize(sizeof(pointer_) + (sizeof(int) * library::get_layout_stride_rank(layout_id)), 0);
|
||||
tensor_ref_buffer_.resize(sizeof(pointer_) + (sizeof(int64_t) * library::get_layout_stride_rank(layout_id)), 0);
|
||||
|
||||
type_ = type;
|
||||
|
||||
@ -422,7 +422,7 @@ library::LayoutTypeID DeviceAllocation::layout() const {
|
||||
return layout_;
|
||||
}
|
||||
|
||||
std::vector<int> const & DeviceAllocation::stride() const {
|
||||
std::vector<int64_t> const & DeviceAllocation::stride() const {
|
||||
return stride_;
|
||||
}
|
||||
|
||||
@ -1277,6 +1277,15 @@ struct vector_to_coord {
|
||||
vector_to_coord<TensorCoord, Rank - 1>(coord, vec);
|
||||
}
|
||||
}
|
||||
|
||||
vector_to_coord(TensorCoord &coord, std::vector<int64_t> const &vec) {
|
||||
|
||||
coord[Rank - 1] = (int)vec.at(Rank - 1);
|
||||
|
||||
if (Rank > 1) {
|
||||
vector_to_coord<TensorCoord, Rank - 1>(coord, vec);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/// Permits copying dynamic vectors into static-length vectors
|
||||
@ -1287,6 +1296,11 @@ struct vector_to_coord<TensorCoord, 1> {
|
||||
|
||||
coord[0] = vec.at(0);
|
||||
}
|
||||
|
||||
vector_to_coord(TensorCoord &coord, std::vector<int64_t> const &vec) {
|
||||
|
||||
coord[0] = (int)vec.at(0);
|
||||
}
|
||||
};
|
||||
|
||||
/// Permits copying dynamic vectors into static-length vectors
|
||||
@ -1306,7 +1320,7 @@ static void write_tensor_csv_static_tensor_view(
|
||||
DeviceAllocation &allocation) {
|
||||
|
||||
Coord<Layout::kRank> extent;
|
||||
Coord<Layout::kStrideRank> stride;
|
||||
Coord<Layout::kStrideRank, typename Layout::Stride::Index> stride;
|
||||
|
||||
if (allocation.extent().size() != Layout::kRank) {
|
||||
throw std::runtime_error("Allocation extent has invalid rank");
|
||||
@ -1317,7 +1331,8 @@ static void write_tensor_csv_static_tensor_view(
|
||||
}
|
||||
|
||||
vector_to_coord<Coord<Layout::kRank>, Layout::kRank>(extent, allocation.extent());
|
||||
vector_to_coord<Coord<Layout::kStrideRank>, Layout::kStrideRank>(stride, allocation.stride());
|
||||
vector_to_coord<Coord<Layout::kStrideRank, typename Layout::Stride::Index>,
|
||||
Layout::kStrideRank>(stride, allocation.stride());
|
||||
|
||||
Layout layout(stride);
|
||||
HostTensor<Element, Layout> host_tensor(extent, layout, false);
|
||||
@ -1498,6 +1513,162 @@ void DeviceAllocation::write_tensor_csv(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Element, typename Layout>
|
||||
static void tensor_fill_tensor_view(DeviceAllocation &allocation, Element val = Element()) {
|
||||
Coord<Layout::kRank> extent;
|
||||
Coord<Layout::kStrideRank, typename Layout::LongIndex> stride;
|
||||
|
||||
if (allocation.extent().size() != Layout::kRank) {
|
||||
throw std::runtime_error("Allocation extent has invalid rank");
|
||||
}
|
||||
|
||||
if (allocation.stride().size() != Layout::kStrideRank) {
|
||||
throw std::runtime_error("Allocation stride has invalid rank");
|
||||
}
|
||||
|
||||
vector_to_coord<Coord<Layout::kRank>, Layout::kRank>(extent, allocation.extent());
|
||||
vector_to_coord<Coord<Layout::kStrideRank, typename Layout::LongIndex>,
|
||||
Layout::kStrideRank>(stride, allocation.stride());
|
||||
|
||||
TensorView<Element, Layout> view(
|
||||
static_cast<Element *>(allocation.data()),
|
||||
Layout(stride),
|
||||
extent
|
||||
);
|
||||
|
||||
|
||||
cutlass::reference::device::TensorFill<Element, Layout>(
|
||||
view,
|
||||
val
|
||||
);
|
||||
}
|
||||
|
||||
template <typename Element>
|
||||
static void tensor_fill(DeviceAllocation &allocation, Element val = Element()) {
|
||||
switch (allocation.layout()) {
|
||||
case library::LayoutTypeID::kRowMajor:
|
||||
tensor_fill_tensor_view<Element, layout::RowMajor>(allocation, val);
|
||||
break;
|
||||
case library::LayoutTypeID::kColumnMajor:
|
||||
tensor_fill_tensor_view<Element, layout::ColumnMajor>(allocation, val);
|
||||
break;
|
||||
case library::LayoutTypeID::kTensorNHWC:
|
||||
tensor_fill_tensor_view<Element, layout::TensorNHWC>(allocation, val);
|
||||
break;
|
||||
case library::LayoutTypeID::kTensorNDHWC:
|
||||
tensor_fill_tensor_view<Element, layout::TensorNDHWC>(allocation, val);
|
||||
break;
|
||||
case library::LayoutTypeID::kTensorNC32HW32:
|
||||
tensor_fill_tensor_view<Element, layout::TensorNCxHWx<32>>(allocation, val);
|
||||
break;
|
||||
case library::LayoutTypeID::kTensorNC64HW64:
|
||||
tensor_fill_tensor_view<Element, layout::TensorNCxHWx<64>>(allocation, val);
|
||||
break;
|
||||
case library::LayoutTypeID::kTensorC32RSK32:
|
||||
tensor_fill_tensor_view<Element, layout::TensorCxRSKx<32>>(allocation, val);
|
||||
break;
|
||||
case library::LayoutTypeID::kTensorC64RSK64:
|
||||
tensor_fill_tensor_view<Element, layout::TensorCxRSKx<64>>(allocation, val);
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Unsupported layout");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/// Fills a tensor uniformly with a value (most frequently used to clear the tensor)
|
||||
void DeviceAllocation::fill(double val = 0.0) {
|
||||
|
||||
switch (this->type()) {
|
||||
case library::NumericTypeID::kF16:
|
||||
tensor_fill<half_t>(*this, static_cast<half_t>(val));
|
||||
break;
|
||||
|
||||
case library::NumericTypeID::kBF16:
|
||||
tensor_fill<bfloat16_t>(*this, static_cast<bfloat16_t>(val));
|
||||
break;
|
||||
|
||||
case library::NumericTypeID::kTF32:
|
||||
tensor_fill<tfloat32_t>(*this, static_cast<tfloat32_t>(val));
|
||||
break;
|
||||
|
||||
case library::NumericTypeID::kF32:
|
||||
tensor_fill<float>(*this, static_cast<float>(val));
|
||||
break;
|
||||
|
||||
case library::NumericTypeID::kF64:
|
||||
tensor_fill<double>(*this, static_cast<double>(val));
|
||||
break;
|
||||
|
||||
case library::NumericTypeID::kS2:
|
||||
tensor_fill<int2b_t>(*this, static_cast<int2b_t>(val));
|
||||
break;
|
||||
|
||||
case library::NumericTypeID::kS4:
|
||||
tensor_fill<int4b_t>(*this, static_cast<int4b_t>(val));
|
||||
break;
|
||||
|
||||
case library::NumericTypeID::kS8:
|
||||
tensor_fill<int8_t>(*this, static_cast<int8_t>(val));
|
||||
break;
|
||||
|
||||
case library::NumericTypeID::kS16:
|
||||
tensor_fill<int16_t>(*this, static_cast<int16_t>(val));
|
||||
break;
|
||||
|
||||
case library::NumericTypeID::kS32:
|
||||
tensor_fill<int32_t>(*this, static_cast<int32_t>(val));
|
||||
break;
|
||||
|
||||
case library::NumericTypeID::kS64:
|
||||
tensor_fill<int64_t>(*this, static_cast<int64_t>(val));
|
||||
break;
|
||||
|
||||
case library::NumericTypeID::kB1:
|
||||
tensor_fill<uint1b_t>(*this, static_cast<uint1b_t>(val));
|
||||
break;
|
||||
|
||||
case library::NumericTypeID::kU2:
|
||||
tensor_fill<uint2b_t>(*this, static_cast<uint2b_t>(val));
|
||||
break;
|
||||
|
||||
case library::NumericTypeID::kU4:
|
||||
tensor_fill<uint4b_t>(*this, static_cast<uint4b_t>(val));
|
||||
break;
|
||||
|
||||
case library::NumericTypeID::kU8:
|
||||
tensor_fill<uint8_t>(*this, static_cast<uint8_t>(val));
|
||||
break;
|
||||
|
||||
case library::NumericTypeID::kU16:
|
||||
tensor_fill<uint16_t>(*this, static_cast<uint16_t>(val));
|
||||
break;
|
||||
|
||||
case library::NumericTypeID::kU32:
|
||||
tensor_fill<uint32_t>(*this, static_cast<uint32_t>(val));
|
||||
break;
|
||||
|
||||
case library::NumericTypeID::kU64:
|
||||
tensor_fill<uint64_t>(*this, static_cast<uint64_t>(val));
|
||||
break;
|
||||
|
||||
case library::NumericTypeID::kCF16:
|
||||
tensor_fill<cutlass::complex<half_t> >(*this, from_real<half_t>(val));
|
||||
break;
|
||||
|
||||
case library::NumericTypeID::kCF32:
|
||||
tensor_fill<cutlass::complex<float> >(*this, from_real<float>(val));
|
||||
break;
|
||||
|
||||
case library::NumericTypeID::kCF64:
|
||||
tensor_fill<cutlass::complex<double> >(*this, from_real<double>(val));
|
||||
break;
|
||||
|
||||
default:
|
||||
throw std::runtime_error("Unsupported numeric type");
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace profiler
|
||||
|
||||
@ -64,7 +64,7 @@ private:
|
||||
library::LayoutTypeID layout_;
|
||||
|
||||
/// Stride vector
|
||||
std::vector<int> stride_;
|
||||
std::vector<int64_t> stride_;
|
||||
|
||||
/// Extent vector
|
||||
std::vector<int> extent_;
|
||||
@ -84,7 +84,7 @@ public:
|
||||
static size_t bytes(library::NumericTypeID type, size_t capacity);
|
||||
|
||||
/// Returns the stride of a packed layout
|
||||
static std::vector<int> get_packed_layout(
|
||||
static std::vector<int64_t> get_packed_layout(
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent);
|
||||
|
||||
@ -93,7 +93,7 @@ public:
|
||||
void *bytes,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int> &stride);
|
||||
std::vector<int64_t> &stride);
|
||||
|
||||
/// Returns true if two blocks have exactly the same value
|
||||
static bool block_compare_equal(
|
||||
@ -124,7 +124,7 @@ public:
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int> const &stride = std::vector<int>(),
|
||||
std::vector<int64_t> const &stride = std::vector<int64_t>(),
|
||||
int batch_count = 1);
|
||||
|
||||
~DeviceAllocation();
|
||||
@ -139,7 +139,7 @@ public:
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int> const &stride = std::vector<int>(),
|
||||
std::vector<int64_t> const &stride = std::vector<int64_t>(),
|
||||
int batch_count = 1);
|
||||
|
||||
/// Returns a buffer owning the tensor reference
|
||||
@ -162,7 +162,7 @@ public:
|
||||
library::LayoutTypeID layout() const;
|
||||
|
||||
/// Gets the stride vector
|
||||
std::vector<int> const & stride() const;
|
||||
std::vector<int64_t> const & stride() const;
|
||||
|
||||
/// Gets the extent vector
|
||||
std::vector<int> const & extent() const;
|
||||
@ -193,6 +193,9 @@ public:
|
||||
|
||||
/// Initializes a host allocation to a random distribution using std::cout
|
||||
void initialize_random_sparsemeta_host(int seed, int MetaSizeInBits);
|
||||
|
||||
/// Uniformly fills a tensor with a value when provided o.w. zero
|
||||
void fill(double value);
|
||||
|
||||
/// Copies from an equivalent-sized tensor in device memory
|
||||
void copy_from_device(void const *ptr);
|
||||
|
||||
@ -52,7 +52,7 @@ DeviceAllocation *DeviceContext::allocate_tensor(
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int> const &stride,
|
||||
std::vector<int64_t> const &stride,
|
||||
int batch_count) {
|
||||
|
||||
device_memory_.emplace_back(type, layout_id, extent, stride, batch_count);
|
||||
@ -69,7 +69,7 @@ DeviceAllocation *DeviceContext::allocate_tensor(
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int> const &stride,
|
||||
std::vector<int64_t> const &stride,
|
||||
int batch_count) {
|
||||
|
||||
DeviceAllocation *allocation =
|
||||
@ -133,7 +133,7 @@ DeviceAllocation *DeviceContext::allocate_sparsemeta_tensor(
|
||||
library::LayoutTypeID layout_id,
|
||||
library::NumericTypeID type_a,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int> const &stride,
|
||||
std::vector<int64_t> const &stride,
|
||||
int batch_count) {
|
||||
|
||||
DeviceAllocation *allocation =
|
||||
|
||||
@ -77,7 +77,7 @@ public:
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int> const &stride = std::vector<int>(),
|
||||
std::vector<int64_t> const &stride = std::vector<int64_t>(),
|
||||
int batch_count = 1);
|
||||
|
||||
/// Allocates memory of a given type, capacity (elements), and name
|
||||
@ -87,7 +87,7 @@ public:
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int> const &stride = std::vector<int>(),
|
||||
std::vector<int64_t> const &stride = std::vector<int64_t>(),
|
||||
int batch_count = 1);
|
||||
|
||||
/// Allocates memory for sparse meta data
|
||||
@ -98,7 +98,7 @@ public:
|
||||
library::LayoutTypeID layout_id,
|
||||
library::NumericTypeID type_a,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int> const &stride = std::vector<int>(),
|
||||
std::vector<int64_t> const &stride = std::vector<int64_t>(),
|
||||
int batch_count = 1);
|
||||
|
||||
/// Clears named allocations (but does not necessarily free memory)
|
||||
|
||||
Reference in New Issue
Block a user