Updated mma_sm80.h to avoid perf penalty due to reinterpret_cast<>. (#100)

- Updated mma_sm80.h to avoid perf penalty due to reinterpret_cast<>.
- Enhancement to CUTLASS Utility Library's HostTensorPlanarComplex template to support copy-in and copy-out
- Added test_examples target to build and test all CUTLASS examples
- Minor edits to documentation to point to GTC 2020 webinar
This commit is contained in:
Andrew Kerr
2020-06-15 10:47:01 -07:00
committed by GitHub
parent 86931fef85
commit 1ab1027954
11 changed files with 213 additions and 33 deletions

View File

@ -276,6 +276,9 @@ public:
/// Gets pointer to device data with a pointer offset
Element const * device_data_ptr_offset(LongIndex ptr_element_offset) const { return device_.get() + ptr_element_offset; }
/// Gets a pointer to the device data imaginary part
Element * device_data_imag() { return device_.get() + imaginary_stride(); }
/// Accesses the tensor reference pointing to data
TensorRef host_ref(LongIndex ptr_element_offset=0) {
return TensorRef(host_data_ptr_offset(ptr_element_offset), layout_, imaginary_stride());
@ -416,6 +419,166 @@ public:
device_data(), host_data(), imaginary_stride() * 2);
}
}
/// Copy data from a caller-supplied device pointer into host memory.
void copy_in_device_to_host(
Element const* ptr_device_real, ///< source device memory
Element const* ptr_device_imag, ///< source device memory
LongIndex count = -1) { ///< number of elements to transfer; if negative, entire tensor is overwritten.
if (count < 0) {
count = capacity();
}
else {
count = __NV_STD_MIN(capacity(), count);
}
device_memory::copy_to_host(
host_data(), ptr_device_real, count);
device_memory::copy_to_host(
host_data_imag(), ptr_device_imag, count);
}
/// Copy data from a caller-supplied device pointer into host memory.
void copy_in_device_to_device(
Element const* ptr_device_real, ///< source device memory
Element const* ptr_device_imag, ///< source device memory
LongIndex count = -1) { ///< number of elements to transfer; if negative, entire tensor is overwritten.
if (count < 0) {
count = capacity();
}
else {
count = __NV_STD_MIN(capacity(), count);
}
device_memory::copy_device_to_device(
device_data(), ptr_device_real, count);
device_memory::copy_device_to_device(
device_data_imag(), ptr_device_imag, count);
}
/// Copy data from a caller-supplied device pointer into host memory.
void copy_in_host_to_device(
Element const* ptr_host_real, ///< source host memory
Element const* ptr_host_imag, ///< source host memory
LongIndex count = -1) { ///< number of elements to transfer; if negative, entire tensor is overwritten.
if (count < 0) {
count = capacity();
}
else {
count = __NV_STD_MIN(capacity(), count);
}
device_memory::copy_to_device(
device_data(), ptr_host_real, count);
device_memory::copy_to_device(
device_data_imag(), ptr_host_imag, count);
}
/// Copy data from a caller-supplied device pointer into host memory.
void copy_in_host_to_host(
Element const* ptr_host_real, ///< source host memory
Element const* ptr_host_imag, ///< source host memory
LongIndex count = -1) { ///< number of elements to transfer; if negative, entire tensor is overwritten.
if (count < 0) {
count = capacity();
}
else {
count = __NV_STD_MIN(capacity(), count);
}
device_memory::copy_host_to_host(
host_data(), ptr_host_real, count);
device_memory::copy_host_to_host(
host_data_imag(), ptr_host_imag, count);
}
/// Copy data from a caller-supplied device pointer into host memory.
void copy_out_device_to_host(
Element * ptr_host_real, ///< source device memory
Element * ptr_host_imag, ///< source device memory
LongIndex count = -1) const { ///< number of elements to transfer; if negative, entire tensor is overwritten.
if (count < 0) {
count = capacity();
}
else {
count = __NV_STD_MIN(capacity(), count);
}
device_memory::copy_to_host(
ptr_host_real, device_data(), count);
device_memory::copy_to_host(
ptr_host_imag, device_data_imag(), count);
}
/// Copy data from a caller-supplied device pointer into host memory.
void copy_out_device_to_device(
Element * ptr_device_real, ///< source device memory
Element * ptr_device_imag, ///< source device memory
LongIndex count = -1) const { ///< number of elements to transfer; if negative, entire tensor is overwritten.
if (count < 0) {
count = capacity();
}
else {
count = __NV_STD_MIN(capacity(), count);
}
device_memory::copy_device_to_device(
ptr_device_real, device_data(), count);
device_memory::copy_device_to_device(
ptr_device_imag, device_data_imag(), count);
}
/// Copy data from a caller-supplied device pointer into host memory.
void copy_out_host_to_device(
Element * ptr_device_real, ///< source device memory
Element * ptr_device_imag, ///< source device memory
LongIndex count = -1) const { ///< number of elements to transfer; if negative, entire tensor is overwritten.
if (count < 0) {
count = capacity();
}
else {
count = __NV_STD_MIN(capacity(), count);
}
device_memory::copy_to_device(
ptr_device_real, host_data(), count);
device_memory::copy_to_device(
ptr_device_imag, host_data_imag(), count);
}
/// Copy data from a caller-supplied device pointer into host memory.
void copy_out_host_to_host(
Element * ptr_host_real, ///< source host memory
Element * ptr_host_imag, ///< source host memory
LongIndex count = -1) const { ///< number of elements to transfer; if negative, entire tensor is overwritten.
if (count < 0) {
count = capacity();
}
else {
count = __NV_STD_MIN(capacity(), count);
}
device_memory::copy_host_to_host(
ptr_host_real, host_data(), count);
device_memory::copy_host_to_host(
ptr_host_imag, host_data_imag(), count);
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////