Compare commits

...

2 Commits

Author SHA1 Message Date
44c704eae8 Doc updates for 3.2.2 2023-10-26 11:07:30 -07:00
6581237a48 fix issue/1138 2023-10-24 12:02:15 -07:00
4 changed files with 8 additions and 4 deletions

View File

@ -1,5 +1,8 @@
# NVIDIA CUTLASS Changelog
## [3.2.2](https://github.com/NVIDIA/cutlass/releases/tag/v3.2.2) (2023-10-25)
* Fixes illegal memory access issue [1138](https://github.com/NVIDIA/cutlass/issues/1138) hit by FlashAttention tests in PyTorch.
## [3.2.1](https://github.com/NVIDIA/cutlass/releases/tag/v3.2.1) (2023-09-22)
* Python support SM90 Epilogue Visitor Tree (EVT) on top of the C++ support released in 3.2.0.
* SM80 EVT support in C++ and Python.

View File

@ -40,7 +40,7 @@ endif()
message(STATUS "CMake Version: ${CMAKE_VERSION}")
set(IMPLICIT_CMAKE_CXX_STANDARD OFF CACHE BOOL "Do not explicitly specify -std=c++11 if set")
project(CUTLASS VERSION 3.2.1 LANGUAGES CXX)
project(CUTLASS VERSION 3.2.2 LANGUAGES CXX)
include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake)
if (CUDA_VERSION VERSION_LESS 11.3)

View File

@ -62,6 +62,9 @@ CUTLASS 3.2.1 is an update to CUTLASS adding:
- SM90 rasterization direction support in the CUTLASS profiler.
- Improvement for CUTLASS profiler build times.
CUTLASS 3.2.2 is a minor update to CUTLASS adding:
- Bug fix for illegal memory access issue hit by Flash Attention tests in PyTorch. See [1138](https://github.com/NVIDIA/cutlass/issues/1138) for details.
Minimum requirements:
- Architecture: Volta

View File

@ -93,8 +93,6 @@ struct device_ptr
{
using value_type = T;
static const uint32_t ElementsPerStoredItem = sizeof(T) * 8 / sizeof_bits_v<T>;
CUTE_HOST_DEVICE constexpr
device_ptr(T* ptr) : ptr_(ptr) {}
@ -113,7 +111,7 @@ struct device_ptr
template <class Index>
CUTE_HOST_DEVICE constexpr
DerivedType operator+(Index const& i) const { return {ptr_ + i / ElementsPerStoredItem}; }
DerivedType operator+(Index const& i) const { return {ptr_ + i}; }
CUTE_HOST_DEVICE constexpr friend
ptrdiff_t operator-(device_ptr<T,DerivedType> const& a,