CUTLASS 3.7 (#2045)
* CUTLASS 3.7 * clean up changelog --------- Co-authored-by: yuzhai <yuzhai@nvidia.com> Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
This commit is contained in:
@ -238,7 +238,7 @@ of tests run may vary over time as more are added.
|
||||
|
||||
# Copyright
|
||||
|
||||
Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
```
|
||||
|
||||
@ -442,7 +442,7 @@ as the mappings are not always bijective.
|
||||
|
||||
# Copyright
|
||||
|
||||
Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
```
|
||||
|
||||
@ -38,7 +38,7 @@ has a variety of examples.
|
||||
|
||||
# Copyright
|
||||
|
||||
Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
```
|
||||
|
||||
@ -257,7 +257,7 @@ targeting NVIDIA GPUs.
|
||||
|
||||
# Copyright
|
||||
|
||||
Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
```
|
||||
|
||||
@ -281,7 +281,7 @@ CUDA exposes warp-level matrix operations in the CUDA C++ WMMA API. The CUDA C++
|
||||
|
||||
# Copyright
|
||||
|
||||
Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
```
|
||||
|
||||
@ -345,7 +345,7 @@ support on current and future NVIDIA GPUs.
|
||||
|
||||
# Copyright
|
||||
|
||||
Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
```
|
||||
|
||||
@ -542,7 +542,7 @@ to inline PTX.
|
||||
|
||||
# Copyright
|
||||
|
||||
Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
```
|
||||
|
||||
@ -128,6 +128,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
|
||||
|
||||
// Step 2: Specify the collective layer epilogue type
|
||||
using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue<
|
||||
ElementC,
|
||||
cutlass::gemm::TagToStrideC_t<LayoutC>,
|
||||
cutlass::gemm::TagToStrideC_t<LayoutC>,
|
||||
cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>>;
|
||||
@ -673,7 +674,7 @@ please refer to CuTe's tutorial, e.g., the sections on
|
||||
|
||||
# Copyright
|
||||
|
||||
Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
```
|
||||
|
||||
@ -761,7 +761,7 @@ Convolution can also be run by the CUTLASS Profiler.
|
||||
|
||||
# Copyright
|
||||
|
||||
Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
```
|
||||
|
||||
@ -272,7 +272,7 @@ Permuted Shared Memory Layouts:
|
||||
|
||||
# Copyright
|
||||
|
||||
Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
```
|
||||
|
||||
@ -179,7 +179,7 @@ for more details.
|
||||
|
||||
# Copyright
|
||||
|
||||
Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
```
|
||||
|
||||
@ -159,9 +159,17 @@ Profiling:
|
||||
capacity of the last-level cache.
|
||||
|
||||
--profiling-iterations=<iterations> Number of iterations to profile each kernel. If zero, kernels
|
||||
are launched up to the profiling duration.
|
||||
are launched up to the profiling duration. If non-zero, this
|
||||
overrides `profiling-duration` and `min-iterations`.
|
||||
|
||||
--warmup-iterations=<iterations> Number of iterations to execute each kernel prior to profiling.
|
||||
--profiling-duration=<duration> Time to spend profiling each kernel (ms). Overriden by
|
||||
`profiling-iterations` when `profiling-iterations` != 0.
|
||||
Note that `min-iterations` must also be satisfied.
|
||||
|
||||
--min-iterations=<iterations> Minimum number of iterations to spend profiling each kernel, even if
|
||||
`profiling-duration` has been met.
|
||||
|
||||
--warmup-iterations=<iterations> Number of iterations to execute each kernel prior to profiling (default: 10).
|
||||
|
||||
--sleep-duration=<duration> Number of ms to sleep between profiling periods (ms).
|
||||
|
||||
@ -624,7 +632,7 @@ reference_device: Passed
|
||||
|
||||
# Copyright
|
||||
|
||||
Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
```
|
||||
|
||||
@ -1161,7 +1161,7 @@ However, A is certainly M major if interpreted as a matrix.
|
||||
|
||||
# Copyright
|
||||
|
||||
Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
```
|
||||
|
||||
@ -655,7 +655,7 @@ $ cmake .. -DCUTLASS_NVCC_ARCHS='70;75;80' -DCUTLASS_LIBRARY_KERNELS=tensorop*s*
|
||||
|
||||
# Copyright
|
||||
|
||||
Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
```
|
||||
|
||||
@ -82,7 +82,7 @@ replaced by [MMA and Copy atoms from CuTe](/media/docs/cute/0t_mma_atom.md).
|
||||
|
||||
# Copyright
|
||||
|
||||
Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
```
|
||||
|
||||
@ -473,7 +473,7 @@ struct WriteableReadableRandomAccessContiguousTileIteratorConcept {
|
||||
|
||||
# Copyright
|
||||
|
||||
Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
```
|
||||
|
||||
@ -434,7 +434,7 @@ Please note that `synclog` is an experimental feature, and its functionality is
|
||||
|
||||
# Copyright
|
||||
|
||||
Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
```
|
||||
|
||||
Reference in New Issue
Block a user