diff --git a/CHANGELOG.md b/CHANGELOG.md index 8c13cf1f..5125c1d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,21 @@ # CUTLASS 4.x +# CUTLASS 4.2.1 +## [4.2.1](https://github.com/NVIDIA/cutlass/releases/tag/v4.2.1) (2025-09-22) + +### CuTe DSL +* Bug fixings and improvements + - Fixed an issue when running DSL codes with cuda-python 13.0 + - Fixed an issue when running inductor with DSL codes + - Fixed an issue with unexpected logging when running DSL codes in FlashInfer + - Fixed the issue reported in https://github.com/NVIDIA/cutlass/issues/2647 + - Fixed an issue when conditional define of variables outside of dynamic control flow + +### CUTLASS C++ +* Bypass EVT for nosmem blockwise kernels on Blackwell. +* Rename cutlass/python/cutlass directory to cutlass/python/cutlass_cppgen. + ## [4.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v4.2.0) (2025-09-15) ### CuTe DSL diff --git a/README.md b/README.md index 335189dd..8ce2151a 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ ![ALT](./media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition") # Overview -# CUTLASS 4.2.0 +# CUTLASS 4.2.1 -_CUTLASS 4.2.0 - Sept 2025_ +_CUTLASS 4.2.1 - Sept 2025_ CUTLASS is a collection of abstractions for implementing high-performance matrix-matrix multiplication (GEMM) and related computations at all levels and scales within CUDA. It incorporates strategies for @@ -224,7 +224,10 @@ CUTLASS runs successfully on the following NVIDIA GPUs, and it is expected to be |NVIDIA H100 Tensor Core GPU |9.0|11.8| |NVIDIA H200 Tensor Core GPU |9.0|11.8| |NVIDIA B200 Tensor Core GPU |10.0|12.8| +|NVIDIA B300 Tensor Core GPU |10.3|13.0| +|NVIDIA DRIVE Thor |11.0|13.0| |NVIDIA GeForce RTX 50x0 series |12.0|12.8| +|NVIDIA DGX Spark |12.1|13.0| ## Target Architecture diff --git a/include/cutlass/version.h b/include/cutlass/version.h index ce93ed7c..57a73a5f 100644 --- a/include/cutlass/version.h +++ b/include/cutlass/version.h @@ -36,7 +36,7 @@ #define CUTLASS_MAJOR 4 #define CUTLASS_MINOR 2 -#define CUTLASS_PATCH 0 +#define CUTLASS_PATCH 1 #ifdef CUTLASS_VERSIONS_GENERATED #include "cutlass/version_extended.h" diff --git a/python/cutlass_cppgen/__init__.py b/python/cutlass_cppgen/__init__.py index 35507d2a..9bdd259c 100644 --- a/python/cutlass_cppgen/__init__.py +++ b/python/cutlass_cppgen/__init__.py @@ -133,7 +133,7 @@ def get_option_registry(): this._option_registry = OptionRegistry(device_cc()) return this._option_registry -this.__version__ = '4.2.0' +this.__version__ = '4.2.1' from cutlass_cppgen.backend import create_memory_pool from cutlass_cppgen.emit.pytorch import pytorch diff --git a/python/setup_library.py b/python/setup_library.py index 75ae8ec0..c56d6b55 100644 --- a/python/setup_library.py +++ b/python/setup_library.py @@ -36,7 +36,7 @@ from setuptools import setup def perform_setup(): setup( name='cutlass_library', - version='4.2.0', + version='4.2.1', description='CUTLASS library generation scripts', packages=['cutlass_library'] ) diff --git a/python/setup_pycute.py b/python/setup_pycute.py index 79acef3d..0bad050f 100644 --- a/python/setup_pycute.py +++ b/python/setup_pycute.py @@ -36,7 +36,7 @@ from setuptools import setup def perform_setup(): setup( name='pycute', - version='4.2.0', + version='4.2.1', description='Python implementation of CuTe', packages=['pycute'], )