diff --git a/CHANGELOG.md b/CHANGELOG.md index 7dbea286..9ca90d8e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # NVIDIA CUTLASS Changelog +## [3.9.2](https://github.com/NVIDIA/cutlass/releases/tag/v3.9.2) (2025-05-03) + +* Fixed [Blockwise](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) and [Groupwise](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) GEMM hang issue when problem size K is 128. +* Optimal code generation with CUDA toolkit versions 12.9. + + ## [3.9.1](https://github.com/NVIDIA/cutlass/releases/tag/v3.9.1) (2025-04-30) * Fixed Group Gemm hang issue in CUTLASS 3.x diff --git a/README.md b/README.md index 63bd4d41..26ec3abd 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ ![ALT](./media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition") -# CUTLASS 3.9.1 +# CUTLASS 3.9.2 -_CUTLASS 3.9.1 - April 2025_ +_CUTLASS 3.9.2 - May 2025_ CUTLASS is a collection of CUDA C++ template abstractions for implementing high-performance matrix-matrix multiplication (GEMM) and related computations at all levels diff --git a/include/cutlass/version.h b/include/cutlass/version.h index 304f9abf..a2880049 100644 --- a/include/cutlass/version.h +++ b/include/cutlass/version.h @@ -36,7 +36,7 @@ #define CUTLASS_MAJOR 3 #define CUTLASS_MINOR 9 -#define CUTLASS_PATCH 1 +#define CUTLASS_PATCH 2 #ifdef CUTLASS_VERSIONS_GENERATED #include "cutlass/version_extended.h" diff --git a/python/cutlass/__init__.py b/python/cutlass/__init__.py index 762b06c3..6cbc9eef 100644 --- a/python/cutlass/__init__.py +++ b/python/cutlass/__init__.py @@ -133,7 +133,7 @@ def get_option_registry(): this._option_registry = OptionRegistry(device_cc()) return this._option_registry -this.__version__ = '3.9.1' +this.__version__ = '3.9.2' from cutlass.backend import create_memory_pool from cutlass.emit.pytorch import pytorch diff --git a/python/setup_library.py b/python/setup_library.py index a4583805..8262e5a7 100644 --- a/python/setup_library.py +++ b/python/setup_library.py @@ -36,7 +36,7 @@ from setuptools import setup def perform_setup(): setup( name='cutlass_library', - version='3.9.1', + version='3.9.2', description='CUTLASS library generation scripts', packages=['cutlass_library'] ) diff --git a/python/setup_pycute.py b/python/setup_pycute.py index 0be2f108..cb945049 100644 --- a/python/setup_pycute.py +++ b/python/setup_pycute.py @@ -36,7 +36,7 @@ from setuptools import setup def perform_setup(): setup( name='pycute', - version='3.9.1', + version='3.9.2', description='Python implementation of CuTe', packages=['pycute'], )