diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8c13cf1f..5125c1d3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,21 @@
 
 # CUTLASS 4.x
 
+# CUTLASS 4.2.1
+## [4.2.1](https://github.com/NVIDIA/cutlass/releases/tag/v4.2.1) (2025-09-22)
+
+### CuTe DSL
+* Bug fixings and improvements
+    - Fixed an issue when running DSL codes with cuda-python 13.0
+    - Fixed an issue when running inductor with DSL codes
+    - Fixed an issue with unexpected logging when running DSL codes in FlashInfer
+    - Fixed the issue reported in https://github.com/NVIDIA/cutlass/issues/2647
+    - Fixed an issue when conditional define of variables outside of dynamic control flow
+
+### CUTLASS C++
+* Bypass EVT for nosmem blockwise kernels on Blackwell.
+* Rename cutlass/python/cutlass directory to cutlass/python/cutlass_cppgen.
+
 ## [4.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v4.2.0) (2025-09-15)
 
 ### CuTe DSL
diff --git a/README.md b/README.md
index 335189dd..8ce2151a 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,9 @@
 ![ALT](./media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
 # Overview
 
-# CUTLASS 4.2.0
+# CUTLASS 4.2.1
 
-_CUTLASS 4.2.0 - Sept 2025_
+_CUTLASS 4.2.1 - Sept 2025_
 
 CUTLASS is a collection of abstractions for implementing high-performance matrix-matrix multiplication (GEMM)
 and related computations at all levels and scales within CUDA. It incorporates strategies for
@@ -224,7 +224,10 @@ CUTLASS runs successfully on the following NVIDIA GPUs, and it is expected to be
 |NVIDIA H100 Tensor Core GPU            |9.0|11.8|
 |NVIDIA H200 Tensor Core GPU            |9.0|11.8|
 |NVIDIA B200 Tensor Core GPU            |10.0|12.8|
+|NVIDIA B300 Tensor Core GPU            |10.3|13.0|
+|NVIDIA DRIVE Thor                      |11.0|13.0|
 |NVIDIA GeForce RTX 50x0 series         |12.0|12.8|
+|NVIDIA DGX Spark                       |12.1|13.0|
 
 ## Target Architecture
 
diff --git a/include/cutlass/version.h b/include/cutlass/version.h
index ce93ed7c..57a73a5f 100644
--- a/include/cutlass/version.h
+++ b/include/cutlass/version.h
@@ -36,7 +36,7 @@
 
 #define CUTLASS_MAJOR 4
 #define CUTLASS_MINOR 2
-#define CUTLASS_PATCH 0 
+#define CUTLASS_PATCH 1
 
 #ifdef CUTLASS_VERSIONS_GENERATED
 #include "cutlass/version_extended.h"
diff --git a/python/cutlass_cppgen/__init__.py b/python/cutlass_cppgen/__init__.py
index 35507d2a..9bdd259c 100644
--- a/python/cutlass_cppgen/__init__.py
+++ b/python/cutlass_cppgen/__init__.py
@@ -133,7 +133,7 @@ def get_option_registry():
         this._option_registry = OptionRegistry(device_cc())
     return this._option_registry
 
-this.__version__ = '4.2.0'
+this.__version__ = '4.2.1'
 
 from cutlass_cppgen.backend import create_memory_pool
 from cutlass_cppgen.emit.pytorch import pytorch
diff --git a/python/setup_library.py b/python/setup_library.py
index 75ae8ec0..c56d6b55 100644
--- a/python/setup_library.py
+++ b/python/setup_library.py
@@ -36,7 +36,7 @@ from setuptools import setup
 def perform_setup():
     setup(
         name='cutlass_library',
-        version='4.2.0',
+        version='4.2.1',
         description='CUTLASS library generation scripts',
         packages=['cutlass_library']
     )
diff --git a/python/setup_pycute.py b/python/setup_pycute.py
index 79acef3d..0bad050f 100644
--- a/python/setup_pycute.py
+++ b/python/setup_pycute.py
@@ -36,7 +36,7 @@ from setuptools import setup
 def perform_setup():
     setup(
         name='pycute',
-        version='4.2.0',
+        version='4.2.1',
         description='Python implementation of CuTe',
         packages=['pycute'],
     )