v3.9 update (#2203)

* v3.9 update * voidD --------- Co-authored-by: yuzhai <yuzhai@nvidia.com>
2025-04-02 12:11:18 -07:00
parent 62750a2b75
commit 6f4921858b
129 changed files with 7719 additions and 2036 deletions
--- a/python/cutlass/init.py
+++ b/python/cutlass/init.py
@ -134,7 +134,7 @@ def get_option_registry():
        this._option_registry = OptionRegistry(device_cc())
    return this._option_registry

-this.__version__ = '3.8.0'
+this.__version__ = '3.9.0'

 from cutlass.backend import create_memory_pool
 from cutlass.emit.pytorch import pytorch
--- a/python/cutlass_library/emit_kernel_listing.py
+++ b/python/cutlass_library/emit_kernel_listing.py
@ -282,6 +282,8 @@ def _computeFlopsPerByte(operation, m, n, k, batch_count=1, beta=0.0):
 def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode
                              ):
  profiler_reference_computing = "--verification-providers=device --providers=cutlass"
+  
+
  # beta values for L0 and L1
  # TODO: randomize beta values for wider coverage
  beta_values = [0.5]
--- a/python/cutlass_library/generator.py
+++ b/python/cutlass_library/generator.py
@ -10025,7 +10025,8 @@ def GenerateSM120_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_versio

  tile_sizes_cooperative = [
    [128, 128, 128],
-    [128, 128, 256]
+    [128, 128, 256],
+    [256, 128, 128]
  ]

  tile_sizes_pingpong = [
--- a/python/setup_library.py
+++ b/python/setup_library.py
@ -36,7 +36,7 @@ from setuptools import setup
 def perform_setup():
    setup(
        name='cutlass_library',
-        version='3.8.0',
+        version='3.9.0',
        description='CUTLASS library generation scripts',
        packages=['cutlass_library']
    )
--- a/python/setup_pycute.py
+++ b/python/setup_pycute.py
@ -36,7 +36,7 @@ from setuptools import setup
 def perform_setup():
    setup(
        name='pycute',
-        version='3.8.0',
+        version='3.9.0',
        description='Python implementation of CuTe',
        packages=['pycute'],
    )