v4.2 tag release. (#2638)

2025-09-16 00:21:53 +08:00
parent 56f0718a97
commit 6a35b4d22f
161 changed files with 14056 additions and 3793 deletions
--- a/test/python/cutlass/gemm/gemm_testbed.py
+++ b/test/python/cutlass/gemm/gemm_testbed.py
@ -153,7 +153,7 @@ class GemmUniversalLauncher:
        else:
            data_cutlass = data_ref.transpose(-1, -2).contiguous()

-        data_cutlass = data_cutlass_cppgen.to("cuda")
+        data_cutlass = data_cutlass.to("cuda")

        # As of this writing, few operations in PyTorch are supported with FP8 data.
        # Thus, we perform computation in FP32 for FP8 reference checks.
--- a/test/python/cutlass/interface/gemm_interface.py
+++ b/test/python/cutlass/interface/gemm_interface.py
@ -240,8 +240,8 @@ class GemmErrorTests(unittest.TestCase):
        """
        cc = device_cc()

-        # F64 Tensor Core operations are only avaiable on devices with CC >= 80
-        supports_tensorop_f64 = cc >= 80
+        # F64 Tensor Core operations are only avaiable on certain devices
+        supports_tensorop_f64 = cc in [80, 89, 90]
        plan = cutlass_cppgen.op.Gemm(cc=cc, element=cutlass_cppgen.DataType.f64, layout=cutlass_cppgen.LayoutType.RowMajor)

        error_msg = f'Incorrectly raised an exception for availability of TensorOp with F64 operands on SM{cc}'
@ -288,7 +288,7 @@ class GemmErrorTests(unittest.TestCase):
            with ExpectException(cc < 80, f'Requested more than 2 stages on SM{cc}'):
                td.stages = 3
                plan.construct(td)
-        else:
+        elif cc == 90:
            original_kschedule = td.kernel_schedule
            original_eschedule = td.epilogue_schedule
            with ExpectException(False, f'Incorrectly flagged an error for insufficient shared memory'):
@ -296,10 +296,13 @@ class GemmErrorTests(unittest.TestCase):
                td.epilogue_schedule = cutlass_cppgen.EpilogueScheduleType.NoSmemWarpSpecialized
                td.stages = 3
                plan.construct(td)
-
            # Reset schedules
            td.kernel_schedule = original_kschedule
            td.epilogue_schedule = original_eschedule
+        elif cc in [100, 101, 103]:
+            with ExpectException(False, f'Incorrectly flagged an error for insufficient shared memory'):
+                td.stages = 3
+                plan.construct(td)

        with ExpectException(True, f'Requested too many stages'):
            td.stages = 100
@ -321,12 +324,12 @@ class GemmErrorTests(unittest.TestCase):
            td.epilogue_schedule = cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecialized
            plan.construct(td)

-        with ExpectException(True, f'Requested a non-auto kernel schedule with an auto epilogue schedule'):
+        with ExpectException(cc == 90, f'Requested a non-auto kernel schedule with an auto epilogue schedule'):
            td.kernel_schedule = cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedPingpong
            td.epilogue_schedule = cutlass_cppgen.EpilogueScheduleType.ScheduleAuto
            plan.construct(td)

-        with ExpectException(True, f'Requested an auto kernel schedule with a non-auto epilogue schedule'):
+        with ExpectException(cc == 90, f'Requested an auto kernel schedule with a non-auto epilogue schedule'):
            td.kernel_schedule = cutlass_cppgen.KernelScheduleType.ScheduleAuto
            td.epilogue_schedule = cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecialized
            plan.construct(td)