v4.3 update. (#2709)

* v4.3 update. * Update the cute_dsl_api changelog's doc link * Update version to 4.3.0 * Update the example link * Update doc to encourage user to install DSL from requirements.txt --------- Co-authored-by: Larry Wu <larwu@nvidia.com>
2025-10-22 02:26:30 +08:00
parent e6e2cc29f5
commit b1d6e2c9b3
244 changed files with 59272 additions and 10455 deletions
--- a/examples/python/CuTeDSL/cute/ffi/CMakeLists.txt
+++ b/examples/python/CuTeDSL/cute/ffi/CMakeLists.txt
@ -30,11 +30,12 @@ cmake_minimum_required(VERSION 3.15)
 project(tensor)

 # Find Python
+find_package(Python COMPONENTS Interpreter Development REQUIRED)
 find_package(Python3 COMPONENTS Interpreter Development REQUIRED)

 # Get Python site-packages directory using Python
 execute_process(
-    COMMAND ${Python_EXECUTABLE} -c "import site; print(site.getsitepackages()[0])"
+    COMMAND ${Python3_EXECUTABLE} -c "import site; print(site.getsitepackages()[0])"
    OUTPUT_VARIABLE Python_SITE_PACKAGES
    OUTPUT_STRIP_TRAILING_WHITESPACE
 )
@ -45,7 +46,13 @@ message(STATUS "Python site-packages directory: ${Python_SITE_PACKAGES}")
 list(APPEND CMAKE_PREFIX_PATH ${Python_SITE_PACKAGES}/nanobind/cmake)

 # Find nanobind
-find_package(nanobind REQUIRED)
+find_package(nanobind)
+if(NOT nanobind_FOUND)
+    message(FATAL_ERROR 
+        "nanobind not found!\n"
+        "Please install nanobind with: pip install nanobind\n"
+    )
+endif()

 # Add the module
 nanobind_add_module(tensor tensor.cpp)
--- a/examples/python/CuTeDSL/cute/ffi/jit_argument.py
+++ b/examples/python/CuTeDSL/cute/ffi/jit_argument.py
@ -54,7 +54,6 @@ import cutlass.cute as cute

 from cutlass._mlir import ir
 from cutlass._mlir.dialects import llvm
-import cutlass._mlir.extras.types as T


 class ExampleTensorValue(ir.Value):
@ -244,7 +243,7 @@ import tempfile
 import torch


-def run_test(tmpdir=None):
+def run_test(tmpdir=None, cmake_args=""):
    # Skip cleanup if user provides tmpdir
    cleanup = tmpdir is None
    # Initialize temporary build directory
@ -253,7 +252,8 @@ def run_test(tmpdir=None):
    try:
        current_dir = os.path.dirname(os.path.abspath(__file__))

-        subprocess.run(["cmake", "-B", tmpdir, current_dir], check=True)
+        cmake_args = cmake_args.split()
+        subprocess.run(["cmake", "-B", tmpdir, current_dir] + cmake_args, check=True)
        subprocess.run(["cmake", "--build", tmpdir], check=True)

        sys.path.append(tmpdir)
@ -284,7 +284,10 @@ def run_test(tmpdir=None):
        # Execute compiled function
        compiled_func(tensor)
    except Exception as e:
-        print(e)
+        import traceback
+
+        traceback.print_exception(type(e), e, e.__traceback__)
+        raise e
    finally:
        if cleanup:
            # Clean up the temporary directory
@ -298,8 +301,17 @@ if __name__ == "__main__":
        description="Set temporary directory for building C modules"
    )
    parser.add_argument(
-        "--tmp-dir", type=str, help="Temporary directory path for building C modules"
+        "--tmp-dir",
+        type=str,
+        default=None,
+        help="Temporary directory path for building C modules",
+    )
+    parser.add_argument(
+        "--cmake-args",
+        type=str,
+        default="",
+        help="Extra CMake arguments for building C modules",
    )
    args = parser.parse_args()

-    run_test(args.tmp_dir)
+    run_test(tmpdir=args.tmp_dir, cmake_args=args.cmake_args)
--- a/examples/python/CuTeDSL/cute/torch_fake_tensor.py
+++ b/examples/python/CuTeDSL/cute/torch_fake_tensor.py
@ -0,0 +1,77 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import torch
+
+import cutlass.cute as cute
+from cutlass.cute.runtime import from_dlpack
+
+
+"""Example demonstrating how to use CuTe with PyTorch's FakeTensor mode.
+
+This example shows how to:
+1. Use PyTorch's FakeTensor mode to compile a CuTe function without real data
+2. Execute the compiled function on real data later
+
+FakeTensor mode allows compiling code without allocating real memory, which is useful
+for ahead-of-time compilation scenarios. The compiled function can then be executed
+on real tensors that match the expected shapes and dtypes.
+
+Primary goals of this example are to demonstrate: How to use PyTorch's FakeTensor mode with CuTe
+to enable ahead-of-time compilation without real data allocation.
+
+The example:
+1. Creates a fake tensor in PyTorch using FakeTensor mode
+2. Compiles a CuTe function using the fake tensor without allocating real memory
+3. Creates a real tensor with matching shape and dtype
+4. Executes the compiled function on the real tensor
+
+To run this example:
+
+.. code-block:: bash
+
+    python examples/cute/torch_fake_tensor.py
+"""
+
+
+@cute.jit
+def print_tensor(t: cute.Tensor):
+    cute.print_tensor(t)
+
+
+if __name__ == "__main__":
+    from torch._subclasses.fake_tensor import FakeTensorMode
+
+    shape = (3, 4)
+    with FakeTensorMode():
+        fake_tensor = torch.zeros(shape, dtype=torch.float32)
+        compiled_fn = cute.compile(print_tensor, from_dlpack(fake_tensor))
+
+    real_tensor = torch.randn(shape, dtype=torch.float32)
+    compiled_fn(from_dlpack(real_tensor))