v4.3 update. (#2709)

* v4.3 update. * Update the cute_dsl_api changelog's doc link * Update version to 4.3.0 * Update the example link * Update doc to encourage user to install DSL from requirements.txt --------- Co-authored-by: Larry Wu <larwu@nvidia.com>
2025-10-22 02:26:30 +08:00
parent e6e2cc29f5
commit b1d6e2c9b3
244 changed files with 59272 additions and 10455 deletions
--- a/examples/python/CuTeDSL/notebooks/tensor.ipynb
+++ b/examples/python/CuTeDSL/notebooks/tensor.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -43,7 +43,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -69,24 +69,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "tensor(raw_ptr(0x000000000736b0c0: f32, generic, align<4>) o (8,5):(5,1), data=\n",
-      "       [[ 1.000000,  1.000000,  1.000000,  1.000000,  1.000000, ],\n",
-      "        [ 1.000000,  1.000000,  1.000000,  1.000000,  1.000000, ],\n",
-      "        [ 1.000000,  1.000000,  1.000000,  1.000000,  1.000000, ],\n",
-      "        ...\n",
-      "        [ 1.000000,  1.000000,  1.000000,  1.000000,  1.000000, ],\n",
-      "        [ 1.000000,  1.000000,  1.000000,  1.000000,  1.000000, ],\n",
-      "        [ 1.000000,  1.000000,  1.000000,  1.000000,  1.000000, ]])\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "import torch\n",
    "\n",
@ -115,12 +100,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from cutlass.cute.runtime import from_dlpack\n",
    "\n",
+    "\n",
    "@cute.jit\n",
    "def print_tensor_dlpack(src: cute.Tensor):\n",
    "    print(src)\n",
@ -129,25 +115,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "tensor<ptr<f32, generic> o (8,5):(5,1)>\n",
-      "tensor(raw_ptr(0x0000000007559340: f32, generic, align<4>) o (8,5):(5,1), data=\n",
-      "       [[-1.151769,  1.019397, -0.371175, -0.717776,  0.502176, ],\n",
-      "        [ 0.114282,  0.900084,  0.320770,  1.564574, -0.632329, ],\n",
-      "        [-0.570140,  0.178112, -0.423079,  1.936198,  0.003355, ],\n",
-      "        ...\n",
-      "        [-2.425393, -0.275528,  1.267157, -0.811101, -0.985456, ],\n",
-      "        [ 0.777889, -2.114074,  0.357184, -0.321312, -0.938138, ],\n",
-      "        [ 1.959564,  1.797602,  0.116901,  0.306198, -1.837295, ]])\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "a = torch.randn(8, 5, dtype=torch_dtype(cutlass.Float32))\n",
    "\n",
@ -156,25 +126,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "tensor<ptr<f32, generic> o (8,8):(8,1)>\n",
-      "tensor(raw_ptr(0x0000000007979da0: f32, generic, align<4>) o (8,8):(8,1), data=\n",
-      "       [[ 0.122739, -0.605744, -1.442022, ..., -0.356501, -0.993329, -0.091110, ],\n",
-      "        [ 0.278448,  0.318482, -0.276867, ...,  1.542181, -1.701539, -0.309454, ],\n",
-      "        [ 0.563565, -0.753936,  0.131214, ...,  0.437912, -0.482277, -0.051540, ],\n",
-      "        ...\n",
-      "        [-1.974096, -0.177881,  0.426807, ..., -1.579115, -0.304974,  0.451164, ],\n",
-      "        [ 0.149851, -0.704689, -0.295063, ..., -0.653001,  0.008871,  0.903916, ],\n",
-      "        [ 1.188619,  1.519662,  1.270734, ...,  0.404082,  0.173200,  0.093476, ]])\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
@ -211,39 +165,23 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "a[2] = 10.000000 (equivalent to a[(2,0)])\n",
-      "a[9] = 6.000000 (equivalent to a[(1,1)])\n",
-      "a[2,0] = 10.000000\n",
-      "a[2,4] = 14.000000\n",
-      "a[(2,4)] = 14.000000\n",
-      "a[2,3] = 100.000000\n",
-      "a[(2,4)] = 101.000000\n",
-      "tensor([[  0.,   1.,   2.,   3.,   4.],\n",
-      "        [  5.,   6.,   7.,   8.,   9.],\n",
-      "        [ 10.,  11.,  12., 100., 101.],\n",
-      "        [ 15.,  16.,  17.,  18.,  19.],\n",
-      "        [ 20.,  21.,  22.,  23.,  24.],\n",
-      "        [ 25.,  26.,  27.,  28.,  29.],\n",
-      "        [ 30.,  31.,  32.,  33.,  34.],\n",
-      "        [ 35.,  36.,  37.,  38.,  39.]])\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "@cute.jit\n",
    "def tensor_access_item(a: cute.Tensor):\n",
    "    # access data using linear index\n",
-    "    cute.printf(\"a[2] = {} (equivalent to a[{}])\", a[2],\n",
-    "                cute.make_identity_tensor(a.layout.shape)[2])\n",
-    "    cute.printf(\"a[9] = {} (equivalent to a[{}])\", a[9],\n",
-    "                cute.make_identity_tensor(a.layout.shape)[9])\n",
+    "    cute.printf(\n",
+    "        \"a[2] = {} (equivalent to a[{}])\",\n",
+    "        a[2],\n",
+    "        cute.make_identity_tensor(a.layout.shape)[2],\n",
+    "    )\n",
+    "    cute.printf(\n",
+    "        \"a[9] = {} (equivalent to a[{}])\",\n",
+    "        a[9],\n",
+    "        cute.make_identity_tensor(a.layout.shape)[9],\n",
+    "    )\n",
    "\n",
    "    # access data using n-d coordinates, following two are equivalent\n",
    "    cute.printf(\"a[2,0] = {}\", a[2, 0])\n",
@ -251,14 +189,14 @@
    "    cute.printf(\"a[(2,4)] = {}\", a[2, 4])\n",
    "\n",
    "    # assign value to tensor@(2,4)\n",
-    "    a[2,3] = 100.0\n",
-    "    a[2,4] = 101.0\n",
-    "    cute.printf(\"a[2,3] = {}\", a[2,3])\n",
-    "    cute.printf(\"a[(2,4)] = {}\", a[(2,4)])\n",
+    "    a[2, 3] = 100.0\n",
+    "    a[2, 4] = 101.0\n",
+    "    cute.printf(\"a[2,3] = {}\", a[2, 3])\n",
+    "    cute.printf(\"a[(2,4)] = {}\", a[(2, 4)])\n",
    "\n",
    "\n",
    "# Create a tensor with sequential data using torch\n",
-    "data = torch.arange(0, 8*5, dtype=torch.float32).reshape(8, 5)\n",
+    "data = torch.arange(0, 8 * 5, dtype=torch.float32).reshape(8, 5)\n",
    "tensor_access_item(from_dlpack(data))\n",
    "\n",
    "print(data)"
@ -287,14 +225,17 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "### Coordinate Tensor\n",
+    "## Coordinate Tensors\n",
    "\n",
-    "A coordinate tensor is a special type of tensor that maps coordinates to coordinates rather than to values. \n",
-    "The key distinction is that while regular tensors map coordinates to some value type (like numbers), \n",
-    "coordinate tensors map coordinates to other coordinates.\n",
+    "### Definition and Properties\n",
    "\n",
-    "For example, given a shape (4,4), a coordinate tensor using row-major layout would appear as:\n",
+    "A coordinate tensor $T: Z^n → Z^m$ is a mathematical structure that establishes a mapping between coordinate spaces. Unlike standard tensors that map coordinates to scalar values, coordinate tensors map coordinates to other coordinates, forming a fundamental building block for tensor operations and transformations.\n",
    "\n",
+    "### Examples\n",
+    "\n",
+    "Consider a `(4,4)` coordinate tensor:\n",
+    "\n",
+    "**Row-Major Layout (C-style):**\n",
    "\\begin{bmatrix} \n",
    "(0,0) & (0,1) & (0,2) & (0,3) \\\\\n",
    "(1,0) & (1,1) & (1,2) & (1,3) \\\\\n",
@ -302,8 +243,7 @@
    "(3,0) & (3,1) & (3,2) & (3,3)\n",
    "\\end{bmatrix}\n",
    "\n",
-    "The same shape with a column-major layout would appear as:\n",
-    "\n",
+    "**Column-Major Layout (Fortran-style):**\n",
    "\\begin{bmatrix}\n",
    "(0,0) & (1,0) & (2,0) & (3,0) \\\\\n",
    "(0,1) & (1,1) & (2,1) & (3,1) \\\\\n",
@ -311,40 +251,50 @@
    "(0,3) & (1,3) & (2,3) & (3,3)\n",
    "\\end{bmatrix}\n",
    "\n",
-    "The key points about coordinate tensors are:\n",
-    "- Each element in the tensor is itself a coordinate tuple (i,j) rather than a scalar value\n",
-    "- The coordinates map to themselves - so position (1,2) contains the coordinate (1,2)\n",
-    "- The layout (row-major vs column-major) determines how these coordinate tuples are arranged in memory\n",
+    "### Identity Tensor\n",
    "\n",
-    "For example, coordinate tensors can be created using the `make_identity_tensor` utility:\n",
+    "An identity tensor $I$ is a special case of a coordinate tensor that implements the identity mapping function:\n",
    "\n",
+    "**Definition:**\n",
+    "For a given shape $S = (s_1, s_2, ..., s_n)$, the identity tensor $I$ satisfies: $I(c) = c, \\forall c \\in \\prod_{i=1}^n [0, s_i)$\n",
+    "\n",
+    "**Properties:**\n",
+    "1. **Bijective Mapping**: The identity tensor establishes a one-to-one correspondence between coordinates.\n",
+    "2. **Layout Invariance**: The logical structure remains constant regardless of the underlying memory layout.\n",
+    "3. **Coordinate Preservation**: For any coordinate c, I(c) = c.\n",
+    "\n",
+    "\n",
+    "CuTe establishes an isomorphism between 1-D indices and N-D coordinates through lexicographical ordering. For a coordinate c = (c₁, c₂, ..., cₙ) in an identity tensor with shape S = (s₁, s₂, ..., sₙ):\n",
+    "\n",
+    "**Linear Index Formula:**\n",
+    "$\\text{idx} = c_1 + \\sum_{i=2}^{n} \\left(c_i \\prod_{j=1}^{i-1} s_j\\right)$\n",
+    "\n",
+    "**Example:**\n",
    "```python\n",
+    "# Create an identity tensor from a given shape\n",
    "coord_tensor = make_identity_tensor(layout.shape())\n",
+    "\n",
+    "# Access coordinate using linear index\n",
+    "coord = coord_tensor[linear_idx]  # Returns the N-D coordinate\n",
    "```\n",
    "\n",
-    "This creates a tensor that maps each coordinate to itself, providing a reference point for understanding how other layouts transform these coordinates."
+    "This bidirectional mapping enables efficient conversion from linear indices to N-dimensional coordinates, facilitating tensor operations and memory access patterns."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "tensor<(0,0) o (8,4):(1@0,1@1)>\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "@cute.jit\n",
    "def print_tensor_coord(a: cute.Tensor):\n",
    "    coord_tensor = cute.make_identity_tensor(a.layout.shape)\n",
    "    print(coord_tensor)\n",
+    "    cute.print_tensor(coord_tensor)\n",
    "\n",
-    "a = torch.randn(8,4, dtype=torch_dtype(cutlass.Float32))\n",
+    "\n",
+    "a = torch.randn(8, 4, dtype=torch_dtype(cutlass.Float32))\n",
    "print_tensor_coord(from_dlpack(a))"
   ]
  }