v4.3 update. (#2709)

* v4.3 update.

* Update the cute_dsl_api changelog's doc link

* Update version to 4.3.0

* Update the example link

* Update doc to encourage user to install DSL from requirements.txt

---------

Co-authored-by: Larry Wu <larwu@nvidia.com>
This commit is contained in:
Junkai-Wu
2025-10-22 02:26:30 +08:00
committed by GitHub
parent e6e2cc29f5
commit b1d6e2c9b3
244 changed files with 59272 additions and 10455 deletions

View File

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -43,7 +43,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -69,24 +69,9 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor(raw_ptr(0x000000000736b0c0: f32, generic, align<4>) o (8,5):(5,1), data=\n",
" [[ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, ],\n",
" [ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, ],\n",
" [ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, ],\n",
" ...\n",
" [ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, ],\n",
" [ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, ],\n",
" [ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, ]])\n"
]
}
],
"outputs": [],
"source": [
"import torch\n",
"\n",
@ -115,12 +100,13 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from cutlass.cute.runtime import from_dlpack\n",
"\n",
"\n",
"@cute.jit\n",
"def print_tensor_dlpack(src: cute.Tensor):\n",
" print(src)\n",
@ -129,25 +115,9 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor<ptr<f32, generic> o (8,5):(5,1)>\n",
"tensor(raw_ptr(0x0000000007559340: f32, generic, align<4>) o (8,5):(5,1), data=\n",
" [[-1.151769, 1.019397, -0.371175, -0.717776, 0.502176, ],\n",
" [ 0.114282, 0.900084, 0.320770, 1.564574, -0.632329, ],\n",
" [-0.570140, 0.178112, -0.423079, 1.936198, 0.003355, ],\n",
" ...\n",
" [-2.425393, -0.275528, 1.267157, -0.811101, -0.985456, ],\n",
" [ 0.777889, -2.114074, 0.357184, -0.321312, -0.938138, ],\n",
" [ 1.959564, 1.797602, 0.116901, 0.306198, -1.837295, ]])\n"
]
}
],
"outputs": [],
"source": [
"a = torch.randn(8, 5, dtype=torch_dtype(cutlass.Float32))\n",
"\n",
@ -156,25 +126,9 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor<ptr<f32, generic> o (8,8):(8,1)>\n",
"tensor(raw_ptr(0x0000000007979da0: f32, generic, align<4>) o (8,8):(8,1), data=\n",
" [[ 0.122739, -0.605744, -1.442022, ..., -0.356501, -0.993329, -0.091110, ],\n",
" [ 0.278448, 0.318482, -0.276867, ..., 1.542181, -1.701539, -0.309454, ],\n",
" [ 0.563565, -0.753936, 0.131214, ..., 0.437912, -0.482277, -0.051540, ],\n",
" ...\n",
" [-1.974096, -0.177881, 0.426807, ..., -1.579115, -0.304974, 0.451164, ],\n",
" [ 0.149851, -0.704689, -0.295063, ..., -0.653001, 0.008871, 0.903916, ],\n",
" [ 1.188619, 1.519662, 1.270734, ..., 0.404082, 0.173200, 0.093476, ]])\n"
]
}
],
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
@ -211,39 +165,23 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"a[2] = 10.000000 (equivalent to a[(2,0)])\n",
"a[9] = 6.000000 (equivalent to a[(1,1)])\n",
"a[2,0] = 10.000000\n",
"a[2,4] = 14.000000\n",
"a[(2,4)] = 14.000000\n",
"a[2,3] = 100.000000\n",
"a[(2,4)] = 101.000000\n",
"tensor([[ 0., 1., 2., 3., 4.],\n",
" [ 5., 6., 7., 8., 9.],\n",
" [ 10., 11., 12., 100., 101.],\n",
" [ 15., 16., 17., 18., 19.],\n",
" [ 20., 21., 22., 23., 24.],\n",
" [ 25., 26., 27., 28., 29.],\n",
" [ 30., 31., 32., 33., 34.],\n",
" [ 35., 36., 37., 38., 39.]])\n"
]
}
],
"outputs": [],
"source": [
"@cute.jit\n",
"def tensor_access_item(a: cute.Tensor):\n",
" # access data using linear index\n",
" cute.printf(\"a[2] = {} (equivalent to a[{}])\", a[2],\n",
" cute.make_identity_tensor(a.layout.shape)[2])\n",
" cute.printf(\"a[9] = {} (equivalent to a[{}])\", a[9],\n",
" cute.make_identity_tensor(a.layout.shape)[9])\n",
" cute.printf(\n",
" \"a[2] = {} (equivalent to a[{}])\",\n",
" a[2],\n",
" cute.make_identity_tensor(a.layout.shape)[2],\n",
" )\n",
" cute.printf(\n",
" \"a[9] = {} (equivalent to a[{}])\",\n",
" a[9],\n",
" cute.make_identity_tensor(a.layout.shape)[9],\n",
" )\n",
"\n",
" # access data using n-d coordinates, following two are equivalent\n",
" cute.printf(\"a[2,0] = {}\", a[2, 0])\n",
@ -251,14 +189,14 @@
" cute.printf(\"a[(2,4)] = {}\", a[2, 4])\n",
"\n",
" # assign value to tensor@(2,4)\n",
" a[2,3] = 100.0\n",
" a[2,4] = 101.0\n",
" cute.printf(\"a[2,3] = {}\", a[2,3])\n",
" cute.printf(\"a[(2,4)] = {}\", a[(2,4)])\n",
" a[2, 3] = 100.0\n",
" a[2, 4] = 101.0\n",
" cute.printf(\"a[2,3] = {}\", a[2, 3])\n",
" cute.printf(\"a[(2,4)] = {}\", a[(2, 4)])\n",
"\n",
"\n",
"# Create a tensor with sequential data using torch\n",
"data = torch.arange(0, 8*5, dtype=torch.float32).reshape(8, 5)\n",
"data = torch.arange(0, 8 * 5, dtype=torch.float32).reshape(8, 5)\n",
"tensor_access_item(from_dlpack(data))\n",
"\n",
"print(data)"
@ -287,14 +225,17 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### Coordinate Tensor\n",
"## Coordinate Tensors\n",
"\n",
"A coordinate tensor is a special type of tensor that maps coordinates to coordinates rather than to values. \n",
"The key distinction is that while regular tensors map coordinates to some value type (like numbers), \n",
"coordinate tensors map coordinates to other coordinates.\n",
"### Definition and Properties\n",
"\n",
"For example, given a shape (4,4), a coordinate tensor using row-major layout would appear as:\n",
"A coordinate tensor $T: Z^n → Z^m$ is a mathematical structure that establishes a mapping between coordinate spaces. Unlike standard tensors that map coordinates to scalar values, coordinate tensors map coordinates to other coordinates, forming a fundamental building block for tensor operations and transformations.\n",
"\n",
"### Examples\n",
"\n",
"Consider a `(4,4)` coordinate tensor:\n",
"\n",
"**Row-Major Layout (C-style):**\n",
"\\begin{bmatrix} \n",
"(0,0) & (0,1) & (0,2) & (0,3) \\\\\n",
"(1,0) & (1,1) & (1,2) & (1,3) \\\\\n",
@ -302,8 +243,7 @@
"(3,0) & (3,1) & (3,2) & (3,3)\n",
"\\end{bmatrix}\n",
"\n",
"The same shape with a column-major layout would appear as:\n",
"\n",
"**Column-Major Layout (Fortran-style):**\n",
"\\begin{bmatrix}\n",
"(0,0) & (1,0) & (2,0) & (3,0) \\\\\n",
"(0,1) & (1,1) & (2,1) & (3,1) \\\\\n",
@ -311,40 +251,50 @@
"(0,3) & (1,3) & (2,3) & (3,3)\n",
"\\end{bmatrix}\n",
"\n",
"The key points about coordinate tensors are:\n",
"- Each element in the tensor is itself a coordinate tuple (i,j) rather than a scalar value\n",
"- The coordinates map to themselves - so position (1,2) contains the coordinate (1,2)\n",
"- The layout (row-major vs column-major) determines how these coordinate tuples are arranged in memory\n",
"### Identity Tensor\n",
"\n",
"For example, coordinate tensors can be created using the `make_identity_tensor` utility:\n",
"An identity tensor $I$ is a special case of a coordinate tensor that implements the identity mapping function:\n",
"\n",
"**Definition:**\n",
"For a given shape $S = (s_1, s_2, ..., s_n)$, the identity tensor $I$ satisfies: $I(c) = c, \\forall c \\in \\prod_{i=1}^n [0, s_i)$\n",
"\n",
"**Properties:**\n",
"1. **Bijective Mapping**: The identity tensor establishes a one-to-one correspondence between coordinates.\n",
"2. **Layout Invariance**: The logical structure remains constant regardless of the underlying memory layout.\n",
"3. **Coordinate Preservation**: For any coordinate c, I(c) = c.\n",
"\n",
"\n",
"CuTe establishes an isomorphism between 1-D indices and N-D coordinates through lexicographical ordering. For a coordinate c = (c₁, c₂, ..., cₙ) in an identity tensor with shape S = (s₁, s₂, ..., sₙ):\n",
"\n",
"**Linear Index Formula:**\n",
"$\\text{idx} = c_1 + \\sum_{i=2}^{n} \\left(c_i \\prod_{j=1}^{i-1} s_j\\right)$\n",
"\n",
"**Example:**\n",
"```python\n",
"# Create an identity tensor from a given shape\n",
"coord_tensor = make_identity_tensor(layout.shape())\n",
"\n",
"# Access coordinate using linear index\n",
"coord = coord_tensor[linear_idx] # Returns the N-D coordinate\n",
"```\n",
"\n",
"This creates a tensor that maps each coordinate to itself, providing a reference point for understanding how other layouts transform these coordinates."
"This bidirectional mapping enables efficient conversion from linear indices to N-dimensional coordinates, facilitating tensor operations and memory access patterns."
]
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor<(0,0) o (8,4):(1@0,1@1)>\n"
]
}
],
"outputs": [],
"source": [
"@cute.jit\n",
"def print_tensor_coord(a: cute.Tensor):\n",
" coord_tensor = cute.make_identity_tensor(a.layout.shape)\n",
" print(coord_tensor)\n",
" cute.print_tensor(coord_tensor)\n",
"\n",
"a = torch.randn(8,4, dtype=torch_dtype(cutlass.Float32))\n",
"\n",
"a = torch.randn(8, 4, dtype=torch_dtype(cutlass.Float32))\n",
"print_tensor_coord(from_dlpack(a))"
]
}