CUTLASS 2.10 updates (#622)

Co-authored-by: Aniket Shivam <ashivam@nvidia.com>
2022-09-12 18:26:30 -07:00
parent beae168f90
commit e773429f7e
96 changed files with 8365 additions and 1667 deletions
--- a/tools/library/scripts/pycutlass/docs/source/conf.py
+++ b/tools/library/scripts/pycutlass/docs/source/conf.py
@ -50,7 +50,7 @@
 # -- Project information -----------------------------------------------------

 project = 'PyCutlass'
-copyright = '2022, Andrew Kerr; Zhaodong Chen; Haicheng Wu; Szymon Migacz; Graham Markall'
+copyright = '2022, Zhaodong Chen; Andrew Kerr; Haicheng Wu; Szymon Migacz; Graham Markall'
 author = 'Zhaodong Chen; Andrew Kerr; Haicheng Wu; Szymon Migacz; Graham Markall'


@ -65,9 +65,12 @@ extensions = [
    'sphinx.ext.autodoc',
    'sphinx.ext.intersphinx',
    'enum_tools.autoenum',
-    'sphinx.ext.autosummary'
+    'sphinx.ext.autosummary',
+    'm2r2'
 ]

+source_suffix = [".rst", ".md"]
+
 autosummary_generate = True
 autosummary_imported_members = True

@ -85,7 +88,7 @@ exclude_patterns = []
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'classic'
+html_theme = 'bizstyle'

 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
--- a/tools/library/scripts/pycutlass/docs/source/cutlass.rst
+++ b/tools/library/scripts/pycutlass/docs/source/cutlass.rst
@ -1,2 +1,100 @@
 cutlass
 =======
+
+.. rubric:: Operator Classification
+
+.. autoclass:: cutlass.OpClass
+    :members:
+
+.. rubric:: GEMM Layout
+
+.. autoclass:: cutlass.RowMajor
+    :members:
+
+.. autoclass:: cutlass.ColumnMajor
+    :members:
+
+.. autoclass:: cutlass.RowMajorInterleaved32
+    :members:
+
+.. autoclass:: cutlass.ColumnMajorInterleaved32
+    :members:
+
+.. rubric:: Conv Layout
+
+.. autoclass:: cutlass.TensorNHWC
+    :members:
+
+.. autoclass:: cutlass.TensorNC32HW32
+    :members:
+
+.. autoclass:: cutlass.TensorC32RSK32
+    :members:
+
+.. rubric:: Threadblock Swizzle
+
+.. autoclass:: cutlass.dim3
+    :special-members:
+    :members:
+
+.. autoclass:: cutlass.IdentitySwizzle1
+    :special-members:
+    :members:
+
+.. autoclass:: cutlass.IdentitySwizzle2
+    :special-members:
+    :members:
+
+.. autoclass:: cutlass.IdentitySwizzle4
+    :special-members:
+    :members:
+
+.. autoclass:: cutlass.IdentitySwizzle8
+    :special-members:
+    :members:
+
+.. autoclass:: cutlass.HorizontalSwizzle
+    :special-members:
+    :members:
+
+.. autoclass:: cutlass.BatchedIdentitySwizzle
+    :special-members:
+    :members:
+
+.. autoclass:: cutlass.StridedDgradIdentitySwizzle1
+    :special-members:
+    :members:
+
+.. autoclass:: cutlass.StridedDgradIdentitySwizzle4
+    :special-members:
+    :members:
+
+.. autoclass:: cutlass.StridedDgradHorizontalSwizzle
+    :special-members:
+    :members:
+
+.. rubric:: Coordinates
+
+.. autoclass:: cutlass.Tensor4DCoord
+    :special-members:
+    :members:
+
+.. autoclass:: cutlass.Tensor3DCoord
+    :special-members:
+    :members:
+
+.. autoclass:: cutlass.MatrixCoord
+    :special-members:
+    :members:
+
+
+.. rubric:: Convolution
+
+.. autoclass:: cutlass.conv.Operator
+    :members:
+
+.. autoclass:: cutlass.conv.IteratorAlgorithm
+    :members:
+
+.. autoclass:: cutlass.conv.StrideSupport
+    :members:
--- a/tools/library/scripts/pycutlass/docs/source/descriptor.rst
+++ b/tools/library/scripts/pycutlass/docs/source/descriptor.rst
@ -1,6 +0,0 @@
-Descriptions
-==============
-
-.. autoclass:: pycutlass.TileDescription
-    :special-members: 
-    :members:
--- a/tools/library/scripts/pycutlass/docs/source/frontend.rst
+++ b/tools/library/scripts/pycutlass/docs/source/frontend.rst
@ -1,5 +0,0 @@
-Frontend
-==============
-
-.. autoclass:: pycutlass.NumpyFrontend
-    :members:
--- a/tools/library/scripts/pycutlass/docs/source/index.rst
+++ b/tools/library/scripts/pycutlass/docs/source/index.rst
@ -3,27 +3,29 @@
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.

-Welcome to PyCutlass's documentation!
+CUTLASS Python Project Documentation
 =====================================
-
+.. mdinclude:: ../../README.md
+   
 .. toctree::
   :maxdepth: 2
   :caption: Contents:



-Indices and tables
+.. Indices and tables
+.. ==================
+
+.. * :ref:`genindex`
+.. * :ref:`modindex`
+.. * :ref:`search`
+
+
+Indices
 ==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
-
-
 .. toctree::
-   types
-   cutlass
-   descriptor
-   frontend
+   user_guide
+   visitor_tree
   gemm_op
   conv2d_op
+   cutlass
--- a/tools/library/scripts/pycutlass/docs/source/md/EpilogueVisitorTree.md
+++ b/tools/library/scripts/pycutlass/docs/source/md/EpilogueVisitorTree.md
@ -0,0 +1,225 @@
+# Epilogue Visitor Tree
+The Epilogue Visitor Tree is an experimental feature that directly generates epilogues from user-provide python functions.
+
+## Usage
+
+The Epilogue Visitor tree support many different operations. 
+
+### Unary functions
+Epilogue Visitor Tree supports unary functions like activation functions. For example,
+```python
+class UnaryEpilogue_(EpilogueVisitTree):
+    def __call__(
+        self, accum: 'tensor', c: 'tensor', 
+        alpha: 'scalar', beta: 'scalar'):
+        #
+        T = leaky_relu.numpy(accum, 0.2)
+        Z = alpha * T + beta * c
+        return Z
+epilogue_functor = UnaryEpilogue_(
+    epilogue_functor, tile_description, math_inst.element_accumulator, 
+    C.alignment, element_epilogue, C.element)
+```
+
+### Broadcast Operation
+Epilogue Visitor Tree supports broadcasting row and column vectors to the whole output matrix. To use broadcast, you just need to specify whether the source vector is a `row` vector or a `column` vector. Here is an example.
+```python
+class ColumnBroadcast_(EpilogueVisitTree):
+    def __call__(
+        self, accum: 'tensor',  c: 'tensor', 
+        vector: 'column', alpha: 'scalar', beta: 'scalar'):
+        #
+        T = accum + vector
+        scale_T = leaky_relu.numpy(alpha * T, 0.2)
+        Z = scale_T + beta * c
+        return Z, T
+epilogue_functor = ColumnBroadcast_(
+    epilogue_functor, tile_description, math_inst.element_accumulator, 
+    C.alignment, element_epilogue, C.element)
+```
+
+### Reduction Operation
+
+Epilogue Visitor Tree also supports row and column-wise reduction in each threadblock tile. The syntax for reduction is
+```python
+{reduction_output} = reduction_op({input_tensor}, {row|column}, {Add}, {threadblock_shape.n|threadblock_shape.m})
+```
+The `{row|column}` indicates whether the `row` vectors are reduced or the `column` vectors are reduction. The `{Add}` specifies the reduction operation. The `{threadblock_shape.n|threadblock_shape.m}` are the reduction lengths.
+
+**Constraint**
+* The `{input_tensor}` can only be the name of source or intermediate result. `reduction_op(A + B, ...)` will not work, please use `C = A + B`, `reduction_op(C, ...)` instead.
+* The `{reduction_output}` cannot be used in the epilogue. It will be directly written to global memory after the reduction is done.
+```python
+class RowReduction_(EpilogueVisitTree):
+    def __call__(
+        self, accum: 'tensor',  c: 'tensor', 
+        alpha: 'scalar', beta: 'scalar'):
+        #
+        D = alpha * accum + tanh.numpy(beta * c)
+        reduction = reduction_op(D, "row", "Add", args.threadblock_shape[1])
+        return D, reduction
+epilogue_functor = RowReduction_(
+    epilogue_functor, tile_description, math_inst.element_accumulator, 
+    C.alignment, element_epilogue, C.element)
+epilogue_functor.initialize()
+```
+
+## Get output_op
+
+As shown in the user guide, an `output_op` is required by the argument wrapper. We will take the `RowReduction_` as an example to show how to get `output_op`.
+```python
+class RowReduction_(EpilogueVisitTree):
+    def __call__(
+        self, accum: 'tensor',  c: 'tensor', 
+        alpha: 'scalar', beta: 'scalar'):
+        #
+        D = alpha * accum + tanh.numpy(beta * c)
+        reduction = reduction_op(D, "row", "Add", args.threadblock_shape[1])
+        return D, reduction
+epilogue_functor = RowReduction_(
+    epilogue_functor, tile_description, math_inst.element_accumulator, 
+    C.alignment, element_epilogue, C.element)
+epilogue_functor.initialize()
+
+cta_n = args.threadblock_shape[1]
+num_cta_n = (problem_size.n() + cta_n - 1) // cta_n
+reduction = np.zeros(shape=(args.batch * problem_size.m() * num_cta_n,), dtype=getattr(np, element_c))
+# get output op
+output_op = operation.epilogue_type(
+    D=tensor_D, alpha=args.alpha, beta=args.beta, c=tensor_C, reduction=reduction, problem_size=[problem_size.m(), problem_size.n()]
+)
+```
+Like other epilogue functors such as `LinearCombination`, the output op for EpilogueVisitorTree is also created with `operation.epilogue_type(*)`. However, there are two differences:
+* The arguments need to be passed as keyword-arguments. The keywords are the argument names in `def __call__`.
+* An additional `problem_size=[problem_size.m(), problem_size.n()]` is required. 
+
+
+## Add new Unary Operation (e.g. Activation Function)
+To add additional unary operation into epilogue visitor tree, a new unary op
+should be created for `VisitorOpUnary`. We will take `tanh` as an example.
+
+### Step 1: define TanhVisitor
+
+The visitor defines the parameters and computation required by the unary option.
+The unary operations are registered in [pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/unary_ops.h](tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/unary_ops.h). But you can define it in any header file and include the header file in [pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_unary.h](tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_unary.h).
+
+
+* Two template arguments are required:
+    * `T`: data type used to compute the unary operation
+    * `N`: compute fragment length
+* We also need to provide the `Arguments` and `Params` structures. The `Arguments` will be assembled by [ctypes](https://docs.python.org/3/library/ctypes.html), the `Params` will be generated from `Arguments` automatically. If the unary function takes no argument, an integer like `int tmp` can be provide to ensure the correctness of ctypes.
+* The constructor can only take the `params` as the single argument.
+* The operation is defined in `Array<T, N> operator()(Array<T, N> const &frag) const `. On common way to do that is first define a scalar computation, and them use it for the fragment computation with an unrolled for-loop.
+* A guard function is required. If it returns `true`, it will disable all the children nodes of the unary node and return zeros to parent node. This is very helpful for multiplication with scalar while the scalar is `0`. For general cases, you can just return `true`. 
+```c++
+// T: data type used to compute the unary operation
+// N: compute fragment length
+template <typename T, int N>
+struct TanhVisitor {
+    /// Argument
+    struct Arguments {
+        // a placeholder argument to ensure correctness of ctypes
+        int tmp;
+
+        CUTLASS_HOST_DEVICE
+        Arguments(): tmp(0) { };
+
+        CUTLASS_HOST_DEVICE
+        Arguments(int tmp): tmp(tmp) { };
+    };
+
+    /// Param
+    struct Params {
+        CUTLASS_HOST_DEVICE
+        Params(){ };
+        Params(Arguments const &args) { }
+    };
+
+    /// Constructor
+    CUTLASS_HOST_DEVICE
+    TanhVisitor(Params const &params) { }
+
+    // scalar operator
+    CUTLASS_HOST_DEVICE
+    T tanh_op(T const &scalar) const {
+        return fast_tanh(scalar);
+    }
+
+    /// vector operator
+    CUTLASS_HOST_DEVICE
+    Array<T, N> operator()(Array<T, N> const &frag) const {
+        Array<T, N> y;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i=0; i < N; ++i) {
+            y[i] = tanh_op(frag[i]);
+        }
+
+        return y;
+    }
+
+    // Guard
+    CUTLASS_HOST_DEVICE
+    bool guard() {
+        return true;
+    }
+};
+```
+
+### Step 2: register Tanh function
+After defining the function in C++, we need to register it in python. The class below gives an example.
+* The init function takes the data type `element_compute`, which will be the `T` in the C++ template.
+In the init function, we also generate the `_Arguments` class as a `ctypes.Structure`. It includes all the data members in the `TanhVisitor::Arguments`.
+* The `_Arguments` need to be registered as `self.argument_type` of `tanh` class. 
+* A `emit` function is required to emit the namespace and typename of `TanhVisitor`.
+* A staticmethod as numpy reference is required to implement the python code to parse.
+
+The built-in functions are defined in [pycutlass/src/pycutlass/epilogue.py](tools/library/scripts/pycutlass/src/pycutlass/epilogue.py). You can defined yours in any file as long as it can be found by [/pycutlass/src/pycutlass/parser.py](tools/library/scripts/pycutlass/src/pycutlass/parser.py).
+```python
+class tanh(ActivationFunctor):
+    def __init__(self, element_compute) -> None:
+        super().__init__()
+        class _Arguments(ctypes.Structure):
+            _fields_ = [
+                ("tmp", ctypes.c_int)
+            ]
+            def __init__(self, *args) -> None:
+                self.tmp = 0
+        self.argument_type = _Arguments
+    
+    def emit(self):
+        return "cutlass::TanhVisitor"
+
+    @staticmethod
+    def numpy(x: np.ndarray):
+        return np.tanh(x)
+```
+
+### Step 3: Run the function
+Now the new unary op is ready to use. An epilogue visitor tree can be built with
+```python
+class RowReduction_(EpilogueVisitTree):
+    def __call__(
+        self, accum: NDArray['tensor', 'float32'],  c: NDArray['tensor', 'float32'], 
+        alpha: 'float32', beta: 'float32'):
+        #
+        D = alpha * accum + tanh.numpy(beta * c)
+        reduction = reduction_op(D, "row", "Add", args.threadblock_shape[1])
+        return D, reduction
+epilogue_functor = RowReduction_(
+    epilogue_functor, tile_description, math_inst.element_accumulator, 
+    C.alignment, element_epilogue, C.element)
+epilogue_functor.initialize()
+```
+
+## Limitations and Future work
+
+Although the Epilogue Visitor Tree brings great flexibility to epilogue construction, as the epilogue is formulated as a single tree, there are several limitations.
+* [Future Work] Serial and Parallel Split-K GEMM are not supported yet. 
+    * To support serial split-k, additional tree transformation pass is required to inject a `binaryOpNode(Add)` + `TensorInputNode` before each `TensorOutputNode` to fetch the partial sum back. The `semaphore` also needs to be passed into epilogue. 
+    * To support parallel split-k, an Reduction with visitor kernel is required.
+* [Future Work] Convolution and GEMM Grouped are not supported yet.
+    * To support Conv2d and GEMM Grouped, corresponding *_with_visitor kernels are required.
+
+* [Limitation] If the same node is used by two operations (except that one of them is reduction), the node and all its offsprings will be executed twice.
+* [Limitation] The result of reduction can only be used as the return value.
--- a/tools/library/scripts/pycutlass/docs/source/md/basic_idea.md
+++ b/tools/library/scripts/pycutlass/docs/source/md/basic_idea.md
@ -0,0 +1,283 @@
+# Basics of PyCUTLASS
+
+PyCUTLASS handles the following things when launch the CUTLASS kernels
+* Memory management
+* Operation Description
+* Code emission and compilation
+* Arguments preprocessing
+* Kernel launching
+* Result Synchronization
+
+## Memory management
+
+PyCUTLASS uses [RMM](https://github.com/rapidsai/rmm) to manage device memory. At the begining of the program, call
+```python
+pycutlass.get_memory_pool({init_pool_size_in_bytes}, {max_pool_size_in_bytes})
+```
+We also provide functions to query the allocated size.
+```python
+bytes = get_allocated_size()
+```
+
+
+## Operation Description
+PyCUTLASS provides operation description for GEMM, GEMM Grouped and Conv2d operations. These operation descriptions are assembled from four foundamental concepts
+* Math Instruction: math instruction executed in GPU cores
+* Tile Description: tiling sizes and pipeline stages
+* Operand Description: data type, layout, memory alignment
+* Epilogue Functor: epilogue function
+
+### Math Instruction
+
+The math instruction is defined as follows:
+```python
+math_inst = MathInstruction(
+    {instruction_shape}, {element_a}, {element_b},
+    {element_acc}, {opclass}, {math_operation}
+)
+```
+The `{instruction_shape}` and `{opclass}` defines the instruction size and type. The table below lists valid combinations. `{element_a}`, `{element_b}` define the source operand data type for each instructions, and `{element_acc}` defines the accumulator type. The `{math_operation}` defines the math operation applied. 
+
+|Opclass                   | element_a/element_b | element_acc     | instruction_shape | math_operation            |
+| --                       | --                  | --              | --                | --                        |
+| cutlass.OpClass.TensorOp | cutlass.float64     | cutlass.float64 | [8, 8, 4]         | MathOperation.multiply_add|
+|                          | cutass.float32 cutlass.tfloat32, cutlass.float16 cutlass.bfloat16 | cutlass.float32 | [16, 8, 8] | MathOperation.multiply_add MathOperation.multiply_add_fast_f32 MathOperation.multiply_add_fast_f16 MathOperation.multiply_add_fast_bf16 |
+|        | cutlass.float16 | cutlass.float16/cutlass.float32|[16, 8, 16]| MathOperation.multiply_add |
+|        | cutlass.bfloat_16 | cutlass.float32 | [16, 8, 16]|MathOperation.multiply_add |
+|        | cutlass.int8 | cutlass.int32 | [16, 8, 32] | MathOperation.multiply_add_saturate|
+|cutlass.OpClass.Simt| cutlass.float64 | cutlass.float64 | [1, 1, 1] | MathOperation.multiply_add |
+| | cutlass.float32 | cutlass.float32 | [1, 1, 1] | MathOperation.multiply_add |
+
+The `cutlass.OpClass.TensorOp` indicates that the tensor core is used, while `cutlass.OpClass.Simt` uses the SIMT Core.
+
+The `multiply_add_fast_f32` emulates fast accurate SGEMM kernel which is accelerated
+using Ampere Tensor Cores. More details can be found in [examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm](examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm).
+
+### Tile Description
+The tile description describes the threadblock and warp tiling sizes, as well as the pipeline stages.
+```python
+tile_description = TileDescription(
+    {threadblock_shape}, {stages}, {warp_count},
+    math_inst
+)
+```
+The `{threadblock_shape}` is a list of 3 integers `[Tile_M, Tile_N, Tile_K]` that defines the threadblock tiling size. `{stages}` defines the number of software pipeline stages ([detail](https://developer.nvidia.com/blog/controlling-data-movement-to-boost-performance-on-ampere-architecture/)). `{warp_count}` defines the number of warps along `M`, `N`, and `K` dimension. I.e., with `{threadblock_shape}=[Tile_M, Tile_N, Tile_K]` and `{warp_count}=[W_M, W_N, W_K]`, the warp tile size would be `[Tile_M / W_M, Tile_N / W_N, Tile_K / W_K]`.
+
+### Operand Description
+The Operand Description defines the data type, layout, and memory alignment of input tensor A, B, and C. The output D shares the same attributes with C. The description is as follows:
+```python
+A = TensorDescription(
+    {element_a}, {layout_a}, {alignment_a}
+)
+
+B = TensorDescription(
+    {element_b}, {layout_b}, {alignment_b}
+)
+
+C = TensorDescription(
+    {element_c}, {layout_c}, {alignment_c}
+)
+```
+The table below lists the supported layout and data types for each operation
+| Operation | data type | layout |
+| --        | --        | --     |
+| GEMM, GEMM Grouped     | cutlass.float64, cutlass.float32, cutlass.float16, cutlass.bfloat16 | cutlass.RowMajor, cutlass.ColumnMajor |
+|           | cutlass.int8 | cutlass.RowMajor, cutlass.ColumnMajor, cutlass.RowMajorInterleaved32, cutlass.ColumnMajorInterleaved32|
+| Conv2d Fprop, Dgrad, Wgrad | cutlass.float64, cutlass.float32, cutlass.float16, cutlass.bfloat16 | cutlass.TensorNHWC |
+| Conv2d Fprop | cutlass.int8 | cutlass.TensorNHWC, cutlass.TensorNC32HW32, cutlass.TensorC32RSK32|
+
+### Epilogue Functor
+The epilogue functor defines the epilogue executed after mainloop.
+We expose the following epilogue functors.
+| Epilogue Functor | Remark |
+| --               | --     |
+| LinearCombination | $D=\alpha \times Accm + \beta \times C$ |
+| LinearCombinationClamp | $D=\alpha \times Accm + \beta \times C$, Output is clamped to the maximum value of the data type output |
+| FastLinearCombinationClamp | $D=\alpha \times Accm + \beta \times C$, only used for problem size $K\le 256$ for cutlass.int8, with accumulator data type `cutlass.int32` and epilogue compute data type `cutlass.float32` |
+| LinearCombinationGeneric | $D  = activation(\alpha \times Accm + \beta \times C)$, available activations include `relu`, `leaky_relu`, `tanh`, `sigmoid`, `silu`, `hardswish`, and `gelu` |
+
+The epilogue functors can be created as follows
+```python
+# LinearCombination
+epilogue_functor = LinearCombination(
+    element_C, alignment_c, element_acc, element_epilogue_compute
+)
+
+# LinearCombinationClamp
+epilogue_functor = LinearCombinationClamp(
+    element_C, alignment_c, element_acc, element_epilogue_compute
+)
+
+# FastLinearCombinationClamp
+epilogue_functor = FastLinearCombinationClamp(
+    element_C, alignment_c
+)
+
+# LinearCombinationGeneric
+epilogue_functor = LinearCombinationGeneric(
+    relu(element_epilogue_compute), element_C, alignment_c, 
+    element_acc, element_epilogue_compute
+)
+```
+
+We also provides an experimental feature "Epilogue Visitor Tree" for GEMM operation. The details can be found in [EpilogueVisitorTree](tools/library/scripts/pycutlass/docs/source/md/EpilogueVisitorTree.md).
+
+
+### GEMM Operation
+
+The GEMM Operation description can be created with 
+```python
+operation = GemmOperationUniversal(
+    {compute_capability}, tile_description,
+    A, B, C, epilogue_functor, 
+    {swizzling_functor}, {visitor}
+)
+```
+* `{compute_capability}` is an integer indicates the compute capability of the GPU. For A100, it is 80.
+* `{swizzling_functor}` describes how threadblocks are scheduled on GPU. This is used to improve the L2 Locality ([detail](https://developer.nvidia.com/blog/optimizing-compute-shaders-for-l2-locality-using-thread-group-id-swizzling/)). Currently we support `cutlass.{IdentitySwizzle1|IdentitySwizzle2|IdentitySwizzle4|IdentitySwizzle8|BatchedIdentitySwizzle}`. The last one is used for batched or array GEMM.
+* `{visitor}`: a bool variable indicates whether the epilogue visitor tree is used.
+
+### GEMM Grouped Operation
+The GEMM Grouped Operation description can be created with 
+```python
+operation = GemmOperationGrouped(
+    compute_capability, tile_description,
+    A, B, C, epilogue_functor, 
+    swizzling_functor, {precompute_mode}
+)
+```
+* `{precompute_mode}`: It could be `SchedulerMode.Host` or `SchedulerMode.Device`. See [examples/24_gemm_grouped](examples/24_gemm_grouped) for more details.
+
+
+### Conv2d Operation
+The Conv2d Operation description can be created with
+```python
+operation = Conv2dOperation(
+    {conv_kind}, {iterator_algorithm},
+    compute_capability, tile_description,
+    A, B, C, {stride_support},
+    epilogue_functor, swizzling_functor
+)
+```
+* `{conv_kind}` defines which convolution is executed. Available options include `fprop`, `dgrad`, and `wgrad`.
+* `{iterator_algorithm}` specifies the iterator algorithm used by the implicit GEMM in convolution. The options are as follows:
+    * `analytic`: functionally correct in all cases but lower performance
+    * `optimized`: optimized for R <= 32, S <= 32 and unity-stride dgrad
+    * `fixed_channels`: analytic algorithm optimized for fixed channel count (C == AccessSize)
+    * `few_channels`: Analytic algorithm optimized for few channels (C divisible by AccessSize)
+* `{stride_support}`: distinguishes among partial specializations that accelerate certain problems where convolution
+stride is unit.
+    * `strided`: arbitrary convolution stride
+    * `unity`: unit convolution stride
+
+***
+## Code Emission and Compilation
+After implementing the operation description, the related host and device code can be compiled with
+```python
+import pycutlass
+
+pycutlass.compiler.add_module([operation,])
+```
+Several operations can be compiled togather. The `nvcc` at `$CUDA_INSTALL_PATH/bin` is used by default as the compiler backend. But you can also switch to [CUDA Python](https://nvidia.github.io/cuda-python/overview.html)'s `nvrtc` with 
+```python
+pycutlass.compiler.nvrtc()
+```
+We also have an internal compiled artifact manager that caches the compiled kernel in both memory and disk. The `compiled_cache.db` at your workspace is the database that contains the binary files. You can delete the file if you want to recompile the kernels.
+***
+## Argument Processing
+We provide argument wrapper to convert python tensors to the kernel parameters. Currently it supports [torch.Tensor](https://pytorch.org/), [numpy.ndarray](https://numpy.org/), and [cupy.ndarray](https://cupy.dev/). 
+### GEMM Arguments
+The Gemm arguments can be created with
+```python
+arguments = GemmArguments(
+    operation=operation, problem_size={problem_size},
+    A={tensor_A}, B={tensor_B}, C={tensor_C}, D={tensor_D},
+    output_op={output_op},
+    gemm_mode={gemm_mode},
+    split_k_slices={split_k_slices}, batch={batch}
+)
+```
+* `problem_size` is a `cutlass.gemm.GemmCoord(M, N, K)` object that defines $M\times N\times K$ matrix multiplication.
+* `tensor_X`: user-provide tensors.
+* `output_op`: the params for the epilogue functor.
+* `gemm_mode`, `split_k_slices`, and `batch`:
+
+|gemm_mode| split_k_slices | batch | remark|
+|--|--|--|--|
+|cutlass.gemm.Mode.Gemm | number of split-K slices | - | the ordinary GEMM or GEMM with serial split-K|
+|cutlass.gemm.Mode.GemmSplitKParallel | number of split-K slices | - | GEMM Split-K Parallel|
+|cutlass.gemm.Mode.Batched | - | batch size | Batched GEMM |
+|cutlass.gemm.Mode.Array | - | batch size | Array GEMM |
+
+### GEMM Grouped Arguments
+The GEMM grouped arguments can be created with
+```python
+arguments = GemmGroupedArguments(
+    operation, {problem_sizes_coord}, {tensor_As}, {tensor_Bs}, {tensor_Cs}, {tensor_Ds},
+    output_op=output_op)
+)
+```
+* `problem_size_coord` is a list of `cutlass.gemm.GemmCoord(M, N, K)` for each problem size.
+* `tensor_Xs` is a list of user-provide tensors.
+* `output_op`: the params of the epilogue functor
+
+### Conv2d Arguments
+The Conv2d arguments can be created with
+```python
+arguments = Conv2dArguments(
+    operation, {problem_size}, {tensor_A},
+    {tensor_B}, {tensor_C}, {tensor_D}, 
+    {output_op}, 
+    {split_k_mode},
+    {split_k_slices}
+)
+```
+* `problem_size`: it can be constructed with
+    ```python
+    problem_size = cutlass.conv.Conv2dProblemSize(
+        cutlass.Tensor4DCoord(N, H, W, C),
+        cutlass.Tensor4DCoord(K, R, S, C),
+        cutlass.Tensor4DCoord(pad[0], pad[1], pad[2], pad[3]),
+        cutlass.MatrixCoord(stride[0], stride[1]),
+        cutlass.MatrixCoord(dilation[0], dilation[1]),
+        cutlass.conv.Mode.cross_correlation, 
+        split_k_slices, 1
+    )
+    ```
+* `tensor_X` are user-provide tensors
+* `output_op`: the params of the epilogue functor
+* `split_k_mode`: currently we support `cutlass.conv.SplitKMode.Serial` and `cutlass.conv.SplitKMode.Parallel`.
+* `split_k_slice`: number of split-k slices
+
+For ordianry conv2d, just use `cutlass.conv.SplitKMode.Serial` with `split_k_slice=1`.
+
+### Getting output_op
+The way to create output_op is listed below
+```python
+output_op = operation.epilogue_type(*([alpha, beta] + args.activation_args)),
+```
+It is a list of arguments start with the scaling factor `alpha` and `beta`.
+The `output_op` of EpilogueVisitorTree is slightly different. Please check [EpilogueVisitorTree](tools/library/scripts/pycutlass/docs/source/md/EpilogueVisitorTree.md) for details.
+
+
+## Kernel Launching
+
+With the arguments and operations, the kernel can be launched simply with
+```python
+operation.run(arguments)
+```
+
+## Sync results
+
+We also provide function to synchronize the kernel execution. If you use `numpy`, it will also copy the result back to host. To do that, run
+```python
+arguments.sync()
+```
+If you use EpilogueVisitorTree, please call
+```python
+output_op.sync()
+```
+
+## Reduction Kernel behind Parallel Split-K
+
+If you use parallel-split-K in GEMM or Conv2d, an additional reduction kernel is required. Please check [examples/40_cutlass_py](examples/40_cutlass_py) for detail.
--- a/tools/library/scripts/pycutlass/docs/source/types.rst
+++ b/tools/library/scripts/pycutlass/docs/source/types.rst
@ -1,6 +0,0 @@
-Types
-========
-
-
-.. autoenum:: pycutlass.OperationKind
-    :members:
--- a/tools/library/scripts/pycutlass/docs/source/user_guide.rst
+++ b/tools/library/scripts/pycutlass/docs/source/user_guide.rst
@ -0,0 +1,4 @@
+User Guide
+=====================================
+
+.. mdinclude:: ./md/basic_idea.md
--- a/tools/library/scripts/pycutlass/docs/source/visitor_tree.rst
+++ b/tools/library/scripts/pycutlass/docs/source/visitor_tree.rst
@ -0,0 +1,4 @@
+User Guide
+=====================================
+
+.. mdinclude:: ./md/EpilogueVisitorTree.md