Compare commits

...

240 Commits
v2.2.0 ... 2.11

Author SHA1 Message Date
66d9cddc83 New updates for 2.11 (#775)
* New updates.

* Minor profiler updates

Co-authored-by: Aniket Shivam <ashivam@nvidia.com>
2023-01-20 16:32:57 -05:00
d49bef88f9 Enable aarch64 support (#779) 2023-01-20 15:51:58 -05:00
8b42e751c6 streamk paper link (#765)
Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2023-01-10 22:10:43 -05:00
eb7f99d3dd @hwu36 Adding the individual arXiv link for Stream-K paper. (#764)
* Stream-K individual paper entry.

* arXiv links updated.
2023-01-10 20:39:06 -05:00
764b840d6f streamk example and performance tuning (#760)
* streamk example and performance tuning

* one missing file

Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2023-01-10 16:10:02 -05:00
a1046d49c1 Adds missing semicolon (#759) 2023-01-09 21:50:46 -05:00
1cd994b4cf Update PUBLICATIONS.md
@neoblizz @dumerrill 

thesis covering streamk
2023-01-08 00:42:19 -05:00
7bdba07310 Add definitions for tag structs. (#752)
This commit changes the declarations of MMA operator class (SIMT, Tensor Core, WMMA Tensor Core) and operator type (multiply-add and so on) to definitions. This is done so that these tag structs are no longer incomplete types, which allows the `typeid` operator to be used on these tag structs. This is necessary for these tag structs to be used as type parameters in [GoogleTest typed tests](https://google.github.io/googletest/advanced.html#typed-tests).
2023-01-06 09:46:52 -05:00
c54ede3a9e Add const overloads for iterator functions. (#753)
This commit adds `const`-correct overloads for `Array::{begin,end,rbegin,rend}`. These overloads are necessary for usage with [the GMock Container Matchers](http://google.github.io/googletest/reference/matchers.html#container-matchers), which cast the `Container` argument to a constant reference.
2023-01-06 09:46:34 -05:00
ff6e733fe1 restore the old epilogue for everything except streamk (#749)
Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2023-01-04 11:02:55 -05:00
5989b7e1d7 Update PUBLICATIONS.md
Add coconet paper to the publication list.  @abhijangda
2023-01-04 09:18:38 -05:00
1e64f153b3 improve streamk load balance (#743)
Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2022-12-25 13:56:33 -05:00
78b30d3191 Update README.md 2022-12-21 11:58:19 -05:00
59de82688b Update README.md 2022-12-21 11:57:55 -05:00
b85865d1ad Add missing #include directives (#741)
This commit adds two `#include` directives so that the definitions of `cutlass::gemm::warp::WarpSize` from "cutlass/gemm/warp/mma.h" and `cutlass::arch::OpClassSimt` from "cutlass/arch/mma.h" are visible to "cutlass/epilogue/threadblock/default_epilogue_simt.h". Without them, there are compiler errors when building the header standalone:

```
In file included from cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.cu:1:
./cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.h:351:32: error: no member named 'warp' in namespace 'cutlass::gemm'; did you mean simply 'warp'?
  static int const kWarpSize = cutlass::gemm::warp::WarpSize<arch::OpClassSimt>::value;
                               ^
./cutlass/include/cutlass/epilogue/warp/tile_iterator_simt.h:49:11: note: 'warp' declared here
namespace warp {
          ^
In file included from cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.cu:1:
./cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.h:351:53: error: no member named 'WarpSize' in namespace 'cutlass::epilogue::warp'
  static int const kWarpSize = cutlass::gemm::warp::WarpSize<arch::OpClassSimt>::value;
                                              ~~~~~~^
./cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.h:351:68: error: no member named 'OpClassSimt' in namespace 'cutlass::arch'
  static int const kWarpSize = cutlass::gemm::warp::WarpSize<arch::OpClassSimt>::value;
                                                             ~~~~~~^
./cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.h:351:82: error: no member named 'value' in the global namespace
  static int const kWarpSize = cutlass::gemm::warp::WarpSize<arch::OpClassSimt>::value;
                                                                               ~~^
./cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.h:367:5: error: use of class template 'OutputTileThreadMap' requires template arguments
    OutputTileThreadMap,
    ^
./cutlass/include/cutlass/epilogue/threadblock/output_tile_thread_map.h:134:8: note: template is declared here
struct OutputTileThreadMap : public OutputTileThreadMapHelpers<Iterations_, Delta_> {
       ^
In file included from cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.cu:1:
./cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.h:391:5: error: use of class template 'OutputTileThreadMap' requires template arguments
    OutputTileThreadMap,
    ^
./cutlass/include/cutlass/epilogue/threadblock/output_tile_thread_map.h:134:8: note: template is declared here
struct OutputTileThreadMap : public OutputTileThreadMapHelpers<Iterations_, Delta_> {
       ^
In file included from cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.cu:1:
./cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.h:405:5: error: unknown type name 'OutputTileIterator'; did you mean 'WarpTileIterator'?
    OutputTileIterator,
    ^
./cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.h:380:9: note: 'WarpTileIterator' declared here
  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorSimtDirect2dConv<
        ^
./cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.h:408:5: error: use of class template 'SharedLoadIterator' requires template arguments
    SharedLoadIterator,
    ^
./cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator.h:67:7: note: template is declared here
class SharedLoadIterator {
      ^
```
2022-12-21 11:40:20 -05:00
3f2bb17722 minor chagnes (#730)
Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2022-12-10 14:44:53 -05:00
38193d76e3 Updates for stream-k (#728)
Co-authored-by: Aniket Shivam <ashivam@nvidia.com>
2022-12-08 23:48:10 -05:00
1d7772f218 Add missing #include directive (#727) 2022-12-08 18:58:31 -05:00
df81d847d7 Make Python interface work for non-SM80 targets (#726)
* Make Python interface work for non-SM80 targets

* Remove line in README
2022-12-07 21:53:33 -05:00
d6117ca362 Relax stream K gemm alignment constraints (#717)
* Relax stream K gemm alignment constraints

The current alignment requirements are too strict. Make them identical
to the checks for the regular universal gemm.

* Revert "Relax stream K gemm alignment constraints"

This reverts commit 31e80a250e.

* Relax stream K gemm alignment constraints

The current alignment requirements are too strict. Make them identical
to the checks for the regular universal gemm.

Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2022-12-07 11:17:49 -05:00
9c0518608e Fix typos in conv problem sizes (#720)
* Fix typos in conv problem sizes

* Typos
2022-12-05 15:54:58 -05:00
9f1f37aa21 misc (#719)
* misc

* minor

Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2022-12-05 12:07:20 -05:00
84213b0b8e fix: make arch.h self contained (#714) 2022-12-01 19:25:48 -05:00
8567b87d65 Update quickstart.md (#704)
* Update quickstart.md

* Update doxygen_mainpage.md

* Update doxygen_mainpage.md

* Update terminology.md
2022-11-29 21:43:03 -05:00
c975e2ccbb releaase 2.11 (#703) 2022-11-19 09:02:15 -05:00
3c90f6aea6 add #pragma once for header file in example 42 (#698) 2022-11-15 22:50:24 -05:00
06eb90cc0d Fix identity sigmoid activation (#659)
* activation support Identity

* fix Sigmoid activation operator() with CUTLASS_HOST_DEVICE
2022-11-09 14:42:23 -05:00
168ea8b0e1 ensure singleton::get thread safe construct instance (#658)
* ensure singleton::get thread safe construct instance

* fix singleton return reference

Co-authored-by: xuweiqi <xuweiqi117@gmail.com>
2022-11-08 21:44:32 -05:00
012c62c748 bug fixes and enharcement to gemm reductionK fusion (#682)
* add two missing files

* fix bunch of bugs of gemm-reducek fusion and add a device interface

* small changes

Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2022-11-03 11:07:50 -04:00
FZC
cc85b64cf6 fix typo (#677) 2022-11-01 14:07:33 -04:00
1b4e24470a Example 43 - DualGemm (#670)
* Ex50 wip

* IS_PROFILING mode

* MultiStage2 - but is slower

* Add SwiGLU

* Support SplitKSerial reduction
Support not storing D0/D1
Cleanup code

* Option to disable bias

* Renumber example

* Fix build

* Remove references to pb_size_0 / pb_size_1

* Add support for bf16 inputs with float accum

* small changes

Co-authored-by: danthe3rd <danthe3rd>
Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2022-10-26 14:04:42 -04:00
8c1bf9b784 Bump CUTLASS Python container version (#672)
* Update example 40 README

* Update CUTLASS Python README
2022-10-22 21:09:39 -04:00
7d0dd6706e Remove excessive includes from examples/41_multi_head_attention (#669)
The rationale behind this change is explained in #563
2022-10-21 22:23:15 -04:00
9b47403b2d Add missing CUTLASS_HOST_DEVICE (#671) 2022-10-21 22:20:38 -04:00
4db6a6140e ex42: Fused MHA imported from xFormers (#662)
* ex42: Fused MHA imported from xFormers

* Remove std:: references

* Support K>128 in the example

* Support causal option

* Support different head size for V, and different seqlength for KV

* Update FLOPS counter

* Remove bit_cast

* fix build: Replace M_LOG2E

* Add doc

* Revert "Remove bit_cast"

This reverts commit 9662fa86bb.

* Explicit casts to int32_t for windows build

Co-authored-by: danthe3rd <danthe3rd>
2022-10-17 10:49:33 -04:00
3bf95e90c2 Update labeler.yml 2022-10-13 08:03:28 -04:00
75fed7493e Update labeler.yml 2022-10-13 08:01:21 -04:00
98b73fc95d Update labeler.yml 2022-10-13 07:55:33 -04:00
4990e3686d Update labeler.yml 2022-10-13 07:52:38 -04:00
4b7365388c Update labeler.yml 2022-10-13 07:32:55 -04:00
0d8405588d Update labeler.yml 2022-10-12 15:32:38 -04:00
cb539dab78 Correct typos in comments (#639)
* Correct typos in comments

Correct comments in code on type of generated distribution. Improve Gaussian RNG to take advantage of Box Muller method

* Inline Box Muller

Added inline function for the Box Muller algorithm and updated code comments to be more concise

* Update tensor_fill.h

* Update tensor_fill.h

* small changes to pass tests

Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2022-09-30 22:51:30 -04:00
dadc881a96 Bug fix for gemm broadcast (#650)
* gemm_universal_with_broadcast, +2 sources.

* Revert "gemm_universal_with_broadcast, +2 sources."

This reverts commit fb063251f2.

* gemm broadcast bug fix
2022-09-30 10:00:38 -04:00
f3eea3a4d7 Create labeler.yml 2022-09-29 15:08:44 -04:00
cd37e82492 change unused class member to local var (#646) 2022-09-28 23:52:35 -04:00
48a9ea223a Fix release version in the citation (#638) 2022-09-22 10:58:45 -04:00
7a458f00a6 fix(permute.h): incorrect comment in Tensor5DPermute20314 (#637)
* fix(permute.h): incorrect comment in `Tensor5DPermute20314`

* typo in usage in example 39
2022-09-22 09:21:13 -04:00
97bff52e8c add two missing files (#636)
Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2022-09-21 15:42:42 -04:00
9f2e3faa69 fix call of GELU_Taylor in LinearCombinationGeneric (#634) 2022-09-20 21:00:55 -04:00
a821280dc7 Gemm broadcast (#632)
* gemm_universal_with_broadcast, +2 sources.

* Revert "gemm_universal_with_broadcast, +2 sources."

This reverts commit fb063251f2.

* gemm_universal_with_broadcast separated version.

* Update copyright banner.

* update banner
2022-09-20 10:37:12 -04:00
f73374a1eb fix:comment typo in example 23 (#633) 2022-09-19 09:54:14 -04:00
faab7536fc add comment (#628) 2022-09-17 21:40:30 -04:00
fc9ebc645b CUTLASS 2.10 bug fixes and minor updates. (#626) 2022-09-15 16:20:33 -04:00
2cc2c7ba1f Add set_k_partition function (#624)
A member function set_k_partition is required for the instatiation of cutlass::gemm::kernel::Gemm, even though SplitKSerial is false
2022-09-13 22:34:20 -04:00
50ceed7154 Minor README fix (#623)
* minor fix

* Minor fix
2022-09-12 22:40:25 -04:00
e773429f7e CUTLASS 2.10 updates (#622)
Co-authored-by: Aniket Shivam <ashivam@nvidia.com>
2022-09-12 21:26:30 -04:00
beae168f90 fix broken link (#620)
Co-authored-by: yuzhai <yuzhai@nvidia.com>
2022-09-06 16:32:44 -04:00
f29d8f7ca9 Include vector in base_grouped.h (#618) 2022-09-06 13:21:23 -04:00
b1d3f9b2fd upstream internal updates (#616)
Co-authored-by: yuzhai <yuzhai@nvidia.com>
2022-09-04 23:05:09 -04:00
b72cbf957d CUTLASS 2.10 (#615)
Co-authored-by: Aniket Shivam <ashivam@nvidia.com>
2022-09-03 18:48:46 -04:00
ca23ff7924 Fixed typo in class name (#608) 2022-08-29 20:51:52 -04:00
1c3d400b14 Added value_type trait to complex to make it an easier drop-in replacement for std::complex. (#607) 2022-08-28 01:12:40 -04:00
abafbf2afd Missing comma in trmm header (#604) 2022-08-25 16:07:33 -04:00
536b20763e Fixed typo in profiler README (#603) 2022-08-24 21:55:13 -04:00
497b499d9d Add residual support for shmem staging iterator used in back-to-back GEMM fusion. This allows support of problem_size_0_n that is not multiple of 32. (#590)
Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2022-08-15 11:19:24 -04:00
e66bfcb1f8 Fix for #596 (typo in example 03) (#597)
* [examples] Fix typos in SYRK and TRMM examples

* Fix typo in example 03
2022-08-09 09:58:36 -04:00
1617685a77 fix: fix types in example 06 (#587) 2022-07-29 12:46:06 -04:00
25ebf15d02 Ensure all arch::Mma specializations have ElementC set (#576)
Co-authored-by: danthe3rd <danthe3rd@users.noreply.github.com>
2022-07-22 23:53:03 -04:00
5d05808072 fix gather example (#574) 2022-07-19 16:18:17 -04:00
0b8cacd6f1 Remove redundant <fstream> includes (#563)
* Remove redundant <fstream> includes

* Fix fstream in examples/

* Fix <fstream> in test/

* Use consistent order for <fstream> (always after <iostream>)

* Remove an unneeded include in a file where std::ofstream usage is commented out

Co-authored-by: Ivan Komarov <dfyz@yandex-team.ru>
2022-07-19 15:23:54 -04:00
e7a61c761a fix race condition when h < stride_h or w < stride_w (#562)
Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2022-07-12 16:37:08 -04:00
fb379eaa5b epilogue leaky relu support ScaleType (#564)
Co-authored-by: xuweiqi <xuweiqi117@gmail.com>
2022-07-11 17:30:55 -04:00
8a766804ad Fix doc in testbed_gemm_with_broadcast (#559) 2022-07-07 09:56:16 -04:00
1eb6355182 [activation] tanh (#550)
Co-authored-by: Bing Xu <bingxu@fb.com>
2022-07-02 08:00:45 -04:00
04a9777b87 Softmax (#546)
* add test layernorm g-mem version

* Delete include/configure directory

* Delete examples/test_layernorm directory

* Update gemm_with_softmax.h

* Update gemm_softmax.cu

* Update linear_combination.h

* Update fast_math.h

* remove redundant vars

Co-authored-by: yujia.zhai <yujia.zhai@bytedance.com>
Co-authored-by: yuzhai <yuzhai@nvidia.com>
2022-07-02 01:19:18 -04:00
e45e773436 Update linear_combination_generic.h (#472)
add `skip_elementwise_` to support serial splitk in linear_combination_generic.h`
2022-06-28 07:29:38 -04:00
dae6b6893b Update CHANGELOG.md 2022-06-27 23:30:49 -04:00
ba18ea9c32 Update README.md 2022-06-27 23:25:26 -04:00
9ab9110168 add leaky relu (#542)
Authored-by: Haicheng Wu <haichengw@nvidia.com>
2022-06-26 10:07:50 -04:00
e5d4669f16 Update CHANGELOG.md (#543) 2022-06-25 13:23:49 -04:00
94f01f19d5 Add implicit gemm perf
plot from @manishucsd, presented in gtc'22 cutlass talk
2022-06-23 22:47:11 -04:00
fa56763c25 Fix occupancy calculation for grouped GEMM (#532) 2022-06-18 19:53:59 -04:00
25e26a6e51 fix bugs in linear_combination_generic.h missing include cutlass/epilogue/thread/scale_type.h (#531) 2022-06-17 23:35:14 -04:00
f248e9bdb4 Create CITATION.cff
Add initial CITATION.cff
2022-06-07 21:25:16 -04:00
dceefe4f64 Increment stride correctly in warp iterator. (#516)
Co-authored-by: peisun1115 <peis@google.com>
2022-06-06 12:33:36 -04:00
c3881d097e Fix a comment about LDSM layout. (#514)
Co-authored-by: peisun1115 <peis@google.com>
2022-06-04 23:04:00 -04:00
a29dfb1c63 Fix a bug to increment stride tile correctly (#503)
* Fix a bug to increment stride tile correctly

* Update regular_tile_access_iterator_tensor_op.h

Co-authored-by: peisun1115 <peis@google.com>
Co-authored-by: Haicheng Wu <57973641+hwu36@users.noreply.github.com>
2022-06-03 22:54:52 -04:00
0abaac84ea [examples] Fix typos in SYRK and TRMM examples (#507) 2022-06-03 22:52:41 -04:00
858c735856 Update gather_scatter_fusion.cu
Correct the reference code in gather/scatter example to put bias add in the correct place.
2022-05-18 13:15:25 -04:00
d6f58b2d14 Update functionality.md 2022-05-11 09:34:24 -04:00
c4cf0dad82 Fix init-self compiler warnings (#493)
Fix a few errors caused by trying to initialize a class member
with itself. These errors can turn into errors if you compile
with `-Winit-self`.
2022-05-11 00:35:28 -04:00
57551902d0 Update functionality.md
add some explanations to the functionality table.
2022-05-11 00:01:19 -04:00
1604ebaf10 Update generator.py
stop generating analytical conv kernels to reduce kernel number
2022-05-08 21:47:15 -04:00
6023038bae add verification of the reduction tensor (#489)
Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2022-05-06 10:24:51 -07:00
ddd8f9cf41 update float < int32_t * 4 (#488)
Co-authored-by: 赵俊涛 <zhaojuntao@zhaojuntaos-MacBook-Pro.local>
2022-05-04 13:36:05 -04:00
ec2b4fd85d b2b bias vector support (#482)
* b2b bias vector support

* add files

Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2022-04-30 04:16:15 -07:00
86ce09aed1 2.9 fixes for nvrtc (#480)
* Use platform::is_same instead of std::is_same

* Don't hide cuComplex include from nvrtc

* Typo fixed

* Remove comment rename
2022-04-29 09:06:52 -04:00
21c1fa3849 add .github (#479)
Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2022-04-28 12:36:59 -07:00
8c339ac039 Fix compilation in clang (#478)
- adds missing commas
- adjusts misaligned usage of CUTLASS_DEVICE between
  template declaration and specializations

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
2022-04-28 14:22:06 -04:00
e49f690fd7 Update linear_combination_generic.h 2022-04-28 14:04:53 -04:00
96dad61a75 Update CHANGELOG.md 2022-04-28 10:52:10 -04:00
cc2ea4c3fc Update README.md 2022-04-28 10:50:11 -04:00
a0de301283 Used relative paths for includes (#477) 2022-04-27 12:04:23 -07:00
319a389f42 Update CMakeLists.txt (#473)
* Update CMakeLists.txt

Add 128bit int support if using nvc++ to solve #310 

@jeffhammond, would you please give it a try?

* Update CMakeLists.txt

correct copy paste error
2022-04-27 07:02:26 -07:00
71def2f084 Use platform:: instead of std::abs and std::conditional (#452)
* Fixed template struct/class mismatch

* Use platform implementation instead of std::abs and std::conditional during nvrtc compilation

* Use platform implementation instead of std::abs and std::conditional during nvrtc compilation

* Revert absolute_value() usage
2022-04-25 14:40:22 -04:00
70f3ba57f5 Fix typo in shared memory layout description (#471) 2022-04-24 18:32:13 -04:00
dd77fadc70 Remove redundant offset def and init in shared_load_iterator.h (#456)
Signed-off-by: Fujun Han <fujun.han@iluvatar.ai>
2022-04-24 16:31:00 -04:00
be4578d517 Fixed template struct/class mismatch (#453) 2022-04-24 16:30:21 -04:00
d7b499deff Fix CUDA_PERROR_EXIT and print failing expression (#446)
`CUDA_PERROR_EXIT ` can lead to incorrect usage (see e.g. [this description](https://www.cs.technion.ac.il/users/yechiel/c++-faq/macros-with-if.html)) because it contains an incomplete `if` expression. Consider:

```
if (condition)
    CUDA_PERROR_EXIT(cudaFree(x))
else
    free(x);
```

The author of the code forgot to add a semicolon after the macro. In that case, the `else` will bind to the `if` inside the macro definition, leading to code that the author did not intend or expect. It the author does use a semicolon, the code will not compile, which is awkward.

The change adds a `do while` around the `if`, which always requires a semicolon.

This PR also adds the text of the failing expression to the printed error message.
2022-04-24 16:29:43 -04:00
310ed81ac3 fix description in example 12. (#444)
Co-authored-by: Exusial <Exusial>
2022-04-24 16:29:06 -04:00
4c0d6e1eb4 [BUGFIX]: Force unroll a loop that doesn't have compilation constant (#441)
loop times is dangerous.

Signed-off-by: Peter Han <fujun.han@iluvatar.ai>
2022-04-24 16:28:32 -04:00
167ac54c65 Fix link to Python example (#469) 2022-04-23 15:37:38 -04:00
12f4108ac2 CUTLASS 2.9 (#468) 2022-04-23 15:02:38 -04:00
dd571f0edb [style] fix code indentation (#449)
* [docs] fix typo in media/docs/layout.md

* [docs] fix comment error

* fix typo in include/cutlass/arch/simd_61.h

* fix stride comment errors in TensorLayout

* fix indentation
2022-04-03 21:13:17 -04:00
6d0d265047 Update PUBLICATIONS.md (#447) 2022-04-03 21:03:28 -04:00
f11fa975a5 Update PUBLICATIONS.md
@tsuki
2022-03-23 21:04:43 -04:00
0e71d9b450 Transposed conv2d and wgrad split k examples (#413)
* add split k wgrad example

* wgrad done

* begin transposed conv2d example

* update transposed conv2d example and add ref check

* update doc for conv2d transpose example

* add license

* add wgrad doc

* more clarification on GEMM output type

* typo fix

* clean up indent

* address comments

* rename example numbers to 34 and 35

* GEMM -> Implicit GEMM

* Revert "rename example numbers to 34 and 35"

This reverts commit 551a808c22.

* transposed_conv2d is 34

* add compiler and device version check to exit gracefully

Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2022-03-23 14:52:54 -04:00
eb0d4c9213 [library] pass pointer of arguments to get_host_workspace_size() in gemm_universal() (#412)
Otherwise GemmUniversalOperation::get_host_workspace_size() will fail on SegmentFault.
2022-03-22 12:36:34 -04:00
bc45e2c023 fixed datatype error of numeric_limit for uint1b_t (#419)
Co-authored-by: Haojin Yang <haojin.yang@.hpi.uni-potsdam.de>
2022-03-22 12:30:30 -04:00
095cbba57c Example 23 - Passing correct alpha and beta values with --parallel-split-k (#424)
When split-k is enabled, we should set alpha to 1 and beta to 0 for the
split-k gemm kernel.

The fix was from hwu36. I only did fixed some minor typos along with his
fix.
2022-03-22 12:27:34 -04:00
8f1fe7a132 Fix separate compilation -dc (#433)
* Fix separate compilation `-dc`

- when cutlass is included in multiple compilation units
  compiled with `-dc` OOB_NAN_F16x8 device constant is
  instantiated multiple times causing
  Multiple definition of '_ZN7cutlass4arch13OOB_NAN_F16x8E' error
  This PR makes this variable a local constant as it is not
  modified during runtime

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>

* Fix

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>

* Test GH

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>

* Revert test GH

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
2022-03-22 12:21:18 -04:00
3ab1eacf09 Fix typo in profiler examples (#437) 2022-03-21 12:00:13 -04:00
cd39c75e25 Fix typo in docs, code comments (#429)
* [docs] fix typo in media/docs/layout.md

* [docs] fix comment error

* fix typo in include/cutlass/arch/simd_61.h

* fix stride comment errors in TensorLayout
2022-03-15 21:54:36 -04:00
b2e1e97cb1 Update PUBLICATIONS.md
ACM Trans on Graphics from nv research.
2022-03-01 22:37:18 -05:00
96a11a1ef3 Removed trivial copy constructors on parameter classes to enable devi… (#366)
* Removed trivial copy constructors on parameter classes to enable device-side launch of CUTLASS kernels

* Added SFINAE to the `TensorRef(NonConstTensorRef const&)` constructor to avoid making it a copy-constructor for device code

* std => platform

* fix affine2

* really fix affine2

Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2022-02-28 21:34:02 -05:00
e96f00586c Make cutlass::gemm::device::GemmArray usable (#295)
* Fix the build of cutlass/gemm/device/gemm_array.h and add a demo for GemmArray

* Add a reference to GemmArray to the docs

Co-authored-by: Ivan Komarov <dfyz@yandex-team.ru>
2022-02-17 20:01:05 -05:00
3cfa5db2a2 Actually use float accumulation in gemm_f16t_f16t_f16t_wmma_tensor_op… (#407)
* Actually use float accumulation in gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu

As title

* Update gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu

change the missing one

Co-authored-by: Haicheng Wu <57973641+hwu36@users.noreply.github.com>
2022-02-16 09:53:21 -05:00
1db6971a8d Remove unused gemm_k_iterations in GemmKernel::Params (#406)
Otherwise we get gemm_k_iterations is uninitialized warnings.
2022-02-16 09:52:45 -05:00
b954127297 Update PUBLICATIONS.md
@jackkosaian
2022-02-14 16:54:32 -05:00
d0d941efc7 [hardswish] correct implmentation (#403)
* [hardswish] correct implmentation

* seems working

* hardswish fp32/fp16x2 optimization

* [relu] half2 support

* add relu0; add multiply_add_relu0;

* cleanup

Co-authored-by: Bing Xu <bingxu@fb.com>
Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2022-02-09 14:28:53 -05:00
8a951b2940 Enable convolution with fused epilogue for Volta Tensor Cores (#402)
* Enabled convolution with epilogue fusion for Volta Tensor Cores.

* Compilation fixes

* Disabled testing Volta on Ampere architectures.
2022-01-30 23:24:50 -05:00
1e4703cbab Support parallel split K mode for porfiling (#277)
* Support parallel split K mode for porfiling

Signed-off-by: Peter Han <fujun.han@iluvatar.ai>

* Parallel Split K support

  1. find gemm kernel by preference key
  2. switch m n for redution kernel

Signed-off-by: Peter Han <fujun.han@iluvatar.ai>

* parallel splitk for fp16 gemm

* add one missing file

Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2022-01-27 10:37:37 -05:00
c3353add63 Merge pull request #388 from depaulmillz/fix/headersonly
Fix utils include not being installed in header only
2022-01-26 14:22:51 -06:00
ac8825b941 Minor fix to change from LIBRARY_INIT to LIBRARY 2022-01-26 15:17:46 -05:00
8fd94806e5 Update PUBLICATIONS.md
add mlsys 2022 paper.
2022-01-17 00:08:18 -05:00
d7c9cbf0b9 Fix typo in scripts/library.py (wrong data size for u8) (#393) 2022-01-07 13:29:56 -05:00
c2ee13a0fe Add epilogue functor for residual block fusion (#391)
* Add epilogue functor for residual block fusion

* Do not run split-k tests when ActivationOp is not Identity

* explain TestSplitK param

* return early
2021-12-29 22:53:40 -05:00
f78994bb40 add the missing pieces (#392)
Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2021-12-25 04:29:54 -08:00
dceabd4c5a Support half precision sigmoid activation (#378)
* Support half precision sigmoid activation

* introduce a vectorized variant using fast_tanh

* move the math to fast_math.h

* fixed compile

* .raw() -> .to_half()

Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2021-12-22 14:45:06 -05:00
86fa1dc30b Fix utils include not being installed in header only 2021-12-21 12:10:26 -05:00
288af365db Added missing synchronization to avoid WAR hazards between tiles. (#386) 2021-12-20 08:34:08 -08:00
0dc3ba60b3 Refactor GELU and Sigmoid epilogue to use a common template (and add SiLu, Hardswish epilogue) (#379)
* Support half precision sigmoid activation

* introduce a vectorized variant using fast_tanh

* refactored sigmoid using the new interface

* refactored gelu

* add silu activation

* add hardswish

* remove sigmoid for now

* add description to silu and hardswish, and other doc update

* Do not ignore Round

* use constant N

* Set isHeavy = true in sigmoid and silu epilogue
2021-12-18 14:58:15 -05:00
ec4f7e5194 Updates to fused epilogue (#383)
* Enhancements and fixes to fused GEMM and Convolution epilogue.
* Need to explicitly list cudart as unit test library dependency.
2021-12-17 16:04:43 -05:00
4e666e1dfd Updated README and added issue templates. (#382) 2021-12-17 09:26:20 -05:00
3799e12f25 Merge pull request #381 from Peter9606/update-makefile-version
Update project version to 2.8.0 in CMakeLists.txt
2021-12-16 21:54:57 -05:00
fc3bc85db8 Update project version to 2.8.0 in CMakeLists.txt
Signed-off-by: Peter Han <fujun.han@iluvatar.ai>
2021-12-17 02:23:31 +00:00
49c0a58d50 Set theme jekyll-theme-minimal 2021-12-15 14:51:24 -05:00
5fe09c2d67 Updated GEMM performance plot with CUTLASS 2.8 compiled with CUDA 11.5 Toolkit (#375)
Updated GEMM performance plot with CUTLASS 2.8 compiled using CUDA 11.5 Toolkit.

GPUs under test:

    NVIDIA A100
    NVIDIA A2
    NVIDIA TitanV
    NVIDIA GeForce 2080 Ti
2021-12-06 14:21:33 -05:00
6b69c79ac3 Fixed contributor formatting. (#365) 2021-11-22 11:30:53 -08:00
62e438f450 Listed Matthew Nicely as the CUTLASS product manager.. (#364) 2021-11-19 17:51:21 -08:00
808c25337a CUTLASS 2.8 (#363)
CUTLASS 2.8
2021-11-19 13:26:35 -08:00
6fc5008803 Update quickstart.md
fix a broken link
2021-11-11 09:53:46 -05:00
a3bcc6981d Merge pull request #331 from reed-lau/feature/fix-wmma-shape-typo
fix wmma shape typo
2021-09-28 10:20:29 -04:00
3b28642801 fix wmma shape typo 2021-09-28 19:04:09 +08:00
538592dea4 example 23 gemm operand reduction fusion (#325) 2021-09-20 13:34:47 -07:00
2e07c4cc2f CUTLASS 2.7 (#318)
CUTLASS 2.7

Mainloop fusion for GEMM: summation over A or B
Strided DGRAD (optimized iterators)
Half-precision GELU_taylor activation functions
Use these when accumulation and epilogue compute types are all cutlass::half_t
Tuning and bug fixes to fused GEMM + GEMM example
Support for smaller than 128b aligned Convolutions: see examples
Caching of results to accelerate Convolution unit tests
Can be enabled or disabled by running cmake .. -DCUTLASS_TEST_ENABLE_CACHED_RESULTS=OFF
Corrections and bug fixes reported by the CUTLASS community
Thank you for filing these issues!

authored-by: Haicheng Wu haichengw@nvidia.com, Manish Gupta manigupta@nvidia.com, Dustyn Blasig dblasig@nvidia.com, Andrew Kerr akerr@nvidia.com
2021-09-20 11:02:22 -07:00
9ac255863f Merge pull request #246 from mengchihe/master
support unalignment input for conv2d fprop stage=2 Fix for issue #242
2021-09-08 11:40:53 -04:00
59e2aa505a refine the implementation 2021-09-08 13:14:08 +00:00
4e8af93da1 Merge remote-tracking branch 'origin/master' into small_alignment 2021-09-07 20:39:38 +00:00
6c2f8f2fb8 CUTLASS 2.6.1 - functional and performance enhancements to strided DGRAD, fixes, and tuning
* cutlass 2.6 update

* remove debug prints

* cutlass 2.6.1 (minor update)

* Updated CHANGELOG.

* Minor edit to readme to indicate patch version.

* Minor edit to readme.

Co-authored-by:  Haicheng Wu <haichengw@nvidia.com>, Andrew Kerr <akerr@nvidia.com>
2021-09-03 10:26:15 -07:00
598e35401c Merge remote-tracking branch 'origin/master' into small_alignment 2021-08-16 07:49:08 -07:00
a01feb93d9 Merge pull request #308 from dongxiao92/patch-1
fix typo in doc
2021-08-08 11:54:42 -07:00
d36f331b44 fix typo in doc
fix typo
2021-08-08 16:44:22 +08:00
69abafb85a Merge pull request #306 from NVIDIA/fix-profiler-cmd-doc
Fix profiler cmd doc
2021-07-30 14:36:54 -04:00
68a078fbbf cleanup 2021-07-30 11:27:21 -07:00
10709dbb64 clean profiler cmd and doc 2021-07-30 11:02:17 -07:00
1227351079 Merge pull request #305 from NVIDIA/fix_epilogue_spill
fix epilogue register spill
2021-07-29 14:30:11 -07:00
a77c658439 fix epilogue register spill 2021-07-29 14:25:48 -07:00
4516b833ce Merge pull request #303 from Peter9606/doc_typo
Doc typo
2021-07-28 20:49:06 -04:00
64dd1e1915 Doc typo
Signed-off-by: Peter Han <fujun.han@iluvatar.ai>
2021-07-29 08:45:59 +08:00
1ac4559d12 Cutlass 2.6 Update 1 (#301)
* cutlass 2.6 update

* remove debug prints
2021-07-27 17:58:30 -07:00
e5d51840e8 CUTLASS 2.6 (#298)
CUTLASS 2.6
2021-07-23 00:40:53 -04:00
6c29fe20ba Merge pull request #285 from tjingrant/patch-1
Typo Fixes
2021-07-05 22:51:19 -04:00
e3c56b0d6b Update predicated_tile_iterator.h 2021-07-05 12:11:53 -04:00
4647c57243 Update predicated_tile_iterator.h 2021-07-05 12:06:41 -04:00
856d4db3fb Update basic_gemm.cu
fix the matrix malloc size
2021-06-15 09:08:36 -04:00
6a1064093f Merge pull request #274 from mani-ananth/master
Some pending Bug fixes
2021-06-02 13:17:39 -04:00
c5f1ef4dff update contributors 2021-06-02 10:11:42 -07:00
47ebfccbec bug fixes 2021-06-02 10:08:25 -07:00
ad9486684f Merge pull request #272 from BernardoCovas/master
Bug in reference conv3d
2021-05-28 17:18:27 -04:00
1d8372a8e2 fix typo in reference conv3d 2021-05-28 21:06:59 +01:00
9cb7d63424 Merge pull request #266 from mani-ananth/master
Fixes for public issue #265
2021-05-19 15:15:22 -04:00
da2f110906 Fixes for public issue #265 2021-05-19 10:16:52 -07:00
b68113f5be Merge pull request #264 from zheng95z/patch-3
Adds `NoBetaScaling` for `LinearCombination`
2021-05-17 10:03:30 -04:00
a68d7cd6f1 Adds NoBetaScaling for LinearCombination 2021-05-12 22:23:55 +08:00
38e8b29f56 Merge pull request #259 from hzfan/ignore_pr
Add gitignore
2021-05-10 20:07:53 -04:00
ee7349c94f fix 2021-05-10 16:39:04 +08:00
8cdd4293d4 add gitignore 2021-05-10 16:37:59 +08:00
f58b843951 Merge pull request #239 from KeDengMS/kedeng/gelu
Fixes to Gelu for half and fusion
2021-05-08 12:51:42 -04:00
5fc142296f Merge pull request #237 from Peter9606/issue_236_typo
Typo fix issue#236
2021-05-08 07:51:19 -04:00
233d69aa6d Merge pull request #235 from Peter9606/issue_233_tranpose_update
tranpose.h update based on issue#233
2021-05-07 07:14:30 -04:00
9840d25269 Merge pull request #256 from zheng95z/patch-2
Fixes some typos in utilities.md
2021-05-06 11:02:49 -04:00
b878c96421 Fixes some typos in utilities.md 2021-05-06 22:37:37 +08:00
8f8a80cad5 Merge pull request #251 from zheng95z/patch-1
add a missing 'device_memory::' before a function
2021-04-25 22:09:44 -04:00
a8f6f8eb07 add a missing 'device_memory::' before a function 2021-04-25 20:05:39 +08:00
f4b0a33633 add unit test for non int4 load 2021-04-23 14:33:46 +08:00
7c783adf53 Merge pull request #247 from xue-fc/patch-1
fix a wrong description
2021-04-22 09:27:40 -04:00
4000df9567 fix a wrong description 2021-04-22 20:28:28 +08:00
bb35a3ba6f support setting load granularity for conv2d fprop 2021-04-22 15:20:57 +08:00
7ec3a87f22 support unalignment input for conv2d fprop stage=2 Fix for issue #242 2021-04-21 14:40:05 +08:00
0b74c8f473 Address CR 2021-04-19 23:36:06 +00:00
83036ed646 More clean up 2021-04-18 04:29:20 +00:00
b7e43f5eb9 Clean up 2021-04-18 04:24:25 +00:00
5c62d892fa Add test 2021-04-18 04:09:34 +00:00
41a31b404b Fixes to Gelu for half and fusion 2021-04-17 22:10:19 +00:00
7320aee17d Typo fix issue#236
Signed-off-by: Peter Han <fujun.han@iluvatar.ai>
2021-04-15 15:08:35 +08:00
2142a05d9d tranpose.h update based on issue#233
1. Add 'pragma once' preprocess directive
 2. Replace prmt PTX with __byte_perm intrinsic

Signed-off-by: Peter Han <fujun.han@iluvatar.ai>
2021-04-14 19:58:00 +08:00
c77a524459 Merge pull request #230 from mani-ananth/master
Fix for issue #221
2021-04-09 14:45:55 -04:00
fac6680f31 Merge branch 'master' of github.com:NVIDIA/cutlass 2021-04-09 11:36:31 -07:00
08993707da fixing functional bug in fused epilogue 2021-04-09 11:36:03 -07:00
c805593ebe Merge pull request #228 from mani-ananth/master
Fix for issue#224 and issue#225
2021-04-08 10:08:13 -04:00
26556d7206 fix a broken sparse gemm example. found by the community. 2021-04-07 13:32:55 -07:00
4839b6cb61 add 2stage fprop 3d into default file 2021-04-07 13:29:32 -07:00
d97214987a Merge pull request #220 from Peter9606/wrong-stride-array-definition
Bugfix: typo, make reduction device cases passed
2021-04-02 08:43:52 -04:00
b0bbc6d548 Merge pull request #219 from mani-ananth/master
Fix for issue #211
2021-04-02 08:42:09 -04:00
7074047a54 Bugfix: typo, make reduction device cases passed
Signed-off-by: Peter Han <fujun.han@iluvatar.ai>
2021-04-02 09:35:23 +08:00
75a4737cfe Fix for public issue #211
- Add a slice-K tile size to the profiler
- fix num warps calculations in implicit gemm header
2021-04-01 14:42:00 -07:00
8a3e4b8d02 Merge pull request #214 from Peter9606/separate-stream-error
Bugfix: memsetAsync uses wrong default stream
2021-03-24 12:09:01 -04:00
6a6b4028bd Revert wrong fix of params.update in GemmUniversalBase
Signed-off-by: Peter Han <fujun.han@iluvatar.ai>
2021-03-23 23:20:40 +08:00
92393b2676 Bugfix: memsetAsync uses wrong default stream
Signed-off-by: Peter Han <fujun.han@iluvatar.ai>
2021-03-23 21:11:42 +08:00
50bf00e5f2 Merge pull request #193 from Peter9606/public_shape_type_from_Mma_HFMA2
HFMA2 Convolutions for SM60 onwards
2021-03-05 21:38:59 -05:00
4cd004ead1 fix test name to optimized and instance large tile sizes to speed unit tests 2021-03-05 13:32:36 -08:00
6c4539e372 Make arch tag of test cases more precisely to SM60
Signed-off-by: Peter Han <fujun.han@iluvatar.ai>
2021-03-05 10:53:26 +08:00
a3639ab1a0 Append fp16 test case to verify Mma_HFMA2
Signed-off-by: Peter Han <fujun.han@iluvatar.ai>
2021-03-04 18:17:57 +08:00
169181f30f Make Shape public from Mma_HFMA2.
Signed-off-by: Peter Han <fujun.han@iluvatar.ai>
2021-03-04 11:05:16 +08:00
0f1056390d Create PUBLICATIONS.md (#189) 2021-03-03 11:17:40 -08:00
34a42e5620 Update generator.py (#192) 2021-03-02 12:21:48 -08:00
8f09b82b12 Merge pull request #187 from NVIDIA/cutlass_2.5
CUTLASS 2.5.0
2021-02-26 23:56:04 -06:00
200a5a5146 Enabled reduction unit tests. 2021-02-26 15:46:57 -05:00
746b7b3247 Enabled tensor reduction kernels. 2021-02-26 15:32:19 -05:00
abdf16a4d9 Updated release notes. 2021-02-26 13:55:04 -05:00
0e13748649 CUTLASS 2.5 2021-02-26 09:58:26 -05:00
ccb697bac7 cutlass 2.4 documentation only update 2020-11-23 06:59:45 -06:00
e6bcdc60cf fix broken links (#148) 2020-11-19 21:46:54 -08:00
6615010cd0 CUTLASS 2.4 (Implicit GEMM convolution) (#147)
CUTLASS 2.4 (Implicit GEMM Convolution)

Co-authored-by: Manish Gupta <manigupta@nvidia.com>, Haicheng Wu <haichengw@nvidia.com>, Dustyn Blasig <dblasig@nvidia.com>, Andrew Kerr <akerr@nvidia.com>
2020-11-19 21:25:25 -08:00
c2b80ad4e4 Merge pull request #135 from NVIDIA/cutlass_2.3_final
CUTLASS 2.3.0
2020-09-25 13:25:26 -05:00
37a8f9e598 CUTLASS 2.3.0 final. 2020-09-25 10:34:46 -07:00
c53f3339bb CUTLASS 2.3 initial commit (#134)
CUTLASS 2.3 adds GEMMs targeting Sparse Tensor Cores on the NVIDIA Ampere Architecture, fast SGEMM, and small matrix classes, bug fixes, and performance enhancements.
2020-09-23 14:00:58 -07:00
4dac7490e6 Typoes (#107)
* Update splitk_gemm.cu

* Update gemm_bias_relu.cu

* Update mma_sm75.h
2020-07-13 14:25:52 -07:00
fd7e058d0c Added examples to enable the unity build (#102)
* Updated documentation of fused GEMM example and removed UNITY BUILD batch size. The default batch size when unity build is enabled tends to be favorable.
2020-06-17 07:09:18 -07:00
1455 changed files with 367055 additions and 21196 deletions

23
.github/ISSUE_TEMPLATE/bug_report.md vendored Normal file
View File

@ -0,0 +1,23 @@
---
name: Bug report
about: Create a bug report to help us improve CUTLASS
title: "[BUG]"
labels: "? - Needs Triage, bug"
assignees: ''
---
**Describe the bug**
A clear and concise description of what the bug is.
**Steps/Code to reproduce bug**
Follow this guide http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports to craft a minimal bug report. This helps us reproduce the issue you're having and resolve the issue more quickly.
**Expected behavior**
A clear and concise description of what you expected to happen.
**Environment details (please complete the following information):**
- Environment location: [Bare-metal, Docker, Cloud(specify cloud provider)]
**Additional context**
Add any other context about the problem here.

View File

@ -0,0 +1,35 @@
---
name: Documentation request
about: Report incorrect or needed documentation to improve CUTLASS
title: "[DOC]"
labels: "? - Needs Triage, documentation"
assignees: ''
---
## Report incorrect documentation
**Location of incorrect documentation**
Provide links and line numbers if applicable.
**Describe the problems or issues found in the documentation**
A clear and concise description of what you found to be incorrect.
**Steps taken to verify documentation is incorrect**
List any steps you have taken:
**Suggested fix for documentation**
Detail proposed changes to fix the documentation if you have any.
---
## Report needed documentation
**Report needed documentation**
A clear and concise description of what documentation you believe it is needed and why.
**Describe the documentation you'd like**
A clear and concise description of what you want to happen.
**Steps taken to search for needed documentation**
List any steps you have taken:

View File

@ -0,0 +1,20 @@
---
name: Feature request
about: Suggest an idea for CUTLASS
title: "[FEA]"
labels: "? - Needs Triage, feature request"
assignees: ''
---
**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I wish I could use CUTLASS to do [...]
**Describe the solution you'd like**
A clear and concise description of what you want to happen.
**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.
**Additional context**
Add any other context, code examples, or references to existing implementations about the feature request here.

View File

@ -0,0 +1,10 @@
---
name: Submit question
about: Ask a general question about CUTLASS
title: "[QST]"
labels: "? - Needs Triage, question"
assignees: ''
---
**What is your question?**

11
.github/workflows/labeler.yml vendored Normal file
View File

@ -0,0 +1,11 @@
name: "Pull Request Labeler"
on:
- pull_request_target
jobs:
triage:
runs-on: ubuntu-latest
steps:
- uses: actions/labeler@main
with:
repo-token: "${{ secrets.GITHUB_TOKEN }}"

View File

@ -0,0 +1,35 @@
name: Auto Assign New Issues to Triage Project
on:
issues:
types: [opened]
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
jobs:
assign_one_project:
runs-on: ubuntu-latest
name: Assign to New Issues to Triage Project
steps:
- name: Process bug issues
uses: docker://takanabe/github-actions-automate-projects:v0.0.1
if: contains(github.event.issue.labels.*.name, 'bug') && contains(github.event.issue.labels.*.name, '? - Needs Triage')
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_PROJECT_URL: https://github.com/NVIDIA/cutlass
GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing'
- name: Process feature issues
uses: docker://takanabe/github-actions-automate-projects:v0.0.1
if: contains(github.event.issue.labels.*.name, 'feature request') && contains(github.event.issue.labels.*.name, '? - Needs Triage')
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_PROJECT_URL: https://github.com/NVIDIA/cutlass
GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing'
- name: Process other issues
uses: docker://takanabe/github-actions-automate-projects:v0.0.1
if: contains(github.event.issue.labels.*.name, '? - Needs Triage') && (!contains(github.event.issue.labels.*.name, 'bug') && !contains(github.event.issue.labels.*.name, 'feature request'))
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_PROJECT_URL: https://github.com/NVIDIA/cutlass
GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing'

57
.github/workflows/stale.yml vendored Normal file
View File

@ -0,0 +1,57 @@
name: Mark inactive issues and pull requests
on:
schedule:
- cron: "0 * * * *"
jobs:
mark-inactive-30d:
runs-on: ubuntu-latest
steps:
- name: Mark 30 day inactive issues and pull requests
uses: actions/stale@v3
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
stale-issue-message: >
This issue has been labeled `inactive-30d` due to no recent activity in the past 30 days.
Please close this issue if no further response or action is needed.
Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.
This issue will be labeled `inactive-90d` if there is no activity in the next 60 days.
stale-issue-label: "inactive-30d"
exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue"
days-before-issue-stale: 30
days-before-issue-close: -1
stale-pr-message: >
This PR has been labeled `inactive-30d` due to no recent activity in the past 30 days.
Please close this PR if it is no longer required.
Otherwise, please respond with a comment indicating any updates.
This PR will be labeled `inactive-90d` if there is no activity in the next 60 days.
stale-pr-label: "inactive-30d"
exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue"
days-before-pr-stale: 30
days-before-pr-close: -1
operations-per-run: 50
mark-inactive-90d:
runs-on: ubuntu-latest
steps:
- name: Mark 90 day inactive issues and pull requests
uses: actions/stale@v3
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
stale-issue-message: >
This issue has been labeled `inactive-90d` due to no recent activity in the past 90 days.
Please close this issue if no further response or action is needed.
Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.
stale-issue-label: "inactive-90d"
exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue"
days-before-issue-stale: 90
days-before-issue-close: -1
stale-pr-message: >
This PR has been labeled `inactive-90d` due to no recent activity in the past 90 days.
Please close this PR if it is no longer required.
Otherwise, please respond with a comment indicating any updates.
stale-pr-label: "inactive-90d"
exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue"
days-before-pr-stale: 90
days-before-pr-close: -1
operations-per-run: 50

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
# PyCache files
__pycache__/

View File

@ -1,6 +1,190 @@
# NVIDIA CUTLASS Changelog
# CUTLASS 2.x
## [2.11.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.11.0) (2022-11-19)
* [Stream-K](/examples/47_ampere_gemm_universal_streamk), which is a new general way to do split-K. It can not only improve performance, but can also significantly reduce the number of tile sizes that need to be profiled to find the best one.
* [Fused multi-head attention Kernel](/examples/41_fused_multi_head_attention). It has two variants: one uses batched GEMM for the fixed sequence length, and the other one uses group GEMM for the variable sequence length. Both versions just need one kernel.
* [Dual GEMM](/examples/45_dual_gemm), which can fuse A x B and A x C into one kernel. Two GEMMs has no producer-consumer dependency.
* Hopper improves [double precision matrix multiplication](/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm90.cu) by 2x compared to Ampere at iso-clocks. It is supported since CUDA 11.8.
* [BLAS3](/test/unit/gemm/device/hemm_cf64_cf64_cf64_tensor_op_f64_sm90.cu) functions with Hoppers new double precision matrix multiplication instructions.
* [ELL Block Sparse GEMM](/examples/43_ell_block_sparse_gemm), which uses an [ELL matrix](https://developer.nvidia.com/blog/accelerating-matrix-multiplication-with-block-sparse-format-and-nvidia-tensor-cores/) to describe the sparsity of A matrix. B and output matrices are still dense. The block size can be arbitary.
* Optimized [Group Conv](/examples/42_ampere_tensorop_group_conv) for SingleGroup mode, which requires that the output channel per group is a multiple of Threadblock tile N.
* [Optimized DepthWise Conv](/examples/46_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu). Two new modes are added
* [kOptimized](/test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu) - use direct conv to compute instead of implicit GEMM.
* The restrictions are: 1) input ,output channel and group number should be multiple of (128 / sizeof(input element)). 2) The input filter size should be the same as the template parameter configuration.
* [kFixedStrideDilation](/test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_fixed_stride_dilation_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu) - which puts stride and dilation into templates to further improve the performance. In this mode, kernel persistents some inputs into register to squeeze more performance, so large filter/stride/dilation is not recommanded.
* The restrictions are: 1) input, output channel and group number should be multiple of (128 / sizeof(input element)). 2) input filter size, stride, dilation should same as the template parameter configuration.
* [Scripts](/examples/44_multi_gemm_ir_and_codegen) to fuse multiple back-to-back GEMM. Its implementation was discussed in a GTC'22 Spring [talk](https://www.nvidia.com/en-us/on-demand/session/gtcspring22-s41606/).
* [FP8 data type definition](/include/cutlass/float8.h) and [conversion routines](/include/cutlass/numeric_conversion.h#L1274-2115).
* Updates and bugfixes from the community (thanks!). Big shout out to Meta's [xFormers](https://github.com/facebookresearch/xformers).
* **Deprecation announcement:** CUTLASS plans to deprecate the following:
* Maxwell and Pascal GPU architectures
* Ubuntu 16.04
* CUDA 10.2
## [2.10.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.10.0) (2022-08-23)
* [CUTLASS Python](/examples/40_cutlass_py) now supports GEMM, CONV, Group GEMM for different data types as well as different epilogue flavours.
* Optimizations for CUTLASS's [Grouped GEMM](examples/24_gemm_grouped/gemm_grouped.cu) kernel. Threadblock scheduling part is improved. Some computation can be moved to the host side if applicable. [Grouped Syr2k](examples/38_syr2k_grouped/syr2k_grouped.cu) kernels are added, too.
* Optimizations for [GEMM+Softmax](examples/35_gemm_softmax). All the reduction computation is fused into the previous GEMM. More template arguments are provided to fine tune the performance.
* [Grouped GEMM for Multihead Attention](examples/41_multi_head_attention). This general group gemm based MHA does not require the sequence length of all GEMMs to be the same which makes it most useful for natural language processing.
* [GEMM + Layer norm fusion for Ampere](examples/37_gemm_layernorm_gemm_fusion/) splits the layernorm into two parts and both of them can be fused into the GEMMs before and after separately. In addition to use square sum to compute variance of layernorm, [Shift-K](https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data) is provided if square sum raise numerical issues.
* [GEMM Epilogue Permutation Fusion](examples/39_gemm_permute) can apply user provided permutation layout mapping in the GEMM epilogue.
* [Grouped convolution targeting implicit GEMM](test/unit/conv/device/group_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu) introduces the first group convolution implementation to CUTLASS. It is an Analytical implementation, not an Optimized. The restrictions are: 1) input and output channel number should be multiple of group number. 2) split-K is not supported. The implementation has 2 modes:
* kSingleGroup: output channel per group is multiple of Threadblock tile N.
* kMultipleGroup: Threadblock tile N is multiple of output channel per group.
* [Depthwise separable convolution](test/unit/conv/device/depthwise_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu) introduces the first depthwise convolution which is also Analytical for now. The restrictions are: 1) SIMT only 2) No split-K 3) input channel equals to output channel equals to group number.
* Standalone [Layernorm](/tools/util/include/cutlass/util/device_layernorm.h) and [Pooling](/tools/util/include/cutlass/util/device_nhwc_pooling.h) kernels.
* [Back-to-back GEMM/CONV](examples/13_two_tensor_op_fusion) relaxes the requirement that the first GEMM K dimension needs to be the multiple of Threadblock Tile K dimension.
* Optimal performance using [**CUDA 11.6u2**](https://developer.nvidia.com/cuda-downloads)
* Updates and bugfixes from the community (thanks!)
## [2.9.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.9.0) (2022-04-21)
* [First layer Convolution kernels](/test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu) specialized for small channel counts and reduced alignment
* [Few channels](/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h) specialization for reduced alignment capabilities
* [Fixed channels](/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h) further specialized when channel count perfectly matches the access vector size
* [Unit tests](/test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu)
* [Python-based instance emitter](/tools/library/scripts/generator.py) in the CUTLASS Library and support in the Profiler
* [BLAS3](https://docs.nvidia.com/cuda/cublas/index.html#cublas-level-3-function-reference) operators accelerated by Tensor Cores
* Supported types: f32, cf32, f64, cf64, tf32x3, complex tf32x3
* [HERK](/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu) with [emitter](/tools/library/scripts/rank_k_operation.py)
* [SYRK](/test/unit/gemm/device/syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu) with [emitter](/tools/library/scripts/rank_k_operation.py)
* [SYMM](/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu) with [emitter](/tools/library/scripts/symm_operation.py)
* [TRMM](/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu) with [emitter](/tools/library/scripts/trmm_operation.py)
* [Unit tests](/test/unit/gemm/device/testbed_rank_k_universal.h)
* [CUTLASS Python](/examples/40_cutlass_py) demonstrating JIT compilation of CUTLASS kernels and a Python-based runtime using [CUDA Python](https://developer.nvidia.com/cuda-python)
* [Python-based runtime](/tools/library/scripts/rt.py) interoperable with existing emitters
* [GEMM + Softmax example](/examples/35_gemm_softmax)
* [Gather and Scatter Fusion with GEMM](/examples/36_gather_scatter_fusion) can gather inputs and scatters outputs based on indices vectors in the same GEMM kernel.
* It can select random rows in a row major matrix.
* It can select random columns in a column major matrix.
* [Back-to-back GEMM/CONV](examples/13_two_tensor_op_fusion) fully supports buffering the first GEMM/CONV results in the shared memory for the latter one to use. It can eliminate register spill when the tile size is big. Additionally, bias vector add is supported in the first GEMM/CONV.
* Supported kernels: GEMM and CONV.
* Supported types: fp16 and int8.
* Supported architectures: Turing and Ampere.
* [Transposed Convolution](/examples/34_transposed_conv2d) (a.k.a Deconvolution) support which reuses Dgrad implementation.
* [Utility functions](/tools/util/include/cutlass/util) that can pad NHWC and convert between NCHW and NHWC.
* [Small alignment implicit gemm](https://github.com/NVIDIA/cutlass/issues/242) support for Fprop/Dgrad/Wgrad so that padding is no longer mandated to use tensor cores in these kernels.
* Epilogue enhancement:
* Eliminate bank conflicts in int8 tensor core kernels.
* Half2 usage if epilogue compute type is fp16.
* More activation functions: Silu, Hardswish, Leaky Relu.
* New elementwise fusion pattern for [residual block](/include/cutlass/epilogue/thread/linear_combination_residual_block.h).
* [Group GEMM](/examples/24_gemm_grouped) thread block number calculation fix which helps to launch the intended number of threadblocks to fully occupy the GPUs.
* [Parallel GEMM splitk](https://github.com/NVIDIA/cutlass/pull/277) support in the CUTLASS profiler.
* Optimal performance using [**CUDA 11.6u2**](https://developer.nvidia.com/cuda-downloads)
* Updates and bugfixes from the community (thanks!)
## [2.8.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.8.0) (2021-11-19)
* **TF32x3:** emulated single-precision using Tensor Cores
* 45+ TFLOPs on NVIDIA A100
* [GEMM SDK example](/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu) (real)
* [COMPLEX GEMM SDK example](/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm.cu) (complex)
* [Implicit GEMM Convolution SDK example](/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu)
* **Mainloop fusion for Convolution:** convolution with fused per-channel scale-bias-relu
* [Conv Fprop SDK example](/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu)
* [Conv WGrad SDK example](/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu)
* [cutlass::conv::device::ImplicitGemmConvolutionFusion](/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h)
* **Grouped GEMM:** similar to batched GEMM with distinct problem size per group
* [SDK example](/examples/24_gemm_grouped) with performance comparison with Batched Strided GEMM
* [cutlass::gemm::device::GemmGrouped](/include/cutlass/gemm/device/gemm_grouped.h)
* [Implicit GEMM Convolution fusion](/examples/13_two_tensor_op_fusion/) supports staging 1st convolution's output accumulator in the shared memory on Turing. This allows more flexible warp tile sizes and less regsiter pressue.
* Optimal performance using [**CUDA 11.5**](https://developer.nvidia.com/cuda-downloads)
* Updates from the community (thanks!)
* **Deprecation announcement:** CUTLASS plans to deprecate the following:
* Maxwell and Pascal GPU architectures
* Ubuntu 16.04
* CUDA 10.2
## [2.7.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.7.0) (2021-09-24)
* Mainloop fusion for GEMM: [summation over A or B](/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu)
* [Strided DGRAD (optimized iterators)](/include/cutlass/conv/kernel/default_conv2d_dgrad.h)
* [Half-precision GELU_taylor activation functions](/include/cutlass/epilogue/thread/activation.h#L196)
* Use these when accumulation and epilogue compute types are all `cutlass::half_t`
* Tuning and bug fixes to [fused GEMM + GEMM example](/examples/13_two_tensor_op_fusion/)
* Support for smaller than 128b aligned Convolutions: [see examples](test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu#L272)
* Caching of results to accelerate Convolution [unit tests](test/unit/conv/device/cache_testbed_output.h)
* Can be enabled or disabled by running `cmake .. -DCUTLASS_TEST_ENABLE_CACHED_RESULTS=OFF`
* Corrections and bug fixes reported by the CUTLASS community
* Thank you for filing these issues!
## [2.6.1](https://github.com/NVIDIA/cutlass/releases/tag/v2.6.1) (2021-09-03)
* Arbitrary padding and striding for CUTLASS Strided DGRAD Convolution operator (Analytic Iterators)
* Tuning for GEMMs fused with partial reductions
* Corrections and bug fixes reported by the CUTLASS community
* Thank you for filing these issues!
## [2.6.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.6.0) (2021-07-22)
* Optimal performance when compiled with the [CUDA 11.4 Toolkit](https://developer.nvidia.com/cuda-toolkit)
* Adopt the new L2 prefetch feature in [cp.async](/include/cutlass/arch/memory.h) and [global load](/include/cutlass/arch/memory_sm80.h)
* Fused operators with GEMM and Convolution
* [Fused broadcast in epilogue](test/unit/gemm/device/gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu)
* [Fused partial reduction in epilogue](/test/unit/gemm/device/gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu)
* 64b tensor strides and leading dimensions support for GEMMs
* Affine rank=2 matrix layouts
* Row stride and column stride for matrices using [cutlass::layout::AffineRank2](/include/cutlass/layout/matrix.h)
* Support [FP64 tensor core](/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu) and SIMT GEMM.
* [Batched GEMV](/test/unit/gemm/device/gemv.cu) preview implementation
* [New strided Dgrad](test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu) implementation
* Accelerates over previous implementation by cutting down redundant math by 4x
* Support using new `Dy` and `w` analytic iterators and existing `cutlass::conv::device::ImplicitGemmConvolution` interface
* Quaternion-valued GEMM and Convolution in single- and double-precision (targeting CUDA Cores)
* Updates to [quaternion.h](/include/cutlass/quaternion.h) and [functional.h](/include/cutlass/functional.h)
* SDK Example for [GEMM](/examples/21_quaternion_gemm/quaternion_gemm.cu) and [Convolution](/examples/22_quaternion_gemm/quaternion_conv.cu)
* [Unit tests for GEMM](/test/unit/gemm/device/simt_qgemm_nn_sm50.cu) and [Convolution](/test/unit/conv/device/conv2d_fprop_implicit_gemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu)
* Many improvements to the epilogue.
* Provide an [option](/include/cutlass/epilogue/threadblock/epilogue.h) to not fully unroll the epilogue to reduce the code size and improve the performance when using complicated elementwise operations
* Performance improvement for FP16 tensor core kernels
* Bug fixes
* Enhanced Clang support and the combination of Clang 13 and CUDA 11.4 can build and run kernels from Pascal and Ampere.
* Updated minimum CUDA Toolkit requirement to 10.2
* [CUDA 11.4 Toolkit](https://developer.nvidia.com/cuda-toolkit) recommended
* Corrections and bug fixes reported by the CUTLASS community
* Thank you for filing these issues!
## [2.5.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.5.0) (2021-02-26)
* Tensor reductions
* _m_-to-_n_ reductions of tensors with affine layout
* [Specializations](/test/unit/reduction/device/tensor_reduce_contiguous.cu) for reductions including contiguous dimension
* [Specializations](/test/unit/reduction/device/tensor_reduce_strided.cu) for reductions excluding contiguous dimension
* Custom reduction functors such as `cutlass::logical_and`
* Large tensor support, up to 2^63 elements (however, each dimension is limited to an extent of 2^31)
* Optimizations for 3-D convolution
* [Optimized tile iterators](include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h) using precomputed delta table for 3-D convolution
* Full coverage of [forward](test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu) and [backwards](test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu) passes for 3D convolution
* [Fused Convolution+Convolution example](/examples/13_two_tensor_op_fusion/README.md)
* Corrections and bug fixes reported by the CUTLASS community
* Thank you for filing these issues!
## [2.4.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.4.0) (2020-11-19)
* Implicit GEMM convolution kernels supporting CUDA and Tensor Cores on NVIDIA GPUs
* Operators: forward (Fprop), backward data gradient (Dgrad), and backward weight gradient (Wgrad) convolution
* Data type: FP32, complex<FP32>, Tensor Float 32 (TF32), BFloat16 (BF16), Float16, Int4, Int8, Int32
* Spatial dimensions: 1-D, 2-D, and 3-D
* Layout: NHWC, NCxHWx
* Implicit GEMM convolution components:
* Global memory iterators supporting Fprop, Dgrad, and Wgrad
* `MmaMultistage` for implicit GEMM convolution for NVIDIA Ampere architecture
* `MmaPipeline` for implicit GEMM convolution for NVIDIA Volta and Turing architectures
* [Documentation](/media/docs/implicit_gemm_convolution.md) describing Implicit GEMM Convolution algorithm and implementation
## [2.3.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.3.0) (2020-09-23)
* [NVIDIA Ampere Architecture features](https://devblogs.nvidia.com/nvidia-ampere-architecture-in-depth/)
* [Sparse Tensor Core GEMM kernels](test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu):
* Direct access to Sparse Tensor Cores and maximum performance via [`mma.sp.sync`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma-and-friends)
* Fast SGEMM targeting GeForce RTX 30-series CUDA Cores
* Minor Features:
* [Activation functions](/include/cutlass/epilogue/thread/activation.h) such as [GeLU](/include/cutlass/epilogue/thread/linear_combination_gelu.h) and [Sigmoid](/include/cutlass/epilogue/thread/linear_combination_sigmoid.h)
* Small [matrix](/include/cutlass/matrix.h) and [quaternion](/include/cutlass/quaternion.h) template classes in device code
* [Floating-point constants](/include/cutlass/constants.h)
* NVIDIA Ampere GPU Architecture examples and documentation:
* [Tensor Float 32](/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu) and
* [Sparse Tensor Cores](/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu)
* Documentation added on CUTLASS [efficient row-major epilogue](/media/docs/gemm_api.md#efficient-epilogue)
## [2.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.2.0) (2020-06-08)
* [NVIDIA Ampere Architecture features](https://devblogs.nvidia.com/nvidia-ampere-architecture-in-depth/)
@ -101,27 +285,33 @@
## Copyright
Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
```
Redistribution and use in source and binary forms, with or without modification, are permitted
provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this list of
conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this list of
conditions and the following disclaimer in the documentation and/or other materials
provided with the distribution.
* Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
to endorse or promote products derived from this software without specific prior written
permission.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
```

82
CITATION.cff Normal file
View File

@ -0,0 +1,82 @@
cff-version: 1.2.0
title: CUTLASS
message: >-
If you use this software, please cite using the
following metadata.
type: software
authors:
- given-names: Andrew
email: akerr@nvidia.com
family-names: Kerr
affiliation: NVIDIA
- given-names: Haicheng
family-names: Wu
affiliation: NVIDIA
email: haichengw@nvidia.com
- given-names: Manish
family-names: Gupta
affiliation: Google
email: manigupta@google.com
- given-names: Dustyn
family-names: Blasig
email: dblasig@nvidia.com
affiliation: NVIDIA
- given-names: Pradeep
family-names: Ramini
email: prramani@nvidia.com
affiliation: NVIDIA
- given-names: Duane
family-names: Merrill
email: dumerrill@nvidia.com
affiliation: NVIDIA
- given-names: Aniket
family-names: Shivam
email: ashivam@nvidia.com
affiliation: NVIDIA
- given-names: Piotr
family-names: Majcher
email: pmajcher@nvidia.com
affiliation: NVIDIA
- given-names: Paul
family-names: Springer
email: pspringer@nvidia.com
affiliation: NVIDIA
- given-names: Markus
family-names: Hohnerbach
affiliation: NVIDIA
email: mhohnerbach@nvidia.com
- given-names: Jin
family-names: Wang
email: jinw@nvidia.com
affiliation: NVIDIA
- given-names: Matt
family-names: Nicely
email: mnicely@nvidia.com
affiliation: NVIDIA
repository-code: 'https://github.com/NVIDIA/cutlass'
abstract: >-
CUTLASS is a collection of CUDA C++ template
abstractions for implementing high-performance
matrix-multiplication (GEMM) and related
computations at all levels and scales within CUDA.
It incorporates strategies for hierarchical
decomposition and data movement similar to those
used to implement cuBLAS and cuDNN. CUTLASS
decomposes these "moving parts" into reusable,
modular software components abstracted by C++
template classes. These thread-wide, warp-wide,
block-wide, and device-wide primitives can be
specialized and tuned via custom tiling sizes, data
types, and other algorithmic policy. The resulting
flexibility simplifies their use as building blocks
within custom kernels and applications.
keywords:
- 'cutlass, tensor cores, cuda'
license: BSD-3-Clause
license-url: https://github.com/NVIDIA/cutlass/blob/v2.11.0/LICENSE.txt
version: '2.11.0'
date-released: '2022-11-19'
identifiers:
- type: url
value: "https://github.com/NVIDIA/cutlass/tree/v2.11.0"
description: The GitHub release URL of tag 2.11.0

View File

@ -1,23 +1,29 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific prior written
# permission.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmake_minimum_required(VERSION 3.12.4 FATAL_ERROR)
@ -31,24 +37,35 @@ else()
endif()
message(STATUS "CMake Version: ${CMAKE_VERSION}")
set(IMPLICIT_CMAKE_CXX_STANDARD OFF CACHE BOOL "Do not explicitly specify -std=c++11 if set")
project(CUTLASS VERSION 2.2.0 LANGUAGES CXX)
project(CUTLASS VERSION 2.11.0 LANGUAGES CXX)
include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake)
if (CUDA_VERSION VERSION_LESS 10.2)
message(WARNING "CUTLASS ${CUTLASS_VERSION} requires CUDA 10.2 or higher, and strongly recommends CUDA 11.0 or higher.")
elseif (CUDA_VERSION VERSION_LESS 11.0)
message(WARNING "CUTLASS ${CUTLASS_VERSION} support for CUDA ${CUDA_VERSION} is deprecated, please use CUDA 11.0 or higher.")
endif()
find_package(Doxygen QUIET)
#
# CUTLASS 2.x requires C++11
#
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
if (NOT IMPLICIT_CMAKE_CXX_STANDARD)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
endif()
if(CUTLASS_NATIVE_CUDA)
set(CMAKE_CUDA_STANDARD 11)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
else()
list(APPEND CUTLASS_CUDA_NVCC_FLAGS --std=c++11)
if (NOT IMPLICIT_CMAKE_CXX_STANDARD)
list(APPEND CUTLASS_CUDA_NVCC_FLAGS --std=c++11)
endif()
endif()
if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
@ -61,17 +78,24 @@ set(CUTLASS_ENABLE_HEADERS_ONLY OFF CACHE BOOL "Enable only the header library")
if(CUTLASS_ENABLE_HEADERS_ONLY)
set(CUTLASS_ENABLE_EXAMPLES_INIT OFF)
set(CUTLASS_ENABLE_TOOLS_INIT OFF)
set(CUTLASS_ENABLE_TOOLS_INIT ON)
set(CUTLASS_ENABLE_LIBRARY_INIT OFF)
else()
set(CUTLASS_ENABLE_EXAMPLES_INIT ON)
set(CUTLASS_ENABLE_TOOLS_INIT ON)
set(CUTLASS_ENABLE_LIBRARY_INIT ON)
endif()
set(CUTLASS_TEST_UNIT_ENABLE_WARNINGS OFF CACHE BOOL "Enable warnings on waived unit tests.")
set(CUTLASS_ENABLE_EXAMPLES ${CUTLASS_ENABLE_EXAMPLES_INIT} CACHE BOOL "Enable CUTLASS Examples")
set(CUTLASS_ENABLE_TOOLS ${CUTLASS_ENABLE_TOOLS_INIT} CACHE BOOL "Enable CUTLASS Tools")
set(CUTLASS_ENABLE_LIBRARY ${CUTLASS_ENABLE_LIBRARY_INIT} CACHE BOOL "Enable CUTLASS Library")
set(CUTLASS_ENABLE_PROFILER ${CUTLASS_ENABLE_LIBRARY} CACHE BOOL "Enable CUTLASS Profiler")
set(CUTLASS_ENABLE_PERFORMANCE ${CUTLASS_ENABLE_PROFILER} CACHE BOOL "Enable CUTLASS Proformance")
if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
set(CUTLASS_ENABLE_TESTS_INIT ${CUTLASS_ENABLE_TOOLS_INIT})
set(CUTLASS_ENABLE_TESTS_INIT ${CUTLASS_ENABLE_LIBRARY}})
else()
set(CUTLASS_ENABLE_TESTS_INIT OFF)
endif()
@ -101,6 +125,12 @@ endif()
if (NOT CUDA_VERSION VERSION_LESS 11.0)
list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 80)
endif()
if (NOT CUDA_VERSION VERSION_LESS 11.1 AND NOT CUDA_COMPILER MATCHES "[Cc]lang")
list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 86)
endif()
if (NOT CUDA_VERSION VERSION_LESS 11.8 AND NOT CUDA_COMPILER MATCHES "[Cc]lang")
list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 90)
endif()
set(CUTLASS_NVCC_ARCHS ${CUTLASS_NVCC_ARCHS_SUPPORTED} CACHE STRING "The SM architectures requested.")
set(CUTLASS_NVCC_ARCHS_ENABLED ${CUTLASS_NVCC_ARCHS} CACHE STRING "The SM architectures to build code for.")
@ -109,10 +139,6 @@ if (POLICY CMP0076)
cmake_policy(SET CMP0076 NEW)
endif()
if( NOT CMAKE_SIZEOF_VOID_P EQUAL 8 )
message(FATAL_ERROR "CUTLASS requires a 64-bit compiler!")
endif()
include(GNUInstallDirs)
link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs)
@ -132,7 +158,12 @@ if (NOT (CMAKE_BUILD_TYPE OR CONFIGURATION_TYPES))
endif()
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CUTLASS_LIBRARY_DEBUG_POSTFIX ".debug" CACHE STRING "Default postfix value for debug libraries")
if (DEFINED CMAKE_DEBUG_POSTFIX)
set(CUTLASS_LIBRARY_DEBUG_POSTFIX_INIT ${CMAKE_DEBUG_POSTFIX})
else()
set(CUTLASS_LIBRARY_DEBUG_POSTFIX_INIT .debug)
endif()
set(CUTLASS_LIBRARY_DEBUG_POSTFIX ${CUTLASS_LIBRARY_DEBUG_POSTFIX_INIT} CACHE STRING "Default postfix value for debug libraries")
if(WIN32)
# On Windows we link against the shared (DLL) runtime. Change gtest settings to match this.
@ -154,6 +185,11 @@ if (${CUTLASS_NVCC_VERBOSE})
list(APPEND CUTLASS_CUDA_NVCC_FLAGS -v)
endif()
#
# CUTLASS NAMESPACE
#
set(CUTLASS_NAMESPACE "cutlass" CACHE STRING "Top level namespace of CUTLASS")
set(CUTLASS_NVCC_EMBED_CUBIN ON CACHE BOOL "Embed compiled CUDA kernel binaries into executables.")
set(CUTLASS_NVCC_EMBED_PTX ON CACHE BOOL "Embed compiled PTX into executables.")
set(CUTLASS_NVCC_KEEP OFF CACHE BOOL "Keep intermediate files generated by NVCC.")
@ -164,12 +200,28 @@ set(CUTLASS_ENABLE_F16C OFF CACHE BOOL "Enable F16C x86 extensions in host code.
#
set(CUTLASS_LIBRARY_OPERATIONS "all" CACHE STRING "Comma delimited list of operation name filters. Default '' means all operations are enabled.")
set(CUTLASS_LIBRARY_KERNELS "" CACHE STRING "Comma delimited list of kernel name filters. If unspecified, only the largest tile size is enabled. If 'all' is specified, all kernels are enabled.")
set(CUTLASS_LIBRARY_IGNORE_KERNELS "" CACHE STRING "Comma delimited list of kernel names to exclude from build.")
# Test Levels L0, L1, L2
set(CUTLASS_TEST_LEVEL "0" CACHE STRING "Level of tests to compile.")
set(CUTLASS_TEST_ENABLE_CACHED_RESULTS ON CACHE BOOL "Enable caching and reuse of test results in unit tests")
set_property(CACHE CUTLASS_TEST_LEVEL PROPERTY STRINGS 0 1 2)
list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_TEST_LEVEL=${CUTLASS_TEST_LEVEL})
list(APPEND CUTLASS_CUDA_CLANG_FLAGS -DCUTLASS_TEST_LEVEL=${CUTLASS_TEST_LEVEL})
if (CUTLASS_TEST_ENABLE_CACHED_RESULTS)
message(STATUS "Enable caching of reference results in conv unit tests")
list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_TEST_ENABLE_CACHED_RESULTS=1)
endif()
set(CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED ON CACHE BOOL "Enable/Disable rigorous conv problem sizes in conv unit tests")
if (CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED)
message(STATUS "Enable rigorous conv problem sizes in conv unit tests")
list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED=1)
endif()
#
# CUDA 10.1 introduces "mma" in PTX performing collective matrix multiply operations.
@ -181,6 +233,10 @@ else()
set(CUTLASS_ENABLE_TENSOR_CORE_MMA_DEFAULT ON)
endif()
# Trace levels for debugging
set(CUTLASS_DEBUG_TRACE_LEVEL "0" CACHE STRING "Level of debug tracing to perform.")
list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_DEBUG_TRACE_LEVEL=${CUTLASS_DEBUG_TRACE_LEVEL})
set(CUTLASS_ENABLE_TENSOR_CORE_MMA ${CUTLASS_ENABLE_TENSOR_CORE_MMA_DEFAULT} CACHE BOOL
"Enable PTX mma instruction for collective matrix multiply operations.")
@ -219,7 +275,7 @@ if (NOT MSVC AND CUTLASS_NVCC_KEEP)
# MSVC flow handles caching already, but for other generators we handle it here.
set(CUTLASS_NVCC_KEEP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp CACHE PATH "Location to store NVCC scratch files")
file(MAKE_DIRECTORY ${CUTLASS_NVCC_KEEP_DIR})
list(APPEND CUTLASS_CUDA_NVCC_FLAGS --keep) # --keep-dir may not work with nvcc for some directories.
list(APPEND CUTLASS_CUDA_NVCC_FLAGS --keep -v) # --keep-dir may not work with nvcc for some directories.
list(APPEND CUTLASS_CUDA_CLANG_FLAGS -save-temps=${CUTLASS_NVCC_KEEP_DIR})
endif()
@ -241,6 +297,17 @@ if (NOT CMAKE_BUILD_TYPE MATCHES "Release")
list(APPEND CUTLASS_CUDA_NVCC_FLAGS -lineinfo)
endif()
#Report CUDA build flags
if (CUDA_COMPILER MATCHES "[Cc]lang")
if(CUTLASS_CUDA_CLANG_FLAGS)
message(STATUS "Using CLANG flags: ${CUTLASS_CUDA_CLANG_FLAGS}")
endif()
else()
if(CUTLASS_CUDA_NVCC_FLAGS)
message(STATUS "Using NVCC flags: ${CUTLASS_CUDA_NVCC_FLAGS}")
endif()
endif()
if(CUDA_COMPILER MATCHES "[Cc]lang")
if( NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang" )
message(FATAL_ERROR "Clang CUDA compilation requires Clang CXX compilation. Currently CMAKE_CXX_COMPILER is ${CMAKE_CXX_COMPILER_ID}" )
@ -250,7 +317,14 @@ if(CUDA_COMPILER MATCHES "[Cc]lang")
message(FATAL_ERROR "Clang 7.0+ required for GPU compilation")
endif()
# There are numerous Clang versions that can work with each CUDA toolkit and the
# the checks are not very useful so we are turning them off and using testing to
# ensure the various combinations work properly.
list(APPEND CUTLASS_CUDA_CLANG_FLAGS --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
list(APPEND CUTLASS_CUDA_CLANG_FLAGS -D__NV_NO_HOST_COMPILER_CHECK=1)
list(APPEND CUTLASS_CUDA_CLANG_FLAGS -Wno-unknown-cuda-version)
list(APPEND CUTLASS_CUDA_CLANG_FLAGS -mllvm -pragma-unroll-threshold=100000)
list(APPEND CUTLASS_CUDA_CLANG_FLAGS -mllvm -unroll-threshold=5000)
list(APPEND CUTLASS_CUDA_CLANG_FLAGS -Wno-unused-command-line-argument)
@ -269,55 +343,115 @@ if(CUDA_COMPILER MATCHES "[Cc]lang")
link_libraries(nvidia::cudart)
endif()
# Support for 128-bit integers if using NVIDIA C++ compiler
if (${CMAKE_CXX_COMPILER_ID} MATCHES "PGI" OR ${CMAKE_CXX_COMPILER_ID} MATCHES "NVHPC")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Mint128 ")
endif()
if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
# CMake 3.18 added support for CUDA_ARCHITECTURES target property. We will use this
# property for CMake 3.18+, so we request the NEW behavior for correct compatibility.
# https://cmake.org/cmake/help/v3.18/policy/CMP0104.html#policy:CMP0104
cmake_policy(SET CMP0104 NEW)
endif()
function(cutlass_apply_cuda_gencode_flags TARGET)
set(options)
set(oneValueArgs)
set(multiValueArgs SM_ARCHS)
cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if (__SM_ARCHS)
set(ARCHS_ENABLED ${__SM_ARCHS})
else()
set(ARCHS_ENABLED ${CUTLASS_NVCC_ARCHS_ENABLED})
endif()
set(NVCC_FLAGS)
set(CLANG_FLAGS)
foreach(ARCH ${CUTLASS_NVCC_ARCHS_ENABLED})
set(__CMAKE_CUDA_ARCHS)
foreach(ARCH ${ARCHS_ENABLED})
list(APPEND CLANG_FLAGS --cuda-gpu-arch=sm_${ARCH})
set(CODES)
if(CUTLASS_NVCC_EMBED_CUBIN)
list(APPEND CODES sm_${ARCH})
list(APPEND __CMAKE_CUDA_ARCHS ${ARCH}-real)
endif()
if(CUTLASS_NVCC_EMBED_PTX)
list(APPEND CODES compute_${ARCH})
list(APPEND __CMAKE_CUDA_ARCHS ${ARCH}-virtual)
endif()
list(JOIN CODES "," CODES_STR)
list(APPEND NVCC_FLAGS -gencode=arch=compute_${ARCH},code=[${CODES_STR}])
endforeach()
if (CUDA_COMPILER MATCHES "[Cc]lang")
target_compile_options(
${TARGET}
PRIVATE
$<$<COMPILE_LANGUAGE:CXX>:${CLANG_FLAGS}>
)
if (NOT __SM_ARCHS)
if (CUDA_COMPILER MATCHES "[Cc]lang")
target_compile_options(
${TARGET}
PRIVATE
$<$<COMPILE_LANGUAGE:CXX>:${CLANG_FLAGS}>
)
elseif(CMAKE_VERSION GREATER_EQUAL 3.18)
set_property(TARGET ${TARGET} PROPERTY CUDA_ARCHITECTURES ${__CMAKE_CUDA_ARCHS})
else()
target_compile_options(
${TARGET}
PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:${NVCC_FLAGS}>
)
endif()
else()
target_compile_options(
${TARGET}
PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:${NVCC_FLAGS}>
)
list(JOIN CLANG_FLAGS " " CLANG_FLAGS_STR)
list(JOIN NVCC_FLAGS " " STR_NVCC_FLAGS)
if (CUDA_COMPILER MATCHES "[Cc]lang")
if(${TARGET} MATCHES ".*\.cpp")
set_source_files_properties(${TARGET} PROPERTIES COMPILE_FLAGS ${CLANG_FLAGS_STR})
endif()
elseif(CMAKE_VERSION GREATER_EQUAL 3.18)
set_source_files_properties(${TARGET} PROPERTIES CUDA_ARCHITECTURES ${STR_NVCC_FLAGS})
else()
if(${TARGET} MATCHES ".*\.cu")
set_source_files_properties(${TARGET} PROPERTIES COMPILE_FLAGS ${STR_NVCC_FLAGS})
endif()
endif()
endif()
endfunction()
# Cache the flags so they are available when the function below is called anywhere globally.
set(__CUTLASS_CUDA_FLAGS ${CUTLASS_CUDA_FLAGS} CACHE INTERNAL "")
set(__CUTLASS_CUDA_FLAGS_RELEASE ${CUTLASS_CUDA_FLAGS_RELEASE} CACHE INTERNAL "")
set(__CUTLASS_CUDA_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} CACHE INTERNAL "")
set(__CUTLASS_CUDA_FLAGS_DEBUG ${CUTLASS_CUDA_FLAGS_DEBUG} CACHE INTERNAL "")
set(__CUTLASS_CUDA_CLANG_FLAGS ${CUTLASS_CUDA_CLANG_FLAGS} CACHE INTERNAL "")
set(__CUTLASS_CUDA_CLANG_FLAGS_RELEASE ${CUTLASS_CUDA_CLANG_FLAGS_RELEASE} CACHE INTERNAL "")
set(__CUTLASS_CUDA_CLANG_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_CLANG_FLAGS_RELWITHDEBINFO} CACHE INTERNAL "")
set(__CUTLASS_CUDA_CLANG_FLAGS_DEBUG ${CUTLASS_CUDA_CLANG_FLAGS_DEBUG} CACHE INTERNAL "")
set(__CUTLASS_CUDA_NVCC_FLAGS ${CUTLASS_CUDA_NVCC_FLAGS} CACHE INTERNAL "")
set(__CUTLASS_CUDA_NVCC_FLAGS_RELEASE ${CUTLASS_CUDA_NVCC_FLAGS_RELEASE} CACHE INTERNAL "")
set(__CUTLASS_CUDA_NVCC_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_NVCC_FLAGS_RELWITHDEBINFO} CACHE INTERNAL "")
set(__CUTLASS_CUDA_NVCC_FLAGS_DEBUG ${CUTLASS_CUDA_NVCC_FLAGS_DEBUG} CACHE INTERNAL "")
function(cutlass_apply_standard_compile_options TARGET)
if(CUDA_COMPILER MATCHES "[Cc]lang")
set(CUDA_COMPILE_LANGUAGE CXX)
set(_FLAGS ${CUTLASS_CUDA_FLAGS} ${CUTLASS_CUDA_CLANG_FLAGS})
set(_FLAGS_RELEASE ${CUTLASS_CUDA_FLAGS_RELEASE} ${CUTLASS_CUDA_CLANG_FLAGS_RELEASE})
set(_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} ${CUTLASS_CUDA_CLANG_FLAGS_RELWITHDEBINFO})
set(_FLAGS_DEBUG ${CUTLASS_CUDA_FLAGS_DEBUG} ${CUTLASS_CUDA_CLANG_FLAGS_DEBUG})
set(_FLAGS ${__CUTLASS_CUDA_FLAGS} ${__CUTLASS_CUDA_CLANG_FLAGS})
set(_FLAGS_RELEASE ${__CUTLASS_CUDA_FLAGS_RELEASE} ${__CUTLASS_CUDA_CLANG_FLAGS_RELEASE})
set(_FLAGS_RELWITHDEBINFO ${__CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} ${__CUTLASS_CUDA_CLANG_FLAGS_RELWITHDEBINFO})
set(_FLAGS_DEBUG ${__CUTLASS_CUDA_FLAGS_DEBUG} ${__CUTLASS_CUDA_CLANG_FLAGS_DEBUG})
else()
set(CUDA_COMPILE_LANGUAGE CUDA)
set(_FLAGS ${CUTLASS_CUDA_FLAGS} ${CUTLASS_CUDA_NVCC_FLAGS})
set(_FLAGS_RELEASE ${CUTLASS_CUDA_FLAGS_RELEASE} ${CUTLASS_CUDA_NVCC_FLAGS_RELEASE})
set(_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} ${CUTLASS_CUDA_NVCC_FLAGS_RELWITHDEBINFO})
set(_FLAGS_DEBUG ${CUTLASS_CUDA_FLAGS_DEBUG} ${CUTLASS_CUDA_NVCC_FLAGS_DEBUG})
set(_FLAGS ${__CUTLASS_CUDA_FLAGS} ${__CUTLASS_CUDA_NVCC_FLAGS})
set(_FLAGS_RELEASE ${__CUTLASS_CUDA_FLAGS_RELEASE} ${__CUTLASS_CUDA_NVCC_FLAGS_RELEASE})
set(_FLAGS_RELWITHDEBINFO ${__CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} ${__CUTLASS_CUDA_NVCC_FLAGS_RELWITHDEBINFO})
set(_FLAGS_DEBUG ${__CUTLASS_CUDA_FLAGS_DEBUG} ${__CUTLASS_CUDA_NVCC_FLAGS_DEBUG})
endif()
target_link_libraries(${TARGET} PRIVATE CUTLASS)
target_compile_options(
${TARGET}
PRIVATE
@ -352,7 +486,7 @@ set_target_properties(CUTLASS PROPERTIES EXPORT_NAME cutlass)
set(CUTLASS_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include CACHE PATH "CUTLASS Header Library")
set(CUTLASS_GENERATOR_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tools/library/)
set(CUTLASS_GENERATOR_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tools/library CACHE INTERNAL "Location of generator scripts")
# The following utility directory is needed even if the tools build is disabled, so it exists here.
set(CUTLASS_TOOLS_UTIL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tools/util/include CACHE INTERNAL "")
@ -361,6 +495,10 @@ include_directories(${CUTLASS_INCLUDE_DIR})
target_compile_features(CUTLASS INTERFACE cxx_std_11)
if (NOT CUTLASS_NAMESPACE STREQUAL "cutlass")
target_compile_definitions(CUTLASS INTERFACE CUTLASS_NAMESPACE=${CUTLASS_NAMESPACE})
endif()
if (NOT DEFINED CUTLASS_REVISION)
find_package(Git QUIET)
@ -448,38 +586,253 @@ endif()
################################################################################
include(CTest)
enable_testing()
if (NOT TARGET test_all)
add_custom_target(test_all)
endif()
set(CUTLASS_INSTALL_TESTS ON CACHE BOOL "Install test executables")
set(CUTLASS_TEST_EXECUTION_ENVIRONMENT "" CACHE BOOL "Environment in which to invoke unit test executables")
set(CMAKE_TEST_INSTALL_PREFIX test CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.")
set(CUTLASS_TEST_INSTALL_PREFIX ${CMAKE_TEST_INSTALL_PREFIX}/cutlass CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.")
set(CUTLASS_TEST_INSTALL_BINDIR ${CUTLASS_TEST_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR} CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.")
set(CUTLASS_TEST_INSTALL_LIBDIR ${CUTLASS_TEST_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR} CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.")
install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX})
install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_BINDIR})
install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_LIBDIR})
install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/ctest)
################################################################################
set(CUTLASS_ENABLE_CUBLAS OFF CACHE BOOL "cuBLAS usage for tests")
set(CUTLASS_ENABLE_CUDNN OFF CACHE BOOL "cuDNN usage for tests")
include(${CMAKE_CURRENT_SOURCE_DIR}/cuBLAS.cmake)
if (CUTLASS_ENABLE_CUBLAS)
target_compile_definitions(CUTLASS INTERFACE CUTLASS_ENABLE_CUBLAS=1)
endif()
include(${CMAKE_CURRENT_SOURCE_DIR}/cuDNN.cmake)
if (CUTLASS_ENABLE_CUDNN)
target_compile_definitions(CUTLASS INTERFACE CUTLASS_ENABLE_CUDNN=1)
endif()
################################################################################
if(CUTLASS_ENABLE_TOOLS)
set(CUTLASS_CTEST_TEMPLATE_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/CTestTestfile.config.cmake)
set(CUTLASS_CTEST_GENERATED_FILES "" CACHE INTERNAL "")
function(cutlass_add_executable_tests NAME TARGET)
#
# Generates test rules for `make test`, `make test_all`, and `ctest` invoked from either the
# <CMAKE_BINARY_DIR> or the <CMAKE_INSTALL_PREFIX>/<CUTLASS_TEST_INSTALL_PREFIX> after installation.
#
# NAME: The base name for the test. Can be run with `make <NAME>` or `ctest -R 'c<NAME>'`.
# TARGET: The target corresponding to the executable under test.
# DISABLE_EXECUTABLE_INSTALL_RULE: An option, if given, that disables creating an install rule for TARGET.
# DEPENDS: A list of targets or files on which this test is dependent.
# DEPENDEES: A list of targets which should depend on this test.
# TEST_COMMAND_OPTIONS: A list of variables (i.e. by reference params) which contain command line arguments
# to pass to the test executable. A unique test with suffix _0, _1, ... is generated for each set of
# options given. If this option is not used, a single test with no arguments is generated.
# RESULT_CACHE_FILE: A file to be installed alongside the test executable with pre-computed
# test results to speed up test runtime.
#
set(options DISABLE_EXECUTABLE_INSTALL_RULE)
set(oneValueArgs DISABLE_TESTS RESULT_CACHE_FILE)
set(multiValueArgs DEPENDS DEPENDEES TEST_COMMAND_OPTIONS)
cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if (NOT DEFINED __DISABLE_TESTS)
set(__DISABLE_TESTS OFF)
endif()
if (__RESULT_CACHE_FILE)
add_custom_command(
TARGET ${TARGET}
POST_BUILD
COMMAND ${CMAKE_COMMAND}
ARGS -E copy ${__RESULT_CACHE_FILE} "$<TARGET_FILE_DIR:${TARGET}>"
)
endif()
if (NOT __DISABLE_EXECUTABLE_INSTALL_RULE AND CUTLASS_INSTALL_TESTS)
# file(RELATIVE_PATH CMAKE_CURRENT_BINARY_RELATIVE_DIR ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
install(
TARGETS ${TARGET}
RUNTIME DESTINATION ${CUTLASS_TEST_INSTALL_BINDIR}
)
if (__RESULT_CACHE_FILE)
install(
FILES ${__RESULT_CACHE_FILE}
DESTINATION ${CUTLASS_TEST_INSTALL_BINDIR}/
)
endif()
endif()
if (NOT __TEST_COMMAND_OPTIONS)
set(__TEST_COMMAND_OPTIONS " ")
endif()
list(LENGTH __TEST_COMMAND_OPTIONS CMD_COUNT)
set(CMD_IDX 0)
if (CMD_COUNT GREATER 1)
add_custom_target(${NAME} DEPENDS ${TARGET} ${__DEPENDS})
foreach(DEPENDEE ${__DEPENDEES})
add_dependencies(${DEPENDEE} ${NAME})
endforeach()
endif()
foreach(CMD_OPTIONS ${__TEST_COMMAND_OPTIONS})
if (CMD_COUNT GREATER 1)
set(TEST_NAME ${NAME}_${CMD_IDX})
else()
set(TEST_NAME ${NAME})
endif()
# The following rigmarole is needed to deal with spaces and possible quotes in
# command line arguments. The options are passed "by reference" as the actual
# variable names holding the real options. We then expand these in a way that
# preserves any quotes. Note, they have to be in this order for it to work for
# all the use cases below.
set(CMD_OPTIONS ${${CMD_OPTIONS}})
list(JOIN CMD_OPTIONS " " TEST_COMMAND_OPTIONS)
separate_arguments(CMD_OPTIONS)
add_custom_target(
${TEST_NAME}
COMMAND
${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${CMD_OPTIONS}
DEPENDS
${TARGET}
)
if (CMD_COUNT GREATER 1)
add_dependencies(${NAME} ${TEST_NAME})
endif()
foreach(DEPENDEE ${__DEPENDEES})
add_dependencies(${DEPENDEE} ${TEST_NAME})
endforeach()
add_test(
NAME c${TEST_NAME}
COMMAND ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${CMD_OPTIONS}
)
set_tests_properties(c${TEST_NAME} PROPERTIES DISABLED ${__DISABLE_TESTS})
if (CUTLASS_INSTALL_TESTS)
# To run the tests from an install package with tests enabled, we need to generate test files
# that don't rely on the current directory structure in build.
set(TEST_NAME c${TEST_NAME})
set(TEST_EXE $<TARGET_FILE_NAME:${TARGET}>)
set(TEST_EXE_WORKING_DIRECTORY ./${CMAKE_INSTALL_BINDIR})
configure_file("${CUTLASS_CTEST_TEMPLATE_FILE}" "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.config.cmake" @ONLY)
file(GENERATE
OUTPUT "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.cmake"
INPUT "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.config.cmake"
)
install(
FILES "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.cmake"
DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/ctest/
)
set(CUTLASS_CTEST_GENERATED_FILES ${CUTLASS_CTEST_GENERATED_FILES};ctest/CTestTestfile.${TEST_NAME}.cmake CACHE INTERNAL "")
endif()
math(EXPR CMD_IDX "${CMD_IDX} + 1")
endforeach()
endfunction()
if (CUTLASS_ENABLE_TOOLS)
add_subdirectory(tools)
if (CUTLASS_ENABLE_PROFILER)
add_dependencies(test_all test_profiler)
endif()
endif()
if(CUTLASS_ENABLE_EXAMPLES)
if (CUTLASS_ENABLE_EXAMPLES)
add_subdirectory(examples)
add_dependencies(test_all test_examples)
endif()
if(CUTLASS_ENABLE_TESTS)
include(CTest)
enable_testing()
if (CUTLASS_ENABLE_TESTS)
add_subdirectory(test)
add_dependencies(test_all test_unit)
endif()
if (CUTLASS_INSTALL_TESTS)
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/cmake")
file(WRITE "${CMAKE_BINARY_DIR}/cmake/CTestTestfile.cmake" "# Generated File\n")
foreach(GENERATED_FILE ${CUTLASS_CTEST_GENERATED_FILES})
file(APPEND "${CMAKE_BINARY_DIR}/cmake/CTestTestfile.cmake" "include(${GENERATED_FILE})\n")
endforeach()
install(
FILES "${CMAKE_BINARY_DIR}/cmake/CTestTestfile.cmake"
DESTINATION "${CUTLASS_TEST_INSTALL_PREFIX}/"
)
endif()
#? install(
#? FILES ${CMAKE_BINARY_DIR}/CTestTestfile.cmake
#? DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/
#? )
#?
#? install(
#? DIRECTORY
#? ${CMAKE_BINARY_DIR}/tools
#? ${CMAKE_BINARY_DIR}/test
#? DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/
#? FILES_MATCHING PATTERN "CTestTestfile.cmake"
#? )
################################################################################
include(CMakePackageConfigHelpers)
write_basic_package_version_file(
${CMAKE_CURRENT_BINARY_DIR}/NvidiaCutlassConfigVersion.cmake
COMPATIBILITY AnyNewerVersion)
install(
FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/NvidiaCutlassConfig.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/
FILES
${CMAKE_CURRENT_SOURCE_DIR}/cmake/NvidiaCutlassConfig.cmake
${CMAKE_CURRENT_BINARY_DIR}/NvidiaCutlassConfigVersion.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/NvidiaCutlass/
)
install(
EXPORT NvidiaCutlass
NAMESPACE nvidia::cutlass::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/NvidiaCutlass/
FILE NvidiaCutlassTargets.cmake
)

View File

@ -7,51 +7,63 @@
This is the official list of CUTLASS developers and contributors.
## DEVELOPERS
Andrew Kerr
Haicheng Wu
Manish Gupta
Dustyn Blasig
Pradeep Ramani
Naila Farooqui
Piotr Majcher
Paul Springer
Jin Wang
Scott Yokim
Markus Hohnerbach
Aditya Atluri
David Tanner
Andrew Kerr
Haicheng Wu
Manish Gupta
Dustyn Blasig
Pradeep Ramani
Cris Cecka
Vijay Thakkar
Aniket Shivam
Honghao Lu
Ethan Yan
Zhaodong Chen
Jack Kosaian
Yujia Zhai
Naila Farooqui
Piotr Majcher
Paul Springer
Jin Wang
Chinmay Talegaonkar
Shang Zhang
Scott Yokim
Markus Hohnerbach
Aditya Atluri
David Tanner
Manikandan Ananth
## CUTLASS Product Manager
Matthew Nicely
## CONTRIBUTORS
Timothy Costa
Julien Demouth
Brian Fahs
Michael Goldfarb
Mostafa Hagog
Fei Hu
Alan Kaatz
Tina Li
Timmy Liu
Duane Merrill
Kevin Siu
Markus Tavenrath
John Tran
Vicki Wang
Junkai Wu
Fung Xie
Albert Xu
Jack Yang
Xiuxia Zhang
Nick Zhao
Timothy Costa
Julien Demouth
Brian Fahs
Michael Goldfarb
Mostafa Hagog
Fei Hu
Alan Kaatz
Tina Li
Timmy Liu
Duane Merrill
Kevin Siu
Markus Tavenrath
John Tran
Vicki Wang
Junkai Wu
Fung Xie
Albert Xu
Jack Yang
Xiuxia Zhang
Nick Zhao
## ACKNOWLEDGEMENTS
Girish Bharambe
Cris Cecka
Luke Durant
Olivier Giroux
Stephen Jones
Rishkul Kulkarni
Bryce Lelbach
Joel McCormack
Kyrylo Perelygin
Girish Bharambe
Luke Durant
Olivier Giroux
Stephen Jones
Rishkul Kulkarni
Bryce Lelbach
Joel McCormack
Kyrylo Perelygin

View File

@ -1,23 +1,29 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific prior written
# permission.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
if(CUDA_COMPILER MATCHES "[Cc]lang")
@ -204,7 +210,7 @@ include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
# paths by default, so we add it explicitly here.
function(cutlass_correct_source_file_language_property)
if(CUDA_COMPILER MATCHES "clang")
if(CUDA_COMPILER MATCHES "[Cc]lang")
foreach(File ${ARGN})
if(File MATCHES ".*\.cu$")
set_source_files_properties(${File} PROPERTIES LANGUAGE CXX)
@ -213,7 +219,13 @@ function(cutlass_correct_source_file_language_property)
endif()
endfunction()
set(CUTLASS_UNITY_BUILD_ENABLED OFF CACHE BOOL "Enable combined source compilation")
if (MSVC OR CUTLASS_LIBRARY_KERNELS MATCHES "all")
set(CUTLASS_UNITY_BUILD_ENABLED_INIT ON)
else()
set(CUTLASS_UNITY_BUILD_ENABLED_INIT OFF)
endif()
set(CUTLASS_UNITY_BUILD_ENABLED ${CUTLASS_UNITY_BUILD_ENABLED_INIT} CACHE BOOL "Enable combined source compilation")
set(CUTLASS_UNITY_BUILD_BATCH_SIZE 16 CACHE STRING "Batch size for unified source files")
function(cutlass_unify_source_files TARGET_ARGS_VAR)
@ -273,10 +285,9 @@ function(cutlass_unify_source_files TARGET_ARGS_VAR)
set(${TARGET_ARGS_VAR} ${TARGET_SOURCE_ARGS} PARENT_SCOPE)
endfunction()
function(cutlass_add_library NAME)
set(options)
set(options SKIP_GENCODE_FLAGS)
set(oneValueArgs EXPORT_NAME)
set(multiValueArgs)
cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@ -292,7 +303,9 @@ function(cutlass_add_library NAME)
endif()
cutlass_apply_standard_compile_options(${NAME})
if (NOT __SKIP_GENCODE_FLAGS)
cutlass_apply_cuda_gencode_flags(${NAME})
endif()
target_compile_features(
${NAME}

View File

@ -1,23 +1,27 @@
Copyright (c) 2017 - 2020, NVIDIA CORPORATION. All rights reserved.
Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the NVIDIA CORPORATION nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

30
PUBLICATIONS.md Normal file
View File

@ -0,0 +1,30 @@
# Publications Using Cutlass
## 2023
- ["Stream-K: Work-centric Parallel Decomposition for Dense Matrix-Matrix Multiplication on the GPU"](https://arxiv.org/abs/2301.03598). Muhammad Osama, Duane Merrill, Cris Cecka, Michael Garland, John D. Owens. _arXiv_, January 2023.
## 2022
- ["GPU Load Balancing"](https://arxiv.org/abs/2212.08964). Muhammad Osama. _Doctoral dissertation, University of California, Davis_, December 2022.
- ["Bolt: Bridging the Gap between Auto-tuners and Hardware-native Performance"](https://arxiv.org/abs/2110.15238). Jiarong Xing, Leyuan Wang, Shang Zhang, Jack Chen, Ang Chen, Yibo Zhu. _Proceedings of the 5th MLSys Conference_, August 2022.
- ["Recovering single precision accuracy from Tensor Cores while surpassing the FP32 theoretical peak performance"](https://arxiv.org/abs/2203.03341). Hiroyuki Ootomo, Rio Yokota. _International Journal of High Performance Computing_, March 2022.
- ["Breaking the Computation and Communication Abstraction Barrier in Distributed Machine Learning Workloads"](https://arxiv.org/abs/2105.05720). Abhinav Jangda, Jun Huang, Guodong Liu, Amir Hossein Nodehi Sabet, Saeed Maleki, Youshan Miao, Madanlal Musuvathi, Todd Mytkowicz, Olli Sarikivi. _Proceedings of the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems_, February 2022.
## 2021
- ["Arithmetic-intensity-guided fault tolerance for neural network inference on GPUs"](https://dl.acm.org/doi/abs/10.1145/3458817.3476184). Jack Kosaian, K. V. Rashmi. _Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis_, November 2021.
- ["Real-time Neural Radiance Caching for Path Tracing"](https://d1qx31qr3h6wln.cloudfront.net/publications/paper_4.pdf). Thomas Muller, Fabrice Rousselle, Jan Novak, Alex Keller. _ACM Trans. Graph._, August 2021.
## 2020
- ["Scalable Knowledge Graph Analytics at 136 Petaflop/s"](https://www.computer.org/csdl/proceedings-article/sc/2020/999800a061/1oeORDgCM0g). Ramakrishnan Kannan, Piyush Sao, Hao Lu, Drahomira Herrmannova, Vijay Thakkar, Robert Patton, Richard Vuduc, Thomas Potok. _Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis_, November 2020.
- ["Accelerating Sparse DNN Models without Hardware-Support via Tile-Wise Sparsity
"](https://arxiv.org/abs/2008.13006). Cong Guo, Bo Yang Hsueh, Jingwen Leng, Yuxian Qiu, Yue Guan, Zehuan Wang, Xiaoying Jia, Xipeng Li, Minyi Guo, Yuhao Zhu. _Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis_, November 2020.
- ["Strassen's Algorithm Reloaded on GPUs"](https://dl.acm.org/doi/10.1145/3372419). Jianyu Huang, Chenhan D. Yu, Robert A. van de Geijn. _ACM Transactions on Mathematical Software_, March 2020.

390
README.md
View File

@ -1,15 +1,15 @@
![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
# CUTLASS 2.2
# CUTLASS 2.11
_CUTLASS 2.2 - June 2020_
_CUTLASS 2.11 - November 2022_
CUTLASS is a collection of CUDA C++ template abstractions for implementing
high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA.
It incorporates strategies for hierarchical decomposition and data movement similar
to those used to implement cuBLAS. CUTLASS decomposes these "moving parts" into
reusable, modular software components abstracted by C++ template classes. These
thread-wide, warp-wide, block-wide, and device-wide primitives can be specialized
high-performance matrix-multiplication (GEMM) and related computations at all levels
and scales within CUDA. It incorporates strategies for hierarchical decomposition and
data movement similar to those used to implement cuBLAS and cuDNN. CUTLASS decomposes
these "moving parts" into reusable, modular software components abstracted by C++ template
classes. These thread-wide, warp-wide, block-wide, and device-wide primitives can be specialized
and tuned via custom tiling sizes, data types, and other algorithmic policy. The
resulting flexibility simplifies their use as building blocks within custom kernels
and applications.
@ -18,97 +18,115 @@ To support a wide variety of applications, CUTLASS provides extensive support fo
mixed-precision computations, providing specialized data-movement and
multiply-accumulate abstractions for half-precision floating
point (FP16), BFloat16 (BF16), Tensor Float 32 (TF32),
single-precision floating point (FP32), double-precision floating
single-precision floating point (FP32),
[FP32 emulation via tensor core instruction](/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm),
double-precision floating
point (FP64) types, integer data types (4b and 8b), and binary data types (1b).
Furthermore, CUTLASS demonstrates warp-synchronous matrix multiply operations
CUTLASS demonstrates warp-synchronous matrix multiply operations
targeting the programmable, high-throughput _Tensor Cores_ implemented by
NVIDIA's Volta, Turing, and Ampere architectures.
CUTLASS implements high-performance Convolution via the implicit GEMM algorithm.
Implicit GEMM is the formulation of a convolution operation as a GEMM thereby taking advantage of
CUTLASS's modular GEMM pipeline.
This allows CUTLASS to build convolutions by reusing highly optimized warp-wide GEMM components and below.
See the [Quick Start Guide](/media/docs/quickstart.md) to get started quickly.
See the [functionality listing](media/docs/functionality.md) for the list of operations
See the [functionality listing](/media/docs/functionality.md) for the list of operations
supported at each level of the execution model hierarchy.
# What's New in CUTLASS 2.2
# What's New in CUTLASS 2.11
CUTLASS 2.2 is a significant update to CUTLASS adding:
CUTLASS 2.11 is an update to CUTLASS adding:
- [Stream-K](/examples/47_ampere_gemm_universal_streamk), which is a new general way to do split-K. It can not only improve performance, but can also significantly reduce the number of tile sizes that need to be profiled to find the best one.
- [Fused multi-head attention kernel](/examples/41_fused_multi_head_attention). It has two variants: one for fixed sequence lengths, and another for variable sequence lengths.
- [Dual GEMM](/examples/45_dual_gemm). It can run two GEMMs that share the same left input matrix in one kernel.
- Hopper improves [double precision matrix multiplication](/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm90.cu) by 2x compared to Ampere at iso-clocks. It is supported since CUDA 11.8.
- [BLAS3](/test/unit/gemm/device/hemm_cf64_cf64_cf64_tensor_op_f64_sm90.cu) functions with Hoppers new double precision matrix multiplication instructions.
- [ELL Block Sparse GEMM](/examples/43_ell_block_sparse_gemm).
- [Optimized Group Conv](/examples/42_ampere_tensorop_group_conv).
- [Optimized DepthWise Conv](/examples/46_depthwise_simt_conv2dfprop).
- [Scripts](/examples/44_multi_gemm_ir_and_codegen) to fuse multiple back-to-back GEMM.
- [FP8 data type definition](/include/cutlass/float8.h) and [conversion routines](/include/cutlass/numeric_conversion.h#L1274-2115).
- Updates and bugfixes from the community (thanks!). Big shout out to Meta's [xFormers](https://github.com/facebookresearch/xformers).
- **Deprecation announcement:** CUTLASS plans to deprecate the following in the next major release:
- Maxwell and Pascal GPU architectures
- Ubuntu 16.04
- CUDA 10.2
- C++ 11
- **Future requirement announcement:** CUTLASS plans to add the following requirements in the next major release:
- Minimum C++ standard - C++17
- Coverage of [NVIDIA Ampere Architecture features](https://devblogs.nvidia.com/nvidia-ampere-architecture-in-depth/)
- Tensor Core-accelerated GEMMs targeting Tensor Float 32, BFloat16, and double-precision data types
- Deep software pipelines using asynchronous copy
- Described in [GTC 2020 Webinar (SR 21745)](https://developer.nvidia.com/gtc/2020/video/s21745)
- Intended to be compiled with [CUDA 11 Toolkit](https://developer.nvidia.com/cuda-toolkit)
# What's New in CUTLASS 2.1
CUTLASS 2.1 is a minor update to CUTLASS 2.0 adding:
- [Planar complex GEMM kernels](/examples/10_planar_complex/planar_complex.cu) targeting Volta and Turing Tensor Cores
- BLAS-style API to launch kernels compiled into the [CUTLASS Library](/media/docs/quickstart.md#cutlass-library)
# What's New in CUTLASS 2.0
CUTLASS 2.0 is a substantial refactoring from the previous version, intended to offer:
- Better performance over 1.x, particularly for kernels targeting Turing Tensor Cores
- Robust and durable templates that reliably span the design space
- Encapsulated functionality that may be reusable in other contexts
**See the [CHANGELOG](CHANGELOG.md) for more details.**
**See the [CHANGELOG](CHANGELOG.md) for a detailed listing of releases and updates.**
# Performance
<p align="center"><img src=/media/images/cutlass-performance-plot.png></p>
<p align="center"><img src=/media/images/cutlass-2.8-gemm-performance.png></p>
CUTLASS primitives are very efficient. When used to construct device-wide GEMM kernels,
they exhibit performance comparable to cuBLAS for scalar GEMM
computations. The above figure shows CUTLASS performance relative to cuBLAS
for large matrix dimensions on an NVIDIA GeForce 2080 Ti, an NVIDIA A100, and an NVIDIA TitanV
using CUDA 11.0 Toolkit. Tensor Core operations are implemented using CUDA's
for large matrix dimensions on an [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/),
an [NVIDIA A2](https://www.nvidia.com/en-us/data-center/products/a2/),
an [NVIDIA TitanV](https://www.nvidia.com/en-us/titan/titan-v/),
and an [NVIDIA GeForce 2080 Ti](https://www.nvidia.com/en-us/geforce/graphics-cards/rtx-2080-ti/)
compiled with the [CUDA 11.5 Toolkit](https://developer.nvidia.com/cuda-downloads). Tensor Core operations are implemented using CUDA's
[mma instruction](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma).
<p align="center"><img src=/media/images/cutlass-2.9-implicit-gemm-performance.png></p>
When using CUTLASS building blocks to construct device-wide implicit gemm (Fprop, Dgrad, and Wgrad)
kernels, CUTLASS performance is also comparable to cuDNN when running Resnet-50 layers on an [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/)
as shown in the above figure. Tensor Core operations are still implemented using CUDA's
[mma instruction](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma).
# Compatibility
CUTLASS requires a C++11 host compiler and
performs best when built with the [CUDA 11.0 Toolkit](https://developer.nvidia.com/cuda-toolkit).
It is compatible with CUDA 9.2, CUDA 10.0, CUDA 10.1, and CUDA 10.2.
CUTLASS requires a C++11 host compiler and performs best when built with the [**CUDA 11.8 Toolkit**](https://developer.nvidia.com/cuda-toolkit).
It is also compatible with CUDA 11.x.
## Operating Systems
We have tested the following environments.
|**Operating System** | **Compiler** |
|-----------------|----------|
| Windows 10 | Microsoft Visual Studio 2015|
| | Microsoft Visual Studio 2017|
| Ubuntu 16.04 | GCC 5.4.0 |
| | Microsoft Visual Studio 2019|
| Ubuntu 18.04 | GCC 7.5.0 |
| Ubuntu 20.04 | GCC 10.3.0 |
| Ubuntu 22.04 | GCC 11.2.0 |
Additionally, CUTLASS may be built with clang.
See [these instructions](media/docs/quickstart.md#clang) for more details.
## Hardware
CUTLASS runs successfully on the following NVIDIA GPUs, and it is expected to be efficient on
any Maxwell-, Pascal-, Volta-, Turing-, or NVIDIA Ampere- architecture NVIDIA GPU.
any Volta-, Turing-, or NVIDIA Ampere- architecture NVIDIA GPU.
|**GPU**|**CUDA Compute Capability**|**Minimum CUDA Toolkit**|**CUDA Toolkit Enabling Native Tensor Cores**|
|**GPU**|**CUDA Compute Capability**|**Minimum CUDA Toolkit**|**Minimum CUDA Toolkit Enabling Native Tensor Cores**|
|---|---|---|---|
|NVIDIA Tesla P100|6.0|9.2| |
|NVIDIA GeForce 1080|6.1|9.2| |
|NVIDIA TitanXP|6.1|9.2| |
|NVIDIA Tesla V100|7.0|9.2|10.1|
|NVIDIA TitanV|7.0|9.2|10.1|
|NVIDIA GeForce RTX 2080 TI, 2080, 2070|7.5|10.0|10.2|
|NVIDIA Tesla T4|7.5|10.0|10.2|
|NVIDIA A100|8.0|11.0|11.0|
|NVIDIA A10 |8.6|11.1|11.1|
|NVIDIA GeForce 3090|8.6|11.1|11.1|
|NVIDIA H100 PCIe|9.0|11.8|Double-precision: 11.8; Mixed precision: 12.0|
# Documentation
CUTLASS 2.2 is described in the following documents and the accompanying
CUTLASS is described in the following documents and the accompanying
[Doxygen documentation](https://nvidia.github.io/cutlass).
- [Quick Start Guide](/media/docs/quickstart.md) - build and run CUTLASS
- [Functionality](/media/docs/functionality.md) - summarizes functionality available in CUTLASS
- [Efficient GEMM in CUDA](media/docs/efficient_gemm.md) - describes how GEMM kernels may be implemented efficiently in CUDA
- [GEMM API](media/docs/gemm_api.md) - describes the CUTLASS GEMM model and C++ template concepts
- [Implicit GEMM Convolution](media/docs/implicit_gemm_convolution.md) - describes 2-D and 3-D convolution in CUTLASS
- [Code Organization](media/docs/code_organization.md) - describes the organization and contents of the CUTLASS project
- [Terminology](media/docs/terminology.md) - describes terms used in the code
- [Programming Guidelines](media/docs/programming_guidelines.md) - guidelines for writing efficient modern CUDA C++
@ -118,9 +136,16 @@ CUTLASS 2.2 is described in the following documents and the accompanying
- [CUTLASS Profiler](media/docs/profiler.md) - command-line driven profiling application
- [CUTLASS Utilities](media/docs/utilities.md) - additional templates used to facilate rapid development
# Resources
We have also described the structure of an efficient GEMM in our talk at the
[GPU Technology Conference 2018](http://on-demand.gputechconf.com/gtc/2018/presentation/s8854-cutlass-software-primitives-for-dense-linear-algebra-at-all-levels-and-scales-within-cuda.pdf).
- [CUTLASS: Software Primitives for Dense Linear Algebra at All Levels and Scales within CUDA](https://www.nvidia.com/en-us/on-demand/session/gtcsiliconvalley2018-s8854/)
- [Developing CUDA Kernels to Push Tensor Cores to the Absolute Limit on NVIDIA A100](https://www.nvidia.com/en-us/on-demand/session/gtcsj20-s21745/)
- [Accelerating Convolution with Tensor Cores in CUTLASS](https://www.nvidia.com/en-us/on-demand/session/gtcspring21-s31883/)
- [Accelerating Backward Data Gradient by Increasing Tensor Core Utilization in CUTLASS](https://www.nvidia.com/en-us/on-demand/session/gtcspring22-s41996/)
- [CUTLASS: Python API, Enhancements, and NVIDIA Hopper](https://www.nvidia.com/en-us/on-demand/session/gtcfall22-a41131/)
# Building CUTLASS
CUTLASS is a header-only template library and does not need to be built to be used by other
@ -131,19 +156,19 @@ CUTLASS unit tests, examples, and utilities can be build with CMake starting ver
Make sure the `CUDACXX` environment variable points to NVCC in the CUDA Toolkit installed
on your system.
```
```bash
$ export CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
```
Create a build directory within the CUTLASS project, then run CMake. By default CUTLASS will build kernels
for CUDA architecture versions 5.0, 6.0, 6.1, 7.0, 7.5, and 8.0. To reduce compile time you can specify
for CUDA architecture versions 5.0, 6.0, 6.1, 7.0, 7.5, 8.0, and 8.6. To reduce compile time you can specify
the architectures to build CUTLASS for by changing the CMake configuration setting
`CUTLASS_NVCC_ARCHS`.
```
```bash
$ mkdir build && cd build
$ cmake .. -DCUTLASS_NVCC_ARCHS=75 # compiles for NVIDIA's Turing GPU architecture
$ cmake .. -DCUTLASS_NVCC_ARCHS=80 # compiles for NVIDIA's Ampere Architecture
```
From the `build/` directory, compile and run the CUTLASS unit tests by building the target `test_unit` with make.
@ -151,7 +176,7 @@ From the `build/` directory, compile and run the CUTLASS unit tests by building
The unit tests are organized as several binaries mirroring the top-level namespaces of CUTLASS,
and they may be executed in parallel via make's `-j` command line argument.
```
```bash
$ make test_unit -j
...
...
@ -182,6 +207,10 @@ include/ # client applications should target this directory
arch/ # direct exposure of architecture features (including instruction-level GEMMs)
conv/ # code specialized for convolution
epilogue/ # code specialized for the epilogue of gemm/convolution
gemm/ # code specialized for general matrix product computations
layout/ # layout definitions for matrices, tensors, and other mathematical objects in memory
@ -189,6 +218,8 @@ include/ # client applications should target this directory
platform/ # CUDA-capable Standard Library components
reduction/ # bandwidth-limited reduction kernels that do not fit the "gemm" model
thread/ # simt code that can be performed within a CUDA thread
transform/ # code specialized for layout, type, and domain transformations
@ -199,36 +230,8 @@ include/ # client applications should target this directory
[CUTLASS SDK examples](/examples) apply CUTLASS templates to implement basic computations.
```
examples/
00_basic_gemm/ # launches a basic GEMM with single precision inputs and outputs
01_cutlass_utilities/ # demonstrates CUTLASS Utilities for allocating and initializing tensors
02_dump_reg_smem/ # debugging utilities for printing register and shared memory contents
03_visualize_layout/ # utility for visualizing all layout functions in CUTLASS
04_tile_iterator/ # example demonstrating an iterator over tiles in memory
05_batched_gemm/ # example demonstrating CUTLASS's batched strided GEMM operation
06_splitK_gemm/ # exmaple demonstrating CUTLASS's Split-K parallel reduction kernel
07_volta_tensorop_gemm/ # example demonstrating mixed precision GEMM using Volta Tensor Cores
08_turing_tensorop_gemm/ # example demonstrating integer GEMM using Turing Tensor Cores
10_planar_complex/ # example demonstrating planar complex GEMM kernels
11_planar_complex_array/ # example demonstrating planar complex kernels with batch-specific problem sizes
12_gemm_bias_relu/ # example demonstrating GEMM fused with bias and relu
13_fused_two_gemms/ # example demonstrating two GEMms fused in one kernel
```
### Tools
```
tools/
library/ # CUTLASS Instance Library - contains instantiations of all supported CUTLASS templates
@ -257,20 +260,85 @@ Instructions for building and running the Unit tests are described in the [Quick
The `tools/profiler/` directory contains a command-line utility for launching each of the GEMM kernels.
It can be built as follows:
```bash
$ make cutlass_profiler -j16
```
$ make cutlass_profiler -j
```
## Building all GEMM and Convolution kernels (_long_ build times)
To limit compilation time, only one tile size is instantiated for each data type, math instruction, and layout.
By default, only one tile size is instantiated for each data type, math instruction, and layout.
To instantiate all, set the following environment variable when running CMake from an empty `build/` directory.
```
Beware, this results in *thousands* of kernels and long build times.
```bash
$ cmake .. -DCUTLASS_NVCC_ARCHS=75 -DCUTLASS_LIBRARY_KERNELS=all
...
$ make cutlass_profiler -j
$ make cutlass_profiler -j16
```
Example command line for profiling SGEMM kernels is as follows:
## Building a subset of GEMM and Convolution kernels (_reduced_ build times)
To compile strictly one kernel or a small set of kernels, a comma-delimited list of kernel names with
wildcard characters may be used to reduce the set of kernels. The following examples show building exactly one
or a subset of kernels for NVIDIA Ampere and Turing architecture:
### Building a subset Tensor Core GEMM kernels
To compile a subset of Tensor Core GEMM kernels with FP32 accumulation and FP16 input targetting NVIDIA Ampere and Turing architecture,
use the below cmake command line:
```bash
$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*gemm_f16_*_nt_align8
...
$ make cutlass_profiler -j16
```
Example command line for profiling a subset of Tensor Core GEMM kernels is as follows:
```bash
./tools/profiler/cutlass_profiler --kernels=cutlass_tensorop_s*gemm_f16_*_nt_align8 --m=3456 --n=4096 --k=4096
...
=============================
Problem ID: 1
Provider: CUTLASS
OperationKind: gemm
Operation: cutlass_tensorop_s1688gemm_f16_256x128_32x2_nt_align8
Status: Success
Verification: ON
Disposition: Passed
reference_device: Passed
cuBLAS: Passed
Arguments: --gemm_kind=universal --m=3456 --n=4096 --k=4096 --A=f16:column --B=f16:row --C=f32:column --alpha=1 \
--beta=0 --split_k_slices=1 --batch_count=1 --op_class=tensorop --accum=f32 --cta_m=256 --cta_n=128 \
--cta_k=32 --stages=2 --warps_m=4 --warps_n=2 --warps_k=1 --inst_m=16 --inst_n=8 --inst_k=8 --min_cc=75 \
--max_cc=1024
Bytes: 118489088 bytes
FLOPs: 115992428544 flops
Runtime: 1.55948 ms
Memory: 70.7616 GiB/s
Math: 74378.8 GFLOP/s
=============================
...
```
### Building one CUDA Core GEMM kernel
To compile one SGEMM kernel targetting NVIDIA Ampere and Turing architecture, use the below cmake command line:
```bash
$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sgemm_128x128_8x2_nn_align1
...
$ make cutlass_profiler -j16
```
Example command line for profiling single SGEMM CUDA kernel is as follows:
```bash
$ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096
=============================
@ -297,9 +365,111 @@ $ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096
Memory: 24.934 GiB/s
Math: 17218.4 GFLOP/s
=============================
```
[Further details about the CUTLASS Profiler are described here.](media/docs/profiler.md)
### Building a subset of Tensor Core Convolution kernels
To compile a subset of Tensor core convolution kernels implementing forward propagation (fprop) with FP32 accumulation
and FP16 input targetting NVIDIA Ampere and Turing architecture, use the below cmake command line:
```bash
$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*fprop_optimized_f16
...
$ make cutlass_profiler -j16
```
Example command line for profiling a subset of Tensor Core convolution kernels is as follows:
```bash
$ ./tools/profiler/cutlass_profiler --kernels=cutlass_tensorop_s*fprop_optimized_f16 --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3
...
=============================
Problem ID: 1
Provider: CUTLASS
OperationKind: conv2d
Operation: cutlass_tensorop_s16816fprop_optimized_f16_128x128_32x5_nhwc
Status: Success
Verification: ON
Disposition: Passed
reference_device: Passed
Arguments: --conv_kind=fprop --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 --p=224 --q=224 --pad_h=1 --pad_w=1 \
--stride_h=1 --stride_w=1 --dilation_h=1 --dilation_w=1 --Activation=f16:nhwc --Filter=f16:nhwc --Output=f32:nhwc \
--conv_mode=cross --iterator_algorithm=optimized --alpha=1 --beta=0 --split_k_mode=serial --split_k_slices=1 \
--eq_gemm_provider=none --op_class=tensorop --accum=f32 --cta_m=128 --cta_n=128 --cta_k=32 --stages=5 \
--warps_m=2 --warps_n=2 --warps_k=1 --inst_m=16 --inst_n=8 --inst_k=16 --min_cc=80 --max_cc=1024
Bytes: 1130659840 bytes
FLOPs: 118482796544 flops
Runtime: 0.711496 ms
Memory: 1479.99 GiB/s
Math: 166526 GFLOP/s
=============================
...
```
### Building one Convolution CUDA kernel
To compile and run one CUDA Core convolution kernel implementing forward propagation (fprop) with F32 accumulation
and FP32 input targetting NVIDIA Ampere and Turing architecture, use the below cmake command line:
```bash
$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sfprop_optimized_128x128_8x2_nhwc
...
$ make cutlass_profiler -j16
```
Example command line for profiling one CUDA Core convolution kernel:
```bash
$ ./tools/profiler/cutlass_profiler --kernels=cutlass_simt_sfprop_optimized_128x128_8x2_nhwc --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3
=============================
Problem ID: 1
Provider: CUTLASS
OperationKind: conv2d
Operation: cutlass_simt_sfprop_optimized_128x128_8x2_nhwc
Status: Success
Verification: ON
Disposition: Passed
reference_device: Passed
Arguments: --conv_kind=fprop --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 --p=224 --q=224 --pad_h=1 --pad_w=1 \
--stride_h=1 --stride_w=1 --dilation_h=1 --dilation_w=1 --Activation=f32:nhwc --Filter=f32:nhwc --Output=f32:nhwc \
--conv_mode=cross --iterator_algorithm=optimized --alpha=1 --beta=0 --split_k_mode=serial --split_k_slices=1 \
--eq_gemm_provider=none --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8 --stages=2 --warps_m=4 \
--warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50 --max_cc=1024
Bytes: 2055798784 bytes
FLOPs: 118482796544 flops
Runtime: 7.34266 ms
Memory: 260.752 GiB/s
Math: 16136.2 GFLOP/s
=============================
```
## More Details on Compiling CUTLASS Kernels and CUTLASS Profiler
- Please follow the links for more CMake examples on selectively compiling CUTLASS kernels:
- [GEMM CMake Examples](media/docs/quickstart.md#gemm-cmake-examples)
- [Implicit GEMM conovlution CMake Examples](media/docs/quickstart.md#convolution-cmake-examples)
- [Further details about the CUTLASS Profiler are described here.](media/docs/profiler.md)
# About
@ -313,27 +483,33 @@ The official list of CUTLASS developers and contributors is available here: [CON
# Copyright
Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
```
Redistribution and use in source and binary forms, with or without modification, are permitted
provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this list of
conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this list of
conditions and the following disclaimer in the documentation and/or other materials
provided with the distribution.
* Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
to endorse or promote products derived from this software without specific prior written
permission.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
```

View File

@ -0,0 +1,21 @@
# Generated file
if (DEFINED ENV{CUTLASS_TEST_EXECUTION_ENVIRONMENT})
set(_CUTLASS_TEST_EXECUTION_ENVIRONMENT $ENV{CUTLASS_TEST_EXECUTION_ENVIRONMENT})
else()
set(_CUTLASS_TEST_EXECUTION_ENVIRONMENT @CUTLASS_TEST_EXECUTION_ENVIRONMENT@)
endif()
if (NOT "@TEST_EXE_DIR@" STREQUAL "")
set(TEST_EXE_PATH @TEST_EXE_DIR@/@TEST_EXE@)
else()
set(TEST_EXE_PATH @TEST_EXE@)
endif()
add_test("@TEST_NAME@" ${_CUTLASS_TEST_EXECUTION_ENVIRONMENT} "${TEST_EXE_PATH}" @TEST_COMMAND_OPTIONS@)
if (NOT "@TEST_EXE_WORKING_DIRECTORY@" STREQUAL "")
set_tests_properties("@TEST_NAME@" PROPERTIES WORKING_DIRECTORY "@TEST_EXE_WORKING_DIRECTORY@")
endif()
set_tests_properties(@TEST_NAME@ PROPERTIES DISABLED @__DISABLE_TESTS@)

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/

View File

@ -1,3 +1,30 @@
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
message(STATUS "Configuring cublas ...")

112
cuDNN.cmake Normal file
View File

@ -0,0 +1,112 @@
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
if(DEFINED CUDNN_ENABLED)
set(CUTLASS_ENABLE_CUDNN ${CUDNN_ENABLED} CACHE BOOL "Enable CUTLASS to build with cuDNN library.")
endif()
if(DEFINED CUTLASS_ENABLE_CUDNN AND NOT CUTLASS_ENABLE_CUDNN)
return()
endif()
message(STATUS "Configuring cuDNN ...")
find_path(
_CUDNN_INCLUDE_DIR cudnn.h
PATHS
${CUDA_TOOLKIT_ROOT_DIR}/include
$ENV{CUDNN_PATH}/include
$ENV{CUDA_PATH}/include
${CUDNN_PATH}/include
/usr/include)
find_library(
_CUDNN_LIBRARY cudnn
HINTS
${CUDA_TOOLKIT_ROOT_DIR}/lib64
${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
${CUDA_TOOLKIT_ROOT_DIR}/lib
$ENV{CUDNN_PATH}/lib64
$ENV{CUDNN_PATH}/lib/x64
$ENV{CUDNN_PATH}/lib
$ENV{CUDA_PATH}/lib64
$ENV{CUDA_PATH}/lib/x64
$ENV{CUDA_PATH}/lib
${CUDNN_PATH}/lib64
${CUDNN_PATH}/lib/x64
${CUDNN_PATH}/lib
/usr/lib/x86_64-linux-gnu
/usr/lib)
if(_CUDNN_INCLUDE_DIR AND _CUDNN_LIBRARY)
message(STATUS "cuDNN: ${_CUDNN_LIBRARY}")
message(STATUS "cuDNN: ${_CUDNN_INCLUDE_DIR}")
set(CUDNN_FOUND ON CACHE INTERNAL "cuDNN Library Found")
else()
message(STATUS "cuDNN not found.")
set(CUDNN_FOUND OFF CACHE INTERNAL "cuDNN Library Found")
endif()
set(CUTLASS_ENABLE_CUDNN ${CUDNN_FOUND} CACHE BOOL "Enable CUTLASS to build with cuDNN library.")
if (CUTLASS_ENABLE_CUDNN AND NOT TARGET cudnn)
set(CUDNN_INCLUDE_DIR ${_CUDNN_INCLUDE_DIR})
set(CUDNN_LIBRARY ${_CUDNN_LIBRARY})
if(WIN32)
add_library(cudnn STATIC IMPORTED GLOBAL)
else()
add_library(cudnn SHARED IMPORTED GLOBAL)
endif()
add_library(nvidia::cudnn ALIAS cudnn)
set_property(
TARGET cudnn
PROPERTY IMPORTED_LOCATION
${CUDNN_LIBRARY})
target_include_directories(
cudnn
INTERFACE
$<INSTALL_INTERFACE:include>
$<BUILD_INTERFACE:${CUDNN_INCLUDE_DIR}>)
endif()
if(CUTLASS_ENABLE_CUDNN AND NOT CUDNN_FOUND)
message(FATAL_ERROR "CUTLASS_ENABLE_CUDNN enabled but cuDNN library could not be found.")
endif()
message(STATUS "Configuring cuDNN ... done.")

1
docs/_config.yml Normal file
View File

@ -0,0 +1 @@
theme: jekyll-theme-minimal

View File

@ -1,25 +1,33 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific prior written
# permission.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cutlass_example_add_executable(
00_basic_gemm
basic_gemm.cu

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
@ -148,7 +154,6 @@ cudaError_t CutlassSgemmNN(
/// Kernel to initialize a matrix with small integers.
__global__ void InitializeMatrix_kernel(
float *matrix,
int ldm,
int rows,
int columns,
int seed = 0) {
@ -157,7 +162,7 @@ __global__ void InitializeMatrix_kernel(
int j = threadIdx.y + blockIdx.y * blockDim.y;
if (i < rows && j < columns) {
int offset = i + j * ldm;
int offset = i + j * rows;
// Generate arbitrary elements.
int const k = 16807;
@ -169,7 +174,7 @@ __global__ void InitializeMatrix_kernel(
}
/// Simple function to initialize a matrix to arbitrary small integers.
cudaError_t InitializeMatrix(float *matrix, int ldm, int rows, int columns, int seed = 0) {
cudaError_t InitializeMatrix(float *matrix, int rows, int columns, int seed = 0) {
dim3 block(16, 16);
dim3 grid(
@ -177,7 +182,7 @@ cudaError_t InitializeMatrix(float *matrix, int ldm, int rows, int columns, int
(columns + block.y - 1) / block.y
);
InitializeMatrix_kernel<<< grid, block >>>(matrix, ldm, rows, columns, seed);
InitializeMatrix_kernel<<< grid, block >>>(matrix, rows, columns, seed);
return cudaGetLastError();
}
@ -185,10 +190,10 @@ cudaError_t InitializeMatrix(float *matrix, int ldm, int rows, int columns, int
///////////////////////////////////////////////////////////////////////////////////////////////////
/// Allocates device memory for a matrix then fills with arbitrary small integers.
cudaError_t AllocateMatrix(float **matrix, int ldm, int rows, int columns, int seed = 0) {
cudaError_t AllocateMatrix(float **matrix, int rows, int columns, int seed = 0) {
cudaError_t result;
size_t sizeof_matrix = sizeof(float) * ldm * columns;
size_t sizeof_matrix = sizeof(float) * rows * columns;
// Allocate device memory.
result = cudaMalloc(reinterpret_cast<void **>(matrix), sizeof_matrix);
@ -209,7 +214,7 @@ cudaError_t AllocateMatrix(float **matrix, int ldm, int rows, int columns, int s
}
// Initialize matrix elements to arbitrary small integers.
result = InitializeMatrix(*matrix, ldm, rows, columns, seed);
result = InitializeMatrix(*matrix, rows, columns, seed);
if (result != cudaSuccess) {
std::cerr << "Failed to initialize matrix: "
@ -304,20 +309,20 @@ cudaError_t TestCutlassGemm(int M, int N, int K, float alpha, float beta) {
// Allocate matrices in GPU device memory with arbitrary seeds.
//
result = AllocateMatrix(&A, lda, M, K, 0);
result = AllocateMatrix(&A, M, K, 0);
if (result != cudaSuccess) {
return result;
}
result = AllocateMatrix(&B, ldb, K, N, 17);
result = AllocateMatrix(&B, K, N, 17);
if (result != cudaSuccess) {
cudaFree(A);
return result;
}
result = AllocateMatrix(&C_cutlass, ldc, M, N, 101);
result = AllocateMatrix(&C_cutlass, M, N, 101);
if (result != cudaSuccess) {
cudaFree(A);
@ -325,7 +330,7 @@ cudaError_t TestCutlassGemm(int M, int N, int K, float alpha, float beta) {
return result;
}
result = AllocateMatrix(&C_reference, ldc, M, N, 101);
result = AllocateMatrix(&C_reference, M, N, 101);
if (result != cudaSuccess) {
cudaFree(A);

View File

@ -1,25 +1,33 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific prior written
# permission.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cutlass_example_add_executable(
01_cutlass_utilities
cutlass_utilities.cu

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
@ -119,12 +125,12 @@ cudaError_t cutlass_hgemm_nn(
int K,
cutlass::half_t alpha,
cutlass::half_t const *A,
int lda,
cutlass::layout::ColumnMajor::Stride::Index lda,
cutlass::half_t const *B,
int ldb,
cutlass::layout::ColumnMajor::Stride::Index ldb,
cutlass::half_t beta,
cutlass::half_t *C,
int ldc) {
cutlass::layout::ColumnMajor::Stride::Index ldc) {
// Define the GEMM operation
using Gemm = cutlass::gemm::device::Gemm<

View File

@ -1,25 +1,33 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific prior written
# permission.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cutlass_example_add_executable(
02_dump_reg_shmem
dump_reg_shmem.cu

View File

@ -1,27 +1,31 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
*modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice,
*this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
*notice, this list of conditions and the following disclaimer in the
*documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its
*contributors may be used to endorse or promote products derived from this
*software without specific prior written permission.
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
*AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
*IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
*DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT,
*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
*DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
*OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TOR (INCLUDING
*NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
*EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
@ -69,7 +73,7 @@
template <typename Element, typename GmemIterator, typename SmemIterator>
__global__ void kernel_dump(typename GmemIterator::Params params,
typename GmemIterator::TensorRef ref) {
__shared__ Element shared_storage[EXAMPLE_MATRIX_ROW * EXAMPLE_MATRIX_COL];
extern __shared__ Element shared_storage[];
// Construct the global iterator and load the data to the fragments.
int tb_thread_id = threadIdx.y * blockDim.x + threadIdx.x;
@ -164,8 +168,11 @@ int main() {
dim3 grid(1, 1);
dim3 block(32, 1, 1);
int smem_size =
int(sizeof(Element) * EXAMPLE_MATRIX_ROW * EXAMPLE_MATRIX_COL);
kernel_dump<Element, GmemIterator, SmemIterator>
<<<grid, block>>>(params, matrix.device_ref());
<<<grid, block, smem_size, 0>>>(params, matrix.device_ref());
cudaError_t result = cudaDeviceSynchronize();

View File

@ -1,28 +1,40 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific prior written
# permission.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
set(TEST_COMMAND_00 RowMajor --extent=16,16)
cutlass_example_add_executable(
03_visualize_layout
visualize_layout.cpp
register_layout.cu
TEST_COMMAND_OPTIONS
TEST_COMMAND_00
)

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
@ -55,27 +61,49 @@ void RegisterLayouts(std::map<std::string, std::unique_ptr<VisualizeLayoutBase>
new VisualizeLayout<cutlass::layout::ColumnMajorInterleaved<4>>},
{"RowMajorInterleaved<4>",
new VisualizeLayout<cutlass::layout::RowMajorInterleaved<4>>},
// All Ampere/Turing H/Integer matrix multiply tensor core kernels uses the same swizzling
// layout implementation with different templates.
//
// mma.sync.aligned.m8n8k128.s32.b1.b1.s32 Interleaved-256
// mma.sync.aligned.m16n8k256.s32.b1.b1.s32 Interleaved-256
{"TensorOpMultiplicand<1,256>",
new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<1, 256>>},
// mma.sync.aligned.m8n8k128.s32.b1.b1.s32 TN kblock512
// mma.sync.aligned.m16n8k256.s32.b1.b1.s32 TN kblock512
{"TensorOpMultiplicand<1,512>",
new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<1, 512>>},
// mma.sync.aligned.m16n8k256.s32.b1.b1.s32 TN kblock1024
{"TensorOpMultiplicand<1,1024>",
new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<1, 1024>>},
// Integer matrix multiply.int4 8832 Interleaved-64
// Integer matrix multiply.int4 16864 Interleaved-64
{"TensorOpMultiplicand<4,64>",
new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<4, 64>>},
// Integer matrix multiply.int4 8832 TN kblock128
// Integer matrix multiply.int4 16864 TN kblock128
{"TensorOpMultiplicand<4,128>",
new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<4, 128>>},
// Integer matrix multiply.int4 16864 TN kblock256
{"TensorOpMultiplicand<4,256>",
new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<4, 256>>},
// Integer matrix multiply 8816 Interleaved-32
// Integer matrix multiply 16832 Interleaved-32
{"TensorOpMultiplicand<8,32>",
new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<8, 32>>},
// Integer matrix multiply 8816 TN kblock64
// Integer matrix multiply 16832 TN kblock64
{"TensorOpMultiplicand<8,64>",
new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<8, 64>>},
// Integer matrix multiply 16832 TN kblock128
{"TensorOpMultiplicand<8,128>",
new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<8, 128>>},
// Matrix Multiply 1688 TN kblock32
// Matrix multiply 16816 TN kblock32
{"TensorOpMultiplicand<16,32>",
new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<16, 32>>},
// Matrix multiply 1688 NT
// Matrix multiply 16816 NT
// Matrix multiply 16816 TN kblock64
{"TensorOpMultiplicand<16,64>",
new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<16, 64>>},
// Matrix multiply 1688.TF32 TN kblock16

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
@ -32,6 +38,8 @@
#include <iomanip>
#include <memory>
#include <cutlass/cutlass.h>
#include "options.h"
#include "register_layout.h"
@ -87,7 +95,7 @@ void print_usage(std::ostream &out) {
"--extent=16,16 --vectorize=2 --output-shape=16,4\n"
<< "$ 03_visualize_layout \"VoltaTensorOpMultiplicandCrosswise<16,32>\" "
"--extent=32,64 --vectorize=4 --output-shape=64,4\n"
<< "$ 03_visualize_layout \"VotlaTensorOpMultiplicandCongruous<16>\" "
<< "$ 03_visualize_layout \"VoltaTensorOpMultiplicandCongruous<16>\" "
"--extent=64,32 --vectorize=8 --output-shape=64,4\n";
out << std::endl;
@ -133,6 +141,8 @@ int main(int argc, char const *arg[]) {
layout_it->second->print_csv(std::cout);
cudaFree(0); // Ensure CUDA is available.
return 0;
}

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/

View File

@ -1,25 +1,33 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific prior written
# permission.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cutlass_example_add_executable(
04_tile_iterator
tile_iterator.cu

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
@ -44,7 +50,6 @@
#include <iostream>
#include <sstream>
#include <vector>
#include <fstream>
// CUTLASS includes
#include "cutlass/transform/threadblock/predicated_tile_iterator.h"

View File

@ -1,25 +1,33 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific prior written
# permission.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cutlass_example_add_executable(
05_batched_gemm
batched_gemm.cu

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
@ -28,12 +34,16 @@
#include "cutlass/cutlass.h"
#include "cutlass/layout/matrix.h"
#include "cutlass/gemm/device/gemm_array.h"
#include "cutlass/gemm/device/gemm_batched.h"
#pragma warning( disable : 4503)
/*
This example demonstrates how to use cutlass to compute a batched strided gemm.
This example demonstrates how to use cutlass to compute a batched strided gemm in two different ways:
1. By specifying pointers to the first matrices of the batch and the stride between the consecutive
matrices of the batch (this is called a strided batched gemm).
2. By copying pointers to all matrices of the batch to the device memory (this is called an array gemm).
In this example, both A and B matrix are non-transpose and column major matrix
batched_C = batched_A x batched_B
As an example, matrix C can be seen as
@ -71,7 +81,7 @@ matrix A can be seen as
---------------------------------------
batch 0 | batch 1
, where batch size is 2, M is 6 and K is 2
The stride (batch_stride_B) between the first element of two batches is lda * k
The stride (batch_stride_A) between the first element of two batches is lda * k
matrix B can be seen as
-----------------------------
@ -84,11 +94,50 @@ matrix B can be seen as
(1,1,0) | (1,1,1) | (1,1,2) |
-----------------------------
, where the batch size is 2, N is 3 and K is 2
The stride (batch_stride_C) between the first element of two batches is k
The stride (batch_stride_B) between the first element of two batches is k
*/
cudaError_t cutlass_array_sgemm(
int m,
int n,
int k,
float alpha,
float const * const *A,
int lda,
float const * const *B,
int ldb,
float * const *C,
int ldc,
float beta,
int batch_count) {
using Gemm = cutlass::gemm::device::GemmArray<
float, cutlass::layout::ColumnMajor,
float, cutlass::layout::ColumnMajor,
float, cutlass::layout::ColumnMajor
>;
Gemm gemm_op;
cutlass::Status status = gemm_op({
{m, n, k},
A, lda,
B, ldb,
C, ldc,
C, ldc,
{alpha, beta},
batch_count
});
if (status != cutlass::Status::kSuccess) {
return cudaErrorUnknown;
}
return cudaSuccess;
}
cudaError_t cutlass_strided_batched_sgemm(
int m,
int n,
@ -188,7 +237,11 @@ cudaError_t strided_batched_gemm_nn_reference(
return result;
}
int main() {
cudaError_t run_batched_gemm(bool use_array) {
const char* gemm_desc = use_array ? "array" : "strided batched";
std::cout << "Running " << gemm_desc << " gemm" << std::endl;
// Arbitrary problem size
int const m = 520;
@ -293,11 +346,69 @@ int main() {
}
// run cutlass
result = cutlass_strided_batched_sgemm(
m, n, k, alpha, A, lda, batch_stride_A, B, ldb, batch_stride_B, C, ldc, batch_stride_C,
beta, batch_count);
if (result != cudaSuccess)
return result;
if (use_array) {
// allocate the host memory for the pointers to the matrices of the batch
std::vector<float*> host_ptr_A(batch_count);
std::vector<float*> host_ptr_B(batch_count);
std::vector<float*> host_ptr_C(batch_count);
// permute the batch elements to emphasize that GemmArray does not depend on matrices being separated by a fixed stride
std::vector<size_t> permutation = {14, 11, 3, 10, 1, 13, 9, 4, 6, 16, 8, 15, 7, 12, 0, 2, 5};
for (size_t b_idx = 0; b_idx < batch_count; b_idx++) {
host_ptr_A[b_idx] = A + permutation[b_idx] * batch_stride_A;
host_ptr_B[b_idx] = B + permutation[b_idx] * batch_stride_B;
host_ptr_C[b_idx] = C + permutation[b_idx] * batch_stride_C;
}
// allocate the corresponding device memory
float const **ptr_A;
float const **ptr_B;
float **ptr_C;
result = cudaMalloc(&ptr_A, batch_count * sizeof(float*));
if (result != cudaSuccess) {
std::cerr << "cudaMalloc result = " << result << std::endl;
return result;
}
result = cudaMalloc(&ptr_B, batch_count * sizeof(float*));
if (result != cudaSuccess) {
std::cerr << "cudaMalloc result = " << result << std::endl;
return result;
}
result = cudaMalloc(&ptr_C, batch_count * sizeof(float*));
if (result != cudaSuccess) {
std::cerr << "cudaMalloc result = " << result << std::endl;
return result;
}
// copy the matrix pointers to the device
result = cudaMemcpy(ptr_A, host_ptr_A.data(), batch_count * sizeof(float*), cudaMemcpyHostToDevice);
if (result != cudaSuccess) {
std::cerr << "cudaMemcpy result = " << result << std::endl;
return result;
}
result = cudaMemcpy(ptr_B, host_ptr_B.data(), batch_count * sizeof(float*), cudaMemcpyHostToDevice);
if (result != cudaSuccess) {
std::cerr << "cudaMemcpy result = " << result << std::endl;
return result;
}
result = cudaMemcpy(ptr_C, host_ptr_C.data(), batch_count * sizeof(float*), cudaMemcpyHostToDevice);
if (result != cudaSuccess) {
std::cerr << "cudaMemcpy result = " << result << std::endl;
return result;
}
result = cutlass_array_sgemm(m, n, k, alpha, ptr_A, lda, ptr_B, ldb, ptr_C, ldc, beta, batch_count);
if (result != cudaSuccess)
return result;
} else {
result = cutlass_strided_batched_sgemm(
m, n, k, alpha, A, lda, batch_stride_A, B, ldb, batch_stride_B, C, ldc, batch_stride_C,
beta, batch_count);
if (result != cudaSuccess)
return result;
}
// copy device memory to host
result = cudaMemcpy(result_C.data(), C, count_C * sizeof(float), cudaMemcpyDeviceToHost);
@ -314,7 +425,7 @@ int main() {
// Expect bit-level accuracy for this simple example
if (ref_C != result_C) {
std::cout << "CUTLASS strided batched gemm does not run correctly" << std::endl;
std::cout << "CUTLASS " << gemm_desc << " gemm does not run correctly" << std::endl;
return cudaErrorUnknown;
}
@ -335,9 +446,19 @@ int main() {
return result;
}
return result;
}
if (result == cudaSuccess) {
std::cout << "Passed." << std::endl;
int main() {
cudaError_t result = cudaSuccess;
for (bool use_array : {false, true}) {
result = run_batched_gemm(use_array);
if (result == cudaSuccess) {
std::cout << "Passed." << std::endl;
} else {
break;
}
}
// Exit.

View File

@ -1,25 +1,33 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific prior written
# permission.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cutlass_example_add_executable(
06_splitK_gemm
splitk_gemm.cu

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
@ -49,7 +55,7 @@ composed from lower level ones. Multiple thread-tiles (tile size each thread com
to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute
threadblock-tile (tile size computed by a threadblock).
In thie example, we split variable initialization into
In this example, we split variable initialization into
1. Setting up data properties : describes how matrices are laid out in the memory and how the kernel
can view them (logical to physical mapping)
2. Setting up computation properties : describes how the above set matrices will be used to compute
@ -68,10 +74,10 @@ ElementAccumulator (float), ElementComputeEpilogue (float), ElementInputA (cutla
ElementInputB (cutlass::half_t), ElementOutput (float). Communicating just the data type is not
enough. As the data is laid out linearly in memory, we have to convey the layout of matrices. We do
that by initializing template variable LayoutInputA to column major cutlass variable, LayoutInputB
to row major and LayoutOutput to row major. Next, we setup rules to comptue alpha * X + beta * C
to row major and LayoutOutput to row major. Next, we setup rules to compute alpha * X + beta * C
which is called epilogue of the kernel. We initialize template variable EpilogueOp, which takes the
data type of output ElementOutput (int32_t), the number of elements per vector memory access (16),
data type of accumulator (int32_t) and data type of computation of linear combination (alpha * X +
data type of output ElementOutput (float), the number of elements per vector memory access (16),
data type of accumulator (float) and data type of computation of linear combination (alpha * X +
beta * C).
Now that we setup the properties of data, we have to setup properties of computation.
@ -79,7 +85,7 @@ Now that we setup the properties of data, we have to setup properties of computa
Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x32,
64x64x4, 8x8x4 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
bank-conflict free manner, and ton of other variables required to compose, intialize and launch a
bank-conflict free manner, and ton of other variables required to compose, initialize and launch a
high performance GEMM kernel. This is the beauty of CUTLASS, it relieves developer from
understanding and coding complicated hardware optimizations which can easily go wrong.
@ -89,7 +95,7 @@ is done which threadblock launched on an SM, CUDA SM architecture of GPU you wan
These are all put together to create a template variable which describes CUTLASS GEMM kernel using
cutlass::gemm::device::GemmSplitKParallel template.
The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it.
The next step is to initialize physical data, instantiate and initialize CUTLASS kernel and run it.
We use CUTLASS utilities to initialize, fill, compare matrices as they are simple and doesn't come
in the way of learning CUTLASS.
@ -97,7 +103,7 @@ Once all the matrices are initialized and filled with data, create arguments tup
kernel which takes problem size (M = 5120, N = 4096 and K = 4096), matrices, alpha, beta and the
important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
memory required by the kernel we instantiated. If yes, we create it and pass it along with other
arguments created to intialize CUTLASS kernel then, the kernel is launched.
arguments created to initialize CUTLASS kernel then, the kernel is launched.
In this example, we later on launch a reference gemm kernel (from CUTLASS utilities) to compare if
the output from CUTLASS kernel is same as reference GEMM kernel.
@ -143,9 +149,6 @@ using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 32>; // <- warp tile M =
// This code section describes the size of MMA op
using ShapeMMAOp = cutlass::gemm::GemmShape<8, 8, 4>; // <- MMA Op tile M = 8, N = 8, K = 4
// This code section describes how threadblocks are scheduled on GPU
using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; // <- ??
// This code section describes ?
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
ElementOutput, // <- data type of output matrix
@ -205,7 +208,7 @@ int run() {
cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
problem_size.mk()); // <- Create matrix A with dimensions M x K
cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(
problem_size.nk()); // <- Create matrix B with dimensions N x K
problem_size.kn()); // <- Create matrix B with dimensions K x N
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(
problem_size.mn()); // <- Create matrix C with dimensions M x N
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(

View File

@ -1,25 +1,33 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific prior written
# permission.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cutlass_example_add_executable(
07_volta_tensorop_gemm
volta_tensorop_gemm.cu

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
@ -67,7 +73,7 @@ beta * C).
Now that we setup the properties of data, we have to setup properties of computation.
Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x32,
64x64x4, 8x8x4 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
64x64x32, 8x8x4 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
bank-conflict free manner, and ton of other variables required to compose, intialize and launch a
high performance GEMM kernel. This is the beauty of CUTLASS, it relieves developer from
@ -284,8 +290,12 @@ int run() {
// Instantiate CUTLASS kernel depending on templates
Gemm gemm_op;
// Check the problem size is supported or not
cutlass::Status status = gemm_op.can_implement(arguments);
CUTLASS_CHECK(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
status = gemm_op.initialize(arguments, workspace.get());
CUTLASS_CHECK(status);
// Launch initialized CUTLASS kernel

View File

@ -1,25 +1,33 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific prior written
# permission.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cutlass_example_add_executable(
08_turing_tensorop_gemm
turing_tensorop_gemm.cu

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
@ -188,31 +194,6 @@ using Gemm = cutlass::gemm::device::Gemm<ElementInputA,
int run() {
// Turing Tensor Core operations exposed with mma.sync and ldmatrix are first available
// in CUDA 10.2.
//
// CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
return -1;
}
cudaDeviceProp props;
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (error != cudaSuccess) {
std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
return -1;
}
if (!((props.major * 10 + props.minor) >= 75)) {
std::cerr << "Turing Tensor Core operations must be run on a machine with compute capability at least 75."
<< std::endl;
// Return 0 so tests are considered passing if run on unsupported platforms.
return 0;
}
const int length_m = 5120;
const int length_n = 4096;
const int length_k = 4096;
@ -291,8 +272,12 @@ int run() {
// Instantiate CUTLASS kernel depending on templates
Gemm gemm_op;
// Check the problem size is supported or not
cutlass::Status status = gemm_op.can_implement(arguments);
CUTLASS_CHECK(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
status = gemm_op.initialize(arguments, workspace.get());
CUTLASS_CHECK(status);
// Launch initialized CUTLASS kernel
@ -337,18 +322,37 @@ int run() {
}
int main() {
bool notSupported = false;
// Turing Tensor Core operations exposed with mma.sync and ldmatrix are first available
// in CUDA 10.2.
//
// CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
notSupported = true;
}
// Returning zero so this test passes when built on older Toolkits.
cudaDeviceProp props;
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (error != cudaSuccess) {
std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
return -1;
}
if (!((props.major * 10 + props.minor) >= 75)) {
std::cerr << "Turing Tensor Core operations must be run on a machine with compute capability at least 75."
<< std::endl;
notSupported = true;
}
if (notSupported) {
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
else {
return run();
}
return run();
}

View File

@ -0,0 +1,36 @@
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cutlass_example_add_executable(
09_turing_tensorop_conv2dfprop
turing_tensorop_conv2dfprop.cu
)

View File

@ -0,0 +1,771 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/**
This example shows how to run convolution kernels using functions and data structures
provided by CUTLASS using tensor cores; which we run on a NVIDIA Turing GPU.
Writing a single high performance convolution kernel is hard but do-able. Whereas writing
high performance kernels at scale which works for multiple problem sizes with good abstractions is
really hard. CUTLASS solves this problem by providing simplified abstractions to compose
multiple sections of implicit gemm kernel. When used properly, the kernels can hit peak performance
of GPU easily.
CUTLASS divides a kernel into hierarchical composable sections. Which means, at each thread, warp
and thread-block level, they compute on their own tile-size with higher level of tile sizes being
composed from lower level ones. Multiple thread-tiles (tile size each thread computes) can be used
to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute
threadblock-tile (tile size computed by a threadblock).
In thie example, we split variable initialization into
1. Setting up data properties : describes how tensors are laid out in the memory and how the kernel
can view them (logical to physical mapping)
2. Setting up computation properties : describes how the above set tensors will be used to compute
output of convolution.
First, we setup the data types of the input tensor A, weights' tensor B and output tensor C along
with alpha, beta as the equation for convolution is C = alpha * Conv(A, B) + beta * C. In CUTLASS,
the kernels first compute Conv(A, B) and leave the rest of the computation to end of the kernel as
alpha * X + beta * C is a simple element-wise operation on X (Conv(A, B)) and C. We call this as
epilogue of kernel. Hence, we setup data types for alpha and beta to be equal to
ElementComputeEpilogue = float. We want to use MMA instructions on Turing and they support 4-bit
signed integer. But int4b_t is not fully supported by Nvidia software stack, so CUTLASS introduces
cutlass::int4b_t. We use the data type for elements in input tensor A and B as cutlass::int4b_t. We
convey this to CUTLASS kernel by initializing template variables ElementAccumulator (int32_t),
ElementComputeEpilogue (float), ElementInputA (cutlass::int4b_t), ElementInputB (cutlass::int4b_t),
ElementOutput (int32_t). Communicating just the data type is not enough. As the data is laid out
linearly in memory, we have to convey the layout of tensors. We do that by initializing template
variables LayoutInputA, LayoutInputB and LayoutOutput to TensorNHWC cutlass variable. Next, we setup
rules to comptue alpha * X + beta * C which is called epilogue of the kernel. We initialize template
variable EpilogueOp, which takes the data type of output ElementOutput (int32_t), the number of
elements per vector memory access (32), data type of accumulator (int32_t) and data type of
computation of linear combination (alpha * X + beta * C).
Now that we setup the properties of data, we have to setup properties of computation.
Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x128,
64x64x128, 8x8x32 (MxNxK) respectively. When passed to instantiate CUTLASS Implicit GEMM kernel, it
internally deduces the amount of threads needed per thread-block, amount of shared memory, storing
data in bank-conflict free manner, and ton of other variables required to compose, intialize and
launch a high performance Implicit GEMM kernel. This is the beauty of CUTLASS, it relieves developer
from understanding and coding complicated hardware optimizations which can easily go wrong.
CUTLASS also supports multiple MMA pipelines in a threadblock. What are MMA pipelines? MMA pipelines
constitute the whole process of loading input data from global memory to shared memory, loading data
from shared memory to registers, doing matrix multiplication, store to global memory. The below flow
sequence shows a typical mma pipeline.
tensor in global memory -> registers -> tile in shared memory -> registers -> mma -> registers ->
output to global memory
The problem with single pipeline is, each stage is synchronous which means, each stage has to wait
until the previous finished executing. There are stages in the pipeline which do not have fixed
latency, for example, the loads from global memory and shared memory. Therefore, we can add one more
pipeline with a phase shift in mma kernel to hide latency from global and shared memory loads.
Finally, the pipeline in a kernel looks like
(1) tensor in global memory -> (2) registers -> (3) tile in shared memory -> (4) registers -> (5)
mma -> (6) registers -> (7) output to global memory (1) <null> -> (2) <null> -> (3) tensor in global
memory -> (4) registers -> (5) tile in shared memory -> (6) registers -> (7) mma -> (8) registers ->
(9) output to global memory
This way, you can hide the second global memory load latency by doing computation on already loaded
input data.
There are few more template variables initialized such as, which threadblock tile of output matrix
is done which threadblock launched on an SM, CUDA SM architecture of GPU you want to run on.
These are all put together to create a template variable which describes CUTLASS Implicit GEMM
kernel using cutlass::conv::device::ImplicitGemm template.
The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it.
We use CUTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come
in the way of learning CUTLASS.
Once all the tensors are initialized and filled with data, create arguments tuple to launch CUTLASS
kernel which takes problem size (N = 1, H = 64, W = 64, C = 128), filter size (K = 64,
R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the
important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space
memory required by the kernel we instantiated. If yes, we create it and pass it along with other
arguments created to intialize CUTLASS kernel then, the kernel is launched.
In this example, we later on launch a reference convolution kernel (from CUTLASS utilities) to
compare if the output from CUTLASS kernel is same as the reference implicit GEMM kernel.
*/
#include <iostream>
#include <fstream>
#include <sstream>
#include "cutlass/cutlass.h"
#include "cutlass/gemm/device/gemm.h"
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/conv/device/implicit_gemm_convolution.h"
#include "cutlass/util/command_line.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/reference/device/gemm.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_copy.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/host/convolution.h"
#include "cutlass/util/tensor_view_io.h"
#include "helper.h"
// The code section below describes datatype for input, output tensors and computation between
// elements
using ElementAccumulator = int32_t; // Data type of accumulator
using ElementComputeEpilogue = float; // Data type of epilogue computation (alpha, beta)
using ElementInputA = cutlass::int4b_t; // Data type of elements in input tensor
using ElementInputB = cutlass::int4b_t; // Data type of elements in input tensor
using ElementOutput = cutlass::int4b_t; // Data type of elements in output tensor
using LayoutInputA = cutlass::layout::TensorNHWC;
using LayoutInputB = cutlass::layout::TensorNHWC;
using LayoutOutput = cutlass::layout::TensorNHWC;
// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
using MMAOp = cutlass::arch::OpClassTensorOp;
// This code section describes CUDA SM architecture number
using SmArch = cutlass::arch::Sm75;
// This code section describes the tile size a thread block will compute
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 128>; // Threadblock tile shape
// This code section describes tile size a warp will compute
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; // Warp tile shape
// This code section describes the size of MMA op
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; // TensorCore instruction shape
// This code section describes how threadblocks are scheduled on GPU
using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
// Number of pipelines you want to use
constexpr int NumStages = 2;
// This code section describes the epilogue part of the kernel, we use default value
using EpilogueOp = cutlass::epilogue::thread::LinearCombinationClamp<
ElementOutput, // Data type of output matrix.
8, // The number of elements per vectorized.
// memory access. This becomes the vector width of
// math instructions in the epilogue too.
ElementAccumulator, // Data type of accumulator
ElementComputeEpilogue>; // Data type for alpha/beta in linear combination
using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementInputA, LayoutInputA,
ElementInputB, LayoutInputB,
ElementOutput, LayoutOutput,
ElementAccumulator,
MMAOp,
SmArch,
ThreadblockShape,
WarpShape,
InstructionShape,
EpilogueOp,
SwizzleThreadBlock,
NumStages,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kAnalytic
>::Kernel;
using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
/////////////////////////////////////////////////////////////////////////////////////////////////
// Command line options parsing
struct Options {
bool help;
cutlass::Tensor4DCoord input_size;
cutlass::Tensor4DCoord filter_size;
cutlass::Tensor4DCoord padding;
cutlass::MatrixCoord conv_stride;
cutlass::MatrixCoord dilation;
bool reference_check;
bool measure_performance;
int iterations;
bool save_workspace;
ElementComputeEpilogue alpha;
ElementComputeEpilogue beta;
bool benchmark;
std::string tag;
Options():
help(false),
input_size(1, 32, 32, 32),
filter_size(32, 3, 3, 32),
padding(1, 1, 1, 1),
conv_stride(1, 1),
dilation(1, 1),
reference_check(false),
measure_performance(true),
iterations(20),
save_workspace(false),
alpha(1),
beta(0),
benchmark(false) { }
// Verify the problem size is compatible with the CUTLASS Convolution implementation.
bool valid() {
//
// CUTLASS attempts to load 128b vectors of int4b_t elements. Consequently,
// all pointers, strides, and tensor extents must be divisible by 32 elements.
//
int const kAlignment = 32;
if ((input_size.c() % kAlignment) ||
(filter_size.n() % kAlignment)) {
// misaligned tensors
return false;
}
// Invalid padding
if ((padding.h() != filter_size.h() / 2) ||
(padding.w() != filter_size.w() / 2)) {
return false;
}
return true;
}
/// Updates input and filter sizes
void update(
cutlass::Tensor4DCoord input_size,
cutlass::Tensor4DCoord filter_size) {
this->input_size = input_size;
this->filter_size = filter_size;
padding.n() = filter_size.h() / 2;
padding.h() = filter_size.h() / 2;
padding.w() = filter_size.w() / 2;
padding.c() = filter_size.w() / 2;
}
// Parses the command line
void parse(int argc, char const **args) {
cutlass::CommandLine cmd(argc, args);
if (cmd.check_cmd_line_flag("help")) {
help = true;
}
if (cmd.check_cmd_line_flag("ref-check")) {
reference_check = true;
}
if (cmd.check_cmd_line_flag("perf-check")) {
measure_performance = true;
}
if (cmd.check_cmd_line_flag("save-workspace")) {
save_workspace = true;
}
if (cmd.check_cmd_line_flag("benchmark")) {
benchmark = true;
}
cmd.get_cmd_line_argument("n", input_size.n());
cmd.get_cmd_line_argument("h", input_size.h());
cmd.get_cmd_line_argument("w", input_size.w());
cmd.get_cmd_line_argument("c", input_size.c());
cmd.get_cmd_line_argument("k", filter_size.n());
cmd.get_cmd_line_argument("r", filter_size.h());
cmd.get_cmd_line_argument("s", filter_size.w());
filter_size.c() = input_size.c();
cmd.get_cmd_line_argument("alpha", alpha);
cmd.get_cmd_line_argument("beta", beta);
cmd.get_cmd_line_argument("iterations", iterations);
cmd.get_cmd_line_argument("tag", tag);
if (filter_size.h() == 3 && filter_size.w() == 3) {
padding = {1, 1, 1, 1};
}
else {
filter_size.h() = 1;
filter_size.w() = 1;
padding = {0, 0, 0, 0};
}
}
/// Prints the usage statement.
std::ostream & print_usage(std::ostream &out) const {
out << "09_turing_tensorop_conv2dfprop example\n\n"
<< " This example uses Turing's Tensor Core operators on int4 data types to compute\n"
<< " forward convolution on tensors of layout NHWC.\n\n"
<< "Options:\n\n"
<< " --help If specified, displays this usage statement.\n\n"
<< " --n=<int> Input tensor extent N\n"
<< " --h=<int> Input tensor extent H\n"
<< " --w=<int> Input tensor extent W\n"
<< " --c=<int> Input tensor extent C\n"
<< " --k=<int> Filter extent K\n"
<< " --r=<int> Filter extent R\n"
<< " --s=<int> Filter extent S\n\n"
<< " --alpha=<float> Epilogue scalar alpha\n"
<< " --beta=<float> Epilogue scalar beta\n\n"
<< " --ref-check If set (true), reference check on the host is computed\n"
<< " --perf-check If set (true), performance is measured.\n"
<< " --benchmark If set (true), performance benchmarking on several layers and batch-size.\n"
<< " --iterations=<int> Number of profiling iterations to perform.\n"
<< " --save-workspace If set, workspace is written to a text file.\n"
<< " --tag=<string> String to replicate across the first column in the results table\n";
out << "\n\nExamples:\n\n"
<< "$ ./examples/09_turing_tensorop_conv2dfprop/09_turing_tensorop_conv2dfprop --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n"
<< "$ ./examples/09_turing_tensorop_conv2dfprop/09_turing_tensorop_conv2dfprop --n=1 --h=224 --w=224 --c=32 --k=32 --r=3 --s=3 --ref-check\n\n";
return out;
}
/// Computes the output tensor size (NPQK)
cutlass::Tensor4DCoord output_size() const {
return cutlass::Tensor4DCoord(
input_size.n(),
(input_size.h() + padding.n() + padding.h() - filter_size.h()) / conv_stride.row() + 1,
(input_size.w() + padding.w() + padding.c() - filter_size.w()) / conv_stride.column() + 1,
filter_size.n());
}
/// Compute performance in GFLOP/s
double gflops(double runtime_s) const {
// Number of multiply-adds = NPQK * CRS
int64_t fmas = output_size().product() * int64_t(filter_size.h() * filter_size.w() * filter_size.c());
// Two flops per multiply-add
return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
struct Result {
double runtime_ms;
double gflops;
cutlass::Status status;
cutlass::Status reference_check;
cudaError_t error;
Result():
runtime_ms(0),
gflops(0),
status(cutlass::Status::kSuccess),
reference_check(cutlass::Status::kInvalid),
error(cudaSuccess) { }
static std::ostream & print_header(std::ostream &out, Options const &options) {
if (!options.tag.empty()) {
out << "Name,";
}
out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPs";
return out;
}
std::ostream & print(std::ostream &out, int idx, Options const &options) {
if (!options.tag.empty()) {
out << options.tag << ",";
}
out
<< "conv_" << idx << ","
<< options.input_size.n() << ","
<< options.input_size.h() << ","
<< options.input_size.w() << ","
<< options.input_size.c() << ","
<< options.filter_size.n() << ","
<< options.filter_size.h() << ","
<< options.filter_size.w() << ","
<< runtime_ms << ","
<< gflops;
return out;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Runs one benchmark
Result profile_convolution(Options const &options) {
Result result;
//
// Allocate host-device tensors using the CUTLASS Utilities.
//
cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(options.input_size);
cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(options.filter_size);
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(options.output_size());
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_c(options.output_size());
//
// Initialize tensors
//
// Fill tensor A on host with uniform-distribution random data
cutlass::reference::host::TensorFillRandomUniform(
tensor_a.host_view(),
1,
ElementInputA(7),
ElementInputA(-8),
0);
// Fill tensor B on host with uniform-distribution random data
cutlass::reference::host::TensorFillRandomUniform(
tensor_b.host_view(),
1,
ElementInputB(7),
ElementInputB(-8),
0);
// Fill tensor C on host with zeros
cutlass::reference::host::TensorFill(
tensor_c.host_view());
// Fill tensor C for reference on host with zeros
cutlass::reference::host::TensorFill(
tensor_ref_c.host_view());
// Copy data from host to GPU
tensor_a.sync_device();
tensor_b.sync_device();
tensor_c.sync_device();
tensor_ref_c.sync_device();
//
// Define arguments for CUTLASS Convolution
//
// mode (kCrossCorrelation or kConvolution)
cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
// Split K dimension into 1 partitions
int split_k_slices = 1;
// Construct Conv2dProblemSize with user defined output size
cutlass::conv::Conv2dProblemSize problem_size(
options.input_size,
options.filter_size,
options.padding,
options.conv_stride,
options.dilation,
options.output_size(),
mode,
split_k_slices);
// Construct ImplicitGemm::Argument structure with conv2d
// problem size, data pointers, and epilogue values
typename ImplicitGemm::Arguments arguments{
problem_size,
tensor_a.device_ref(),
tensor_b.device_ref(),
tensor_c.device_ref(),
tensor_c.device_ref(),
{options.alpha, options.beta},
};
//
// Initialize CUTLASS Convolution
//
ImplicitGemm implicit_gemm_op;
size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
// Allocate workspace memory
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
result.status = implicit_gemm_op.can_implement(arguments);
CUTLASS_CHECK(result.status);
result.status = implicit_gemm_op.initialize(arguments, workspace.get());
CUTLASS_CHECK(result.status);
//
// Launch initialized CUTLASS kernel
//
result.status = implicit_gemm_op();
CUTLASS_CHECK(result.status);
//
// Optional reference check
//
if (options.reference_check) {
std::cout << "Verification on host...\n";
// Compute with reference implementation
cutlass::reference::host::Conv2dFprop<
ElementInputA,
LayoutInputA,
ElementInputB,
LayoutInputB,
ElementOutput,
LayoutOutput,
ElementComputeEpilogue,
ElementAccumulator,
cutlass::NumericConverterClamp<ElementOutput, ElementComputeEpilogue>
>(
problem_size,
tensor_a.host_ref(),
tensor_b.host_ref(),
tensor_c.host_ref(),
tensor_ref_c.host_ref(),
options.alpha,
options.beta
);
// Check if output from CUTLASS kernel and reference kernel are equal or not
tensor_c.sync_host();
bool passed = cutlass::reference::host::TensorEquals(
tensor_c.host_view(),
tensor_ref_c.host_view());
if (!passed) {
result.reference_check = cutlass::Status::kErrorInternal;
std::cout << "ERROR - results miscompared.\n";
}
else {
result.reference_check = cutlass::Status::kSuccess;
std::cout << "Passed.\n";
}
}
else {
result.reference_check = cutlass::Status::kInvalid;
}
if (options.save_workspace) {
std::stringstream ss;
ss << "09_tensor_conv_workspace_conv2dfprop_"
<< options.input_size.n() << "x" << options.input_size.h() << "x" << options.input_size.w() << "x" << options.input_size.c()
<< "_"
<< options.filter_size.n() << "x" << options.filter_size.h() << "x" << options.filter_size.w() << "x" << options.filter_size.c()
<< ".dat";
std::ofstream output_workspace(ss.str());
output_workspace
<< "Input = \n" << tensor_a.host_view() << "\n\n"
<< "Filters = \n" << tensor_b.host_view() << "\n\n";
if (options.reference_check) {
output_workspace << "Reference = \n" << tensor_ref_c.host_view() << "\n\n";
}
output_workspace << "Computed = \n" << tensor_c.host_view() << std::endl;
std::cout << "Results written to '" << ss.str() << "'." << std::endl;
}
//
// Performance measurement
//
if (options.measure_performance) {
cudaEvent_t events[2];
for (auto & event : events) {
result.error = cudaEventCreate(&event);
if (result.error != cudaSuccess) {
std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
return result;
}
}
// Record an event at the start of a series of convolution operations.
result.error = cudaEventRecord(events[0]);
if (result.error != cudaSuccess) {
std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
return result;
}
// Launch a sequence of implicit GEMM operations on the device
for (int iteration = 0; iteration < options.iterations; ++iteration) {
result.status = implicit_gemm_op();
CUTLASS_CHECK(result.status);
}
// Record an event when the convolutions have been launched.
result.error = cudaEventRecord(events[1]);
if (result.error != cudaSuccess) {
std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
return result;
}
// Wait for work on the device to complete.
result.error = cudaEventSynchronize(events[1]);
if (result.error != cudaSuccess) {
std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
return result;
}
// Measure elapsed runtime
float runtime_ms = 0;
result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
if (result.error != cudaSuccess) {
std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
return result;
}
// Print average runtime and GFLOPs.
result.runtime_ms = double(runtime_ms) / double(options.iterations);
result.gflops = options.gflops(result.runtime_ms / 1000.0);
// Cleanup
for (auto event : events) {
(void)cudaEventDestroy(event);
}
}
return result;
}
/////////////////////////////////////////////////////////////////////////////////////////////////
int main(int argc, char const **args) {
// Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
//
// CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
return 0;
}
cudaDeviceProp props;
CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
if (!(props.major > 7 || (props.major == 7 && props.minor >= 5))) {
std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
<< std::endl;
return 0;
}
Options options;
options.parse(argc, args);
if (options.help) {
options.print_usage(std::cout) << std::endl;
return 0;
}
if (options.benchmark) {
// Benchmark several layers
int batch_sizes[] = {1, 32, 64, 128, 256, 512};
struct Benchmark {
int h, w, c, k, r, s;
} layers[] = {
{56, 56, 64, 256, 1, 1},
{56, 56, 64, 64, 1, 1},
{56, 56, 64, 64, 3, 3},
{56, 56, 256, 64, 1, 1},
{56, 56, 256, 512, 1, 1},
{56, 56, 256, 128, 1, 1},
{28, 28, 128, 128, 3, 3},
{28, 28, 128, 512, 1, 1},
{28, 28, 512, 128, 1, 1},
{28, 28, 512, 1024, 1, 1},
{28, 28, 512, 256, 1, 1},
{14, 14, 256, 256, 3, 3},
{14, 14, 256, 1024, 1, 1},
{14, 14, 1024, 256, 1, 1},
{14, 14, 1024, 2048, 1, 1},
{14, 14, 1024, 512, 1, 1},
{7, 7, 512, 512, 3, 3},
};
Result::print_header(std::cout, options) << std::endl;
int idx = 1;
for (auto const &layer : layers) {
for (auto N : batch_sizes) {
options.update({N, layer.h, layer.w, layer.c}, {layer.k, layer.r, layer.s, layer.c});
Result result = profile_convolution(options);
result.print(std::cout, idx, options) << std::endl;
}
++idx;
}
}
else {
// Execute one problem size
if (!options.valid()) {
std::cerr << "Invalid problem." << std::endl;
return -1;
}
Result result = profile_convolution(options);
Result::print_header(std::cout, options) << std::endl;
result.print(std::cout, 1, options) << std::endl;
}
return 0;
}
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,26 +1,34 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific prior written
# permission.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# Planar Complex GEMM example
cutlass_example_add_executable(
10_planar_complex

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
@ -50,7 +56,7 @@
To build strictly the planar complex kernels needed for general application, execute the following
CMake command in an empty build directory.
$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" \
$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
-DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex
This builds all planar complex GEMM variants for Volta and Turing architectures.
@ -59,7 +65,7 @@
specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for
the 'CN' layout configuration (conjugate A operand with both A and B as column-major).
$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" \
$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
-DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_f16*cn
$ make 10_planar_complex
@ -68,7 +74,6 @@
*/
#include <iostream>
#include <fstream>
#include <sstream>
#include "cutlass/cutlass.h"
@ -167,15 +172,15 @@ struct Options {
<< " This example uses the CUTLASS Library to execute Planar Complex GEMM computations.\n\n"
<< "Options:\n\n"
<< " --help If specified, displays this usage statement.\n\n"
<< " --m <int> GEMM M dimension\n"
<< " --n <int> GEMM N dimension\n"
<< " --k <int> GEMM K dimension\n"
<< " --batch <int> Number of GEMM operations executed in one batch\n"
<< " --alpha <f32> Epilogue scalar alpha (real part)\n"
<< " --alpha_i <f32> Epilogue scalar alpha (imaginary part)\n"
<< " --beta <f32> Epilogue scalar beta (real part)\n\n"
<< " --beta_i <f32> Epilogue scalar beta (imaginary part)\n\n"
<< " --iterations <int> Number of profiling iterations to perform.\n\n";
<< " --m=<int> GEMM M dimension\n"
<< " --n=<int> GEMM N dimension\n"
<< " --k=<int> GEMM K dimension\n"
<< " --batch=<int> Number of GEMM operations executed in one batch\n"
<< " --alpha=<f32> Epilogue scalar alpha (real part)\n"
<< " --alpha_i=<f32> Epilogue scalar alpha (imaginary part)\n"
<< " --beta=<f32> Epilogue scalar beta (real part)\n\n"
<< " --beta_i=<f32> Epilogue scalar beta (imaginary part)\n\n"
<< " --iterations=<int> Number of profiling iterations to perform.\n\n";
out << "\n\nExamples:\n\n"
<< "$ ./examples/10_planar_complex/10_planar_complex --batch=7 --m=1024 --n=512 --k=1024 \\\n"
@ -275,10 +280,10 @@ public:
int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
int lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
int ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
int ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
int ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
@ -526,6 +531,11 @@ int main(int argc, char const **args) {
return 0;
}
}
else {
// NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond.
//
// fall through
}
//
// Parse options

View File

@ -1,26 +1,34 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific prior written
# permission.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# Planar Complex Array GEMM example
cutlass_example_add_executable(
11_planar_complex_array

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
@ -48,7 +54,7 @@
To build strictly the planar complex kernels needed for general application, execute the following
CMake command in an empty build directory.
$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" \
$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
-DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex
This builds all planar complex GEMM variants for Volta and Turing architectures.
@ -57,7 +63,7 @@
specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for
the 'CN' layout configuration (conjugate A operand with both A and B as column-major).
$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" \
$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
-DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_array_f16*cn
$ make 11_planar_complex_array
@ -66,7 +72,6 @@
*/
#include <iostream>
#include <fstream>
#include <sstream>
#include "cutlass/cutlass.h"
@ -165,15 +170,15 @@ struct Options {
<< " This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n"
<< "Options:\n\n"
<< " --help If specified, displays this usage statement.\n\n"
<< " --m <int> GEMM M dimension\n"
<< " --n <int> GEMM N dimension\n"
<< " --k <int> GEMM K dimension\n"
<< " --batch <int> Number of GEMM operations executed in one batch\n"
<< " --alpha <f32> Epilogue scalar alpha (real part)\n"
<< " --alpha_i <f32> Epilogue scalar alpha (imaginary part)\n"
<< " --beta <f32> Epilogue scalar beta (real part)\n\n"
<< " --beta_i <f32> Epilogue scalar beta (imaginary part)\n\n"
<< " --iterations <int> Number of profiling iterations to perform.\n";
<< " --m=<int> GEMM M dimension\n"
<< " --n=<int> GEMM N dimension\n"
<< " --k=<int> GEMM K dimension\n"
<< " --batch=<int> Number of GEMM operations executed in one batch\n"
<< " --alpha=<f32> Epilogue scalar alpha (real part)\n"
<< " --alpha_i=<f32> Epilogue scalar alpha (imaginary part)\n"
<< " --beta=<f32> Epilogue scalar beta (real part)\n\n"
<< " --beta_i=<f32> Epilogue scalar beta (imaginary part)\n\n"
<< " --iterations=<int> Number of profiling iterations to perform.\n";
out << "\n\nExamples:\n\n"
<< "$ ./examples/11_planar_complex_array/11_planar_complex_array\n\n";
@ -292,10 +297,11 @@ public:
int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
int lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
int ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
int ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
int ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
@ -586,6 +592,11 @@ int main(int argc, char const **args) {
return 0;
}
}
else {
// NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond.
//
// fall through
}
//
// Parse options

View File

@ -1,25 +1,33 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific prior written
# permission.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cutlass_example_add_executable(
12_gemm_bias_relu
gemm_bias_relu.cu

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
@ -48,11 +54,20 @@ using ElementInputA = cutlass::half_t; // <- data type of elements
using ElementInputB = cutlass::half_t; // <- data type of elements in input matrix B
using ElementOutput = float; // <- data type of elements in output matrix D
// The code section below describes matrix layout of input and output matrices. Column Major for
// Matrix A, Row Major for Matrix B and Row Major for Matrix C
// Note that if the output is column major, the bias has to be per row. i.e. every row has different bias.
// If the output is row major, the bias has to be per column, i.e. every column has different bias.
// Below list some other notices:
//
// Note this example only works for ColumnMajor output because
// 1) we only have row major epilogue.
// 2) we swap A and B if the output is column major then we can still use the
// row major epilogue.
// 3) Mx1 bias vector becomes 1xM after the swapping/transposing.
// 4) we can use the existing OutputIterator to load 1xM bias vector.
using LayoutInputA = cutlass::layout::ColumnMajor;
using LayoutInputB = cutlass::layout::ColumnMajor;
using LayoutOutput = cutlass::layout::RowMajor;
using LayoutOutput = cutlass::layout::ColumnMajor;
// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
using MMAOp = cutlass::arch::OpClassTensorOp;
@ -73,17 +88,18 @@ using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSw
// Define the epilogue operation as LinearCombinationRelu. This is approximately equal to
//
// d_ij = max(0, alpha * sum_k(a_ik * b_kj) + beta * c_ij )
// d_ij = max(0, alpha * sum_k(a_ik * b_kj) + c_ij )
//
using EpilogueOp = cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput, // <- data type of output matrix
128 / cutlass::sizeof_bits<ElementOutput>::value, // <- this is the number of elements per
// vectorized memory access. For half
// precision, it's 8 elements. This becomes
// the vector width of math instructions in
// epilogue too
ElementAccumulator, // <- data type of accumulator
ElementComputeEpilogue>; // <- data type for alpha/beta in linear combination function
ElementOutput, // <- data type of output matrix
128 / cutlass::sizeof_bits<ElementOutput>::value, // <- this is the number of elements per
// vectorized memory access. For half
// precision, it's 8 elements. This becomes
// the vector width of math instructions in
// epilogue too
ElementAccumulator, // <- data type of accumulator
ElementComputeEpilogue, // <- data type for alpha in linear combination function
cutlass::epilogue::thread::ScaleType::NoBetaScaling>; // <- alpha x C + bias
// Number of pipelines you want to use
constexpr int NumStages = 2;
@ -106,21 +122,6 @@ using Gemm = cutlass::gemm::device::Gemm<ElementInputA,
int run() {
cudaDeviceProp props;
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (error != cudaSuccess) {
std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
return -1;
}
if (!(props.major * 10 + props.minor >= 75)) {
std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
<< std::endl;
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
const int length_m = 5120;
const int length_n = 4096;
const int length_k = 4096;
@ -132,7 +133,7 @@ int run() {
cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
problem_size.mk()); // <- Create matrix A with dimensions M x K
cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(
problem_size.nk()); // <- Create matrix B with dimensions N x K
problem_size.kn()); // <- Create matrix B with dimensions K x N
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c_bias(
{problem_size.m(), 1}); // <- Create matrix C with dimensions M x 1
@ -175,9 +176,8 @@ int run() {
tensor_d.sync_device();
tensor_ref_d.sync_device();
// Initialize alpha and beta for dot product computation
// Initialize alpha for dot product computation
ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
ElementComputeEpilogue beta = ElementComputeEpilogue(0);
// Split K dimension into 1 partitions
int split_k_slices = 1;
@ -193,7 +193,7 @@ int run() {
// to project away the N dimension by setting the stride to zero.
tensor_d.device_ref(), // <- reference to matrix D on device
{alpha, beta}, // <- tuple of alpha and beta
{alpha}, // <- alpha
split_k_slices}; // <- k-dimension split factor
// Using the arguments, query for extra workspace required for matrix multiplication computation
@ -205,8 +205,12 @@ int run() {
// Instantiate CUTLASS kernel depending on templates
Gemm gemm_op;
// Check the problem size is supported or not
cutlass::Status status = gemm_op.can_implement(arguments);
CUTLASS_CHECK(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
status = gemm_op.initialize(arguments, workspace.get());
CUTLASS_CHECK(status);
// Launch initialized CUTLASS kernel
@ -234,7 +238,6 @@ int run() {
tensor_a.device_ref(),
tensor_b.device_ref(),
0,
tensor_c_bias.device_ref(),
tensor_ref_d.device_ref());
// Wait for kernels to finish
@ -249,7 +252,7 @@ int run() {
for (int j = 0; j < problem_size.n(); ++j) {
tensor_ref_d.at({i, j}) = std::max(
ElementOutput(0),
ElementOutput(tensor_ref_d.at({i, j}) + beta * tensor_c_bias.at({i, 0}))
ElementOutput(tensor_ref_d.at({i, j}) + tensor_c_bias.at({i, 0}))
);
}
}
@ -266,17 +269,35 @@ int run() {
}
int main() {
bool notSupported = false;
// Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
//
// CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples.
if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
notSupported = true;
}
cudaDeviceProp props;
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (error != cudaSuccess) {
std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
return -1;
}
if (!(props.major * 10 + props.minor >= 75)) {
std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
<< std::endl;
notSupported = true;
}
if (notSupported) {
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
else {
return run();
}
return run();
}

View File

@ -1,33 +0,0 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific prior written
# permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cutlass_example_add_executable(
13_fused_two_gemms
fused_gemm.cu
)
target_include_directories(
13_fused_two_gemms
PRIVATE
.
)

View File

@ -1,74 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/**
*/
#include "b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h"
#include "b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h"
int run() {
cudaDeviceProp props;
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (error != cudaSuccess) {
std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
return -1;
}
if (!(props.major * 10 + props.minor >= 75)) {
std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
<< std::endl;
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
run_nonfused_gemm_f16();
run_fused_gemm_f16();
run_nonfused_gemm_s8();
run_fused_gemm_s8();
#endif
return 0;
}
int main() {
// Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
//
// CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples.
if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
else {
return run();
}
}

View File

@ -1,289 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
*/
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/numeric_types.h"
#include "cutlass/arch/arch.h"
#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
#include "threadblock/b2b_mma_pipelined.h"
////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace gemm {
namespace threadblock {
////////////////////////////////////////////////////////////////////////////////
template <
/// Element type for A matrix operand
typename ElementA_,
/// Layout type for A matrix operand
typename LayoutA_,
/// Access granularity of A matrix in units of elements
int kAlignmentA,
/// Element type for B matrix operand
typename ElementB_,
/// Layout type for B matrix operand
typename LayoutB_,
/// Access granularity of B matrix in units of elements
int kAlignmentB,
/// Element type for internal accumulation
typename ElementAccumulator_,
/// Layout type for C and D matrix operands
typename LayoutC_,
/// Operator class tag
typename OperatorClass_,
/// Tag indicating architecture to tune for
typename ArchTag_,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape0_,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape1_,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape0_,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape1_,
/// Instruction-level tile size (concept: GemmShape)
typename InstructionShape_,
/// Number of stages used in the pipelined mainloop
int Stages,
/// Operation perfomed by GEMM
typename Operator,
/// Epilogue output operator
typename EpilogueOutputOp,
/// Store the accumulators in row major or column major. Row major is used
/// when output layout is interleaved.
bool AccumulatorsInRowMajor = false>
struct DefaultB2bMma;
////////////////////////////////////////////////////////////////////////////////
/// Specialization for row-major output
template <
/// Element type for A matrix operand
typename ElementA,
/// Layout type for A matrix operand
typename LayoutA,
/// Access granularity of A matrix in units of elements
int kAlignmentA,
/// Element type for B matrix operand
typename ElementB,
/// Layout type for B matrix operand
typename LayoutB,
/// Access granularity of B matrix in units of elements
int kAlignmentB,
/// Element type for internal accumulation
typename ElementAccumulator,
/// Tag indicating architecture to tune for
typename OperatorClass,
/// Tag indicating architecture to tune for
typename ArchTag,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape0,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape1,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape0,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape1,
/// Instruction-level tile size (concept: GemmShape)
typename InstructionShape,
/// Operation performed by GEMM
typename Operator,
/// Epilogue output operator
typename EpilogueOutputOp>
struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
kAlignmentB, ElementAccumulator, layout::RowMajor,
OperatorClass, ArchTag,
ThreadblockShape0, ThreadblockShape1,
WarpShape0, WarpShape1,
InstructionShape, 2, Operator, EpilogueOutputOp, false> {
// Define the MmaCore components
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
OperatorClass, 2, Operator>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
OperatorClass, 2, Operator>;
// Define iterators over tiles from the A operand
using IteratorA0 =
cutlass::transform::threadblock::PredicatedTileIterator<
cutlass::MatrixShape<MmaCore0::Shape::kM, MmaCore0::Shape::kK>,
ElementA, LayoutA, 1, typename MmaCore0::IteratorThreadMapA, kAlignmentA>;
// Define iterators over tiles from the B operand
using IteratorB0 =
cutlass::transform::threadblock::PredicatedTileIterator<
cutlass::MatrixShape<MmaCore0::Shape::kK, MmaCore0::Shape::kN>,
ElementB, LayoutB, 0, typename MmaCore0::IteratorThreadMapB, kAlignmentB>;
// Use fragment iterator for A operand
using AccumulatorLayout = cutlass::layout::ColumnMajor;
using FragmentIteratorA1 =
cutlass::gemm::warp::MmaTensorOpFragmentIterator<
cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
MmaCore1::Shape::kK, //kBlocksColumn
ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp, true>;
// Define iterators over tiles from the B operand
using IteratorB1 =
cutlass::transform::threadblock::PredicatedTileIterator<
cutlass::MatrixShape<MmaCore1::Shape::kK, MmaCore1::Shape::kN>,
ElementB, LayoutB, 0, typename MmaCore1::IteratorThreadMapB>;
// Define the threadblock-scoped pipelined matrix multiply
using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaPipelined<
typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA,
IteratorB0, typename MmaCore0::SmemIteratorB,
typename MmaCore1::Shape, FragmentIteratorA1,
IteratorB1, typename MmaCore1::SmemIteratorB,
ElementAccumulator, layout::RowMajor,
EpilogueOutputOp,
typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy>;
};
////////////////////////////////////////////////////////////////////////////////
/// Specialization for column-major-interleaved output
template <
/// Element type for A matrix operand
typename ElementA,
/// Layout type for A matrix operand
typename LayoutA,
/// Access granularity of A matrix in units of elements
int kAlignmentA,
/// Element type for B matrix operand
typename ElementB,
/// Layout type for B matrix operand
typename LayoutB,
/// Access granularity of B matrix in units of elements
int kAlignmentB,
/// Element type for internal accumulation
typename ElementAccumulator,
/// Tag indicating architecture to tune for
typename OperatorClass,
/// Tag indicating architecture to tune for
typename ArchTag,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape0,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape1,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape0,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape1,
/// Instruction-level tile size (concept: GemmShape)
typename InstructionShape,
/// Operation performed by GEMM
typename Operator,
/// Epilogue output operator
typename EpilogueOutputOp,
/// Number of Interleaved K
int InterleavedK>
struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
kAlignmentB, ElementAccumulator,
layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, ArchTag,
ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
InstructionShape, 2, Operator, EpilogueOutputOp, true> {
// Define the MmaCore components
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementAccumulator,
layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, 2, Operator,
true>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementAccumulator,
layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, 2, Operator,
true>;
static_assert(kAlignmentA == 128 / sizeof_bits<ElementA>::value,
"Alignment must match thread data map's vector length");
static_assert(kAlignmentB ==128 / sizeof_bits<ElementB>::value,
"Alignment must match thread data map's vector length");
// Define iterators over tiles from the A operand
using IteratorA0 = cutlass::transform::threadblock::PredicatedTileIterator<
cutlass::MatrixShape<MmaCore0::Shape::kM, MmaCore0::Shape::kK>, ElementA,
LayoutA, 1, typename MmaCore0::IteratorThreadMapA>;
// Define iterators over tiles from the B operand
using IteratorB0 = cutlass::transform::threadblock::PredicatedTileIterator<
cutlass::MatrixShape<MmaCore0::Shape::kK, MmaCore0::Shape::kN>, ElementB,
LayoutB, 0, typename MmaCore0::IteratorThreadMapB>;
// Use fragment iterator for A operand
using AccumulatorLayout = cutlass::layout::RowMajor; //AccumulatorsInRowMajor = true
using FragmentIteratorA1 =
cutlass::gemm::warp::MmaTensorOpFragmentIterator<
cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
MmaCore1::Shape::kK, //kBlocksColumn
ElementAccumulator, ElementA, AccumulatorLayout,
InstructionShape, EpilogueOutputOp, true /*only handle beta=0 for 1st Gemm epilogue*/>;
// Define iterators over tiles from the B operand
using IteratorB1 =
cutlass::transform::threadblock::PredicatedTileIterator<
cutlass::MatrixShape<MmaCore1::Shape::kK, MmaCore1::Shape::kN>,
ElementB, LayoutB, 0, typename MmaCore1::IteratorThreadMapB>;
// Define the threadblock-scoped pipelined matrix multiply
using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaPipelined<
typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA,
IteratorB0, typename MmaCore0::SmemIteratorB,
typename MmaCore1::Shape, FragmentIteratorA1,
IteratorB1, typename MmaCore1::SmemIteratorB,
ElementAccumulator, layout::ColumnMajorInterleaved<InterleavedK>,
EpilogueOutputOp,
typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy>;
};
////////////////////////////////////////////////////////////////////////////////
} // namespace threadblock
} // namespace gemm
} // namespace cutlass
////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,82 @@
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
include_directories(
.
)
add_custom_target(13_fused_two_gemms)
add_custom_target(13_fused_two_convs)
add_custom_target(13_two_tensor_op_fusion
DEPENDS 13_fused_two_gemms
13_fused_two_convs
)
foreach(FUSION_CONV_EXAMPLE
fused_two_convs_f16_sm75_rf
fused_two_convs_f16_sm75_shmem
fused_two_convs_f16_sm80_rf
fused_two_convs_f16_sm80_shmem
fused_two_convs_s8_sm75_rf
fused_two_convs_s8_sm75_shmem
fused_two_convs_s8_sm80_rf
fused_two_convs_s8_sm80_shmem
)
cutlass_example_add_executable(
13_${FUSION_CONV_EXAMPLE}
${FUSION_CONV_EXAMPLE}.cu
)
add_dependencies(13_fused_two_convs 13_${FUSION_CONV_EXAMPLE})
endforeach()
foreach(FUSION_GEMM_EXAMPLE
fused_two_gemms_f16_sm75_rf
fused_two_gemms_f16_sm75_shmem
fused_two_gemms_f16_sm80_rf
fused_two_gemms_f16_sm80_shmem
fused_two_gemms_s8_sm75_rf
fused_two_gemms_s8_sm75_shmem
fused_two_gemms_s8_sm80_rf
fused_two_gemms_s8_sm80_shmem
)
cutlass_example_add_executable(
13_${FUSION_GEMM_EXAMPLE}
${FUSION_GEMM_EXAMPLE}.cu
)
add_dependencies(13_fused_two_gemms 13_${FUSION_GEMM_EXAMPLE})
endforeach()

View File

@ -0,0 +1,118 @@
# Introduction
This example shows fusing two back-to-back GEMMs/Convolutions into one kernel.
<p align="center"><img src=/media/images/13_example_fusion.png></p>
When running two unfused GEMM/Conv operations, each operation loads one input
activation matrix, one weight matrix (or filter matrix) from the memory and then
stores the result activation matrix back to the memory.
When the two GEMM/Conv operations are fused together, the mainloops of the two
GEMMs/Convs run back to back in a single kernel. The output accumulator of the
1st GEMM/Conv will be stored in the register file and reused as the activation
input of the 2nd GEMM/Conv. This saves a round trip to memory for the activation
matrix.
This example computes the following:
- 1st GEMM/Conv: D0 = relu(alpha0 .\* A0 \*\* B0)
- 2nd GEMM/Conv: D1 = relu(alpha1 .\* D0 \*\* B1 + beta1 .\* C1)
In the above equation, operator \*\* can be matrix multiplication or convolution operation.
# Implementation Details
In order to run two GEMM/Convs in a single kernel, the example requires the same number of
threadblocks are used across 2 GEMMs/Convs. This also ensures the same threadblock tile M across
2 GEMMs/Convs.
In order to reuse the output accumulator (stored in register-file) of the 1st GEMM as the
input activation, the example enforces the following two constraints:
- thread_block_tile_N = problem_N
<p align="center"><img src=/media/images/13_example_block_resident_fusion.png></p>
This constraint ensures that each threadblock loads the entire weight/filter matrix in
addition to its own input activation tile. Therefore the input activation tile of the
2nd GEMM/Conv only depends on the output activation tile of the 1st GEMM/Conv, and the
operation can be fully block-resident.
- warp_tile_N = thread_block_tile_N
<p align="center"><img src=/media/images/13_example_rf_resident_fusion.png></p>
This constraint ensures that each warp loads the entire weight/filter kBlock in
addition to its own input activation tile. Therefore the input activation warp tile of the
2nd GEMM/Conv only depends on the output warp accumulator of the 1st GEMM/Conv in the
register file, and the operation can be fully register-file-resident.
On the other hand, this constraint can be relaxed if the output accumulator of the 1st GEMM/CONV
is staged in the shared memory and then used as input for the 2nd GEMM/CONV. In this case, the
input of each warp tile can be loaded from the shared memory so they do not need to be RF-resident,
therefore each warp does not need to store the entire input matrix of 2nd GEMM in its RF. This is
illustrated in the diagram below.
<p align="center"><img src=/media/images/13_example_shmem_resident_fusion.png></p>
When applying the above constraint to convolutions, it is required that the 2nd Convolution
kernel doesn't have halos such that data used by each threadblock doesn't depend on any other
threadblock. Typically this requires the 2nd Convolution uses 1x1 filter without any paddings.
# Build and run
- Run cmake at top-level CUTLASS
- `make 13_two_tensor_op_fusion`
- Run individual benchmarks
- `./examples/13_two_tensor_op_fusion/13_fused_two_convs_f16_sm75_rf`
- `./examples/13_two_tensor_op_fusion/13_fused_two_convs_f16_sm75_shmem`
- `./examples/13_two_tensor_op_fusion/13_fused_two_convs_f16_sm80_rf`
- `./examples/13_two_tensor_op_fusion/13_fused_two_convs_f16_sm80_shmem`
- `./examples/13_two_tensor_op_fusion/13_fused_two_convs_s8_sm75_rf`
- `./examples/13_two_tensor_op_fusion/13_fused_two_convs_s8_sm75_shmem`
- `./examples/13_two_tensor_op_fusion/13_fused_two_convs_s8_sm80_rf`
- `./examples/13_two_tensor_op_fusion/13_fused_two_convs_s8_sm80_shmem`
- `./examples/13_two_tensor_op_fusion/13_fused_two_gemms_f16_sm75_rf`
- `./examples/13_two_tensor_op_fusion/13_fused_two_gemms_f16_sm75_shmem`
- `./examples/13_two_tensor_op_fusion/13_fused_two_gemms_f16_sm80_rf`
- `./examples/13_two_tensor_op_fusion/13_fused_two_gemms_f16_sm80_shmem`
- `./examples/13_two_tensor_op_fusion/13_fused_two_gemms_s8_sm75_rf`
- `./examples/13_two_tensor_op_fusion/13_fused_two_gemms_s8_sm75_shmem`
- `./examples/13_two_tensor_op_fusion/13_fused_two_gemms_s8_sm80_rf`
- `./examples/13_two_tensor_op_fusion/13_fused_two_gemms_s8_sm80_shmem`
# Copyright
Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
```
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
```

View File

@ -0,0 +1,719 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#pragma once
#include <iostream>
#include <fstream>
#include <sstream>
#include "cutlass/cutlass.h"
#include "cutlass/conv/device/implicit_gemm_convolution.h"
#include "cutlass/reduction/device/reduce_split_k.h"
#include "cutlass/reduction/thread/reduction_operators.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/device/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_norm.h"
#include "cutlass/util/reference/host/convolution.h"
#include "cutlass/util/reference/device/convolution.h"
#include "cutlass/util/reference/device/tensor_relu.h"
#include "cutlass/core_io.h"
#include "cutlass/util/tensor_view_io.h"
#include "reference/device/tensor_scale_bias.h"
#include "helper.h"
#define CHECK_GT(val1, val2) \
if((val1) <= (val2)) \
std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_GT failed\n";
#define CHECK_TRUE(val) \
if(!(val)) \
std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_TRUE failed\n";
template <typename Conv2d0_, typename Conv2d1_>
class B2bNonFusedConv2dRun {
public:
using Conv2d0 = Conv2d0_;
using Conv2d1 = Conv2d1_;
using ElementAccumulator = typename Conv2d0::ElementAccumulator;
using ElementCompute = typename Conv2d0::ElementCompute;
static cutlass::conv::Operator const kConvolutionalOperator = Conv2d0::kConvolutionalOperator;
static_assert(kConvolutionalOperator == Conv2d1::kConvolutionalOperator,
"Fused convolution operators must be the same");
public:
/// Initialization
cutlass::Distribution::Kind init_A;
cutlass::Distribution::Kind init_B;
cutlass::Distribution::Kind init_C;
cutlass::Distribution::Kind init_Bias;
uint64_t seed;
cutlass::HostTensor<typename Conv2d0::ElementA, typename Conv2d0::LayoutA> tensor_A0;
cutlass::HostTensor<typename Conv2d0::ElementB, typename Conv2d0::LayoutB> tensor_B0;
cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_C0;
cutlass::HostTensor<typename Conv2d0::ElementCompute, typename Conv2d0::LayoutC> tensor_Bias0;
cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_D0_computed;
cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_D0_reference;
cutlass::HostTensor<typename Conv2d1::ElementB, typename Conv2d1::LayoutB> tensor_B1;
cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_C1;
cutlass::HostTensor<typename Conv2d1::ElementCompute, typename Conv2d0::LayoutC> tensor_Bias1;
cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_D1_computed;
cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_D1_reference;
public:
B2bNonFusedConv2dRun(
cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_Bias_ = cutlass::Distribution::Uniform,
uint64_t seed_ = 2080
):
init_A(init_A_), init_B(init_B_), init_C(init_C_), init_Bias(init_Bias_), seed(seed_) {
}
/// Helper to initialize a tensor view
template <typename Element, typename Layout>
void initialize_tensor(
cutlass::TensorView<Element, Layout> view,
cutlass::Distribution::Kind dist_kind,
uint64_t seed) {
if (dist_kind == cutlass::Distribution::Uniform) {
int scope;
int bits = cutlass::sizeof_bits<Element>::value;
if (bits <= 16) {
scope = 2;
}
else {
scope = 8;
}
cutlass::reference::host::TensorFillRandomUniform(
view, seed, scope, -scope, 0);
}
else if (dist_kind == cutlass::Distribution::Identity) {
cutlass::reference::host::TensorFillIdentity(view);
}
else if (dist_kind == cutlass::Distribution::Gaussian) {
cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
}
else if (dist_kind == cutlass::Distribution::Sequential) {
cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
}
else if (dist_kind == cutlass::Distribution::AllZeros) {
cutlass::reference::host::TensorFill(view, Element(0));
}
else if (dist_kind == cutlass::Distribution::AllOnes) {
cutlass::reference::host::TensorFill(view, Element(1));
}
else {
std::cerr << "Not implemented\n";
}
}
void initialize(
cutlass::conv::Conv2dProblemSize const &problem_size_0,
cutlass::conv::Conv2dProblemSize const &problem_size_1,
uint64_t seed = 2019) {
tensor_A0.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size_0));
tensor_B0.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
tensor_C0.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
tensor_Bias0.resize({1, 1, 1, problem_size_0.K});
tensor_D0_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
tensor_D0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
tensor_B1.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
tensor_C1.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
tensor_Bias1.resize({1, 1, 1, problem_size_1.K});
tensor_D1_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
tensor_D1_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
initialize_tensor(tensor_A0.host_view(), init_A, seed);
initialize_tensor(tensor_B0.host_view(), init_B, seed * 17);
initialize_tensor(tensor_C0.host_view(), init_C, seed * 39);
initialize_tensor(tensor_Bias0.host_view(), init_Bias, seed * 83);
initialize_tensor(tensor_B1.host_view(), init_B, seed * 18);
initialize_tensor(tensor_C1.host_view(), init_C, seed * 40);
initialize_tensor(tensor_Bias1.host_view(), init_Bias, seed * 84);
tensor_A0.sync_device();
tensor_B0.sync_device();
tensor_C0.sync_device();
tensor_Bias0.sync_device();
tensor_D0_computed.sync_device();
tensor_D0_reference.sync_device();
tensor_B1.sync_device();
tensor_C1.sync_device();
tensor_Bias1.sync_device();
tensor_D1_computed.sync_device();
tensor_D1_reference.sync_device();
}
/// Executes one test
bool run(
cutlass::conv::Conv2dProblemSize const &problem_size_0,
cutlass::conv::Conv2dProblemSize const &problem_size_1,
cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
ElementCompute alpha0 = ElementCompute(1),
ElementCompute beta0 = ElementCompute(0),
ElementCompute alpha1 = ElementCompute(1),
ElementCompute beta1 = ElementCompute(0),
bool relu = true,
int warm_ups = 1,
int runs = 100) {
initialize(problem_size_0, problem_size_1);
// configure the operator
Conv2d0 conv2d_op_0;
Conv2d1 conv2d_op_1;
typename Conv2d0::Arguments conv2d_args_0(
problem_size_0,
tensor_A0.device_ref(),
tensor_B0.device_ref(),
{tensor_Bias0.device_data(), typename Conv2d0::LayoutC::Stride(0)},
tensor_D0_computed.device_ref(),
{alpha0, beta0},
split_k_mode
);
typename Conv2d1::Arguments conv2d_args_1(
problem_size_1,
tensor_D0_computed.device_ref(),
tensor_B1.device_ref(),
{tensor_Bias1.device_data(), typename Conv2d1::LayoutC::Stride(0)},
tensor_D1_computed.device_ref(),
{alpha1, beta1},
split_k_mode
);
cutlass::Status status = conv2d_op_0.initialize(conv2d_args_0);
CUTLASS_CHECK(status);
status = conv2d_op_1.initialize(conv2d_args_1);
CUTLASS_CHECK(status);
for(int i = 0; i < warm_ups; i++) {
status = conv2d_op_0();
CUTLASS_CHECK(status);
status = conv2d_op_1();
CUTLASS_CHECK(status);
}
//
// Run Conv2d
//
cudaEvent_t start, stop1, stop2;
cudaEventCreate(&start);
cudaEventCreate(&stop1);
cudaEventCreate(&stop2);
cudaEventRecord(start);
for(int i = 0; i < runs; i++) {
// run conv2d operator
status = conv2d_op_0();
CUTLASS_CHECK(status);
}
cudaEventRecord(stop1);
for(int i = 0; i < runs; i++) {
// run conv2d operator
status = conv2d_op_1();
CUTLASS_CHECK(status);
}
cudaEventRecord(stop2);
cudaDeviceSynchronize();
float conv2d0Time, conv2d1Time, totalTime;
cudaEventElapsedTime(&conv2d0Time, start, stop1);
cudaEventElapsedTime(&conv2d1Time, stop1, stop2);
cudaEventElapsedTime(&totalTime, start, stop2);
std::cout << "conv2d 0 time " << conv2d0Time / (float)runs << " ms\n";
std::cout << "conv2d 1 time " << conv2d1Time / (float)runs << " ms\n";
std::cout << "Non-fusion time " << totalTime / (float)runs << " ms\n";
tensor_D0_computed.sync_host();
tensor_D1_computed.sync_host();
bool passed = false;
cutlass::reference::device::Conv2d<
typename Conv2d0::ElementA,
typename Conv2d0::LayoutA,
typename Conv2d0::ElementB,
typename Conv2d0::LayoutB,
typename Conv2d0::ElementC,
typename Conv2d0::LayoutC,
ElementCompute,
ElementAccumulator
>(
kConvolutionalOperator,
problem_size_0,
tensor_A0.device_ref(),
tensor_B0.device_ref(),
{tensor_Bias0.device_data(), typename Conv2d0::LayoutC::Stride(0)},
tensor_D0_reference.device_ref(),
alpha0,
beta0);
if(relu) {
cutlass::reference::device::TensorReLu(tensor_D0_reference.device_view());
}
cutlass::reference::device::Conv2d<
typename Conv2d1::ElementA,
typename Conv2d1::LayoutA,
typename Conv2d1::ElementB,
typename Conv2d1::LayoutB,
typename Conv2d1::ElementC,
typename Conv2d1::LayoutC,
ElementCompute,
ElementAccumulator
>(
kConvolutionalOperator,
problem_size_1,
tensor_D0_reference.device_ref(),
tensor_B1.device_ref(),
{tensor_Bias1.device_data(), typename Conv2d1::LayoutC::Stride(0)},
tensor_D1_reference.device_ref(),
alpha1,
beta1);
if(relu) {
cutlass::reference::device::TensorReLu(tensor_D1_reference.device_view());
}
cudaError_t result = cudaDeviceSynchronize();
CHECK_TRUE(result == cudaSuccess);
// sync host (copy device data to host) for dumping error output in case of mismatches
tensor_D0_reference.sync_host();
tensor_D1_reference.sync_host();
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_computed.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_reference.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_computed.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_reference.host_view()), 0);
passed = cutlass::reference::host::TensorEquals(
tensor_D1_computed.host_view(),
tensor_D1_reference.host_view());
CHECK_TRUE(passed);
if (!passed) {
std::stringstream fname;
fname << "error_B2bImplicitGemm_device_nonfused.txt";
std::cerr << "Dumping results in " << fname.str() << "\n";
std::ofstream results(fname.str());
results << problem_size_0 << std::endl;
results << problem_size_1 << std::endl;
results
<< "\nA0:\n" << tensor_A0.host_view() << "\n"
<< "\nB0:\n" << tensor_B0.host_view() << "\n"
<< "\nC0:\n" << tensor_C0.host_view() << "\n"
<< "\nBias0:\n" << tensor_Bias0.host_view() << "\n"
<< "\nD0 reference:\n" << tensor_D0_reference.host_view() << "\n"
<< "\nD0 computed:\n" << tensor_D0_computed.host_view() << "\n"
<< "\nB1:\n" << tensor_B1.host_view() << "\n"
<< "\nC1:\n" << tensor_C1.host_view() << "\n"
<< "\nBias1:\n" << tensor_Bias1.host_view() << "\n"
<< "\nD1 reference:\n" << tensor_D1_reference.host_view() << "\n"
<< "\nD1 computed:\n" << tensor_D1_computed.host_view();
}
return passed;
}
};
template <typename B2bConv2d_>
class B2bFusedConv2dRun {
public:
using B2bConv2d = B2bConv2d_;
using ElementAccumulator = typename B2bConv2d::ElementAccumulator;
using ElementCompute = typename B2bConv2d::ElementCompute;
static cutlass::conv::Operator const kConvolutionalOperator = B2bConv2d::kConvolutionalOperator;
public:
/// Initialization
cutlass::Distribution::Kind init_A;
cutlass::Distribution::Kind init_B;
cutlass::Distribution::Kind init_C;
cutlass::Distribution::Kind init_Scale;
cutlass::Distribution::Kind init_Bias;
uint64_t seed;
cutlass::HostTensor<typename B2bConv2d::ElementA, typename B2bConv2d::LayoutA> tensor_A0;
cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B0;
cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_C0;
cutlass::HostTensor<typename B2bConv2d::ElementScaleBias, typename B2bConv2d::LayoutScaleBias> tensor_Scale0;
cutlass::HostTensor<typename B2bConv2d::ElementScaleBias, typename B2bConv2d::LayoutScaleBias> tensor_Bias0;
cutlass::HostTensor<ElementAccumulator, typename B2bConv2d::LayoutC> tensor_Z0_reference;
cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D0_reference;
cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B1;
cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_C1;
cutlass::HostTensor<typename B2bConv2d::ElementCompute, typename B2bConv2d::LayoutC> tensor_Bias1;
cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D1_computed;
cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D1_reference;
public:
B2bFusedConv2dRun(
cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_Scale_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_Bias_ = cutlass::Distribution::Uniform,
uint64_t seed_ = 2080
):
init_A(init_A_), init_B(init_B_), init_C(init_C_),
init_Scale(init_Scale_), init_Bias(init_Bias_), seed(seed_) {
}
/// Helper to initialize a tensor view
template <typename Element, typename Layout>
void initialize_tensor(
cutlass::TensorView<Element, Layout> view,
cutlass::Distribution::Kind dist_kind,
uint64_t seed) {
if (dist_kind == cutlass::Distribution::Uniform) {
int scope;
int bits = cutlass::sizeof_bits<Element>::value;
if (bits <= 16) {
scope = 2;
}
else {
scope = 8;
}
cutlass::reference::host::TensorFillRandomUniform(
view, seed, scope, -scope, 0);
}
else if (dist_kind == cutlass::Distribution::Identity) {
cutlass::reference::host::TensorFillIdentity(view);
}
else if (dist_kind == cutlass::Distribution::Gaussian) {
cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
}
else if (dist_kind == cutlass::Distribution::Sequential) {
cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
}
else if (dist_kind == cutlass::Distribution::AllZeros) {
cutlass::reference::host::TensorFill(view, Element(0));
}
else if (dist_kind == cutlass::Distribution::AllOnes) {
cutlass::reference::host::TensorFill(view, Element(1));
}
else {
}
}
void initialize(
cutlass::conv::Conv2dProblemSize const &problem_size_0,
cutlass::conv::Conv2dProblemSize const &problem_size_1,
ElementCompute alpha0,
ElementCompute alpha1,
uint64_t seed = 2019) {
tensor_A0.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size_0));
tensor_B0.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
tensor_C0.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
if(alpha0 == ElementCompute(0)) //per-channel scale
tensor_Scale0.resize({1, problem_size_0.K});
tensor_Bias0.resize({1, problem_size_0.K});
tensor_Z0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
tensor_D0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
tensor_B1.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
tensor_C1.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
tensor_Bias1.resize({1, 1, 1, problem_size_1.K});
tensor_D1_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
tensor_D1_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
initialize_tensor(tensor_A0.host_view(), init_A, seed);
initialize_tensor(tensor_B0.host_view(), init_B, seed * 17);
initialize_tensor(tensor_C0.host_view(), init_C, seed * 39);
if(alpha0 == ElementCompute(0)) //per-channel scale
initialize_tensor(tensor_Scale0.host_view(), init_Scale, seed * 61);
initialize_tensor(tensor_Bias0.host_view(), init_Bias, seed * 83);
initialize_tensor(tensor_B1.host_view(), init_B, seed * 18);
initialize_tensor(tensor_C1.host_view(), init_C, seed * 40);
initialize_tensor(tensor_Bias1.host_view(), init_Bias, seed * 84);
tensor_A0.sync_device();
tensor_B0.sync_device();
tensor_C0.sync_device();
if(alpha0 == ElementCompute(0)) //per-channel scale
tensor_Scale0.sync_device();
tensor_Bias0.sync_device();
tensor_D0_reference.sync_device();
tensor_B1.sync_device();
tensor_C1.sync_device();
tensor_Bias1.sync_device();
tensor_D1_computed.sync_device();
tensor_D1_reference.sync_device();
}
/// Executes one test
bool run(
cutlass::conv::Conv2dProblemSize const &problem_size_0,
cutlass::conv::Conv2dProblemSize const &problem_size_1,
cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
ElementCompute alpha0 = ElementCompute(1),
ElementCompute beta0 = ElementCompute(0),
ElementCompute alpha1 = ElementCompute(1),
ElementCompute beta1 = ElementCompute(0),
bool relu = true,
int warm_ups = 1,
int runs = 100) {
initialize(problem_size_0, problem_size_1, alpha0, alpha1);
// configure the operator
B2bConv2d b2b_conv2d_op;
typename B2bConv2d::Arguments b2b_conv2d_args(
problem_size_0,
problem_size_1,
tensor_A0.device_ref(),
tensor_B0.device_ref(),
tensor_C0.device_ref(),
tensor_Scale0.device_ref(),
tensor_Bias0.device_ref(),
tensor_B1.device_ref(),
{tensor_Bias1.device_data(), typename B2bConv2d::LayoutC::Stride(0)},
tensor_D1_computed.device_ref(),
{alpha0, beta0},
{alpha1, beta1},
split_k_mode
);
cutlass::Status status = b2b_conv2d_op.can_implement(b2b_conv2d_args);
if(status != cutlass::Status::kSuccess) {
std::cout << "Problem sizes not supported.\n"
<< "Requirments:\n"
<< " problem_size_0.N*P*Q = problem_size_1.N*P*Q\n"
<< " problem_size_0.K = problem_size_1.C\n"
<< " problem_size_1.R = problem_size_1.S = 1\n"
<< " ThreadblockShape0::kN = problem_size_0.K\n"
<< " ThreadblockShape1::kN = problem_size_1.K" << std::endl;
}
CUTLASS_CHECK(status);
status = b2b_conv2d_op.initialize(b2b_conv2d_args);
CUTLASS_CHECK(status);
for(int i = 0; i < warm_ups; i++) {
status = b2b_conv2d_op();
CUTLASS_CHECK(status);
}
//
// Run the Conv2d
//
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
for(int i = 0; i < runs; i++) {
// run conv2d operator
status = b2b_conv2d_op();
CUTLASS_CHECK(status);
}
cudaEventRecord(stop);
cudaDeviceSynchronize();
float conv2dTime;
cudaEventElapsedTime(&conv2dTime, start, stop);
std::cout << "Fusion time " << conv2dTime / (float)runs << " ms\n";
tensor_D1_computed.sync_host();
bool passed = false;
cutlass::reference::device::Conv2d<
typename B2bConv2d::ElementA,
typename B2bConv2d::LayoutA,
typename B2bConv2d::ElementB,
typename B2bConv2d::LayoutB,
ElementAccumulator,
typename B2bConv2d::LayoutC,
ElementAccumulator,
ElementAccumulator
>(
kConvolutionalOperator,
problem_size_0,
tensor_A0.device_ref(),
tensor_B0.device_ref(),
tensor_Z0_reference.device_ref(),
tensor_Z0_reference.device_ref(),
ElementAccumulator(1), // intermediate alpha = 1
ElementAccumulator(0) // beta = 0
);
cutlass::reference::device::TensorScaleBiasConv2d<
ElementAccumulator,
typename B2bConv2d::ElementC,
typename B2bConv2d::LayoutC,
ElementCompute,
typename B2bConv2d::LayoutScaleBias
>(
problem_size_0,
tensor_Z0_reference.device_ref(),
tensor_D0_reference.device_ref(),
alpha0,
tensor_Scale0.device_ref(),
tensor_Bias0.device_ref()
);
if(relu) {
cutlass::reference::device::TensorReLu(tensor_D0_reference.device_view());
}
cutlass::reference::device::Conv2d<
typename B2bConv2d::ElementA,
typename B2bConv2d::LayoutA,
typename B2bConv2d::ElementB,
typename B2bConv2d::LayoutB,
typename B2bConv2d::ElementC,
typename B2bConv2d::LayoutC,
ElementCompute,
ElementAccumulator
>(
kConvolutionalOperator,
problem_size_1,
tensor_D0_reference.device_ref(),
tensor_B1.device_ref(),
{tensor_Bias1.device_data(), typename B2bConv2d::LayoutC::Stride(0)},
tensor_D1_reference.device_ref(),
alpha1,
beta1);
if(relu) {
cutlass::reference::device::TensorReLu(tensor_D1_reference.device_view());
}
cudaError_t result = cudaDeviceSynchronize();
CHECK_TRUE(result == cudaSuccess);
// sync host (copy device data to host) for dumping error output in case of mismatches
tensor_D0_reference.sync_host();
tensor_D1_reference.sync_host();
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_reference.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_computed.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_reference.host_view()), 0);
passed = cutlass::reference::host::TensorEquals(
tensor_D1_computed.host_view(),
tensor_D1_reference.host_view());
CHECK_TRUE(passed);
if (!passed) {
std::stringstream fname;
fname << "error_B2bImplicitGemm_device_fused.txt";
std::cerr << "Dumping results in " << fname.str() << "\n";
std::ofstream results(fname.str());
results << problem_size_0 << std::endl;
results << problem_size_1 << std::endl;
results
<< "\nA0:\n" << tensor_A0.host_view() << "\n"
<< "\nB0:\n" << tensor_B0.host_view() << "\n"
<< "\nC0:\n" << tensor_C0.host_view() << "\n"
<< "\nScale0:\n" << tensor_Scale0.host_view() << "\n"
<< "\nBias0:\n" << tensor_Bias0.host_view() << "\n"
<< "\nB1:\n" << tensor_B1.host_view() << "\n"
<< "\nC1:\n" << tensor_C1.host_view() << "\n"
<< "\nBias1:\n" << tensor_Bias1.host_view() << "\n"
<< "\nD1 reference:\n" << tensor_D1_reference.host_view() << "\n"
<< "\nD1 computed:\n" << tensor_D1_computed.host_view();
}
return passed;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
@ -38,6 +44,7 @@
#include "cutlass/util/reference/device/gemm.h"
#include "cutlass/util/reference/device/tensor_relu.h"
#include "reference/device/tensor_scale_bias.h"
#include "helper.h"
#define CHECK_GT(val1, val2) \
@ -62,6 +69,7 @@ struct B2bNonFusedGemmRun
cutlass::Distribution::Kind init_A;
cutlass::Distribution::Kind init_B;
cutlass::Distribution::Kind init_C;
cutlass::Distribution::Kind init_Bias;
uint64_t seed;
//
@ -72,9 +80,10 @@ struct B2bNonFusedGemmRun
cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_Bias_ = cutlass::Distribution::Uniform,
uint64_t seed_ = 2080
):
init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
init_A(init_A_), init_B(init_B_), init_C(init_C_), init_Bias(init_Bias_), seed(seed_) { }
/// Helper to initialize a tensor view
template <typename Element, typename Layout>
@ -91,7 +100,7 @@ struct B2bNonFusedGemmRun
else if (dist_kind == cutlass::Distribution::Identity) {
cutlass::reference::host::TensorFillIdentity(view);
}
}
else if (dist_kind == cutlass::Distribution::Gaussian) {
cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
@ -100,9 +109,14 @@ struct B2bNonFusedGemmRun
cutlass::reference::host::BlockFillSequential(
view.data(), view.capacity());
}
}
else if (dist_kind == cutlass::Distribution::AllZeros) {
cutlass::reference::host::TensorFill(view, Element(0));
}
else if (dist_kind == cutlass::Distribution::AllOnes) {
cutlass::reference::host::TensorFill(view, Element(1));
}
else {
// TODO: Implement the rest
std::cerr << "Not implemented\n";
return false;
}
@ -121,7 +135,9 @@ struct B2bNonFusedGemmRun
ElementCompute beta0 = ElementCompute(0),
ElementCompute alpha1 = ElementCompute(1),
ElementCompute beta1 = ElementCompute(0),
bool relu = true) {
bool relu = true,
int warm_ups = 1,
int runs = 100) {
//
// Allocate the GEMM workspace
@ -139,6 +155,10 @@ struct B2bNonFusedGemmRun
typename Gemm0::ElementC,
typename Gemm0::LayoutC> tensor_C0(problem_size_0.mn());
cutlass::HostTensor<
ElementCompute,
typename Gemm0::LayoutC> tensor_Bias0({1, problem_size_0.n()});
cutlass::HostTensor<
typename Gemm0::ElementC,
typename Gemm0::LayoutC> tensor_D0(problem_size_0.mn());
@ -155,6 +175,10 @@ struct B2bNonFusedGemmRun
typename Gemm1::ElementC,
typename Gemm1::LayoutC> tensor_C1(problem_size_1.mn());
cutlass::HostTensor<
ElementCompute,
typename Gemm1::LayoutC> tensor_Bias1({1, problem_size_1.n()});
cutlass::HostTensor<
typename Gemm1::ElementC,
typename Gemm1::LayoutC> tensor_D1(problem_size_1.mn());
@ -167,8 +191,10 @@ struct B2bNonFusedGemmRun
CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019));
CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2018));
CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017));
CHECK_TRUE(initialize_tensor(tensor_Bias0.host_view(), init_Bias, seed + 2014));
CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2016));
CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015));
CHECK_TRUE(initialize_tensor(tensor_Bias1.host_view(), init_Bias, seed + 2013));
cutlass::reference::host::TensorFill(
tensor_D0.host_view());
@ -182,9 +208,11 @@ struct B2bNonFusedGemmRun
tensor_A0.sync_device();
tensor_B0.sync_device();
tensor_C0.sync_device();
tensor_Bias0.sync_device();
tensor_D0.sync_device();
tensor_B1.sync_device();
tensor_C1.sync_device();
tensor_Bias1.sync_device();
tensor_D1.sync_device();
reference_D0.sync_device();
reference_D1.sync_device();
@ -197,7 +225,7 @@ struct B2bNonFusedGemmRun
problem_size_0,
tensor_A0.device_ref(),
tensor_B0.device_ref(),
tensor_C0.device_ref(),
{tensor_Bias0.device_data(), typename Gemm0::LayoutC::Stride(0)},
tensor_D0.device_ref(),
{alpha0, beta0}
};
@ -206,7 +234,7 @@ struct B2bNonFusedGemmRun
problem_size_1,
tensor_D0.device_ref(),
tensor_B1.device_ref(),
tensor_C1.device_ref(),
{tensor_Bias1.device_data(), typename Gemm1::LayoutC::Stride(0)},
tensor_D1.device_ref(),
{alpha1, beta1}
};
@ -222,10 +250,17 @@ struct B2bNonFusedGemmRun
status = gemm_op_1.initialize(arguments_1);
CUTLASS_CHECK(status);
for(int i = 0; i < warm_ups; i++) {
status = gemm_op_0();
CUTLASS_CHECK(status);
status = gemm_op_1();
CUTLASS_CHECK(status);
}
//
// Run the GEMM
//
cudaEvent_t start, stop1, stop2;
cudaEventCreate(&start);
cudaEventCreate(&stop1);
@ -233,14 +268,13 @@ struct B2bNonFusedGemmRun
cudaEventRecord(start);
for(int i = 0; i < 100; i++) {
for(int i = 0; i < runs; i++) {
status = gemm_op_0();
CUTLASS_CHECK(status);
}
cudaEventRecord(stop1);
for(int i = 0; i < 100; i++) {
for(int i = 0; i < runs; i++) {
status = gemm_op_1();
CUTLASS_CHECK(status);
@ -252,9 +286,9 @@ struct B2bNonFusedGemmRun
cudaEventElapsedTime(&gemm0Time, start, stop1);
cudaEventElapsedTime(&gemm1Time, stop1, stop2);
cudaEventElapsedTime(&totalTime, start, stop2);
std::cout << "gemm 0 time " << gemm0Time / 100.0 << " ms\n";
std::cout << "gemm 1 time " << gemm1Time / 100.0 << " ms\n";
std::cout << "total time " << totalTime / 100.0 << " ms\n";
std::cout << "gemm 0 time " << gemm0Time / (float)runs << " ms\n";
std::cout << "gemm 1 time " << gemm1Time / (float)runs << " ms\n";
std::cout << "Non-fusion time " << totalTime / (float)runs << " ms\n";
tensor_D0.sync_host();
tensor_D1.sync_host();
@ -282,7 +316,7 @@ struct B2bNonFusedGemmRun
tensor_A0.device_ref(),
tensor_B0.device_ref(),
beta0,
tensor_C0.device_ref(),
{tensor_Bias0.device_data(), typename Gemm0::LayoutC::Stride(0)},
reference_D0.device_ref()
);
@ -296,7 +330,7 @@ struct B2bNonFusedGemmRun
reference_D0.device_ref(),
tensor_B1.device_ref(),
beta1,
tensor_C1.device_ref(),
{tensor_Bias1.device_data(), typename Gemm1::LayoutC::Stride(0)},
reference_D1.device_ref()
);
@ -309,7 +343,6 @@ struct B2bNonFusedGemmRun
reference_D0.sync_host();
reference_D1.sync_host();
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0);
@ -333,13 +366,14 @@ struct B2bNonFusedGemmRun
<< "A0 =\n" << tensor_A0.host_view()
<< "\nB0 =\n" << tensor_B0.host_view()
<< "\nC0 =\n" << tensor_C0.host_view()
<< "\nBias0:\n" << tensor_Bias0.host_view() << "\n"
<< "\nD0 =\n" << tensor_D0.host_view()
<< "\nB1 =\n" << tensor_B1.host_view()
<< "\nC1 =\n" << tensor_C1.host_view()
<< "\nBias1:\n" << tensor_Bias1.host_view() << "\n"
<< "\n\nReference =\n" << reference_D1.host_view()
<< "\nComputed =\n" << tensor_D1.host_view();
}
return passed;
}
};
@ -356,6 +390,8 @@ struct B2bFusedGemmRun
cutlass::Distribution::Kind init_A;
cutlass::Distribution::Kind init_B;
cutlass::Distribution::Kind init_C;
cutlass::Distribution::Kind init_Scale;
cutlass::Distribution::Kind init_Bias;
uint64_t seed;
//
@ -366,9 +402,12 @@ struct B2bFusedGemmRun
cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_Scale_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_Bias_ = cutlass::Distribution::Uniform,
uint64_t seed_ = 2080
):
init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
init_A(init_A_), init_B(init_B_), init_C(init_C_),
init_Scale(init_Scale_), init_Bias(init_Bias_), seed(seed_) { }
/// Helper to initialize a tensor view
template <typename Element, typename Layout>
@ -394,9 +433,14 @@ struct B2bFusedGemmRun
cutlass::reference::host::BlockFillSequential(
view.data(), view.capacity());
}
}
else if (dist_kind == cutlass::Distribution::AllZeros) {
cutlass::reference::host::TensorFill(view, Element(0));
}
else if (dist_kind == cutlass::Distribution::AllOnes) {
cutlass::reference::host::TensorFill(view, Element(1));
}
else {
// TODO: Implement the rest
std::cerr << "Not implemented\n";
return false;
}
@ -415,7 +459,9 @@ struct B2bFusedGemmRun
ElementCompute beta0 = ElementCompute(0),
ElementCompute alpha1 = ElementCompute(1),
ElementCompute beta1 = ElementCompute(0),
bool relu = true) {
bool relu = true,
int warm_ups = 1,
int runs = 100) {
//
// Allocate the GEMM workspace
@ -433,9 +479,20 @@ struct B2bFusedGemmRun
typename B2bGemm::ElementC,
typename B2bGemm::LayoutC> tensor_C0(problem_size_0.mn());
// cutlass::HostTensor<
// typename B2bGemm::ElementC,
// typename B2bGemm::LayoutC> tensor_D0(problem_size_0.mn());
cutlass::HostTensor<
typename B2bGemm::ElementScaleBias,
typename B2bGemm::LayoutScaleBias> tensor_Scale0;
if(alpha0 == ElementCompute(0)) //per-channel scale
tensor_Scale0.resize({1, problem_size_0.n()});
cutlass::HostTensor<
typename B2bGemm::ElementScaleBias,
typename B2bGemm::LayoutScaleBias> tensor_Bias0({1, problem_size_0.n()});
cutlass::HostTensor<
ElementAccumulator,
typename B2bGemm::LayoutC> reference_Z0(problem_size_0.mn());
cutlass::HostTensor<
typename B2bGemm::ElementC,
@ -449,6 +506,10 @@ struct B2bFusedGemmRun
typename B2bGemm::ElementC,
typename B2bGemm::LayoutC> tensor_C1(problem_size_1.mn());
cutlass::HostTensor<
ElementCompute,
typename B2bGemm::LayoutScaleBias> tensor_Bias1({1, problem_size_1.n()});
cutlass::HostTensor<
typename B2bGemm::ElementC,
typename B2bGemm::LayoutC> tensor_D1(problem_size_1.mn());
@ -461,21 +522,29 @@ struct B2bFusedGemmRun
CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019));
CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2018));
CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017));
if(alpha0 == ElementCompute(0)) //per-channel scale
CHECK_TRUE(initialize_tensor(tensor_Scale0.host_view(), init_Scale, seed + 2014));
CHECK_TRUE(initialize_tensor(tensor_Bias0.host_view(), init_Bias, seed + 2013));
CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2016));
CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015));
CHECK_TRUE(initialize_tensor(tensor_Bias1.host_view(), init_Bias, seed + 2012));
cutlass::reference::host::TensorFill(
tensor_D1.host_view());
cutlass::reference::host::TensorFill(
reference_D0.host_view());
reference_D0.host_view());
cutlass::reference::host::TensorFill(
reference_D1.host_view());
tensor_A0.sync_device();
tensor_B0.sync_device();
tensor_C0.sync_device();
if(alpha0 == ElementCompute(0)) //per-channel scale
tensor_Scale0.sync_device();
tensor_Bias0.sync_device();
tensor_B1.sync_device();
tensor_C1.sync_device();
tensor_Bias1.sync_device();
tensor_D1.sync_device();
reference_D0.sync_device();
reference_D1.sync_device();
@ -490,8 +559,10 @@ struct B2bFusedGemmRun
tensor_A0.device_ref(),
tensor_B0.device_ref(),
tensor_C0.device_ref(),
tensor_Scale0.device_ref(),
tensor_Bias0.device_ref(),
tensor_B1.device_ref(),
tensor_C1.device_ref(),
{tensor_Bias1.device_data(), typename B2bGemm::LayoutC::Stride(0)},
tensor_D1.device_ref(),
{alpha0, beta0},
{alpha1, beta1},
@ -499,10 +570,26 @@ struct B2bFusedGemmRun
B2bGemm b2b_gemm_op;
cutlass::Status status = b2b_gemm_op.initialize(arguments);
cutlass::Status status = b2b_gemm_op.can_implement(arguments);
if(status != cutlass::Status::kSuccess) {
std::cout << "Problem sizes not supported.\n"
<< "Requirments:\n"
<< " problem_size_0.M = problem_size_1.M\n"
<< " problem_size_0.N = problem_size_1.K\n"
<< " ThreadblockShape0::kN = problem_size_0.N\n"
<< " ThreadblockShape1::kN = problem_size_1.N" << std::endl;
}
status = b2b_gemm_op.initialize(arguments);
CUTLASS_CHECK(status);
for(int i = 0; i < warm_ups; i++) {
status = b2b_gemm_op();
CUTLASS_CHECK(status);
}
//
// Run the GEMM
//
@ -513,7 +600,7 @@ struct B2bFusedGemmRun
cudaEventRecord(start);
for(int i = 0; i < 100; i++) {
for(int i = 0; i < runs; i++) {
status = b2b_gemm_op();
CUTLASS_CHECK(status);
@ -523,29 +610,49 @@ struct B2bFusedGemmRun
cudaDeviceSynchronize();
float gemmTime;
cudaEventElapsedTime(&gemmTime, start, stop);
std::cout << "time " << gemmTime / 100.0 << " ms\n";
std::cout << "Fusion time " << gemmTime / (float)runs << " ms\n";
//tensor_D0.sync_host();
tensor_D1.sync_host();
//
// Verify
//
cutlass::reference::device::Gemm<
typename B2bGemm::ElementA, typename B2bGemm::LayoutA,
typename B2bGemm::ElementB, typename B2bGemm::LayoutB,
ElementAccumulator, typename B2bGemm::LayoutC,
ElementAccumulator, ElementAccumulator>
reference_gemm_0;
cutlass::reference::device::Gemm<
typename B2bGemm::ElementA, typename B2bGemm::LayoutA,
typename B2bGemm::ElementB, typename B2bGemm::LayoutB,
typename B2bGemm::ElementC, typename B2bGemm::LayoutC, ElementCompute,
ElementAccumulator, typename B2bGemm::Operator>
reference_gemm_0, reference_gemm_1;
reference_gemm_1;
reference_gemm_0(
problem_size_0,
alpha0,
ElementAccumulator(1), //intermediate alpha=1
tensor_A0.device_ref(),
tensor_B0.device_ref(),
beta0,
tensor_C0.device_ref(),
reference_D0.device_ref()
ElementAccumulator(0), //beta = 0
reference_Z0.device_ref(),
reference_Z0.device_ref(),
ElementAccumulator(0)
);
cutlass::reference::device::TensorScaleBiasGemm<
ElementAccumulator, typename B2bGemm::ElementC, typename B2bGemm::LayoutC,
ElementCompute, typename B2bGemm::LayoutScaleBias
> (
problem_size_0,
reference_Z0.device_ref(),
reference_D0.device_ref(),
alpha0,
tensor_Scale0.device_ref(),
tensor_Bias0.device_ref()
);
if(relu) {
@ -558,18 +665,15 @@ struct B2bFusedGemmRun
reference_D0.device_ref(),
tensor_B1.device_ref(),
beta1,
tensor_C1.device_ref(),
{tensor_Bias1.device_data(), typename B2bGemm::LayoutC::Stride(0)},
reference_D1.device_ref()
);
if(relu) {
cutlass::reference::device::TensorReLu(reference_D1.device_view());
}
cudaDeviceSynchronize();
reference_D0.sync_host();
reference_D1.sync_host();
CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0);
@ -580,7 +684,8 @@ struct B2bFusedGemmRun
tensor_D1.host_view());
CHECK_TRUE(passed);
if (!passed) {
if (!passed)
{
std::stringstream fname;
@ -593,13 +698,14 @@ struct B2bFusedGemmRun
<< "A0 =\n" << tensor_A0.host_view()
<< "\nB0 =\n" << tensor_B0.host_view()
<< "\nC0 =\n" << tensor_C0.host_view()
// << "\nD0 =\n" << tensor_D0.host_view()
<< "\nScale0:\n" << tensor_Scale0.host_view() << "\n"
<< "\nBias0:\n" << tensor_Bias0.host_view() << "\n"
<< "\nB1 =\n" << tensor_B1.host_view()
<< "\nC1 =\n" << tensor_C1.host_view()
<< "\nBias1:\n" << tensor_Bias1.host_view() << "\n"
<< "\n\nReference =\n" << reference_D1.host_view()
<< "\nComputed =\n" << tensor_D1.host_view();
}
return passed;
}

View File

@ -0,0 +1,749 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#pragma once
#include <iostream>
#include <fstream>
#include <sstream>
#include "cutlass/cutlass.h"
#include "cutlass/conv/device/implicit_gemm_convolution.h"
#include "cutlass/reduction/device/reduce_split_k.h"
#include "cutlass/reduction/thread/reduction_operators.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/device/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_norm.h"
#include "cutlass/util/host_reorder.h"
#include "cutlass/util/reference/host/convolution.h"
#include "cutlass/util/reference/device/convolution.h"
#include "cutlass/util/reference/device/tensor_relu.h"
#include "cutlass/core_io.h"
#include "cutlass/util/tensor_view_io.h"
#include "reference/device/tensor_scale_bias.h"
#include "helper.h"
#define CHECK_GT(val1, val2) \
if((val1) <= (val2)) \
std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_GT failed\n";
#define CHECK_TRUE(val) \
if(!(val)) \
std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_TRUE failed\n";
template <typename Conv2d0_, typename Conv2d1_, int InterleavedK>
class B2bInterleavedNonFusedConv2dRun {
public:
using Conv2d0 = Conv2d0_;
using Conv2d1 = Conv2d1_;
using ElementAccumulator = typename Conv2d0::ElementAccumulator;
using ElementCompute = typename Conv2d0::ElementCompute;
static cutlass::conv::Operator const kConvolutionalOperator = Conv2d0::kConvolutionalOperator;
static_assert(kConvolutionalOperator == Conv2d1::kConvolutionalOperator,
"Fused convolution operators must be the same");
public:
/// Initialization
cutlass::Distribution::Kind init_A;
cutlass::Distribution::Kind init_B;
cutlass::Distribution::Kind init_C;
cutlass::Distribution::Kind init_Bias;
uint64_t seed;
cutlass::HostTensor<typename Conv2d0::ElementA, typename Conv2d0::LayoutA> tensor_A0;
cutlass::HostTensor<typename Conv2d0::ElementB, typename Conv2d0::LayoutB> tensor_B0;
cutlass::HostTensor<typename Conv2d0::ElementB, typename Conv2d0::LayoutB> tensor_B0_reordered;
cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_C0;
cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_Bias0;
cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_D0_computed;
cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_D0_reference;
cutlass::HostTensor<typename Conv2d1::ElementB, typename Conv2d1::LayoutB> tensor_B1;
cutlass::HostTensor<typename Conv2d1::ElementB, typename Conv2d1::LayoutB> tensor_B1_reordered;
cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_C1;
cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d0::LayoutC> tensor_Bias1;
cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_D1_computed;
cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_D1_reference;
public:
B2bInterleavedNonFusedConv2dRun(
cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_Bias_ = cutlass::Distribution::Uniform,
uint64_t seed_ = 2080
):
init_A(init_A_), init_B(init_B_), init_C(init_C_), init_Bias(init_Bias_), seed(seed_) {
}
/// Helper to initialize a tensor view
template <typename Element, typename Layout>
void initialize_tensor(
cutlass::TensorView<Element, Layout> view,
cutlass::Distribution::Kind dist_kind,
uint64_t seed) {
if (dist_kind == cutlass::Distribution::Uniform) {
int scope;
int bits = cutlass::sizeof_bits<Element>::value;
if (bits <= 16) {
scope = 2;
}
else {
scope = 8;
}
cutlass::reference::host::TensorFillRandomUniform(
view, seed, scope, -scope, 0);
}
else if (dist_kind == cutlass::Distribution::Identity) {
cutlass::reference::host::TensorFillIdentity(view);
}
else if (dist_kind == cutlass::Distribution::Gaussian) {
cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
}
else if (dist_kind == cutlass::Distribution::Sequential) {
cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
}
else if (dist_kind == cutlass::Distribution::AllZeros) {
cutlass::reference::host::TensorFill(view, Element(0));
}
else if (dist_kind == cutlass::Distribution::AllOnes) {
cutlass::reference::host::TensorFill(view, Element(1));
}
else {
}
}
void initialize(
cutlass::conv::Conv2dProblemSize const &problem_size_0,
cutlass::conv::Conv2dProblemSize const &problem_size_1, uint64_t seed = 2019) {
tensor_A0.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size_0));
tensor_B0.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
tensor_B0_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
tensor_C0.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
tensor_Bias0.resize({1, 1, 1, problem_size_0.K});
tensor_D0_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
tensor_D0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
tensor_B1.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
tensor_B1_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
tensor_C1.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
tensor_Bias1.resize({1, 1, 1, problem_size_1.K});
tensor_D1_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
tensor_D1_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
initialize_tensor(tensor_A0.host_view(), init_A, seed);
initialize_tensor(tensor_B0.host_view(), init_B, seed * 17);
initialize_tensor(tensor_C0.host_view(), init_C, seed * 39);
initialize_tensor(tensor_Bias0.host_view(), init_Bias, seed * 83);
initialize_tensor(tensor_B1.host_view(), init_B, seed * 18);
initialize_tensor(tensor_C1.host_view(), init_C, seed * 40);
//Reorder B0 and B1
cutlass::reorder_convK<InterleavedK, InterleavedK>(
tensor_B0_reordered.host_ref(), tensor_B0.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size_0));
cutlass::reorder_convK<InterleavedK, InterleavedK>(
tensor_B1_reordered.host_ref(), tensor_B1.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size_1));
tensor_A0.sync_device();
tensor_B0.sync_device();
tensor_B0_reordered.sync_device();
tensor_C0.sync_device();
tensor_Bias0.sync_device();
tensor_D0_computed.sync_device();
tensor_D0_reference.sync_device();
tensor_B1.sync_device();
tensor_B1_reordered.sync_device();
tensor_C1.sync_device();
tensor_Bias1.sync_device();
tensor_D1_computed.sync_device();
tensor_D1_reference.sync_device();
}
/// Executes one test
bool run(
cutlass::conv::Conv2dProblemSize const &problem_size_0,
cutlass::conv::Conv2dProblemSize const &problem_size_1,
cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
ElementCompute alpha0 = ElementCompute(1),
ElementCompute beta0 = ElementCompute(0),
ElementCompute alpha1 = ElementCompute(1),
ElementCompute beta1 = ElementCompute(0),
bool relu = true,
int warm_ups = 1,
int runs = 100) {
initialize(problem_size_0, problem_size_1);
// configure the operator
Conv2d0 conv2d_op_0;
Conv2d1 conv2d_op_1;
typename Conv2d0::Arguments conv2d_args_0(
problem_size_0,
tensor_A0.device_ref(),
tensor_B0_reordered.device_ref(),
tensor_C0.device_ref(),
tensor_D0_computed.device_ref(),
{alpha0, beta0},
split_k_mode
);
typename Conv2d1::Arguments conv2d_args_1(
problem_size_1,
tensor_D0_computed.device_ref(),
tensor_B1_reordered.device_ref(),
tensor_C1.device_ref(),
tensor_D1_computed.device_ref(),
{alpha1, beta1},
split_k_mode
);
cutlass::Status status = conv2d_op_0.initialize(conv2d_args_0);
CUTLASS_CHECK(status);
status = conv2d_op_1.initialize(conv2d_args_1);
CUTLASS_CHECK(status);
for(int i = 0; i < warm_ups; i++) {
status = conv2d_op_0();
CUTLASS_CHECK(status);
status = conv2d_op_1();
CUTLASS_CHECK(status);
}
//
// Run Conv2d
//
cudaEvent_t start, stop1, stop2;
cudaEventCreate(&start);
cudaEventCreate(&stop1);
cudaEventCreate(&stop2);
cudaEventRecord(start);
for(int i = 0; i < runs; i++) {
// run conv2d operator
status = conv2d_op_0();
CUTLASS_CHECK(status);
}
cudaEventRecord(stop1);
for(int i = 0; i < runs; i++) {
// run conv2d operator
status = conv2d_op_1();
CUTLASS_CHECK(status);
}
cudaEventRecord(stop2);
cudaDeviceSynchronize();
float conv2d0Time, conv2d1Time, totalTime;
cudaEventElapsedTime(&conv2d0Time, start, stop1);
cudaEventElapsedTime(&conv2d1Time, stop1, stop2);
cudaEventElapsedTime(&totalTime, start, stop2);
std::cout << "conv2d 0 time " << conv2d0Time / (float)runs << " ms\n";
std::cout << "conv2d 1 time " << conv2d1Time / (float)runs << " ms\n";
std::cout << "Non-fusion time " << totalTime / (float)runs << " ms\n";
tensor_D0_computed.sync_host();
tensor_D1_computed.sync_host();
bool passed = false;
cutlass::reference::device::Conv2d<
typename Conv2d0::ElementA,
typename Conv2d0::LayoutA,
typename Conv2d0::ElementB,
typename Conv2d0::LayoutB,
typename Conv2d0::ElementC,
typename Conv2d0::LayoutC,
ElementCompute,
ElementAccumulator,
cutlass::NumericConverterClamp<typename Conv2d0::ElementC, ElementCompute>
>(
kConvolutionalOperator,
problem_size_0,
tensor_A0.device_ref(),
tensor_B0.device_ref(),
tensor_C0.device_ref(),
tensor_D0_reference.device_ref(),
alpha0,
beta0);
if(relu) {
cutlass::reference::device::TensorReLu(tensor_D0_reference.device_view());
}
cutlass::reference::device::Conv2d<
typename Conv2d1::ElementA,
typename Conv2d1::LayoutA,
typename Conv2d1::ElementB,
typename Conv2d1::LayoutB,
typename Conv2d1::ElementC,
typename Conv2d1::LayoutC,
ElementCompute,
ElementAccumulator,
cutlass::NumericConverterClamp<typename Conv2d1::ElementC, ElementCompute>
>(
kConvolutionalOperator,
problem_size_1,
tensor_D0_reference.device_ref(),
tensor_B1.device_ref(),
tensor_C1.device_ref(),
tensor_D1_reference.device_ref(),
alpha1,
beta1);
if(relu) {
cutlass::reference::device::TensorReLu(tensor_D1_reference.device_view());
}
cudaError_t result = cudaDeviceSynchronize();
CHECK_TRUE(result == cudaSuccess);
// sync host (copy device data to host) for dumping error output in case of mismatches
tensor_D0_reference.sync_host();
tensor_D1_reference.sync_host();
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_computed.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_reference.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_computed.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_reference.host_view()), 0);
passed = cutlass::reference::host::TensorEquals(
tensor_D1_computed.host_view(),
tensor_D1_reference.host_view());
CHECK_TRUE(passed);
if (!passed) {
std::stringstream fname;
fname << "error_B2bImplicitGemm_device_interleaved_nonfused.txt";
std::cerr << "Dumping results in " << fname.str() << "\n";
std::ofstream results(fname.str());
results << problem_size_0 << std::endl;
results << problem_size_1 << std::endl;
results
<< "\nA0:\n" << tensor_A0.host_view() << "\n"
<< "\nB0:\n" << tensor_B0.host_view() << "\n"
<< "\nB0_reordered:\n" << tensor_B0_reordered.host_view() << "\n"
<< "\nC0:\n" << tensor_C0.host_view() << "\n"
<< "\nBias0:\n" << tensor_Bias0.host_view() << "\n"
<< "\nD0 reference:\n" << tensor_D0_reference.host_view() << "\n"
<< "\nD0 computed:\n" << tensor_D0_computed.host_view() << "\n"
<< "\nB1:\n" << tensor_B1.host_view() << "\n"
<< "\nB1_reordered:\n" << tensor_B1_reordered.host_view() << "\n"
<< "\nC1:\n" << tensor_C1.host_view() << "\n"
<< "\nBias1:\n" << tensor_Bias1.host_view() << "\n"
<< "\nD1 reference:\n" << tensor_D1_reference.host_view() << "\n"
<< "\nD1 computed:\n" << tensor_D1_computed.host_view();
}
return passed;
}
};
template <typename B2bConv2d_, int InterleavedK>
class B2bInterleavedFusedConv2dRun {
public:
using B2bConv2d = B2bConv2d_;
using ElementAccumulator = typename B2bConv2d::ElementAccumulator;
using ElementCompute = typename B2bConv2d::ElementCompute;
static cutlass::conv::Operator const kConvolutionalOperator = B2bConv2d::kConvolutionalOperator;
public:
/// Initialization
cutlass::Distribution::Kind init_A;
cutlass::Distribution::Kind init_B;
cutlass::Distribution::Kind init_C;
cutlass::Distribution::Kind init_Scale;
cutlass::Distribution::Kind init_Bias;
uint64_t seed;
cutlass::HostTensor<typename B2bConv2d::ElementA, typename B2bConv2d::LayoutA> tensor_A0;
cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B0;
cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B0_reordered;
cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_C0;
cutlass::HostTensor<typename B2bConv2d::ElementScaleBias, typename B2bConv2d::LayoutScaleBias> tensor_Scale0;
cutlass::HostTensor<typename B2bConv2d::ElementScaleBias, typename B2bConv2d::LayoutScaleBias> tensor_Bias0;
cutlass::HostTensor<ElementAccumulator, typename B2bConv2d::LayoutC> tensor_Z0_reference;
cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D0_reference;
cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B1;
cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B1_reordered;
cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_C1;
cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_Bias1;
cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D1_computed;
cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D1_reference;
public:
B2bInterleavedFusedConv2dRun(
cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_Scale_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_Bias_ = cutlass::Distribution::Uniform,
uint64_t seed_ = 2080
):
init_A(init_A_), init_B(init_B_), init_C(init_C_),
init_Scale(init_Scale_), init_Bias(init_Bias_), seed(seed_) {
}
/// Helper to initialize a tensor view
template <typename Element, typename Layout>
void initialize_tensor(
cutlass::TensorView<Element, Layout> view,
cutlass::Distribution::Kind dist_kind,
uint64_t seed) {
if (dist_kind == cutlass::Distribution::Uniform) {
int scope;
int bits = cutlass::sizeof_bits<Element>::value;
if (bits <= 16) {
scope = 2;
}
else {
scope = 8;
}
cutlass::reference::host::TensorFillRandomUniform(
view, seed, scope, -scope, 0);
}
else if (dist_kind == cutlass::Distribution::Identity) {
cutlass::reference::host::TensorFillIdentity(view);
}
else if (dist_kind == cutlass::Distribution::Gaussian) {
cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
}
else if (dist_kind == cutlass::Distribution::Sequential) {
cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
}
else if (dist_kind == cutlass::Distribution::AllZeros) {
cutlass::reference::host::TensorFill(view, Element(0));
}
else if (dist_kind == cutlass::Distribution::AllOnes) {
cutlass::reference::host::TensorFill(view, Element(1));
}
else {
}
}
void initialize(
cutlass::conv::Conv2dProblemSize const &problem_size_0,
cutlass::conv::Conv2dProblemSize const &problem_size_1,
ElementCompute alpha0,
ElementCompute alpha1,
uint64_t seed = 2019) {
tensor_A0.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size_0));
tensor_B0.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
tensor_B0_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
tensor_C0.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
if(alpha0 == ElementCompute(0)) //per-channel scale
tensor_Scale0.resize({1, problem_size_0.K});
tensor_Bias0.resize({1, problem_size_0.K});
tensor_Z0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
tensor_D0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
tensor_B1.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
tensor_B1_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
tensor_C1.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
tensor_Bias1.resize({1, 1, 1, problem_size_1.K});
tensor_D1_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
tensor_D1_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
initialize_tensor(tensor_A0.host_view(), init_A, seed);
initialize_tensor(tensor_B0.host_view(), init_B, seed * 17);
initialize_tensor(tensor_C0.host_view(), init_C, seed * 39);
if(alpha0 == ElementCompute(0)) //per-channel scale
initialize_tensor(tensor_Scale0.host_view(), init_Scale, seed * 61);
initialize_tensor(tensor_Bias0.host_view(), init_Bias, seed * 83);
initialize_tensor(tensor_B1.host_view(), init_B, seed * 18);
initialize_tensor(tensor_C1.host_view(), init_C, seed * 40);
initialize_tensor(tensor_Bias1.host_view(), init_Bias, seed * 84);
//Reorder B0 and B1
cutlass::reorder_convK<16, InterleavedK>(
tensor_B0_reordered.host_ref(), tensor_B0.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size_0));
cutlass::reorder_convK<InterleavedK, InterleavedK>(
tensor_B1_reordered.host_ref(), tensor_B1.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size_1));
tensor_A0.sync_device();
tensor_B0.sync_device();
tensor_B0_reordered.sync_device();
tensor_C0.sync_device();
if(alpha0 == ElementCompute(0)) //per-channel scale
tensor_Scale0.sync_device();
tensor_Bias0.sync_device();
tensor_D0_reference.sync_device();
tensor_B1.sync_device();
tensor_B1_reordered.sync_device();
tensor_C1.sync_device();
tensor_Bias1.sync_device();
tensor_D1_computed.sync_device();
tensor_D1_reference.sync_device();
}
/// Executes one test
bool run(
cutlass::conv::Conv2dProblemSize const &problem_size_0,
cutlass::conv::Conv2dProblemSize const &problem_size_1,
cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
ElementCompute alpha0 = ElementCompute(1),
ElementCompute beta0 = ElementCompute(0),
ElementCompute alpha1 = ElementCompute(1),
ElementCompute beta1 = ElementCompute(0),
bool relu = true,
int warm_ups = 1,
int runs = 100) {
initialize(problem_size_0, problem_size_1, alpha0, alpha1);
// configure the operator
B2bConv2d b2b_conv2d_op;
typename B2bConv2d::Arguments b2b_conv2d_args(
problem_size_0,
problem_size_1,
tensor_A0.device_ref(),
tensor_B0_reordered.device_ref(),
tensor_C0.device_ref(),
tensor_Scale0.device_ref(),
tensor_Bias0.device_ref(),
tensor_B1_reordered.device_ref(),
tensor_C1.device_ref(),
tensor_D1_computed.device_ref(),
{alpha0, beta0},
{alpha1, beta1},
split_k_mode
);
cutlass::Status status = b2b_conv2d_op.can_implement(b2b_conv2d_args);
if(status != cutlass::Status::kSuccess) {
std::cout << "Problem sizes not supported.\n"
<< "Requirments:\n"
<< " problem_size_0.N*P*Q = problem_size_1.N*P*Q\n"
<< " problem_size_0.K = problem_size_1.C\n"
<< " problem_size_1.R = problem_size_1.S = 1\n"
<< " ThreadblockShape0::kN = problem_size_0.K\n"
<< " ThreadblockShape1::kN = problem_size_1.K" << std::endl;
}
CUTLASS_CHECK(status);
status = b2b_conv2d_op.initialize(b2b_conv2d_args);
CUTLASS_CHECK(status);
for(int i = 0; i < warm_ups; i++) {
status = b2b_conv2d_op();
CUTLASS_CHECK(status);
}
//
// Run the Conv2d
//
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
for(int i = 0; i < runs; i++) {
// run conv2d operator
status = b2b_conv2d_op();
CUTLASS_CHECK(status);
}
cudaEventRecord(stop);
cudaDeviceSynchronize();
float conv2dTime;
cudaEventElapsedTime(&conv2dTime, start, stop);
std::cout << "Fusion time " << conv2dTime / (float)runs << " ms\n";
tensor_D1_computed.sync_host();
bool passed = false;
cutlass::reference::device::Conv2d<
typename B2bConv2d::ElementA,
typename B2bConv2d::LayoutA,
typename B2bConv2d::ElementB,
typename B2bConv2d::LayoutB,
ElementAccumulator,
typename B2bConv2d::LayoutC,
ElementAccumulator,
ElementAccumulator
>(
kConvolutionalOperator,
problem_size_0,
tensor_A0.device_ref(),
tensor_B0.device_ref(),
tensor_Z0_reference.device_ref(),
tensor_Z0_reference.device_ref(),
ElementAccumulator(1), // intermediate alpha = 1
ElementAccumulator(0) // beta = 0
);
cutlass::reference::device::TensorScaleBiasConv2d<
ElementAccumulator,
typename B2bConv2d::ElementC,
typename B2bConv2d::LayoutC,
ElementCompute,
typename B2bConv2d::LayoutScaleBias,
cutlass::NumericConverterClamp<typename B2bConv2d::ElementC, ElementCompute>
>(
problem_size_0,
tensor_Z0_reference.device_ref(),
tensor_D0_reference.device_ref(),
alpha0,
tensor_Scale0.device_ref(),
tensor_Bias0.device_ref()
);
if(relu) {
cutlass::reference::device::TensorReLu(tensor_D0_reference.device_view());
}
cutlass::reference::device::Conv2d<
typename B2bConv2d::ElementA,
typename B2bConv2d::LayoutA,
typename B2bConv2d::ElementB,
typename B2bConv2d::LayoutB,
typename B2bConv2d::ElementC,
typename B2bConv2d::LayoutC,
ElementCompute,
ElementAccumulator,
cutlass::NumericConverterClamp<typename B2bConv2d::ElementC, ElementCompute>
>(
kConvolutionalOperator,
problem_size_1,
tensor_D0_reference.device_ref(),
tensor_B1.device_ref(),
tensor_C1.device_ref(),
tensor_D1_reference.device_ref(),
alpha1,
beta1);
if(relu) {
cutlass::reference::device::TensorReLu(tensor_D1_reference.device_view());
}
cudaError_t result = cudaDeviceSynchronize();
CHECK_TRUE(result == cudaSuccess);
// sync host (copy device data to host) for dumping error output in case of mismatches
tensor_D0_reference.sync_host();
tensor_D1_reference.sync_host();
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_reference.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_computed.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_reference.host_view()), 0);
passed = cutlass::reference::host::TensorEquals(
tensor_D1_computed.host_view(),
tensor_D1_reference.host_view());
CHECK_TRUE(passed);
if (!passed) {
std::stringstream fname;
fname << "error_B2bImplicitGemm_device_interleaved_fused.txt";
std::cerr << "Dumping results in " << fname.str() << "\n";
std::ofstream results(fname.str());
results << problem_size_0 << std::endl;
results << problem_size_1 << std::endl;
results
<< "\nA0:\n" << tensor_A0.host_view() << "\n"
<< "\nB0:\n" << tensor_B0.host_view() << "\n"
<< "\nB0_reordered:\n" << tensor_B0_reordered.host_view() << "\n"
<< "\nC0:\n" << tensor_C0.host_view() << "\n"
<< "\nScale0:\n" << tensor_Scale0.host_view() << "\n"
<< "\nBias0:\n" << tensor_Bias0.host_view() << "\n"
<< "\nB1:\n" << tensor_B1.host_view() << "\n"
<< "\nB1_reordered:\n" << tensor_B1_reordered.host_view() << "\n"
<< "\nC1:\n" << tensor_C1.host_view() << "\n"
<< "\nBias1:\n" << tensor_Bias1.host_view() << "\n"
<< "\nD1 reference:\n" << tensor_D1_reference.host_view() << "\n"
<< "\nD1 computed:\n" << tensor_D1_computed.host_view();
}
return passed;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,28 +1,33 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#pragma once
#include <iostream>
@ -38,6 +43,9 @@
#include "cutlass/util/reference/host/tensor_norm.h"
#include "cutlass/util/host_reorder.h"
#include "cutlass/util/reference/device/gemm.h"
#include "cutlass/util/reference/device/tensor_relu.h"
#include "reference/device/tensor_scale_bias.h"
#include "helper.h"
#define CHECK_GT(val1, val2) \
@ -60,6 +68,7 @@ struct B2bInterleavedNonFusedGemmRun
cutlass::Distribution::Kind init_A;
cutlass::Distribution::Kind init_B;
cutlass::Distribution::Kind init_C;
cutlass::Distribution::Kind init_Bias;
uint64_t seed;
//
@ -70,9 +79,10 @@ struct B2bInterleavedNonFusedGemmRun
cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_Bias_ = cutlass::Distribution::Uniform,
uint64_t seed_ = 2080
):
init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
init_A(init_A_), init_B(init_B_), init_C(init_C_), init_Bias(init_Bias_), seed(seed_) { }
/// Helper to initialize a tensor view
template <typename Element, typename Layout>
@ -89,14 +99,23 @@ struct B2bInterleavedNonFusedGemmRun
else if (dist_kind == cutlass::Distribution::Identity) {
cutlass::reference::host::TensorFillIdentity(view);
}
}
else if (dist_kind == cutlass::Distribution::Gaussian) {
cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
}
else if (dist_kind == cutlass::Distribution::Sequential) {
cutlass::reference::host::BlockFillSequential(
view.data(), view.capacity());
}
}
else if (dist_kind == cutlass::Distribution::AllZeros) {
cutlass::reference::host::TensorFill(view, Element(0));
}
else if (dist_kind == cutlass::Distribution::AllOnes) {
cutlass::reference::host::TensorFill(view, Element(1));
}
else {
// TODO: Implement the rest
std::cerr << "Not implemented\n";
return false;
}
@ -115,7 +134,9 @@ struct B2bInterleavedNonFusedGemmRun
ElementCompute beta0 = ElementCompute(0),
ElementCompute alpha1 = ElementCompute(1),
ElementCompute beta1 = ElementCompute(0),
bool relu = true) {
bool relu = true,
int warm_ups = 1,
int runs = 100) {
//
// Allocate the GEMM workspace
@ -137,6 +158,10 @@ struct B2bInterleavedNonFusedGemmRun
typename Gemm0::ElementC,
typename Gemm0::LayoutC> tensor_C0(problem_size_0.mn());
cutlass::HostTensor<
typename Gemm0::ElementC,
typename Gemm0::LayoutC> tensor_Bias0({1, problem_size_0.n()});
cutlass::HostTensor<
typename Gemm0::ElementC,
typename Gemm0::LayoutC> tensor_D0(problem_size_0.mn());
@ -157,6 +182,10 @@ struct B2bInterleavedNonFusedGemmRun
typename Gemm1::ElementC,
typename Gemm1::LayoutC> tensor_C1(problem_size_1.mn());
cutlass::HostTensor<
typename Gemm0::ElementC,
typename Gemm1::LayoutC> tensor_Bias1({1, problem_size_1.n()});
cutlass::HostTensor<
typename Gemm1::ElementC,
typename Gemm1::LayoutC> tensor_D1(problem_size_1.mn());
@ -169,8 +198,10 @@ struct B2bInterleavedNonFusedGemmRun
CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019));
CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2018));
CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017));
CHECK_TRUE(initialize_tensor(tensor_Bias0.host_view(), init_Bias, seed + 2014));
CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2016));
CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015));
CHECK_TRUE(initialize_tensor(tensor_Bias1.host_view(), init_Bias, seed + 2013));
//Reorder B0 and B1
cutlass::reorder_column<InterleavedK_>(
@ -191,10 +222,12 @@ struct B2bInterleavedNonFusedGemmRun
tensor_B0.sync_device();
tensor_B0_reordered.sync_device();
tensor_C0.sync_device();
tensor_Bias0.sync_device();
tensor_D0.sync_device();
tensor_B1.sync_device();
tensor_B1_reordered.sync_device();
tensor_C1.sync_device();
tensor_Bias1.sync_device();
tensor_D1.sync_device();
reference_D0.sync_device();
reference_D1.sync_device();
@ -207,7 +240,7 @@ struct B2bInterleavedNonFusedGemmRun
problem_size_0,
tensor_A0.device_ref(),
tensor_B0_reordered.device_ref(),
tensor_C0.device_ref(),
{tensor_Bias0.device_data(), typename Gemm0::LayoutC::Stride(0)},
tensor_D0.device_ref(),
{alpha0, beta0}
};
@ -216,7 +249,7 @@ struct B2bInterleavedNonFusedGemmRun
problem_size_1,
tensor_D0.device_ref(),
tensor_B1_reordered.device_ref(),
tensor_C1.device_ref(),
{tensor_Bias1.device_data(), typename Gemm1::LayoutC::Stride(0)},
tensor_D1.device_ref(),
{alpha1, beta1}
};
@ -232,6 +265,14 @@ struct B2bInterleavedNonFusedGemmRun
status = gemm_op_1.initialize(arguments_1);
CUTLASS_CHECK(status);
for(int i = 0; i < warm_ups; i++) {
status = gemm_op_0();
CUTLASS_CHECK(status);
status = gemm_op_1();
CUTLASS_CHECK(status);
}
//
// Run the GEMM
//
@ -242,14 +283,13 @@ struct B2bInterleavedNonFusedGemmRun
cudaEventRecord(start);
for(int i = 0; i < 100; i++) {
for(int i = 0; i < runs; i++) {
status = gemm_op_0();
CUTLASS_CHECK(status);
}
cudaEventRecord(stop1);
for(int i = 0; i < 100; i++) {
cudaEventRecord(stop1);
for(int i = 0; i < runs; i++) {
status = gemm_op_1();
CUTLASS_CHECK(status);
@ -261,9 +301,9 @@ struct B2bInterleavedNonFusedGemmRun
cudaEventElapsedTime(&gemm0Time, start, stop1);
cudaEventElapsedTime(&gemm1Time, stop1, stop2);
cudaEventElapsedTime(&totalTime, start, stop2);
std::cout << "gemm 0 time " << gemm0Time / 100.0 << " ms\n";
std::cout << "gemm 1 time " << gemm1Time / 100.0 << " ms\n";
std::cout << "total time " << totalTime / 100.0 << " ms\n";
std::cout << "gemm 0 time " << gemm0Time / (float)runs << " ms\n";
std::cout << "gemm 1 time " << gemm1Time / (float)runs << " ms\n";
std::cout << "Non-fusion time " << totalTime / (float)runs << " ms\n";
tensor_D0.sync_host();
tensor_D1.sync_host();
@ -291,7 +331,7 @@ struct B2bInterleavedNonFusedGemmRun
tensor_A0.device_ref(),
tensor_B0.device_ref(),
beta0,
tensor_C0.device_ref(),
{tensor_Bias0.device_data(), typename Gemm0::LayoutC::Stride(0)},
reference_D0.device_ref()
);
@ -302,10 +342,10 @@ struct B2bInterleavedNonFusedGemmRun
reference_gemm_1(
problem_size_1,
alpha1,
tensor_D0.device_ref(),
reference_D0.device_ref(),
tensor_B1.device_ref(),
beta1,
tensor_C1.device_ref(),
beta1,
{tensor_Bias1.device_data(), typename Gemm1::LayoutC::Stride(0)},
reference_D1.device_ref()
);
@ -313,6 +353,7 @@ struct B2bInterleavedNonFusedGemmRun
cutlass::reference::device::TensorReLu(reference_D1.device_view());
}
// Wait for kernels to finish
cudaDeviceSynchronize();
reference_D0.sync_host();
reference_D1.sync_host();
@ -341,14 +382,15 @@ struct B2bInterleavedNonFusedGemmRun
<< "\nB0 =\n" << tensor_B0.host_view()
<< "\nB0_reordered =\n" << tensor_B0_reordered.host_view()
<< "\nC0 =\n" << tensor_C0.host_view()
<< "\nBias0:\n" << tensor_Bias0.host_view() << "\n"
<< "\nD0 =\n" << tensor_D0.host_view()
<< "\nB1 =\n" << tensor_B1.host_view()
<< "\nB1_reordered =\n" << tensor_B1_reordered.host_view()
<< "\nC1 =\n" << tensor_C1.host_view()
<< "\nBias1:\n" << tensor_Bias1.host_view() << "\n"
<< "\n\nReference =\n" << reference_D1.host_view()
<< "\nComputed =\n" << tensor_D1.host_view();
}
return passed;
}
};
@ -365,6 +407,8 @@ struct B2bInterleavedFusedGemmRun
cutlass::Distribution::Kind init_A;
cutlass::Distribution::Kind init_B;
cutlass::Distribution::Kind init_C;
cutlass::Distribution::Kind init_Scale;
cutlass::Distribution::Kind init_Bias;
uint64_t seed;
//
@ -375,9 +419,12 @@ struct B2bInterleavedFusedGemmRun
cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_Scale_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_Bias_ = cutlass::Distribution::Uniform,
uint64_t seed_ = 2080
):
init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
init_A(init_A_), init_B(init_B_), init_C(init_C_),
init_Scale(init_Scale_), init_Bias(init_Bias_), seed(seed_) { }
/// Helper to initialize a tensor view
template <typename Element, typename Layout>
@ -395,13 +442,22 @@ struct B2bInterleavedFusedGemmRun
cutlass::reference::host::TensorFillIdentity(view);
}
else if (dist_kind == cutlass::Distribution::Gaussian) {
cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
}
else if (dist_kind == cutlass::Distribution::Sequential) {
cutlass::reference::host::BlockFillSequential(
view.data(), view.capacity());
}
}
else if (dist_kind == cutlass::Distribution::AllZeros) {
cutlass::reference::host::TensorFill(view, Element(0));
}
else if (dist_kind == cutlass::Distribution::AllOnes) {
cutlass::reference::host::TensorFill(view, Element(1));
}
else {
// TODO: Implement the rest
std::cerr << "Not implemented\n";
return false;
}
@ -419,8 +475,10 @@ struct B2bInterleavedFusedGemmRun
ElementCompute alpha0 = ElementCompute(1),
ElementCompute beta0 = ElementCompute(0),
ElementCompute alpha1 = ElementCompute(1),
ElementCompute beta1 = ElementCompute(0),
bool relu = true) {
ElementCompute beta1 = ElementCompute(0),
bool relu = true,
int warm_ups = 1,
int runs = 100) {
//
// Allocate the GEMM workspace
@ -442,9 +500,20 @@ struct B2bInterleavedFusedGemmRun
typename B2bGemm::ElementC,
typename B2bGemm::LayoutC> tensor_C0(problem_size_0.mn());
// cutlass::HostTensor<
// typename B2bGemm::ElementC,
// typename B2bGemm::LayoutC> tensor_D0(problem_size_0.mn());
cutlass::HostTensor<
typename B2bGemm::ElementScaleBias,
typename B2bGemm::LayoutScaleBias> tensor_Scale0;
if(alpha0 == ElementCompute(0)) //per-channel scale
tensor_Scale0.resize({1, problem_size_0.n()});
cutlass::HostTensor<
typename B2bGemm::ElementScaleBias,
typename B2bGemm::LayoutScaleBias> tensor_Bias0({1, problem_size_0.n()});
cutlass::HostTensor<
ElementAccumulator,
typename B2bGemm::LayoutC> reference_Z0(problem_size_0.mn());
cutlass::HostTensor<
typename B2bGemm::ElementC,
@ -462,6 +531,10 @@ struct B2bInterleavedFusedGemmRun
typename B2bGemm::ElementC,
typename B2bGemm::LayoutC> tensor_C1(problem_size_1.mn());
cutlass::HostTensor<
typename B2bGemm::ElementC,
typename B2bGemm::LayoutScaleBias> tensor_Bias1({1, problem_size_1.n()});
cutlass::HostTensor<
typename B2bGemm::ElementC,
typename B2bGemm::LayoutC> tensor_D1(problem_size_1.mn());
@ -474,11 +547,15 @@ struct B2bInterleavedFusedGemmRun
CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019));
CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2018));
CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017));
if(alpha0 == ElementCompute(0)) //per-channel scale
CHECK_TRUE(initialize_tensor(tensor_Scale0.host_view(), init_Scale, seed + 2014));
CHECK_TRUE(initialize_tensor(tensor_Bias0.host_view(), init_Bias, seed + 2013));
CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2016));
CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015));
CHECK_TRUE(initialize_tensor(tensor_Bias1.host_view(), init_Bias, seed + 2012));
//Reorder B0
cutlass::reorder_column<B2bGemm::InstructionShape::kK>(
cutlass::reorder_column<16>(
tensor_B0_reordered.host_ref(), tensor_B0.host_ref(), problem_size_0);
cutlass::reorder_column<InterleavedK_>(
tensor_B1_reordered.host_ref(), tensor_B1.host_ref(), problem_size_1);
@ -494,10 +571,13 @@ struct B2bInterleavedFusedGemmRun
tensor_B0.sync_device();
tensor_B0_reordered.sync_device();
tensor_C0.sync_device();
//tensor_D0.sync_device();
if(alpha0 == ElementCompute(0)) //per-channel scale
tensor_Scale0.sync_device();
tensor_Bias0.sync_device();
tensor_B1.sync_device();
tensor_B1_reordered.sync_device();
tensor_C1.sync_device();
tensor_Bias1.sync_device();
tensor_D1.sync_device();
reference_D0.sync_device();
reference_D1.sync_device();
@ -512,20 +592,37 @@ struct B2bInterleavedFusedGemmRun
tensor_A0.device_ref(),
tensor_B0_reordered.device_ref(),
tensor_C0.device_ref(),
tensor_Scale0.device_ref(),
tensor_Bias0.device_ref(),
tensor_B1_reordered.device_ref(),
tensor_C1.device_ref(),
{tensor_Bias1.device_data(), typename B2bGemm::LayoutC::Stride(0)},
tensor_D1.device_ref(),
{alpha0, beta0},
{alpha1, beta1},
1, /*threadblock_swizzle_k_tile*/
};
B2bGemm b2b_gemm_op;
cutlass::Status status = b2b_gemm_op.initialize(arguments);
cutlass::Status status = b2b_gemm_op.can_implement(arguments);
if(status != cutlass::Status::kSuccess) {
std::cout << "Problem sizes not supported.\n"
<< "Requirments:\n"
<< " problem_size_0.M = problem_size_1.M\n"
<< " problem_size_0.N = problem_size_1.K\n"
<< " ThreadblockShape0::kN = problem_size_0.N\n"
<< " ThreadblockShape1::kN = problem_size_1.N" << std::endl;
}
status = b2b_gemm_op.initialize(arguments);
CUTLASS_CHECK(status);
for(int i = 0; i < warm_ups; i++) {
status = b2b_gemm_op();
CUTLASS_CHECK(status);
}
//
// Run the GEMM
//
@ -536,7 +633,7 @@ struct B2bInterleavedFusedGemmRun
cudaEventRecord(start);
for(int i = 0; i < 100; i++) {
for(int i = 0; i < runs; i++) {
status = b2b_gemm_op();
CUTLASS_CHECK(status);
@ -546,29 +643,49 @@ struct B2bInterleavedFusedGemmRun
cudaDeviceSynchronize();
float gemmTime;
cudaEventElapsedTime(&gemmTime, start, stop);
std::cout << "time " << gemmTime / 100.0 << " ms\n";
std::cout << "Fusion time " << gemmTime / (float)runs << " ms\n";
//tensor_D0.sync_host();
tensor_D1.sync_host();
//
// Verify
//
cutlass::reference::device::Gemm<
typename B2bGemm::ElementA, typename B2bGemm::LayoutA,
typename B2bGemm::ElementB, typename B2bGemm::LayoutB,
ElementAccumulator, typename B2bGemm::LayoutC,
ElementAccumulator, ElementAccumulator>
reference_gemm_0;
cutlass::reference::device::Gemm<
typename B2bGemm::ElementA, typename B2bGemm::LayoutA,
typename B2bGemm::ElementB, typename B2bGemm::LayoutB,
typename B2bGemm::ElementC, typename B2bGemm::LayoutC, ElementCompute,
ElementAccumulator, typename B2bGemm::Operator>
reference_gemm_0, reference_gemm_1;
reference_gemm_1;
reference_gemm_0(
problem_size_0,
alpha0,
ElementAccumulator(1), //intermediate alpha=1
tensor_A0.device_ref(),
tensor_B0.device_ref(),
beta0,
tensor_C0.device_ref(),
reference_D0.device_ref()
ElementAccumulator(0), //beta = 0
reference_Z0.device_ref(),
reference_Z0.device_ref(),
ElementAccumulator(0)
);
cutlass::reference::device::TensorScaleBiasGemm<
ElementAccumulator, typename B2bGemm::ElementC, typename B2bGemm::LayoutC,
ElementCompute, typename B2bGemm::LayoutScaleBias
> (
problem_size_0,
reference_Z0.device_ref(),
reference_D0.device_ref(),
alpha0,
tensor_Scale0.device_ref(),
tensor_Bias0.device_ref()
);
if(relu) {
@ -581,18 +698,15 @@ struct B2bInterleavedFusedGemmRun
reference_D0.device_ref(),
tensor_B1.device_ref(),
beta1,
tensor_C1.device_ref(),
{tensor_Bias1.device_data(), typename B2bGemm::LayoutC::Stride(0)},
reference_D1.device_ref()
);
if(relu) {
cutlass::reference::device::TensorReLu(reference_D1.device_view());
}
cudaDeviceSynchronize();
reference_D0.sync_host();
reference_D1.sync_host();
reference_D0.sync_host();
reference_D1.sync_host();
CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0);
@ -603,7 +717,8 @@ struct B2bInterleavedFusedGemmRun
tensor_D1.host_view());
CHECK_TRUE(passed);
if (!passed) {
if (!passed)
{
std::stringstream fname;
@ -617,14 +732,15 @@ struct B2bInterleavedFusedGemmRun
<< "\nB0 =\n" << tensor_B0.host_view()
<< "\nB0_reordered =\n" << tensor_B0_reordered.host_view()
<< "\nC0 =\n" << tensor_C0.host_view()
// << "\nD0 =\n" << tensor_D0.host_view()
<< "\nScale0:\n" << tensor_Scale0.host_view() << "\n"
<< "\nBias0:\n" << tensor_Bias0.host_view() << "\n"
<< "\nB1 =\n" << tensor_B1.host_view()
<< "\nB1_reordered =\n" << tensor_B1_reordered.host_view()
<< "\nC1 =\n" << tensor_C1.host_view()
<< "\nBias1:\n" << tensor_Bias1.host_view() << "\n"
<< "\n\nReference =\n" << reference_D1.host_view()
<< "\nComputed =\n" << tensor_D1.host_view();
}
return passed;
}

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
@ -40,6 +46,7 @@
#include "kernel/b2b_gemm.h"
#include "kernel/default_b2b_gemm.h"
#include "kernel/default_b2b_gemm_smem_accumulator.h"
////////////////////////////////////////////////////////////////////////////////
@ -102,6 +109,8 @@ template <
int Stages =
DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
ElementC_, ElementAccumulator_>::kStages,
/// Stage accumulator in shared memory
bool SmemAccumulator = false,
/// Access granularity of A matrix in units of elements
int AlignmentA =
DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
@ -115,9 +124,7 @@ template <
/// Operation performed by GEMM
typename Operator_ = typename DefaultGemmConfiguration<
OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
ElementAccumulator_>::Operator,
/// Whether Beta is zero or not
bool IsBetaZero = false>
ElementAccumulator_>::Operator>
class B2bGemm {
public:
@ -148,10 +155,13 @@ class B2bGemm {
static int const kAlignmentB = AlignmentB;
static int const kAlignmentC = EpilogueOutputOp1::kCount;
static bool const kSplitKSerial = SplitKSerial;
static bool const kIsBetaZero = IsBetaZero;
static ComplexTransform const kTransformA = ComplexTransform::kNone;
static ComplexTransform const kTransformB = ComplexTransform::kNone;
/// Derived types
using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
using LayoutScaleBias = layout::RowMajor;
/// Define the kernel
using B2bGemmKernel = typename kernel::DefaultB2bGemm<
ElementA,
@ -176,7 +186,7 @@ class B2bGemm {
kStages,
kSplitKSerial,
Operator,
kIsBetaZero
SmemAccumulator
>::B2bGemmKernel;
/// Argument structure
@ -191,6 +201,8 @@ class B2bGemm {
TensorRef<ElementA const, LayoutA> ref_A0;
TensorRef<ElementB const, LayoutB> ref_B0;
TensorRef<ElementC const, LayoutC> ref_C0;
TensorRef<ElementScaleBias const, LayoutScaleBias> ref_Scale0;
TensorRef<ElementScaleBias const, LayoutScaleBias> ref_Bias0;
TensorRef<ElementB const, LayoutB> ref_B1;
TensorRef<ElementC const, LayoutC> ref_C1;
TensorRef<ElementC, LayoutC> ref_D1;
@ -216,6 +228,8 @@ class B2bGemm {
TensorRef<ElementA const, LayoutA> ref_A0_,
TensorRef<ElementB const, LayoutB> ref_B0_,
TensorRef<ElementC const, LayoutC> ref_C0_,
TensorRef<ElementScaleBias const, LayoutScaleBias> ref_Scale0_,
TensorRef<ElementScaleBias const, LayoutScaleBias> ref_Bias0_,
TensorRef<ElementB const, LayoutB> ref_B1_,
TensorRef<ElementC const, LayoutC> ref_C1_,
TensorRef<ElementC, LayoutC> ref_D1_,
@ -230,6 +244,8 @@ class B2bGemm {
ref_A0(ref_A0_),
ref_B0(ref_B0_),
ref_C0(ref_C0_),
ref_Scale0(ref_Scale0_),
ref_Bias0(ref_Bias0_),
ref_B1(ref_B1_),
ref_C1(ref_C1_),
ref_D1(ref_D1_),
@ -342,6 +358,8 @@ public:
args.ref_A0.non_const_ref(),
args.ref_B0.non_const_ref(),
args.ref_C0.non_const_ref(),
args.ref_Scale0.non_const_ref(),
args.ref_Bias0.non_const_ref(),
args.ref_B1.non_const_ref(),
args.ref_C1.non_const_ref(),
args.ref_D1,
@ -362,12 +380,14 @@ public:
}
}
params_.ref_A0.reset(args.ref_A.non_const_ref().data());
params_.ref_B0.reset(args.ref_B.non_const_ref().data());
params_.ref_C0.reset(args.ref_C.non_const_ref().data());
params_.ref_B1.reset(args.ref_B.non_const_ref().data());
params_.ref_C1.reset(args.ref_C.non_const_ref().data());
params_.ref_D1.reset(args.ref_D.data());
params_.ref_A0.reset(args.ref_A0.non_const_ref().data());
params_.ref_B0.reset(args.ref_B0.non_const_ref().data());
params_.ref_C0.reset(args.ref_C0.non_const_ref().data());
params_.ref_Scale0.reset(args.ref_Scale0.non_const_ref().data());
params_.ref_Bias0.reset(args.ref_Bias0.non_const_ref().data());
params_.ref_B1.reset(args.ref_B1.non_const_ref().data());
params_.ref_C1.reset(args.ref_C1.non_const_ref().data());
params_.ref_D1.reset(args.ref_D1.data());
params_.output_op_0 = args.epilogue0;
params_.output_op_1 = args.epilogue1;
params_.semaphore = static_cast<int *>(workspace);
@ -394,14 +414,6 @@ public:
if (result != cudaSuccess) {
return Status::kErrorInternal;
}
result = cudaFuncSetAttribute(
Kernel<B2bGemmKernel>,
cudaFuncAttributePreferredSharedMemoryCarveout, 100);
if (result != cudaSuccess) {
return Status::kErrorInternal;
}
}
cutlass::Kernel<B2bGemmKernel><<<grid, block, smem_size, stream>>>(params_);
@ -422,7 +434,7 @@ public:
void *workspace = nullptr,
cudaStream_t stream = nullptr) {
Status status = initialize(args, workspace);
Status status = initialize(args, workspace, stream);
if (status == Status::kSuccess) {
status = run(stream);

View File

@ -0,0 +1,300 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Template for device-level Implicit GEMM
*/
#pragma once
#include <limits>
#include "cutlass/cutlass.h"
#include "cutlass/device_kernel.h"
#include "cutlass/conv/convolution.h"
#include "kernel/b2b_implicit_gemm_convolution.h"
#include "kernel/default_b2b_conv2d_fprop.h"
#include "kernel/default_b2b_conv2d_fprop_sm75.h"
#include "kernel/default_b2b_conv2d_fprop_sm80.h"
#include "kernel/default_b2b_conv2d_fprop_smem_accumulator_sm75.h"
#include "kernel/default_b2b_conv2d_fprop_smem_accumulator_sm80.h"
namespace cutlass {
namespace conv {
namespace device {
template<typename B2bImplicitGemmKernel_>
class B2bImplicitGemmConvolution {
public:
using B2bImplicitGemmKernel = B2bImplicitGemmKernel_;
using ElementA = typename B2bImplicitGemmKernel::ElementA;
using LayoutA = typename B2bImplicitGemmKernel::LayoutA;
using ElementB = typename B2bImplicitGemmKernel::ElementB;
using LayoutB = typename B2bImplicitGemmKernel::LayoutB;
using ElementC = typename B2bImplicitGemmKernel::ElementC;
using LayoutC = typename B2bImplicitGemmKernel::LayoutC;
using ElementAccumulator = typename B2bImplicitGemmKernel::ElementAccumulator;
using ElementCompute = typename B2bImplicitGemmKernel::ElementCompute;
using ElementScaleBias = typename B2bImplicitGemmKernel::ElementScaleBias;
using LayoutScaleBias = typename B2bImplicitGemmKernel::LayoutScaleBias;
using OperatorClass = typename B2bImplicitGemmKernel::OperatorClass;
using ArchTag = typename B2bImplicitGemmKernel::ArchTag;
using ThreadblockShape0 = typename B2bImplicitGemmKernel::ThreadblockShape0;
using ThreadblockShape1 = typename B2bImplicitGemmKernel::ThreadblockShape1;
using WarpShape0 = typename B2bImplicitGemmKernel::WarpShape0;
using WarpShape1 = typename B2bImplicitGemmKernel::WarpShape1;
using InstructionShape = typename B2bImplicitGemmKernel::InstructionShape;
using ThreadblockSwizzle = typename B2bImplicitGemmKernel::ThreadblockSwizzle;
using EpilogueOutputOp0 = typename B2bImplicitGemmKernel::EpilogueOutputOp0;
using EpilogueOutputOp1 = typename B2bImplicitGemmKernel::EpilogueOutputOp1;
static int const kStages = B2bImplicitGemmKernel::kStages;
static int const kConvDim = B2bImplicitGemmKernel::kConvDim;
using WarpMmaOperator0 = typename B2bImplicitGemmKernel::WarpMmaOperator0;
using WarpMmaOperator1 = typename B2bImplicitGemmKernel::WarpMmaOperator1;
using ArchMmaOperator = typename B2bImplicitGemmKernel::ArchMmaOperator;
using MathOperator = typename B2bImplicitGemmKernel::MathOperator;
static cutlass::conv::Operator const kConvolutionalOperator = B2bImplicitGemmKernel::kConvolutionalOperator;
static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = B2bImplicitGemmKernel::kIteratorAlgorithm;
static int const kWarpCount =
(ThreadblockShape0::kM / WarpShape0::kM) *
(ThreadblockShape0::kN / WarpShape0::kN);
/// Argument structure
using Arguments = typename B2bImplicitGemmKernel::Arguments;
private:
/// Kernel parameters object
typename B2bImplicitGemmKernel::Params params_;
public:
/// Constructs Implicit GEMM
B2bImplicitGemmConvolution() { }
/// Determines whether the Implicit GEMM can execute the given problem.
static Status can_implement(Arguments const &args) {
// dispatch to iterators
Status status = B2bImplicitGemmKernel::B2bMma::IteratorA0::can_implement(args.problem_size_0);
if (Status::kSuccess != status) {
return status;
}
status = B2bImplicitGemmKernel::B2bMma::IteratorB0::can_implement(args.problem_size_0);
if (Status::kSuccess != status) {
return status;
}
status = B2bImplicitGemmKernel::B2bMma::IteratorB1::can_implement(args.problem_size_1);
if (Status::kSuccess != status) {
return status;
}
// Determine grid shape
ThreadblockSwizzle threadblock_swizzle;
dim3 grid = threadblock_swizzle.get_grid_shape(
threadblock_swizzle.get_tiled_shape(
cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_0),
{ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
args.problem_size_0.split_k_slices));
if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
grid.z <= std::numeric_limits<uint16_t>::max())) {
return Status::kErrorInvalidProblem;
}
// Determine if fusion sizes are valid
cutlass::gemm::GemmCoord problem_size_0 = implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_0);
cutlass::gemm::GemmCoord problem_size_1 = implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_1);
if(problem_size_0.m() != problem_size_1.m())
return Status::kErrorInvalidProblem;
if(problem_size_0.n() != problem_size_1.k())
return Status::kErrorInvalidProblem;
if(args.problem_size_1.R != 1 || args.problem_size_1.S != 1)
return Status::kErrorInvalidProblem;
if(problem_size_0.n() > ThreadblockShape0::kN)
return Status::kErrorInvalidProblem;
if(problem_size_1.n() > ThreadblockShape1::kN)
return Status::kErrorInvalidProblem;
return Status::kSuccess;
}
/// Gets the workspace size
static size_t get_workspace_size(Arguments const &args) {
size_t workspace_bytes = 0;
// Determine grid shape
ThreadblockSwizzle threadblock_swizzle;
cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_0),
{ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
args.problem_size_0.split_k_slices);
if(args.split_k_mode == SplitKMode::kParallel) {
// Split-K parallel: CTAs in k-dimension write the partial results in a temporary workspace.
// The user needs to call a reduction operator to optain the final output tensor
workspace_bytes =
sizeof(ElementAccumulator) *
size_t(cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, args.problem_size_0)) *
size_t(grid_tiled_shape.k());
}
else if(args.split_k_mode == SplitKMode::kSerial && args.problem_size_0.split_k_slices > 1) {
// Split-K serial: The user workspace is used to store semaphore and serialize writing the
// final reduced output to user's output tensor
workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n());
}
return workspace_bytes;
}
/// Initializes GEMM state from arguments.
Status initialize(
Arguments const &args,
void *workspace = nullptr,
cudaStream_t stream = nullptr) {
if (args.problem_size_0.split_k_slices > 1) {
if (!workspace) {
return Status::kErrorWorkspaceNull;
}
cudaError_t status = cudaMemsetAsync(workspace, 0, get_workspace_size(args), stream);
if (status != cudaSuccess) {
return Status::kErrorInternal;
}
}
// initialize the params structure from the arguments
params_ = typename B2bImplicitGemmKernel::Params(
args,
static_cast<int *>(workspace)
);
int smem_size = int(sizeof(typename B2bImplicitGemmKernel::SharedStorage));
if (smem_size >= (48 << 10)) {
cudaError_t result = cudaFuncSetAttribute(cutlass::Kernel<B2bImplicitGemmKernel>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
smem_size);
if (result != cudaSuccess) {
return Status::kErrorInternal;
}
}
return Status::kSuccess;
}
/// Initializes GEMM state from arguments.
Status update(Arguments const &args, void *workspace = nullptr) {
// update the params structure from the arguments
params_.ptr_A0 = args.ref_A0.data();
params_.ptr_B0 = args.ref_B0.data();
params_.ptr_C0 = args.ref_C0.data();
params_.ptr_Scale0 = args.ref_Scale0.data();
params_.ptr_Bias0 = args.ref_Bias0.data();
params_.ptr_B1 = args.ref_B1.data();
params_.ptr_C1 = args.ref_C1.data();
params_.ptr_D1 = args.ref_D1.data();
params_.output_op_0 = args.output_op_0;
params_.output_op_1 = args.output_op_1;
params_.semaphore = static_cast<int *>(workspace);
return Status::kSuccess;
}
/// Runs the kernel using initialized state.
Status run(cudaStream_t stream = nullptr) {
ThreadblockSwizzle threadblock_swizzle;
dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
dim3 block(32 * kWarpCount, 1, 1);
int smem_size = int(sizeof(typename B2bImplicitGemmKernel::SharedStorage));
cutlass::Kernel<B2bImplicitGemmKernel><<<grid, block, smem_size, stream>>>(params_);
cudaError_t result = cudaGetLastError();
return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
}
/// Runs the kernel using initialized state.
Status operator()(cudaStream_t stream = nullptr) {
return run(stream);
}
/// Runs the kernel using initialized state.
Status operator()(
Arguments const &args,
void *workspace = nullptr,
cudaStream_t stream = nullptr) {
Status status = initialize(args, workspace, stream);
if (status == Status::kSuccess) {
status = run(stream);
}
return status;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace device
} // namespace conv
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,234 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/conv/device/implicit_gemm_convolution.h"
#include "device/b2b_implicit_gemm_convolution.h"
#include "b2b_conv2d_run.h"
#include "test_run.h"
////////////////////////////////////////////////////////////////////////////////
cutlass::conv::Conv2dProblemSize conv2d_f16_sm75_problem_size_0 (
{32, 56, 56, 64}, // input size (NHWC)
{64, 3, 3, 64}, // filter size (KRSC)
{1, 1, 1, 1}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{32, 56, 56, 64} // output size (NPQK)
);
cutlass::conv::Conv2dProblemSize conv2d_f16_sm75_problem_size_1 (
{32, 56, 56, 64}, // input size (NHWC)
{128, 1, 1, 64}, // filter size (KRSC)
{0, 0, 0, 0}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{32, 56, 56, 128} // output size (NPQK)
);
bool run_nonfused_conv2d_fprop_optimized_f16_sm75() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(1); //beta=1 for bias
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape0,
WarpShape0,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape1,
WarpShape1,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
B2bNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1> nonFusedConv2d;
std::cout << "Running Non-fused back-to-back FP16 Optimized Convolution Fprops...\n";
bool pass = nonFusedConv2d.run(conv2d_f16_sm75_problem_size_0, conv2d_f16_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
bool run_fused_conv2d_fprop_optimized_f16_sm75_rf_res() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(1);
//Fused kernel has built-in bias, setting beta=0
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //use beta for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<16, 64, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<16, 128, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
using EpilogueOutputOp0 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>;
const bool SmemAccumulator = false;
using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized,
SmemAccumulator
>::Kernel;
using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
B2bFusedConv2dRun<B2bConv2dFprop> fusedConv2d;
std::cout << "Running Fused back-to-back FP16 Optimized Convolution Fprops with RF Residency...\n";
bool pass = fusedConv2d.run(conv2d_f16_sm75_problem_size_0, conv2d_f16_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
int main() {
std::vector<bool (*)()>funcs = {
&run_nonfused_conv2d_fprop_optimized_f16_sm75,
&run_fused_conv2d_fprop_optimized_f16_sm75_rf_res
};
return testRun(75, funcs, "conv f16 RF residency");
}
////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,234 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/conv/device/implicit_gemm_convolution.h"
#include "device/b2b_implicit_gemm_convolution.h"
#include "b2b_conv2d_run.h"
#include "test_run.h"
////////////////////////////////////////////////////////////////////////////////
cutlass::conv::Conv2dProblemSize conv2d_f16_sm75_problem_size_0 (
{32, 56, 56, 64}, // input size (NHWC)
{64, 3, 3, 64}, // filter size (KRSC)
{1, 1, 1, 1}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{32, 56, 56, 64} // output size (NPQK)
);
cutlass::conv::Conv2dProblemSize conv2d_f16_sm75_problem_size_1 (
{32, 56, 56, 64}, // input size (NHWC)
{256, 1, 1, 64}, // filter size (KRSC)
{0, 0, 0, 0}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{32, 56, 56, 256} // output size (NPQK)
);
bool run_nonfused_conv2d_fprop_optimized_f16_sm75() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(1); //beta=1 for bias
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape0,
WarpShape0,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape1,
WarpShape1,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
B2bNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1> nonFusedConv2d;
std::cout << "Running Non-fused back-to-back FP16 Optimized Convolution Fprops...\n";
bool pass = nonFusedConv2d.run(conv2d_f16_sm75_problem_size_0, conv2d_f16_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
bool run_fused_conv2d_fprop_optimized_f16_sm75_shmem() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(1);
//Fused kernel has built-in bias, setting beta=0
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
using EpilogueOutputOp0 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>;
const bool SmemAccumulator = true;
using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized,
SmemAccumulator
>::Kernel;
using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
B2bFusedConv2dRun<B2bConv2dFprop> fusedConv2d;
std::cout << "Running Fused back-to-back FP16 Optimized Convolution Fprops with shared memory staging...\n";
bool pass = fusedConv2d.run(conv2d_f16_sm75_problem_size_0, conv2d_f16_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
int main() {
std::vector<bool (*)()>funcs = {
&run_nonfused_conv2d_fprop_optimized_f16_sm75,
&run_fused_conv2d_fprop_optimized_f16_sm75_shmem
};
return testRun(75, funcs, "conv f16 shmem staging");
}
////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,233 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/conv/device/implicit_gemm_convolution.h"
#include "device/b2b_implicit_gemm_convolution.h"
#include "b2b_conv2d_run.h"
#include "test_run.h"
////////////////////////////////////////////////////////////////////////////////
cutlass::conv::Conv2dProblemSize conv2d_f16_sm80_problem_size_0 (
{32, 56, 56, 64}, // input size (NHWC)
{64, 3, 3, 64}, // filter size (KRSC)
{1, 1, 1, 1}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{32, 56, 56, 64} // output size (NPQK)
);
cutlass::conv::Conv2dProblemSize conv2d_f16_sm80_problem_size_1 (
{32, 56, 56, 64}, // input size (NHWC)
{128, 1, 1, 64}, // filter size (KRSC)
{0, 0, 0, 0}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{32, 56, 56, 128} // output size (NPQK)
);
bool run_nonfused_conv2d_fprop_optimized_f16_sm80() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(1); //beta=1 for bias
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
WarpShape0,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape1,
WarpShape1,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
B2bNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1> nonFusedConv2d;
std::cout << "Running Non-fused back-to-back FP16 Optimized Convolution Fprops...\n";
bool pass = nonFusedConv2d.run(conv2d_f16_sm80_problem_size_0, conv2d_f16_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
bool run_fused_conv2d_fprop_optimized_f16_sm80_rf_res() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(1);
//Fused kernel has built-in bias, setting beta=0
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 128, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
using EpilogueOutputOp0 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>;
using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
B2bFusedConv2dRun<B2bConv2dFprop> fusedConv2d;
std::cout << "Running Fused back-to-back FP16 Optimized Convolution Fprops with RF Residency...\n";
bool pass = fusedConv2d.run(conv2d_f16_sm80_problem_size_0, conv2d_f16_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
return true;
}
int main() {
std::vector<bool (*)()>funcs = {
&run_nonfused_conv2d_fprop_optimized_f16_sm80,
&run_fused_conv2d_fprop_optimized_f16_sm80_rf_res
};
return testRun(80, funcs, "conv f16 RF residency");
}
////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,236 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/conv/device/implicit_gemm_convolution.h"
#include "device/b2b_implicit_gemm_convolution.h"
#include "b2b_conv2d_run.h"
#include "test_run.h"
////////////////////////////////////////////////////////////////////////////////
cutlass::conv::Conv2dProblemSize conv2d_f16_sm80_problem_size_0 (
{32, 56, 56, 64}, // input size (NHWC)
{64, 3, 3, 64}, // filter size (KRSC)
{1, 1, 1, 1}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{32, 56, 56, 64} // output size (NPQK)
);
cutlass::conv::Conv2dProblemSize conv2d_f16_sm80_problem_size_1 (
{32, 56, 56, 64}, // input size (NHWC)
{256, 1, 1, 64}, // filter size (KRSC)
{0, 0, 0, 0}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{32, 56, 56, 256} // output size (NPQK)
);
bool run_nonfused_conv2d_fprop_optimized_f16_sm80() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(1); //beta=1 for bias
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
WarpShape0,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape1,
WarpShape1,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
B2bNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1> nonFusedConv2d;
std::cout << "Running Non-fused back-to-back FP16 Optimized Convolution Fprops...\n";
bool pass = nonFusedConv2d.run(conv2d_f16_sm80_problem_size_0, conv2d_f16_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
bool run_fused_conv2d_fprop_optimized_f16_sm80_shmem() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(1);
//Fused kernel has built-in bias, setting beta=0
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
using EpilogueOutputOp0 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>;
const bool SmemAccumulator = true;
using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized,
SmemAccumulator
>::Kernel;
using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
B2bFusedConv2dRun<B2bConv2dFprop> fusedConv2d;
std::cout << "Running Fused back-to-back FP16 Optimized Convolution Fprops with shared memory staging...\n";
bool pass = fusedConv2d.run(conv2d_f16_sm80_problem_size_0, conv2d_f16_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
int main() {
std::vector<bool (*)()>funcs = {
&run_nonfused_conv2d_fprop_optimized_f16_sm80,
&run_fused_conv2d_fprop_optimized_f16_sm80_shmem
};
return testRun(80, funcs, "conv f16 shmem staging");
}
////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,238 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/conv/device/implicit_gemm_convolution.h"
#include "device/b2b_implicit_gemm_convolution.h"
#include "b2b_interleaved_conv2d_run.h"
#include "test_run.h"
////////////////////////////////////////////////////////////////////////////////
cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_0 (
{32, 56, 56, 64}, // input size (NHWC)
{64, 3, 3, 64}, // filter size (KRSC)
{1, 1, 1, 1}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{32, 56, 56, 64} // output size (NPQK)
);
cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_1 (
{32, 56, 56, 64}, // input size (NHWC)
{128, 1, 1, 64}, // filter size (KRSC)
{0, 0, 0, 0}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{32, 56, 56, 128} // output size (NPQK)
);
bool run_nonfused_conv2d_fprop_optimized_s8_sm75() {
using ElementA = int8_t;
using ElementB = int8_t;
using ElementC = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(1); //beta=1 for bias
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape0,
WarpShape0,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape1,
WarpShape1,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
B2bInterleavedNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1, 32> nonFusedConv2d;
std::cout << "Running Non-fused back-to-back INT8 interleaved Optimized Convolution Fprops...\n";
bool pass = nonFusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
bool run_fused_conv2d_fprop_optimized_s8_sm75_rf_res() {
using ElementA = int8_t;
using ElementB = int8_t;
using ElementC = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
ElementCompute alpha0 = ElementCompute(1);
//Fused kernel has built-in bias, setting beta=0
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<16, 64, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<16, 128, 32>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
using EpilogueOutputOp0 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>;
const bool SmemAccumulator = false;
using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kOptimized,
SmemAccumulator
>::Kernel;
using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
B2bInterleavedFusedConv2dRun<B2bConv2dFprop, 32> fusedConv2d;
std::cout << "Running Fused back-to-back INT8 interleaved Optimized Convolution Fprops with RF residency...\n";
bool pass = fusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
int main() {
std::vector<bool (*)()>funcs = {
&run_nonfused_conv2d_fprop_optimized_s8_sm75,
&run_fused_conv2d_fprop_optimized_s8_sm75_rf_res
};
return testRun(75, funcs, "conv int8 RF residency");
}
////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,238 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/conv/device/implicit_gemm_convolution.h"
#include "device/b2b_implicit_gemm_convolution.h"
#include "b2b_interleaved_conv2d_run.h"
#include "test_run.h"
////////////////////////////////////////////////////////////////////////////////
cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_0 (
{32, 56, 56, 64}, // input size (NHWC)
{64, 3, 3, 64}, // filter size (KRSC)
{1, 1, 1, 1}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{32, 56, 56, 64} // output size (NPQK)
);
cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_1 (
{32, 56, 56, 64}, // input size (NHWC)
{256, 1, 1, 64}, // filter size (KRSC)
{0, 0, 0, 0}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{32, 56, 56, 256} // output size (NPQK)
);
bool run_nonfused_conv2d_fprop_optimized_s8_sm75() {
using ElementA = int8_t;
using ElementB = int8_t;
using ElementC = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(1); //beta=1 for bias
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape0,
WarpShape0,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape1,
WarpShape1,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
B2bInterleavedNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1, 32> nonFusedConv2d;
std::cout << "Running Non-fused back-to-back INT8 interleaved Optimized Convolution Fprops...\n";
bool pass = nonFusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
bool run_fused_conv2d_fprop_optimized_s8_sm75_shmem() {
using ElementA = int8_t;
using ElementB = int8_t;
using ElementC = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
ElementCompute alpha0 = ElementCompute(1);
//Fused kernel has built-in bias, setting beta=0
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
using EpilogueOutputOp0 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>;
const bool SmemAccumulator = true;
using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kOptimized,
SmemAccumulator
>::Kernel;
using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
B2bInterleavedFusedConv2dRun<B2bConv2dFprop, 32> fusedConv2d;
std::cout << "Running Fused back-to-back INT8 interleaved Optimized Convolution Fprops with shared memory staging...\n";
bool pass = fusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
int main() {
std::vector<bool (*)()>funcs = {
&run_nonfused_conv2d_fprop_optimized_s8_sm75,
&run_fused_conv2d_fprop_optimized_s8_sm75_shmem
};
return testRun(75, funcs, "conv int8 shmem staging");
}
////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,236 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/conv/device/implicit_gemm_convolution.h"
#include "device/b2b_implicit_gemm_convolution.h"
#include "b2b_interleaved_conv2d_run.h"
#include "test_run.h"
////////////////////////////////////////////////////////////////////////////////
cutlass::conv::Conv2dProblemSize conv2d_s8_sm80_problem_size_0 (
{32, 56, 56, 64}, // input size (NHWC)
{64, 3, 3, 64}, // filter size (KRSC)
{1, 1, 1, 1}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{32, 56, 56, 64} // output size (NPQK)
);
cutlass::conv::Conv2dProblemSize conv2d_s8_sm80_problem_size_1 (
{32, 56, 56, 64}, // input size (NHWC)
{128, 1, 1, 64}, // filter size (KRSC)
{0, 0, 0, 0}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{32, 56, 56, 128} // output size (NPQK)
);
bool run_nonfused_conv2d_fprop_optimized_s8_sm80() {
using ElementA = int8_t;
using ElementB = int8_t;
using ElementC = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(1); //beta=1 for bias
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 64>;
using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
WarpShape0,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape1,
WarpShape1,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
B2bInterleavedNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1, 32> nonFusedConv2d;
std::cout << "Running Non-fused back-to-back INT8 interleaved Optimized Convolution Fprops...\n";
bool pass = nonFusedConv2d.run(conv2d_s8_sm80_problem_size_0, conv2d_s8_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
bool run_fused_conv2d_fprop_optimized_s8_sm80_rf_res() {
using ElementA = int8_t;
using ElementB = int8_t;
using ElementC = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
ElementCompute alpha0 = ElementCompute(1);
//Fused kernel has built-in bias, setting beta=0
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<16, 64, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 64>;
using WarpShape1 = cutlass::gemm::GemmShape<16, 128, 64>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
using EpilogueOutputOp0 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
8 * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>;
using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
B2bInterleavedFusedConv2dRun<B2bConv2dFprop, 32> fusedConv2d;
std::cout << "Running Fused back-to-back INT8 interleaved Optimized Convolution Fprops with RF residency...\n";
bool pass = fusedConv2d.run(conv2d_s8_sm80_problem_size_0, conv2d_s8_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
int main() {
std::vector<bool (*)()>funcs = {
&run_nonfused_conv2d_fprop_optimized_s8_sm80,
&run_fused_conv2d_fprop_optimized_s8_sm80_rf_res
};
return testRun(80, funcs, "conv int8 RF residency");
}
////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,237 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/conv/device/implicit_gemm_convolution.h"
#include "device/b2b_implicit_gemm_convolution.h"
#include "b2b_interleaved_conv2d_run.h"
#include "test_run.h"
////////////////////////////////////////////////////////////////////////////////
cutlass::conv::Conv2dProblemSize conv2d_s8_sm80_problem_size_0 (
{32, 56, 56, 64}, // input size (NHWC)
{64, 3, 3, 64}, // filter size (KRSC)
{1, 1, 1, 1}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{32, 56, 56, 64} // output size (NPQK)
);
cutlass::conv::Conv2dProblemSize conv2d_s8_sm80_problem_size_1 (
{32, 56, 56, 64}, // input size (NHWC)
{256, 1, 1, 64}, // filter size (KRSC)
{0, 0, 0, 0}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{32, 56, 56, 256} // output size (NPQK)
);
bool run_nonfused_conv2d_fprop_optimized_s8_sm80() {
using ElementA = int8_t;
using ElementB = int8_t;
using ElementC = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(1); //beta=1 for bias
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 64>;
using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
WarpShape0,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape1,
WarpShape1,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
B2bInterleavedNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1, 32> nonFusedConv2d;
std::cout << "Running Non-fused back-to-back INT8 interleaved Optimized Convolution Fprops...\n";
bool pass = nonFusedConv2d.run(conv2d_s8_sm80_problem_size_0, conv2d_s8_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
bool run_fused_conv2d_fprop_optimized_s8_sm80_shmem() {
using ElementA = int8_t;
using ElementB = int8_t;
using ElementC = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
ElementCompute alpha0 = ElementCompute(1);
//Fused kernel has built-in bias, setting beta=0
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 64>;
using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
using EpilogueOutputOp0 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
8 * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>;
const bool SmemAccumulator = true;
using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kOptimized,
SmemAccumulator
>::Kernel;
using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
B2bInterleavedFusedConv2dRun<B2bConv2dFprop, 32> fusedConv2d;
std::cout << "Running Fused back-to-back INT8 interleaved Optimized Convolution Fprops with shared memory staging...\n";
bool pass = fusedConv2d.run(conv2d_s8_sm80_problem_size_0, conv2d_s8_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
int main() {
std::vector<bool (*)()>funcs = {
&run_nonfused_conv2d_fprop_optimized_s8_sm80,
&run_fused_conv2d_fprop_optimized_s8_sm80_shmem
};
return testRun(80, funcs, "conv int8 shmem staging");
}
////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,210 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cutlass/gemm/device/gemm.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/host/tensor_copy.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/gemm.h"
#include "device/b2b_gemm.h"
#include "b2b_gemm_run.h"
#include "test_run.h"
////////////////////////////////////////////////////////////////////////////////
cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_0(128*640, 64, 576);
cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_1(128*640, 128, 64);
bool run_nonfused_gemm_f16() {
using ElementOutput = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(1); //beta = 1 for bias
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta = 1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
using Gemm0 = cutlass::gemm::device::Gemm<
cutlass::half_t,
cutlass::layout::RowMajor,
cutlass::half_t,
cutlass::layout::ColumnMajor,
ElementOutput,
cutlass::layout::RowMajor,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape0,
WarpShape0,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
128 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2
>;
using Gemm1 = cutlass::gemm::device::Gemm<
cutlass::half_t,
cutlass::layout::RowMajor,
cutlass::half_t,
cutlass::layout::ColumnMajor,
ElementOutput,
cutlass::layout::RowMajor,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape1,
WarpShape1,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
128 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2
>;
B2bNonFusedGemmRun<Gemm0, Gemm1> nonFusedGemm;
std::cout << "Running Non-fused back-to-back FP16 TN GEMMs...\n";
bool pass = nonFusedGemm.run(gemm_f16_sm75_problem_size_0, gemm_f16_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
bool run_fused_gemm_f16_rf_res() {
using ElementOutput = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(1);
//Fused kernel has built-in bias, setting beta=0
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 128, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
using EpilogueOutputOp0 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
128 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>;
using B2bGemm = cutlass::gemm::device::B2bGemm<
cutlass::half_t,
cutlass::layout::RowMajor,
cutlass::half_t,
cutlass::layout::ColumnMajor,
ElementOutput,
cutlass::layout::RowMajor,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2
>;
B2bFusedGemmRun<B2bGemm> fusedGemm;
std::cout << "Running Fused back-to-back FP16 TN GEMMs with RF Residency...\n";
bool passed = fusedGemm.run(gemm_f16_sm75_problem_size_0, gemm_f16_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
if(passed)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return passed;
}
int main() {
std::vector<bool (*)()>funcs = {
&run_nonfused_gemm_f16,
&run_fused_gemm_f16_rf_res
};
return testRun(75, funcs, "gemm f16 RF residency");
}
///////////////////////////////////////////////////////////////////////////////

View File

@ -1,29 +1,33 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#pragma once
#include <iostream>
#include "cutlass/cutlass.h"
@ -38,26 +42,26 @@
#include "device/b2b_gemm.h"
#include "b2b_gemm_run.h"
#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
#include "test_run.h"
////////////////////////////////////////////////////////////////////////////////
void run_nonfused_gemm_f16() {
cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_0(128*640, 64, 576);
cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_1(128*640, 256, 64);
bool run_nonfused_gemm_f16() {
using ElementOutput = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
ElementCompute alpha0 = ElementCompute(2);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(2);
ElementCompute beta1 = ElementCompute(1);
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(1); //beta = 1 for bias
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta = 1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
@ -79,7 +83,8 @@ void run_nonfused_gemm_f16() {
ElementOutput,
128 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2
@ -101,7 +106,8 @@ void run_nonfused_gemm_f16() {
ElementOutput,
128 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2
@ -110,30 +116,31 @@ void run_nonfused_gemm_f16() {
B2bNonFusedGemmRun<Gemm0, Gemm1> nonFusedGemm;
std::cout << "Running Non-fused back-to-back FP16 TN GEMMs...\n";
bool pass = nonFusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
bool pass = nonFusedGemm.run(gemm_f16_sm75_problem_size_0, gemm_f16_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
void run_fused_gemm_f16() {
bool run_fused_gemm_f16_shmem() {
using ElementOutput = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
ElementCompute alpha0 = ElementCompute(2);
ElementCompute alpha0 = ElementCompute(1);
//Fused kernel has built-in bias, setting beta=0
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(2);
ElementCompute beta1 = ElementCompute(1);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 128, 32>;
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
using EpilogueOutputOp0 =
@ -141,7 +148,8 @@ void run_fused_gemm_f16() {
ElementOutput,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
@ -149,10 +157,12 @@ void run_fused_gemm_f16() {
ElementOutput,
128 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>;
const bool SmemAccumulator = true;
using B2bGemm = cutlass::gemm::device::B2bGemm<
cutlass::half_t,
@ -172,19 +182,33 @@ void run_fused_gemm_f16() {
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2
2,
SmemAccumulator
>;
B2bFusedGemmRun<B2bGemm> fusedGemm;
std::cout << "Running Fused back-to-back FP16 TN GEMMs...\n";
bool passed = fusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
std::cout << "Running Fused back-to-back FP16 TN GEMMs with shared memory staging...\n";
bool passed = fusedGemm.run(gemm_f16_sm75_problem_size_0, gemm_f16_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
if(passed)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return passed;
}
////////////////////////////////////////////////////////////////////////////////
#endif //#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
int main() {
std::vector<bool (*)()>funcs = {
&run_nonfused_gemm_f16,
&run_fused_gemm_f16_shmem
};
return testRun(75, funcs, "gemm f16 shmem staging");
}
////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,213 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cutlass/gemm/device/gemm.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/host/tensor_copy.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/gemm.h"
#include "device/b2b_gemm.h"
#include "b2b_gemm_run.h"
#include "test_run.h"
////////////////////////////////////////////////////////////////////////////////
cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_0(128*640, 64, 576);
cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_1(128*640, 128, 64);
bool run_nonfused_gemm_f16_sm80() {
using ElementOutput = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(1); //beta=1 for bias
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
using Gemm0 = cutlass::gemm::device::Gemm<
cutlass::half_t,
cutlass::layout::RowMajor,
cutlass::half_t,
cutlass::layout::ColumnMajor,
ElementOutput,
cutlass::layout::RowMajor,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
WarpShape0,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
128 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3
>;
using Gemm1 = cutlass::gemm::device::Gemm<
cutlass::half_t,
cutlass::layout::RowMajor,
cutlass::half_t,
cutlass::layout::ColumnMajor,
ElementOutput,
cutlass::layout::RowMajor,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape1,
WarpShape1,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
128 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3
>;
B2bNonFusedGemmRun<Gemm0, Gemm1> nonFusedGemm;
std::cout << "Running Non-fused back-to-back FP16 TN GEMMs...\n";
bool pass = nonFusedGemm.run(gemm_f16_sm80_problem_size_0, gemm_f16_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
bool run_fused_gemm_f16_sm80_rf_res() {
using ElementOutput = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(1);
//Fused kernel has built-in bias, setting beta=0
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<16, 64, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<16, 128, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
using EpilogueOutputOp0 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
128 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>;
using B2bGemm = cutlass::gemm::device::B2bGemm<
cutlass::half_t,
cutlass::layout::RowMajor,
cutlass::half_t,
cutlass::layout::ColumnMajor,
ElementOutput,
cutlass::layout::RowMajor,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3
>;
B2bFusedGemmRun<B2bGemm> fusedGemm;
std::cout << "Running Fused back-to-back FP16 TN GEMMs with RF residency...\n";
bool passed = fusedGemm.run(gemm_f16_sm80_problem_size_0, gemm_f16_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
if(passed)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return passed;
}
int main() {
std::vector<bool (*)()>funcs = {
&run_nonfused_gemm_f16_sm80,
&run_fused_gemm_f16_sm80_rf_res
};
return testRun(80, funcs, "gemm f16 RF residency");
}
////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,217 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cutlass/gemm/device/gemm.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/host/tensor_copy.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/gemm.h"
#include "device/b2b_gemm.h"
#include "b2b_gemm_run.h"
#include "test_run.h"
////////////////////////////////////////////////////////////////////////////////
cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_0(128*640, 64, 576);
cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_1(128*640, 256, 64);
bool run_nonfused_gemm_f16_sm80() {
using ElementOutput = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(1); //beta=1 for bias
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
using Gemm0 = cutlass::gemm::device::Gemm<
cutlass::half_t,
cutlass::layout::RowMajor,
cutlass::half_t,
cutlass::layout::ColumnMajor,
ElementOutput,
cutlass::layout::RowMajor,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
WarpShape0,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
128 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3
>;
using Gemm1 = cutlass::gemm::device::Gemm<
cutlass::half_t,
cutlass::layout::RowMajor,
cutlass::half_t,
cutlass::layout::ColumnMajor,
ElementOutput,
cutlass::layout::RowMajor,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape1,
WarpShape1,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
128 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3
>;
B2bNonFusedGemmRun<Gemm0, Gemm1> nonFusedGemm;
std::cout << "Running Non-fused back-to-back FP16 TN GEMMs...\n";
bool pass = nonFusedGemm.run(gemm_f16_sm80_problem_size_0, gemm_f16_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
bool run_fused_gemm_f16_sm80_shmem() {
using ElementOutput = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(1);
//Fused kernel has built-in bias, setting beta=0
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
using EpilogueOutputOp0 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
128 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>;
const bool SmemAccumulator = true;
using B2bGemm = cutlass::gemm::device::B2bGemm<
cutlass::half_t,
cutlass::layout::RowMajor,
cutlass::half_t,
cutlass::layout::ColumnMajor,
ElementOutput,
cutlass::layout::RowMajor,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
SmemAccumulator
>;
B2bFusedGemmRun<B2bGemm> fusedGemm;
std::cout << "Running Fused back-to-back FP16 TN GEMMs with shared memory staging...\n";
bool passed = fusedGemm.run(gemm_f16_sm80_problem_size_0, gemm_f16_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
if(passed)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return passed;
}
int main() {
std::vector<bool (*)()>funcs = {
&run_nonfused_gemm_f16_sm80,
&run_fused_gemm_f16_sm80_shmem
};
return testRun(80, funcs, "gemm f16 shmem staging");
}
////////////////////////////////////////////////////////////////////////////////

View File

@ -1,29 +1,33 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#pragma once
#include <iostream>
#include "cutlass/cutlass.h"
@ -38,28 +42,28 @@
#include "device/b2b_gemm.h"
#include "b2b_interleaved_gemm_run.h"
#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
#include "test_run.h"
////////////////////////////////////////////////////////////////////////////////
void run_nonfused_gemm_s8() {
cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_0(128*640, 64, 576);
cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_1(128*640, 128, 64);
bool run_nonfused_gemm_s8() {
using ElementOutput = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
ElementCompute alpha0 = ElementCompute(2);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(2);
ElementCompute beta1 = ElementCompute(1);
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(1); //beta = 1 for bias
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta = 1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 32, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 64>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
using Gemm0 = cutlass::gemm::device::Gemm<
@ -79,7 +83,8 @@ void run_nonfused_gemm_s8() {
ElementOutput,
64 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2
@ -101,7 +106,8 @@ void run_nonfused_gemm_s8() {
ElementOutput,
64 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2
@ -110,30 +116,32 @@ void run_nonfused_gemm_s8() {
B2bInterleavedNonFusedGemmRun<Gemm0, Gemm1, 32> nonFusedGemm;
std::cout << "Running Non-fused back-to-back INT8 NT interleaved GEMMs...\n";
bool pass = nonFusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
bool pass = nonFusedGemm.run(gemm_s8_sm75_problem_size_0, gemm_s8_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
void run_fused_gemm_s8() {
bool run_fused_gemm_s8_rf_res() {
using ElementOutput = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
ElementCompute alpha0 = ElementCompute(2);
ElementCompute alpha0 = ElementCompute(1);
//Fused kernel has built-in bias, setting beta=0
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(2);
ElementCompute beta1 = ElementCompute(1);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 64>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 128, 64>;
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 128, 32>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
using EpilogueOutputOp0 =
@ -141,7 +149,8 @@ void run_fused_gemm_s8() {
ElementOutput,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
@ -149,11 +158,10 @@ void run_fused_gemm_s8() {
ElementOutput,
64 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>;
using B2bGemm = cutlass::gemm::device::B2bGemm<
int8_t,
cutlass::layout::ColumnMajorInterleaved<32>,
@ -177,14 +185,28 @@ void run_fused_gemm_s8() {
B2bInterleavedFusedGemmRun<B2bGemm, 32> fusedGemm;
std::cout << "Running Fused back-to-back INT8 NT interleaved GEMMs...\n";
bool passed = fusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
std::cout << "Running Fused back-to-back INT8 NT interleaved GEMMs with RF Residency...\n";
bool passed = fusedGemm.run(gemm_s8_sm75_problem_size_0, gemm_s8_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
if(passed)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
}
////////////////////////////////////////////////////////////////////////////////
return passed;
#endif // #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
}
int main() {
std::vector<bool (*)()>funcs = {
&run_nonfused_gemm_s8,
&run_fused_gemm_s8_rf_res
};
return testRun(75, funcs, "gemm int8 RF residency");
}
////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,214 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cutlass/gemm/device/gemm.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/host/tensor_copy.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/gemm.h"
#include "device/b2b_gemm.h"
#include "b2b_interleaved_gemm_run.h"
#include "test_run.h"
////////////////////////////////////////////////////////////////////////////////
cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_0(128*640, 64, 576);
cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_1(128*640, 256, 64);
bool run_nonfused_gemm_s8() {
using ElementOutput = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(1); //beta = 1 for bias
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta = 1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
using Gemm0 = cutlass::gemm::device::Gemm<
int8_t,
cutlass::layout::ColumnMajorInterleaved<32>,
int8_t,
cutlass::layout::RowMajorInterleaved<32>,
ElementOutput,
cutlass::layout::ColumnMajorInterleaved<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape0,
WarpShape0,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
64 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2
>;
using Gemm1 = cutlass::gemm::device::Gemm<
int8_t,
cutlass::layout::ColumnMajorInterleaved<32>,
int8_t,
cutlass::layout::RowMajorInterleaved<32>,
ElementOutput,
cutlass::layout::ColumnMajorInterleaved<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape1,
WarpShape1,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
64 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2
>;
B2bInterleavedNonFusedGemmRun<Gemm0, Gemm1, 32> nonFusedGemm;
std::cout << "Running Non-fused back-to-back INT8 NT interleaved GEMMs...\n";
bool pass = nonFusedGemm.run(gemm_s8_sm75_problem_size_0, gemm_s8_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
bool run_fused_gemm_s8_shmem() {
using ElementOutput = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
ElementCompute alpha0 = ElementCompute(1);
//Fused kernel has built-in bias, setting beta=0
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
using EpilogueOutputOp0 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
64 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>;
const bool SmemAccumulator = true;
using B2bGemm = cutlass::gemm::device::B2bGemm<
int8_t,
cutlass::layout::ColumnMajorInterleaved<32>,
int8_t,
cutlass::layout::RowMajorInterleaved<32>,
ElementOutput,
cutlass::layout::ColumnMajorInterleaved<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
SmemAccumulator
>;
B2bInterleavedFusedGemmRun<B2bGemm, 32> fusedGemm;
std::cout << "Running Fused back-to-back INT8 NT interleaved GEMMs with shared memory staging...\n";
bool passed = fusedGemm.run(gemm_s8_sm75_problem_size_0, gemm_s8_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
if(passed)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return passed;
}
int main() {
std::vector<bool (*)()>funcs = {
&run_nonfused_gemm_s8,
&run_fused_gemm_s8_shmem
};
return testRun(75, funcs, "gemm int8 shmem staing");
}
////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,227 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cutlass/gemm/device/gemm.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/host/tensor_copy.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/gemm.h"
#include "device/b2b_gemm.h"
#include "b2b_interleaved_gemm_run.h"
#include "test_run.h"
////////////////////////////////////////////////////////////////////////////////
cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_0(128*640, 64, 576);
cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_1(128*640, 128, 64);
bool run_nonfused_gemm_s8_sm80() {
using ElementOutput = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(1); //beta=1 for bias
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 64>;
using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
using Gemm0 = cutlass::gemm::device::Gemm<
int8_t,
cutlass::layout::ColumnMajorInterleaved<32>,
int8_t,
cutlass::layout::RowMajorInterleaved<32>,
ElementOutput,
cutlass::layout::ColumnMajorInterleaved<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
WarpShape0,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
64 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
3,
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate
>;
using Gemm1 = cutlass::gemm::device::Gemm<
int8_t,
cutlass::layout::ColumnMajorInterleaved<32>,
int8_t,
cutlass::layout::RowMajorInterleaved<32>,
ElementOutput,
cutlass::layout::ColumnMajorInterleaved<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape1,
WarpShape1,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
64 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
3,
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate
>;
B2bInterleavedNonFusedGemmRun<Gemm0, Gemm1, 32> nonFusedGemm;
std::cout << "Running Non-fused back-to-back INT8 NT interleaved GEMMs...\n";
bool pass = nonFusedGemm.run(gemm_s8_sm80_problem_size_0, gemm_s8_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
bool run_fused_gemm_s8_sm80_rf_res() {
using ElementOutput = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
ElementCompute alpha0 = ElementCompute(1);
//Fused kernel has built-in bias, setting beta=0
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<16, 64, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 64>;
using WarpShape1 = cutlass::gemm::GemmShape<16, 128, 64>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
using EpilogueOutputOp0 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
8 * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
64 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>;
const bool SmemAccumulator = false;
using B2bGemm = cutlass::gemm::device::B2bGemm<
int8_t,
cutlass::layout::ColumnMajorInterleaved<32>,
int8_t,
cutlass::layout::RowMajorInterleaved<32>,
ElementOutput,
cutlass::layout::ColumnMajorInterleaved<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
3,
SmemAccumulator,
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate
>;
B2bInterleavedFusedGemmRun<B2bGemm, 32> fusedGemm;
std::cout << "Running Fused back-to-back INT8 NT interleaved GEMMs with RF residency...\n";
bool passed = fusedGemm.run(gemm_s8_sm80_problem_size_0, gemm_s8_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
if(passed)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return passed;
}
int main() {
std::vector<bool (*)()>funcs = {
&run_nonfused_gemm_s8_sm80,
&run_fused_gemm_s8_sm80_rf_res
};
return testRun(80, funcs, "gemm int8 RF residency");
}
////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,226 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cutlass/gemm/device/gemm.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/host/tensor_copy.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/gemm.h"
#include "device/b2b_gemm.h"
#include "b2b_interleaved_gemm_run.h"
#include "test_run.h"
////////////////////////////////////////////////////////////////////////////////
cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_0(128*640, 64, 576);
cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_1(128*640, 256, 64);
bool run_nonfused_gemm_s8_sm80() {
using ElementOutput = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(1); //beta=1 for bias
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 64>;
using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
using Gemm0 = cutlass::gemm::device::Gemm<
int8_t,
cutlass::layout::ColumnMajorInterleaved<32>,
int8_t,
cutlass::layout::RowMajorInterleaved<32>,
ElementOutput,
cutlass::layout::ColumnMajorInterleaved<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
WarpShape0,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
64 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
3,
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate
>;
using Gemm1 = cutlass::gemm::device::Gemm<
int8_t,
cutlass::layout::ColumnMajorInterleaved<32>,
int8_t,
cutlass::layout::RowMajorInterleaved<32>,
ElementOutput,
cutlass::layout::ColumnMajorInterleaved<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape1,
WarpShape1,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
64 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
3,
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate
>;
B2bInterleavedNonFusedGemmRun<Gemm0, Gemm1, 32> nonFusedGemm;
std::cout << "Running Non-fused back-to-back INT8 NT interleaved GEMMs...\n";
bool pass = nonFusedGemm.run(gemm_s8_sm80_problem_size_0, gemm_s8_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
bool run_fused_gemm_s8_sm80_shmem() {
using ElementOutput = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
ElementCompute alpha0 = ElementCompute(1);
//Fused kernel has built-in bias, setting beta=0
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(1); //beta=1 for bias
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 64>;
using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
using EpilogueOutputOp0 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
8 * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
64 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute,
cutlass::epilogue::thread::ScaleType::NoBetaScaling
>;
const bool SmemAccumulator = true;
using B2bGemm = cutlass::gemm::device::B2bGemm<
int8_t,
cutlass::layout::ColumnMajorInterleaved<32>,
int8_t,
cutlass::layout::RowMajorInterleaved<32>,
ElementOutput,
cutlass::layout::ColumnMajorInterleaved<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
3,
SmemAccumulator,
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate
>;
B2bInterleavedFusedGemmRun<B2bGemm, 32> fusedGemm;
std::cout << "Running Fused back-to-back INT8 NT interleaved GEMMs with shared memory staging...\n";
bool passed = fusedGemm.run(gemm_s8_sm80_problem_size_0, gemm_s8_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
if(passed)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return passed;
}
int main() {
std::vector<bool (*)()>funcs = {
&run_nonfused_gemm_s8_sm80,
&run_fused_gemm_s8_sm80_shmem
};
return testRun(80, funcs, "gemm int8 shmem staging");
}
////////////////////////////////////////////////////////////////////////////////

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
@ -66,12 +72,15 @@ struct B2bGemm {
cutlass::gemm::GemmCoord problem_size_0;
cutlass::gemm::GemmCoord problem_size_1;
cutlass::gemm::GemmCoord grid_tiled_shape;
int swizzle_log_tile;
typename B2bMma::IteratorA0::Params params_A0;
typename B2bMma::IteratorA0::TensorRef ref_A0;
typename B2bMma::IteratorB0::Params params_B0;
typename B2bMma::IteratorB0::TensorRef ref_B0;
typename Epilogue::OutputTileIterator::Params params_C0;
typename Epilogue::OutputTileIterator::TensorRef ref_C0;
typename B2bMma::IteratorAccumulatorScaleBias::TensorRef ref_Scale0;
typename B2bMma::IteratorAccumulatorScaleBias::TensorRef ref_Bias0;
typename B2bMma::IteratorB1::Params params_B1;
typename B2bMma::IteratorB1::TensorRef ref_B1;
typename Epilogue::OutputTileIterator::Params params_C1;
@ -91,7 +100,7 @@ struct B2bGemm {
//
CUTLASS_HOST_DEVICE
Params(): semaphore(0), gemm_k_iterations_0(0), gemm_k_size_0(0),
Params(): swizzle_log_tile(0), semaphore(0), gemm_k_iterations_0(0), gemm_k_size_0(0),
gemm_k_iterations_1(0), gemm_k_size_1(0) { }
CUTLASS_HOST_DEVICE
@ -102,6 +111,8 @@ struct B2bGemm {
typename B2bMma::IteratorA0::TensorRef ref_A0,
typename B2bMma::IteratorB0::TensorRef ref_B0,
typename Epilogue::OutputTileIterator::TensorRef ref_C0,
typename B2bMma::IteratorAccumulatorScaleBias::TensorRef ref_Scale0,
typename B2bMma::IteratorAccumulatorScaleBias::TensorRef ref_Bias0,
typename B2bMma::IteratorB1::TensorRef ref_B1,
typename Epilogue::OutputTileIterator::TensorRef ref_C1,
typename Epilogue::OutputTileIterator::TensorRef ref_D1,
@ -112,12 +123,15 @@ struct B2bGemm {
problem_size_0(problem_size_0),
problem_size_1(problem_size_1),
grid_tiled_shape(grid_tiled_shape),
swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
params_A0(ref_A0.layout()),
ref_A0(ref_A0),
params_B0(ref_B0.layout()),
ref_B0(ref_B0),
params_C0(ref_C0.layout()),
ref_C0(ref_C0),
ref_Scale0(ref_Scale0),
ref_Bias0(ref_Bias0),
params_B1(ref_B1.layout()),
ref_B1(ref_B1),
params_C1(ref_C1.layout()),
@ -200,6 +214,19 @@ struct B2bGemm {
return Status::kErrorMisalignedOperand;
}
// Determine if fusion sizes are valid
if(problem_size_0.m() != problem_size_1.m())
return Status::kErrorInvalidProblem;
if(problem_size_0.n() != problem_size_1.k())
return Status::kErrorInvalidProblem;
if(problem_size_0.n() > B2bMma::Shape0::kN)
return Status::kErrorInvalidProblem;
if(problem_size_1.n() > B2bMma::Shape1::kN)
return Status::kErrorInvalidProblem;
return Status::kSuccess;
}
@ -210,7 +237,8 @@ struct B2bGemm {
// Compute threadblock location
ThreadblockSwizzle threadblock_swizzle;
cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset();
cutlass::gemm::GemmCoord threadblock_tile_offset =
threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
// Early exit if CTA is out of range
if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
@ -283,6 +311,29 @@ struct B2bGemm {
int warp_idx = __shfl_sync(0x1f, threadIdx.x / 32, 0);
int lane_idx = threadIdx.x % 32;
// Construct iterators to accumulator scale/bias vector
typename B2bMma::IteratorAccumulatorScaleBias iterator_Scale0(
params.ref_Scale0.data(),
{1, params.problem_size_0.n()},
thread_idx,
warp_idx,
MatrixCoord(
0, threadblock_tile_offset.n() * B2bMma::Shape0::kN
)
);
typename B2bMma::IteratorAccumulatorScaleBias iterator_Bias0(
params.ref_Bias0.data(),
{1, params.problem_size_0.n()},
thread_idx,
warp_idx,
MatrixCoord(
0, threadblock_tile_offset.n() * B2bMma::Shape0::kN
)
);
//
// Main loop
//
@ -290,7 +341,7 @@ struct B2bGemm {
OutputOp0 output_op_0(params.output_op_0);
// Construct thread-scoped matrix multiply
B2bMma b2bMma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
B2bMma b2bMma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx, params.problem_size_0.n());
typename B2bMma::FragmentC0 src_accum;
typename B2bMma::FragmentC1 accumulators;
@ -300,7 +351,8 @@ struct B2bGemm {
if (!kSplitKSerial || gemm_k_iterations_0 > 0) {
// Compute threadblock-scoped matrix multiply-add
b2bMma(gemm_k_iterations_0, accumulators, iterator_A0, iterator_B0, iterator_B1, src_accum, output_op_0);
b2bMma(gemm_k_iterations_0, accumulators, iterator_A0, iterator_B0,
iterator_Scale0, iterator_Bias0, iterator_B1, src_accum, output_op_0);
}
//
@ -313,7 +365,8 @@ struct B2bGemm {
// Masked tile iterators constructed from members
//
threadblock_tile_offset = threadblock_swizzle.get_tile_offset();
threadblock_tile_offset =
threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
//assume identity swizzle
MatrixCoord threadblock_offset(
@ -333,7 +386,7 @@ struct B2bGemm {
semaphore.fetch();
// Indicate which position in a serial reduction the output operator is currently updating
output_op_1.set_k_partition(threadblock_tile_offset.k());
output_op_1.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
}
// Tile iterator loading from source tensor.

View File

@ -0,0 +1,521 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Template for a pipelined Implicit GEMM kernel.
*/
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/aligned_buffer.h"
#include "cutlass/array.h"
#include "cutlass/numeric_types.h"
#include "cutlass/matrix_shape.h"
#include "cutlass/semaphore.h"
#include "cutlass/tensor_ref.h"
#include "cutlass/layout/tensor.h"
#include "cutlass/gemm/gemm.h"
#include "cutlass/conv/convolution.h"
#include "cutlass/conv/conv2d_problem_size.h"
#include "cutlass/conv/conv3d_problem_size.h"
#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace conv {
namespace kernel {
/////////////////////////////////////////////////////////////////////////////////////////////////
template <
typename B2bMma_, ///! Threadblock-scoped matrix multiply-accumulate
typename Epilogue_, ///! Epilogue
typename ThreadblockSwizzle_, ///! Threadblock swizzling function
conv::Operator ConvOperator, ///! Convolutional operator (Fprop, Dgrad, Wgrad)
typename ConvProblemSize_ = Conv2dProblemSize ///! Convolutional operator on 2D or 3D problem
>
struct B2bImplicitGemmConvolution {
using B2bMma = B2bMma_;
using Epilogue = Epilogue_;
using EpilogueOutputOp0 = typename B2bMma::OutputOp;
using EpilogueOutputOp1 = typename Epilogue::OutputOp;
using ThreadblockSwizzle = ThreadblockSwizzle_;
static Operator const kConvolutionalOperator = ConvOperator;
using ElementA = typename B2bMma::IteratorA0::Element;
using LayoutA = typename B2bMma::IteratorA0::Layout;
using ElementB = typename B2bMma::IteratorB0::Element;
using LayoutB = typename B2bMma::IteratorB0::Layout;
using ElementC = typename EpilogueOutputOp1::ElementOutput;
/// Set output tensor C layout
using LayoutC = LayoutA;
using ElementAccumulator = typename EpilogueOutputOp0::ElementAccumulator;
using ElementCompute = typename EpilogueOutputOp0::ElementCompute;
/// Scale and Bias
using ElementScaleBias = typename B2bMma::IteratorAccumulatorScaleBias::Element;
using LayoutScaleBias = typename B2bMma::IteratorAccumulatorScaleBias::Layout;
using WarpMmaOperator0 = typename B2bMma::Policy0::Operator;
using WarpMmaOperator1 = typename B2bMma::Policy1::Operator;
using ArchMmaOperator = typename WarpMmaOperator0::ArchMmaOperator;
using MathOperator = typename ArchMmaOperator::Operator;
using OperatorClass = typename WarpMmaOperator0::OperatorClass;
using ArchTag = typename WarpMmaOperator0::ArchTag;
using ThreadblockShape0 = typename B2bMma::Shape0;
using ThreadblockShape1 = typename B2bMma::Shape1;
using WarpShape0 = typename WarpMmaOperator0::Shape;
using WarpShape1 = typename WarpMmaOperator1::Shape;
using InstructionShape = typename ArchMmaOperator::Shape;
static int const kStages = B2bMma::kStages;
static IteratorAlgorithm const kIteratorAlgorithm = B2bMma::IteratorA0::kIteratorAlgorithm;
/// Warp count (concept: GemmShape)
using WarpCount0 = typename B2bMma::WarpCount0;
static int const kThreadCount = 32 * WarpCount0::kCount;
using TensorRefA0 = typename B2bMma::IteratorA0::TensorRef;
using TensorRefB0 = typename B2bMma::IteratorB0::TensorRef;
using TensorRefScaleBias0 = typename B2bMma::IteratorAccumulatorScaleBias::TensorRef;
using TensorRefB1 = typename B2bMma::IteratorB1::TensorRef;
using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
/// Check iterator A and B convolution dimension are the same and
// set device::B2bImplicitGemmConvolution::kConvDim
static_assert(B2bMma::IteratorA0::kConvDim == B2bMma::IteratorB0::kConvDim,
"Convolution on different dimensions is not supported");
static int const kConvDim = B2bMma::IteratorA0::kConvDim;
/// Conv dimension and problem size structure (Conv2d or Conv3d)
using ConvProblemSize = ConvProblemSize_;
/// Wgrad C stride idx for implicit gemm algorithm
// Conv2d row-major matrix C (KxRSC)
// Conv3d row-major matrix C (KxTRSC)
static int const kWgradCStrideIdx =
cutlass::platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value ? 2 : 3;
/// This chooses the appropriate stride element of the C tensor.
static int const kTensorCStrideIdx =
(kConvolutionalOperator == conv::Operator::kWgrad ? kWgradCStrideIdx : 0);
//
//
//
using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
LayoutC,
typename Epilogue::OutputTileIterator::Layout,
TensorRefC,
ConvOperator,
ConvProblemSize
>;
/// Argument structure
struct Arguments {
//
// Data members
//
ConvProblemSize problem_size_0;
ConvProblemSize problem_size_1;
TensorRefA0 ref_A0;
TensorRefB0 ref_B0;
TensorRefC ref_C0;
TensorRefScaleBias0 ref_Scale0;
TensorRefScaleBias0 ref_Bias0;
TensorRefB1 ref_B1;
TensorRefC ref_C1;
TensorRefC ref_D1;
typename EpilogueOutputOp0::Params output_op_0;
typename EpilogueOutputOp1::Params output_op_1;
SplitKMode split_k_mode;
//
// Methods
//
/// Default ctor
CUTLASS_HOST_DEVICE
Arguments() { }
CUTLASS_HOST_DEVICE
Arguments(
ConvProblemSize const & problem_size_0,
ConvProblemSize const & problem_size_1
):
problem_size_0(problem_size_0),
problem_size_1(problem_size_1) { }
CUTLASS_HOST_DEVICE
Arguments(
ConvProblemSize const & problem_size_0,
ConvProblemSize const & problem_size_1,
TensorRefA0 const & ref_A0,
TensorRefB0 const & ref_B0,
TensorRefC const & ref_C0,
TensorRefScaleBias0 const & ref_Scale0,
TensorRefScaleBias0 const & ref_Bias0,
TensorRefB1 const & ref_B1,
TensorRefC const & ref_C1,
TensorRefC const & ref_D1,
typename EpilogueOutputOp0::Params const & output_op_0,
typename EpilogueOutputOp1::Params const & output_op_1,
SplitKMode const & split_k_mode = SplitKMode::kSerial
):
problem_size_0(problem_size_0),
problem_size_1(problem_size_1),
ref_A0(ref_A0),
ref_B0(ref_B0),
ref_C0(ref_C0),
ref_Scale0(ref_Scale0),
ref_Bias0(ref_Bias0),
ref_B1(ref_B1),
ref_C1(ref_C1),
ref_D1(ref_D1),
output_op_0(output_op_0),
output_op_1(output_op_1),
split_k_mode(split_k_mode)
{
}
};
/// Parameters structure
struct Params {
ConvProblemSize problem_size_0;
ConvProblemSize problem_size_1;
cutlass::gemm::GemmCoord grid_tiled_shape;
gemm::GemmCoord implicit_gemm_problem_size_0;
gemm::GemmCoord implicit_gemm_problem_size_1;
int swizzle_log_tile;
int gemm_k_iterations_0;
int gemm_k_iterations_1;
typename B2bMma::IteratorA0::Params iterator_A0;
typename B2bMma::IteratorA0::Element const *ptr_A0;
typename B2bMma::IteratorB0::Params iterator_B0;
typename B2bMma::IteratorB0::Element const *ptr_B0;
typename Epilogue::OutputTileIterator::Params iterator_C0;
typename Epilogue::OutputTileIterator::Element *ptr_C0;
typename B2bMma::IteratorAccumulatorScaleBias::Element *ptr_Scale0;
typename B2bMma::IteratorAccumulatorScaleBias::Element *ptr_Bias0;
typename B2bMma::IteratorB1::Params iterator_B1;
typename B2bMma::IteratorB1::Element const *ptr_B1;
typename Epilogue::OutputTileIterator::Params iterator_C1;
typename Epilogue::OutputTileIterator::Element *ptr_C1;
typename Epilogue::OutputTileIterator::Params iterator_D1;
typename Epilogue::OutputTileIterator::Element *ptr_D1;
typename EpilogueOutputOp0::Params output_op_0;
typename EpilogueOutputOp1::Params output_op_1;
int *semaphore;
SplitKMode split_k_mode;
//
// Methods
//
CUTLASS_HOST_DEVICE
Params(): swizzle_log_tile(0), gemm_k_iterations_0(0), gemm_k_iterations_1(0) { }
///
CUTLASS_HOST_DEVICE
Params(
Arguments const &args,
int *semaphore = nullptr
):
problem_size_0(args.problem_size_0),
problem_size_1(args.problem_size_1),
implicit_gemm_problem_size_0(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_0)),
implicit_gemm_problem_size_1(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_1)),
iterator_A0(B2bMma::IteratorA0::getParams(args.problem_size_0, args.ref_A0.layout())),
ptr_A0(args.ref_A0.data()),
iterator_B0(args.problem_size_0, args.ref_B0.layout()),
ptr_B0(args.ref_B0.data()),
iterator_C0(ConvOutputIteratorParameter::layout(args.ref_C0)),
ptr_C0(args.ref_C0.data()),
ptr_Scale0(args.ref_Scale0.data()),
ptr_Bias0(args.ref_Bias0.data()),
iterator_B1(args.problem_size_1, args.ref_B1.layout()),
ptr_B1(args.ref_B1.data()),
iterator_C1(ConvOutputIteratorParameter::layout(args.ref_C1)),
ptr_C1(args.ref_C1.data()),
iterator_D1(ConvOutputIteratorParameter::layout(args.ref_D1)),
ptr_D1(args.ref_D1.data()),
output_op_0(args.output_op_0),
output_op_1(args.output_op_1),
semaphore(semaphore),
split_k_mode(args.split_k_mode)
{
gemm_k_iterations_0 = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape0::kK, args.problem_size_0);
gemm_k_iterations_1 = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape1::kK, args.problem_size_1);
ThreadblockSwizzle threadblock_swizzle;
grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
implicit_gemm_problem_size_0,
{ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
args.problem_size_0.split_k_slices);
swizzle_log_tile = ThreadblockSwizzle().get_log_tile(grid_tiled_shape);
}
};
/// Shared memory storage structure
union SharedStorage {
typename B2bMma::B2bMmaSharedStorage main_loop;
typename Epilogue::SharedStorage epilogue;
};
//
// Methods
//
CUTLASS_HOST_DEVICE
B2bImplicitGemmConvolution() { }
/// Executes one ImplicitGEMM
CUTLASS_DEVICE
void operator()(Params const &params, SharedStorage &shared_storage) {
// Compute threadblock location
ThreadblockSwizzle threadblock_swizzle;
cutlass::gemm::GemmCoord threadblock_tile_idx =
threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
// Early exit if CTA is out of range
if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
return;
}
// Compute position within threadblock
int thread_idx = threadIdx.x;
// Construct iterators to A and B operands
typename B2bMma::IteratorA0 iterator_A0(
params.iterator_A0,
params.problem_size_0,
params.ptr_A0,
thread_idx,
MatrixCoord(
threadblock_tile_idx.m() * B2bMma::Shape0::kM,
threadblock_tile_idx.k() * B2bMma::Shape0::kK
)
);
typename B2bMma::IteratorB0 iterator_B0(
params.iterator_B0,
params.problem_size_0,
params.ptr_B0,
thread_idx,
MatrixCoord(
threadblock_tile_idx.k() * B2bMma::Shape0::kK,
threadblock_tile_idx.n() * B2bMma::Shape0::kN
)
);
typename B2bMma::IteratorB1 iterator_B1(
params.iterator_B1,
params.problem_size_1,
params.ptr_B1,
thread_idx,
MatrixCoord(
threadblock_tile_idx.k() * B2bMma::Shape1::kK,
threadblock_tile_idx.n() * B2bMma::Shape1::kN
)
);
// Broadcast the warp_id computed by lane 0 to ensure dependent code
// is compiled as warp-uniform.
int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
int lane_idx = threadIdx.x % 32;
// Construct iterators to accumulator scale/bias vector
typename B2bMma::IteratorAccumulatorScaleBias iterator_Scale0(
params.ptr_Scale0,
{1, params.problem_size_0.K},
thread_idx,
warp_idx,
MatrixCoord(
0, threadblock_tile_idx.n() * B2bMma::Shape0::kN
)
);
typename B2bMma::IteratorAccumulatorScaleBias iterator_Bias0(
params.ptr_Bias0,
{1, params.problem_size_0.K},
thread_idx,
warp_idx,
MatrixCoord(
0, threadblock_tile_idx.n() * B2bMma::Shape0::kN
)
);
//
// Main loop
//
EpilogueOutputOp0 output_op_0(params.output_op_0);
// Construct thread-scoped matrix multiply
B2bMma b2bMma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
typename B2bMma::FragmentC0 src_accum;
typename B2bMma::FragmentC1 accumulators;
src_accum.clear();
accumulators.clear();
// Compute threadblock-scoped matrix multiply-add
b2bMma(params.gemm_k_iterations_0, accumulators, iterator_A0, iterator_B0,
iterator_Scale0, iterator_Bias0, iterator_B1, src_accum, output_op_0);
//
// Epilogue
//
EpilogueOutputOp1 output_op_1(params.output_op_1);
// Construct the semaphore.
int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m();
Semaphore semaphore(params.semaphore + block_idx, thread_idx);
// Compute logical position within grid
threadblock_tile_idx =
threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
// If performing a reduction via split-K, fetch the initial synchronization
if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
// Fetch the synchronization lock initially but do not block.
semaphore.fetch();
// Indicate which position in a serial reduction the output operator is currently updating
output_op_1.set_k_partition(threadblock_tile_idx.k(), params.grid_tiled_shape.k());
}
MatrixCoord threadblock_offset(
threadblock_tile_idx.m() * B2bMma::Shape1::kM,
threadblock_tile_idx.n() * B2bMma::Shape1::kN
);
// Tile iterator writing to destination tensor
typename Epilogue::OutputTileIterator iterator_D1(
params.iterator_D1,
params.ptr_D1,
ConvOutputIteratorParameter::extent(params.problem_size_1),
thread_idx,
threadblock_offset
);
// Tile iterator reading from source accumulator tensor
typename Epilogue::OutputTileIterator iterator_C1(
params.iterator_C1,
params.ptr_C1,
ConvOutputIteratorParameter::extent(params.problem_size_1),
thread_idx,
threadblock_offset
);
// Construct the epilogue
Epilogue epilogue(
shared_storage.epilogue,
thread_idx,
warp_idx,
lane_idx);
// Wait on the semaphore - this latency may have been covered by iterator construction
if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
// For subsequent threadblocks, the source matrix is held in the 'D' tensor.
if (threadblock_tile_idx.k()) {
iterator_C1 = iterator_D1;
}
semaphore.wait(threadblock_tile_idx.k());
__threadfence();
}
// Each split-k-slice writes to a unique tensor location
else if (params.split_k_mode == SplitKMode::kParallel) {
iterator_D1.add_pointer_offset(threadblock_tile_idx.k() *
cutlass::conv::implicit_gemm_tensor_c_size(ConvOperator, params.problem_size_1));
}
// Run efficient epilogue
epilogue(output_op_1, iterator_D1, accumulators, iterator_C1);
//
// Release the semaphore
//
if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
int lock = 0;
if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) {
// The final threadblock resets the semaphore for subsequent grids.
lock = 0;
}
else {
// Otherwise, the semaphore is incremented
lock = threadblock_tile_idx.k() + 1;
}
semaphore.release(lock);
}
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernel
} // namespace conv
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,94 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief
Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped
matrix multiply-add with the appropriate threadblock-scoped epilogue.
*/
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/conv/kernel/default_conv2d.h"
#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
#include "cutlass/transform/threadblock/predicated_vector_access_iterator.h"
#include "cutlass/transform/threadblock/vector_iterator.h"
#include "cutlass/transform/warp/vector_fragment_iterator.h"
#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
#include "kernel/b2b_implicit_gemm_convolution.h"
#include "threadblock/b2b_implicit_gemm_pipelined.h"
#include "threadblock/b2b_implicit_gemm_multistage.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace conv {
namespace kernel {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Defines a kernel for Conv2dFprop
template <
typename ElementA,
typename LayoutA,
typename ElementB,
typename LayoutB,
typename ElementC,
typename LayoutC,
typename ElementAccumulator,
typename OperatorClass,
typename ArchTag,
typename ThreadblockShape0,
typename ThreadblockShape1,
typename WarpShape0,
typename WarpShape1,
typename InstructionShape,
typename EpilogueOutputOp0,
typename EpilogueOutputOp1,
typename ThreadblockSwizzle,
int Stages,
typename MathOperatorTag,
conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
bool SmemAccumulator = false
> struct DefaultB2bConv2dFprop;
} // namespace kernel
} // namespace conv
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,749 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief
Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped
matrix multiply-add with the appropriate threadblock-scoped epilogue.
*/
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/conv/kernel/default_conv2d.h"
#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
#include "cutlass/transform/threadblock/predicated_vector_access_iterator.h"
#include "cutlass/transform/threadblock/vector_iterator.h"
#include "cutlass/transform/warp/vector_fragment_iterator.h"
#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
#include "kernel/default_b2b_conv2d_fprop.h"
#include "kernel/b2b_implicit_gemm_convolution.h"
#include "threadblock/b2b_implicit_gemm_pipelined.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace conv {
namespace kernel {
/////////////////////////////////////////////////////////////////////////////////////////////////
// OpClassTensorOp convolutions
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm
/// and 2 stage pipeline.
template <
typename ElementA,
typename LayoutA,
typename ElementB,
typename LayoutB,
typename ElementC,
typename LayoutC,
typename ElementAccumulator,
typename ArchTag,
typename ThreadblockShape0,
typename ThreadblockShape1,
typename WarpShape0,
typename WarpShape1,
typename InstructionShape,
typename EpilogueOutputOp0,
typename EpilogueOutputOp1,
typename ThreadblockSwizzle,
typename MathOperatorTag
>
struct DefaultB2bConv2dFprop <
ElementA,
LayoutA,
ElementB,
LayoutB,
ElementC,
LayoutC,
ElementAccumulator,
arch::OpClassTensorOp,
ArchTag,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
ThreadblockSwizzle,
2,
MathOperatorTag,
IteratorAlgorithm::kAnalytic
> {
// Define the core components from GEMM
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
2, MathOperatorTag>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
2, MathOperatorTag>;
// Define iterators over tiles from the A operand
using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
using IteratorA0 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
ElementA, LayoutA,
ThreadMapA0
>
>;
using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
// Define iterators over tiles from the B operand
using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
using IteratorB0 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
ElementB, LayoutB,
ThreadMapB0
>
>;
using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
// Use fragment iterator for A operand
using AccumulatorLayout = cutlass::layout::ColumnMajor;
using FragmentIteratorA1 =
cutlass::gemm::warp::MmaTensorOpFragmentIterator<
cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
MmaCore1::Shape::kK, //kBlocksColumn
ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
/// Define iterators over tiles from scale/bias vectors
using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
static int const kElementsPerAccess = 2;
using IteratorAccumulatorScaleBias =
cutlass::transform::threadblock::VectorIterator<
cutlass::transform::threadblock::PredicatedVectorAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
>;
// Warp-level iterators to load scale and bias vectors
using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
LayoutScaleBias, InstructionShape, kElementsPerAccess>;
// Define iterators over tiles from the B operand
using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
using IteratorB1 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
ElementB, LayoutB,
ThreadMapB1
>
>;
using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
// Warp-level GEMM components
using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
using MmaPolicy0 = typename MmaCore0::MmaPolicy;
using MmaPolicy1 = typename MmaCore1::MmaPolicy;
// Define the Mma
using B2bMma = threadblock::B2bImplicitGemmPipelined<
ThreadblockShape0,
IteratorA0,
SmemIteratorA0,
IteratorB0,
SmemIteratorB0,
ThreadblockShape1,
FragmentIteratorA1,
IteratorAccumulatorScaleBias,
FragmentIteratorA1ScaleBias,
IteratorB1,
SmemIteratorB1,
ElementC,
LayoutC,
EpilogueOutputOp0,
MmaPolicy0,
MmaPolicy1
>;
// Define the epilogue
using Epilogue = typename detail::DefaultConvEpilogue<
ArchTag,
ThreadblockShape1,
WarpMmaTensorOp1,
1,
EpilogueOutputOp1
>::Epilogue;
// Define the kernel
using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
B2bMma,
Epilogue,
ThreadblockSwizzle,
conv::Operator::kFprop
>;
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and 2 stage
/// pipeline with interleaved layout.
template <
typename ElementA,
typename ElementB,
typename ElementC,
typename LayoutC,
typename ElementAccumulator,
typename ArchTag,
typename ThreadblockShape0,
typename ThreadblockShape1,
typename WarpShape0,
typename WarpShape1,
typename InstructionShape,
typename EpilogueOutputOp0,
typename EpilogueOutputOp1,
typename ThreadblockSwizzle,
typename MathOperatorTag,
int InterleavedK
>
struct DefaultB2bConv2dFprop <
ElementA,
layout::TensorNCxHWx<InterleavedK>,
ElementB,
layout::TensorCxRSKx<InterleavedK>,
ElementC,
LayoutC,
ElementAccumulator,
arch::OpClassTensorOp,
ArchTag,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
ThreadblockSwizzle,
2,
MathOperatorTag,
IteratorAlgorithm::kAnalytic,
false
> {
// Define the core components from GEMM
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
ElementB, layout::RowMajorInterleaved<InterleavedK>,
ElementAccumulator, LayoutC, arch::OpClassTensorOp,
2, MathOperatorTag, true>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
ElementB, layout::RowMajorInterleaved<InterleavedK>,
ElementAccumulator, LayoutC, arch::OpClassTensorOp,
2, MathOperatorTag, true>;
// Define iterators over tiles from the A operand
// Note GEMM shared memory threadmap is used here because conv global memory
// layout needs to be mapped to fprop which is similar to the crosswise
// layout which is used by the interleaved GEMM shared memory threadmap.
// The Interleaved GEMM global memory layout is similar to the congruous
// layout.
using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
using IteratorA0 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
ElementA, layout::TensorNCxHWx<InterleavedK>,
ThreadMapA0
>
>;
using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
// Define iterators over tiles from the B operand
// Note GEMM shared memory threadmap is used here because conv global memory
// layout needs to be mapped to fprop which is similar to the crosswise
// layout which is used by the interleaved GEMM shared memory threadmap.
// The Interleaved GEMM global memory layout is similar to the congruous
// layout.
using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
using IteratorB0 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
ElementB, layout::TensorCxRSKx<InterleavedK>,
ThreadMapB0
>
>;
using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
// Use fragment iterator for A operand
using AccumulatorLayout = cutlass::layout::RowMajor;
using FragmentIteratorA1 =
cutlass::gemm::warp::MmaTensorOpFragmentIterator<
cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
MmaCore1::Shape::kK, //kBlocksColumn
ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
/// Define iterators over tiles from scale/bias vectors
using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
static int const kElementsPerAccess = 4;
using IteratorAccumulatorScaleBias =
cutlass::transform::threadblock::VectorIterator<
cutlass::transform::threadblock::PredicatedVectorAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
>;
// Warp-level iterators to load scale and bias vectors
using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
LayoutScaleBias, InstructionShape, kElementsPerAccess>;
// Define iterators over tiles from the B operand
using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
using IteratorB1 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
ElementB, layout::TensorCxRSKx<InterleavedK>,
ThreadMapB1
>
>;
using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
// Warp-level GEMM components
using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
using MmaPolicy0 = typename MmaCore0::MmaPolicy;
using MmaPolicy1 = typename MmaCore1::MmaPolicy;
// Define the Mma
using B2bMma = threadblock::B2bImplicitGemmPipelined<
ThreadblockShape0,
IteratorA0,
SmemIteratorA0,
IteratorB0,
SmemIteratorB0,
ThreadblockShape1,
FragmentIteratorA1,
IteratorAccumulatorScaleBias,
FragmentIteratorA1ScaleBias,
IteratorB1,
SmemIteratorB1,
ElementC,
LayoutC,
EpilogueOutputOp0,
MmaPolicy0,
MmaPolicy1
>;
// Define the epilogue
using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
ThreadblockShape1,
WarpMmaTensorOp1,
1,
EpilogueOutputOp1,
EpilogueOutputOp1::kCount,
InterleavedK
>::Epilogue;
// Define the kernel
using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
B2bMma,
Epilogue,
ThreadblockSwizzle,
conv::Operator::kFprop
>;
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm
/// and 2 stage pipeline.
template <
typename ElementA,
typename LayoutA,
typename ElementB,
typename LayoutB,
typename ElementC,
typename LayoutC,
typename ElementAccumulator,
typename ArchTag,
typename ThreadblockShape0,
typename ThreadblockShape1,
typename WarpShape0,
typename WarpShape1,
typename InstructionShape,
typename EpilogueOutputOp0,
typename EpilogueOutputOp1,
typename ThreadblockSwizzle,
typename MathOperatorTag
>
struct DefaultB2bConv2dFprop <
ElementA,
LayoutA,
ElementB,
LayoutB,
ElementC,
LayoutC,
ElementAccumulator,
arch::OpClassTensorOp,
ArchTag,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
ThreadblockSwizzle,
2,
MathOperatorTag,
IteratorAlgorithm::kOptimized
> {
// Define the core components from GEMM
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
2, MathOperatorTag>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
2, MathOperatorTag>;
// Define iterators over tiles from the A operand
using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
using IteratorA0 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
ElementA, LayoutA,
ThreadMapA0
>
>;
using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
// Define iterators over tiles from the B operand
using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
using IteratorB0 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
ElementB, LayoutB,
ThreadMapB0
>
>;
using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
// Use fragment iterator for A operand
using AccumulatorLayout = cutlass::layout::ColumnMajor;
using FragmentIteratorA1 =
cutlass::gemm::warp::MmaTensorOpFragmentIterator<
cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
MmaCore1::Shape::kK, //kBlocksColumn
ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
/// Define iterators over tiles from scale/bias vectors
using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
static int const kElementsPerAccess = 2;
using IteratorAccumulatorScaleBias =
cutlass::transform::threadblock::VectorIterator<
cutlass::transform::threadblock::PredicatedVectorAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
>;
// Warp-level iterators to load scale and bias vectors
using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
LayoutScaleBias, InstructionShape, kElementsPerAccess>;
// Define iterators over tiles from the B operand
using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
using IteratorB1 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
ElementB, LayoutB,
ThreadMapB1
>
>;
using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
// Warp-level GEMM components
using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
using MmaPolicy0 = typename MmaCore0::MmaPolicy;
using MmaPolicy1 = typename MmaCore1::MmaPolicy;
// Define the Mma
using B2bMma = threadblock::B2bImplicitGemmPipelined<
ThreadblockShape0,
IteratorA0,
SmemIteratorA0,
IteratorB0,
SmemIteratorB0,
ThreadblockShape1,
FragmentIteratorA1,
IteratorAccumulatorScaleBias,
FragmentIteratorA1ScaleBias,
IteratorB1,
SmemIteratorB1,
ElementC,
LayoutC,
EpilogueOutputOp0,
MmaPolicy0,
MmaPolicy1
>;
// Define the epilogue
using Epilogue = typename detail::DefaultConvEpilogue<
ArchTag,
ThreadblockShape1,
WarpMmaTensorOp1,
1,
EpilogueOutputOp1
>::Epilogue;
// Define the kernel
using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
B2bMma,
Epilogue,
ThreadblockSwizzle,
conv::Operator::kFprop
>;
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm and 2 stage
/// pipeline with interleaved layout.
template <
typename ElementA,
typename ElementB,
typename ElementC,
typename LayoutC,
typename ElementAccumulator,
typename ArchTag,
typename ThreadblockShape0,
typename ThreadblockShape1,
typename WarpShape0,
typename WarpShape1,
typename InstructionShape,
typename EpilogueOutputOp0,
typename EpilogueOutputOp1,
typename ThreadblockSwizzle,
typename MathOperatorTag,
int InterleavedK
>
struct DefaultB2bConv2dFprop <
ElementA,
layout::TensorNCxHWx<InterleavedK>,
ElementB,
layout::TensorCxRSKx<InterleavedK>,
ElementC,
LayoutC,
ElementAccumulator,
arch::OpClassTensorOp,
ArchTag,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
ThreadblockSwizzle,
2,
MathOperatorTag,
IteratorAlgorithm::kOptimized
> {
// Define the core components from GEMM
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
ElementB, layout::RowMajorInterleaved<InterleavedK>,
ElementAccumulator, LayoutC, arch::OpClassTensorOp,
2, MathOperatorTag, true>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
ElementB, layout::RowMajorInterleaved<InterleavedK>,
ElementAccumulator, LayoutC, arch::OpClassTensorOp,
2, MathOperatorTag, true>;
// Define iterators over tiles from the A operand
// Note GEMM shared memory threadmap is used here because conv global memory
// layout needs to be mapped to fprop which is similar to the crosswise
// layout which is used by the interleaved GEMM shared memory threadmap.
// The Interleaved GEMM global memory layout is similar to the congruous
// layout.
// Define iterators over tiles from the A operand
using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
using IteratorA0 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
ElementA, layout::TensorNCxHWx<InterleavedK>,
ThreadMapA0
>
>;
using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
// Define iterators over tiles from the B operand
using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
using IteratorB0 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
ElementB, layout::TensorCxRSKx<InterleavedK>,
ThreadMapB0
>
>;
using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
// Use fragment iterator for A operand
using AccumulatorLayout = cutlass::layout::RowMajor;
using FragmentIteratorA1 =
cutlass::gemm::warp::MmaTensorOpFragmentIterator<
cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
MmaCore1::Shape::kK, //kBlocksColumn
ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
/// Define iterators over tiles from scale/bias vectors
using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
static int const kElementsPerAccess = 4;
using IteratorAccumulatorScaleBias =
cutlass::transform::threadblock::VectorIterator<
cutlass::transform::threadblock::PredicatedVectorAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
>;
// Warp-level iterators to load scale and bias vectors
using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
LayoutScaleBias, InstructionShape, kElementsPerAccess>;
using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
using IteratorB1 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
ElementB, layout::TensorCxRSKx<InterleavedK>,
ThreadMapB1
>
>;
using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
// Warp-level GEMM components
using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
using MmaPolicy0 = typename MmaCore0::MmaPolicy;
using MmaPolicy1 = typename MmaCore1::MmaPolicy;
// Define the Mma
using B2bMma = threadblock::B2bImplicitGemmPipelined<
ThreadblockShape0,
IteratorA0,
SmemIteratorA0,
IteratorB0,
SmemIteratorB0,
ThreadblockShape1,
FragmentIteratorA1,
IteratorAccumulatorScaleBias,
FragmentIteratorA1ScaleBias,
IteratorB1,
SmemIteratorB1,
ElementC,
LayoutC,
EpilogueOutputOp0,
MmaPolicy0,
MmaPolicy1
>;
// Define the epilogue
using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
ThreadblockShape1,
WarpMmaTensorOp1,
1,
EpilogueOutputOp1,
EpilogueOutputOp1::kCount,
InterleavedK
>::Epilogue;
// Define the kernel
using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
B2bMma,
Epilogue,
ThreadblockSwizzle,
conv::Operator::kFprop
>;
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernel
} // namespace conv
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,740 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief
Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped
matrix multiply-add with the appropriate threadblock-scoped epilogue.
*/
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/conv/kernel/default_conv2d.h"
#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
#include "cutlass/transform/threadblock/predicated_vector_access_iterator.h"
#include "cutlass/transform/threadblock/vector_iterator.h"
#include "cutlass/transform/warp/vector_fragment_iterator.h"
#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
#include "kernel/default_b2b_conv2d_fprop.h"
#include "kernel/b2b_implicit_gemm_convolution.h"
#include "threadblock/b2b_implicit_gemm_multistage.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace conv {
namespace kernel {
/////////////////////////////////////////////////////////////////////////////////////////////////
// OpClassTensorOp convolutions
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
/// pipeline.
template <
typename ElementA,
typename LayoutA,
typename ElementB,
typename LayoutB,
typename ElementC,
typename LayoutC,
typename ElementAccumulator,
typename ArchTag,
typename ThreadblockShape0,
typename ThreadblockShape1,
typename WarpShape0,
typename WarpShape1,
typename InstructionShape,
typename EpilogueOutputOp0,
typename EpilogueOutputOp1,
typename ThreadblockSwizzle,
int Stages,
typename MathOperatorTag
>
struct DefaultB2bConv2dFprop <
ElementA,
LayoutA,
ElementB,
LayoutB,
ElementC,
LayoutC,
ElementAccumulator,
arch::OpClassTensorOp,
ArchTag,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
ThreadblockSwizzle,
Stages,
MathOperatorTag,
IteratorAlgorithm::kAnalytic
> {
// Define the core components from GEMM
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
Stages, MathOperatorTag>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
Stages, MathOperatorTag>;
// Define iterators over tiles from the A operand
using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
using IteratorA0 =
cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
ElementA, LayoutA,
ThreadMapA0
>;
using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
// Define iterators over tiles from the B operand
using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
using IteratorB0 =
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
ElementB, LayoutB,
ThreadMapB0
>;
using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
// Use fragment iterator for A operand
using AccumulatorLayout = cutlass::layout::ColumnMajor;
using FragmentIteratorA1 =
cutlass::gemm::warp::MmaTensorOpFragmentIterator<
cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
MmaCore1::Shape::kK, //kBlocksColumn
ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
/// Define iterators over tiles from scale/bias vectors
using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
static int const kElementsPerAccess = 2;
using IteratorAccumulatorScaleBias =
cutlass::transform::threadblock::VectorIterator<
cutlass::transform::threadblock::PredicatedVectorAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
>;
// Warp-level iterators to load scale and bias vectors
using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
LayoutScaleBias, InstructionShape, kElementsPerAccess>;
// Define iterators over tiles from the B operand
using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
using IteratorB1 =
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
ElementB, LayoutB,
ThreadMapB1
>;
using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
// Warp-level GEMM components
using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
using MmaPolicy0 = typename MmaCore0::MmaPolicy;
using MmaPolicy1 = typename MmaCore1::MmaPolicy;
// Define the Mma
using B2bMma = threadblock::B2bImplicitGemmMultistage<
ThreadblockShape0,
IteratorA0,
SmemIteratorA0,
arch::CacheOperation::Always,
IteratorB0,
SmemIteratorB0,
arch::CacheOperation::Global,
ThreadblockShape1,
FragmentIteratorA1,
IteratorAccumulatorScaleBias,
FragmentIteratorA1ScaleBias,
IteratorB1,
SmemIteratorB1,
arch::CacheOperation::Global,
EpilogueOutputOp0,
MmaPolicy0,
MmaPolicy1,
Stages
>;
// Define the epilogue
using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
ThreadblockShape1,
WarpMmaTensorOp1,
1,
EpilogueOutputOp1,
EpilogueOutputOp1::kCount
>::Epilogue;
// Define the kernel
using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
B2bMma,
Epilogue,
ThreadblockSwizzle,
conv::Operator::kFprop
>;
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
/// pipeline with interleaved layout.
template <
typename ElementA,
typename ElementB,
typename ElementC,
typename LayoutC,
typename ElementAccumulator,
typename ArchTag,
typename ThreadblockShape0,
typename ThreadblockShape1,
typename WarpShape0,
typename WarpShape1,
typename InstructionShape,
typename EpilogueOutputOp0,
typename EpilogueOutputOp1,
typename ThreadblockSwizzle,
int Stages,
typename MathOperatorTag,
int InterleavedK
>
struct DefaultB2bConv2dFprop <
ElementA,
layout::TensorNCxHWx<InterleavedK>,
ElementB,
layout::TensorCxRSKx<InterleavedK>,
ElementC,
LayoutC,
ElementAccumulator,
arch::OpClassTensorOp,
ArchTag,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
ThreadblockSwizzle,
Stages,
MathOperatorTag,
IteratorAlgorithm::kAnalytic
> {
// Define the core components from GEMM
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
ElementB, layout::RowMajorInterleaved<InterleavedK>,
ElementAccumulator, LayoutC, arch::OpClassTensorOp,
Stages, MathOperatorTag, true>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
ElementB, layout::RowMajorInterleaved<InterleavedK>,
ElementAccumulator, LayoutC, arch::OpClassTensorOp,
Stages, MathOperatorTag, true>;
// Define iterators over tiles from the A operand
// Note GEMM shared memory threadmap is used here because conv global memory
// layout needs to be mapped to fprop which is similar to the crosswise
// layout which is used by the interleaved GEMM shared memory threadmap.
// The Interleaved GEMM global memory layout is similar to the congruous
// layout.
using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
using IteratorA0 =
cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
ElementA, layout::TensorNCxHWx<InterleavedK>,
ThreadMapA0
>;
using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
// Define iterators over tiles from the B operand
// Note GEMM shared memory threadmap is used here because conv global memory
// layout needs to be mapped to fprop which is similar to the crosswise
// layout which is used by the interleaved GEMM shared memory threadmap.
// The Interleaved GEMM global memory layout is similar to the congruous
// layout.
using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
using IteratorB0 =
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
ElementB, layout::TensorCxRSKx<InterleavedK>,
ThreadMapB0
>;
using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
// Use fragment iterator for A operand
using AccumulatorLayout = cutlass::layout::RowMajor;
using FragmentIteratorA1 =
cutlass::gemm::warp::MmaTensorOpFragmentIterator<
cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
MmaCore1::Shape::kK, //kBlocksColumn
ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
/// Define iterators over tiles from scale/bias vectors
using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
static int const kElementsPerAccess = 4;
using IteratorAccumulatorScaleBias =
cutlass::transform::threadblock::VectorIterator<
cutlass::transform::threadblock::PredicatedVectorAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
>;
// Warp-level iterators to load scale and bias vectors
using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
LayoutScaleBias, InstructionShape, kElementsPerAccess>;
using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
using IteratorB1 =
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
ElementB, layout::TensorCxRSKx<InterleavedK>,
ThreadMapB1
>;
using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
// Warp-level GEMM components
using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
using MmaPolicy0 = typename MmaCore0::MmaPolicy;
using MmaPolicy1 = typename MmaCore1::MmaPolicy;
// Define the Mma
using B2bMma = threadblock::B2bImplicitGemmMultistage<
ThreadblockShape0,
IteratorA0,
SmemIteratorA0,
arch::CacheOperation::Always,
IteratorB0,
SmemIteratorB0,
arch::CacheOperation::Global,
ThreadblockShape1,
FragmentIteratorA1,
IteratorAccumulatorScaleBias,
FragmentIteratorA1ScaleBias,
IteratorB1,
SmemIteratorB1,
arch::CacheOperation::Global,
EpilogueOutputOp0,
MmaPolicy0,
MmaPolicy1,
Stages
>;
// Define the epilogue
using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
ThreadblockShape1,
WarpMmaTensorOp1,
1,
EpilogueOutputOp1,
EpilogueOutputOp1::kCount,
InterleavedK
>::Epilogue;
// Define the kernel
using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
B2bMma,
Epilogue,
ThreadblockSwizzle,
conv::Operator::kFprop
>;
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm and
/// multistage pipeline.
template <
typename ElementA,
typename LayoutA,
typename ElementB,
typename LayoutB,
typename ElementC,
typename LayoutC,
typename ElementAccumulator,
typename ArchTag,
typename ThreadblockShape0,
typename ThreadblockShape1,
typename WarpShape0,
typename WarpShape1,
typename InstructionShape,
typename EpilogueOutputOp0,
typename EpilogueOutputOp1,
typename ThreadblockSwizzle,
int Stages,
typename MathOperatorTag
>
struct DefaultB2bConv2dFprop <
ElementA,
LayoutA,
ElementB,
LayoutB,
ElementC,
LayoutC,
ElementAccumulator,
arch::OpClassTensorOp,
ArchTag,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
ThreadblockSwizzle,
Stages,
MathOperatorTag,
IteratorAlgorithm::kOptimized
> {
// Define the core components from GEMM
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
Stages, MathOperatorTag>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
Stages, MathOperatorTag>;
// Define iterators over tiles from the A operand
using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
using IteratorA0 =
cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
ElementA, LayoutA,
ThreadMapA0
>;
using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
// Define iterators over tiles from the B operand
using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
using IteratorB0 =
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
ElementB, LayoutB,
ThreadMapB0
>;
using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
// Use fragment iterator for A operand
using AccumulatorLayout = cutlass::layout::ColumnMajor;
using FragmentIteratorA1 =
cutlass::gemm::warp::MmaTensorOpFragmentIterator<
cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
MmaCore1::Shape::kK, //kBlocksColumn
ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
/// Define iterators over tiles from scale/bias vectors
using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
static int const kElementsPerAccess = 2;
using IteratorAccumulatorScaleBias =
cutlass::transform::threadblock::VectorIterator<
cutlass::transform::threadblock::PredicatedVectorAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
>;
// Warp-level iterators to load scale and bias vectors
using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
LayoutScaleBias, InstructionShape, kElementsPerAccess>;
// Define iterators over tiles from the B operand
using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
using IteratorB1 =
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
ElementB, LayoutB,
ThreadMapB1
>;
using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
// Warp-level GEMM components
using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
using MmaPolicy0 = typename MmaCore0::MmaPolicy;
using MmaPolicy1 = typename MmaCore1::MmaPolicy;
// Define the Mma
using B2bMma = threadblock::B2bImplicitGemmMultistage<
ThreadblockShape0,
IteratorA0,
SmemIteratorA0,
arch::CacheOperation::Always,
IteratorB0,
SmemIteratorB0,
arch::CacheOperation::Global,
ThreadblockShape1,
FragmentIteratorA1,
IteratorAccumulatorScaleBias,
FragmentIteratorA1ScaleBias,
IteratorB1,
SmemIteratorB1,
arch::CacheOperation::Global,
EpilogueOutputOp0,
MmaPolicy0,
MmaPolicy1,
Stages
>;
// Define the epilogue
using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
ThreadblockShape1,
WarpMmaTensorOp1,
1,
EpilogueOutputOp1,
EpilogueOutputOp1::kCount
>::Epilogue;
// Define the kernel
using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
B2bMma,
Epilogue,
ThreadblockSwizzle,
conv::Operator::kFprop
>;
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Defines a kernel for Conv2dFprop specialization for Optimzed IteratorAlgorithm and
// multistage pipeline with interleaved layout.
template <
typename ElementA,
typename ElementB,
typename ElementC,
typename LayoutC,
typename ElementAccumulator,
typename ArchTag,
typename ThreadblockShape0,
typename ThreadblockShape1,
typename WarpShape0,
typename WarpShape1,
typename InstructionShape,
typename EpilogueOutputOp0,
typename EpilogueOutputOp1,
typename ThreadblockSwizzle,
int Stages,
typename MathOperatorTag,
int InterleavedK
>
struct DefaultB2bConv2dFprop <
ElementA,
layout::TensorNCxHWx<InterleavedK>,
ElementB,
layout::TensorCxRSKx<InterleavedK>,
ElementC,
LayoutC,
ElementAccumulator,
arch::OpClassTensorOp,
ArchTag,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
ThreadblockSwizzle,
Stages,
MathOperatorTag,
IteratorAlgorithm::kOptimized
> {
// Define the core components from GEMM
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
ElementB, layout::RowMajorInterleaved<InterleavedK>,
ElementAccumulator, LayoutC, arch::OpClassTensorOp,
Stages, MathOperatorTag, true>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
ElementB, layout::RowMajorInterleaved<InterleavedK>,
ElementAccumulator, LayoutC, arch::OpClassTensorOp,
Stages, MathOperatorTag, true>;
// Define iterators over tiles from the A operand
// Note GEMM shared memory threadmap is used here because conv global memory
// layout needs to be mapped to fprop which is similar to the crosswise
// layout which is used by the interleaved GEMM shared memory threadmap.
// The Interleaved GEMM global memory layout is similar to the congruous
// layout.
using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
using IteratorA0 =
cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
ElementA, layout::TensorNCxHWx<InterleavedK>,
ThreadMapA0
>;
using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
// Define iterators over tiles from the B operand
// Note GEMM shared memory threadmap is used here because conv global memory
// layout needs to be mapped to fprop which is similar to the crosswise
// layout which is used by the interleaved GEMM shared memory threadmap.
// The Interleaved GEMM global memory layout is similar to the congruous
// layout.
using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
using IteratorB0 =
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
ElementB, layout::TensorCxRSKx<InterleavedK>,
ThreadMapB0
>;
using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
// Use fragment iterator for A operand
using AccumulatorLayout = cutlass::layout::RowMajor;
using FragmentIteratorA1 =
cutlass::gemm::warp::MmaTensorOpFragmentIterator<
cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
MmaCore1::Shape::kK, //kBlocksColumn
ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
/// Define iterators over tiles from scale/bias vectors
using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
static int const kElementsPerAccess = 4;
using IteratorAccumulatorScaleBias =
cutlass::transform::threadblock::VectorIterator<
cutlass::transform::threadblock::PredicatedVectorAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
>;
// Warp-level iterators to load scale and bias vectors
using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
LayoutScaleBias, InstructionShape, kElementsPerAccess>;
using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
using IteratorB1 =
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
ElementB, layout::TensorCxRSKx<InterleavedK>,
ThreadMapB1
>;
using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
// Warp-level GEMM components
using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
using MmaPolicy0 = typename MmaCore0::MmaPolicy;
using MmaPolicy1 = typename MmaCore1::MmaPolicy;
// Define the Mma
using B2bMma = threadblock::B2bImplicitGemmMultistage<
ThreadblockShape0,
IteratorA0,
SmemIteratorA0,
arch::CacheOperation::Always,
IteratorB0,
SmemIteratorB0,
arch::CacheOperation::Global,
ThreadblockShape1,
FragmentIteratorA1,
IteratorAccumulatorScaleBias,
FragmentIteratorA1ScaleBias,
IteratorB1,
SmemIteratorB1,
arch::CacheOperation::Global,
EpilogueOutputOp0,
MmaPolicy0,
MmaPolicy1,
Stages
>;
// Define the epilogue
using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
ThreadblockShape1,
WarpMmaTensorOp1,
1,
EpilogueOutputOp1,
EpilogueOutputOp1::kCount,
InterleavedK
>::Epilogue;
// Define the kernel
using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
B2bMma,
Epilogue,
ThreadblockSwizzle,
conv::Operator::kFprop
>;
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernel
} // namespace conv
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,817 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief
Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped
matrix multiply-add with the appropriate threadblock-scoped epilogue.
*/
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/conv/kernel/default_conv2d.h"
#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
#include "cutlass/transform/threadblock/predicated_vector_access_iterator.h"
#include "cutlass/transform/threadblock/vector_iterator.h"
#include "cutlass/transform/warp/vector_fragment_iterator.h"
#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
#include "kernel/default_b2b_conv2d_fprop.h"
#include "kernel/b2b_implicit_gemm_convolution.h"
#include "threadblock/b2b_implicit_gemm_pipelined_smem_accumulator.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace conv {
namespace kernel {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm
/// and 2 stage pipeline.
/// Accumulator will be staged in shared memory.
template <
typename ElementA,
typename LayoutA,
typename ElementB,
typename LayoutB,
typename ElementC,
typename LayoutC,
typename ElementAccumulator,
typename ArchTag,
typename ThreadblockShape0,
typename ThreadblockShape1,
typename WarpShape0,
typename WarpShape1,
typename InstructionShape,
typename EpilogueOutputOp0,
typename EpilogueOutputOp1,
typename ThreadblockSwizzle,
typename MathOperatorTag
>
struct DefaultB2bConv2dFprop <
ElementA,
LayoutA,
ElementB,
LayoutB,
ElementC,
LayoutC,
ElementAccumulator,
arch::OpClassTensorOp,
ArchTag,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
ThreadblockSwizzle,
2,
MathOperatorTag,
IteratorAlgorithm::kAnalytic,
true
> {
// Define the core components from GEMM
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
2, MathOperatorTag>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
2, MathOperatorTag>;
// Define iterators over tiles from the A operand
using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
using IteratorA0 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
ElementA, LayoutA,
ThreadMapA0
>
>;
using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
// Define iterators over tiles from the B operand
using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
using IteratorB0 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
ElementB, LayoutB,
ThreadMapB0
>
>;
using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
/// Define iterators over tiles from scale/bias vectors
using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
static int const kElementsPerAccess = 2;
using IteratorAccumulatorScaleBias =
cutlass::transform::threadblock::VectorIterator<
cutlass::transform::threadblock::PredicatedVectorAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kN>,
ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
>;
// Define iterators over tiles from the B operand
using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
using IteratorB1 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
ElementB, LayoutB,
ThreadMapB1
>
>;
using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
// Warp-level GEMM components
using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
using MmaPolicy0 = typename MmaCore0::MmaPolicy;
using MmaPolicy1 = typename MmaCore1::MmaPolicy;
// Use fragment iterator for the accumulator
using SmemAccumulatorLayout = cutlass::layout::RowMajor;
using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
WarpShape0, InstructionShape,
ElementAccumulator,
typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
SmemAccumulatorLayout
>;
// Store Accumulator tiles to Shared Memory
using SmemIteratorD0 =
cutlass::epilogue::warp::TileIteratorTensorOp<
WarpShape0,
InstructionShape,
ElementC,
SmemAccumulatorLayout
>;
static int const kThreadCount = 32;
// load warp tile from Shared Memory accumulator
using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator<
MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA,
ElementA, SmemAccumulatorLayout,
MatrixShape<InstructionShape::kM, InstructionShape::kK>,
WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
// Define the Mma
using B2bMma = threadblock::B2bImplicitGemmPipelinedSmemAccumulator<
ThreadblockShape0,
IteratorA0,
SmemIteratorA0,
IteratorB0,
SmemIteratorB0,
IteratorAccumulatorScaleBias,
FragmentIteratorAccumulator,
SmemIteratorD0,
ThreadblockShape1,
WarpIteratorA1,
IteratorB1,
SmemIteratorB1,
ElementC,
LayoutC,
EpilogueOutputOp0,
MmaPolicy0,
MmaPolicy1
>;
// Define the epilogue
using Epilogue = typename detail::DefaultConvEpilogue<
ArchTag,
ThreadblockShape1,
WarpMmaTensorOp1,
1,
EpilogueOutputOp1
>::Epilogue;
// Define the kernel
using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
B2bMma,
Epilogue,
ThreadblockSwizzle,
conv::Operator::kFprop
>;
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and 2 stage
/// pipeline with interleaved layout.
/// Accumulator will be staged in shared memory.
template <
typename ElementA,
typename ElementB,
typename ElementC,
typename LayoutC,
typename ElementAccumulator,
typename ArchTag,
typename ThreadblockShape0,
typename ThreadblockShape1,
typename WarpShape0,
typename WarpShape1,
typename InstructionShape,
typename EpilogueOutputOp0,
typename EpilogueOutputOp1,
typename ThreadblockSwizzle,
typename MathOperatorTag,
int InterleavedK
>
struct DefaultB2bConv2dFprop <
ElementA,
layout::TensorNCxHWx<InterleavedK>,
ElementB,
layout::TensorCxRSKx<InterleavedK>,
ElementC,
LayoutC,
ElementAccumulator,
arch::OpClassTensorOp,
ArchTag,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
ThreadblockSwizzle,
2,
MathOperatorTag,
IteratorAlgorithm::kAnalytic,
true
> {
// Define the core components from GEMM
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
ElementB, layout::RowMajorInterleaved<InterleavedK>,
ElementAccumulator, LayoutC, arch::OpClassTensorOp,
2, MathOperatorTag, true>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
ElementB, layout::RowMajorInterleaved<InterleavedK>,
ElementAccumulator, LayoutC, arch::OpClassTensorOp,
2, MathOperatorTag, true>;
// Define iterators over tiles from the A operand
// Note GEMM shared memory threadmap is used here because conv global memory
// layout needs to be mapped to fprop which is similar to the crosswise
// layout which is used by the interleaved GEMM shared memory threadmap.
// The Interleaved GEMM global memory layout is similar to the congruous
// layout.
using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
using IteratorA0 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
ElementA, layout::TensorNCxHWx<InterleavedK>,
ThreadMapA0
>
>;
using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
// Define iterators over tiles from the B operand
// Note GEMM shared memory threadmap is used here because conv global memory
// layout needs to be mapped to fprop which is similar to the crosswise
// layout which is used by the interleaved GEMM shared memory threadmap.
// The Interleaved GEMM global memory layout is similar to the congruous
// layout.
using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
using IteratorB0 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
ElementB, layout::TensorCxRSKx<InterleavedK>,
ThreadMapB0
>
>;
using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
/// Define iterators over tiles from scale/bias vectors
using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
static int const kElementsPerAccess = 4; //For interleaved layout
using IteratorAccumulatorScaleBias =
cutlass::transform::threadblock::VectorIterator<
cutlass::transform::threadblock::PredicatedVectorAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kN>,
ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
>;
// Define iterators over tiles from the B operand
using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
using IteratorB1 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
ElementB, layout::TensorCxRSKx<InterleavedK>,
ThreadMapB1
>
>;
using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
// Warp-level GEMM components
using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
using MmaPolicy0 = typename MmaCore0::MmaPolicy;
using MmaPolicy1 = typename MmaCore1::MmaPolicy;
// Use fragment iterator for the accumulator
using SmemAccumulatorLayout = cutlass::layout::ColumnMajorInterleaved<16>;
using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
WarpShape0, InstructionShape,
ElementAccumulator,
typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
SmemAccumulatorLayout
>;
// Store Accumulator tiles to Shared Memory
using SmemIteratorD0 =
cutlass::epilogue::warp::TileIteratorTensorOp<
WarpShape0,
InstructionShape,
ElementC,
SmemAccumulatorLayout
>;
static int const kThreadCount = 32;
// load warp tile from Shared Memory accumulator
using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIteratorCanonical<
MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA,
ElementA, SmemAccumulatorLayout,
MatrixShape<InstructionShape::kM, InstructionShape::kK>,
WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
// Define the Mma
using B2bMma = threadblock::B2bImplicitGemmPipelinedSmemAccumulator<
ThreadblockShape0,
IteratorA0,
SmemIteratorA0,
IteratorB0,
SmemIteratorB0,
IteratorAccumulatorScaleBias,
FragmentIteratorAccumulator,
SmemIteratorD0,
ThreadblockShape1,
WarpIteratorA1,
IteratorB1,
SmemIteratorB1,
ElementC,
LayoutC,
EpilogueOutputOp0,
MmaPolicy0,
MmaPolicy1
>;
// Define the epilogue
using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
ThreadblockShape1,
WarpMmaTensorOp1,
1,
EpilogueOutputOp1,
EpilogueOutputOp1::kCount,
InterleavedK
>::Epilogue;
// Define the kernel
using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
B2bMma,
Epilogue,
ThreadblockSwizzle,
conv::Operator::kFprop
>;
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm
/// and 2 stage pipeline.
/// Accumulator will be staged in shared memory.
template <
typename ElementA,
typename LayoutA,
typename ElementB,
typename LayoutB,
typename ElementC,
typename LayoutC,
typename ElementAccumulator,
typename ArchTag,
typename ThreadblockShape0,
typename ThreadblockShape1,
typename WarpShape0,
typename WarpShape1,
typename InstructionShape,
typename EpilogueOutputOp0,
typename EpilogueOutputOp1,
typename ThreadblockSwizzle,
typename MathOperatorTag
>
struct DefaultB2bConv2dFprop <
ElementA,
LayoutA,
ElementB,
LayoutB,
ElementC,
LayoutC,
ElementAccumulator,
arch::OpClassTensorOp,
ArchTag,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
ThreadblockSwizzle,
2,
MathOperatorTag,
IteratorAlgorithm::kOptimized,
true
> {
// Define the core components from GEMM
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
2, MathOperatorTag>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
2, MathOperatorTag>;
// Define iterators over tiles from the A operand
using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
using IteratorA0 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
ElementA, LayoutA,
ThreadMapA0
>
>;
using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
// Define iterators over tiles from the B operand
using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
using IteratorB0 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
ElementB, LayoutB,
ThreadMapB0
>
>;
using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
/// Define iterators over tiles from scale/bias vectors
using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
static int const kElementsPerAccess = 2;
using IteratorAccumulatorScaleBias =
cutlass::transform::threadblock::VectorIterator<
cutlass::transform::threadblock::PredicatedVectorAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kN>,
ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
>;
// Define iterators over tiles from the B operand
using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
using IteratorB1 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
ElementB, LayoutB,
ThreadMapB1
>
>;
using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
// Warp-level GEMM components
using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
using MmaPolicy0 = typename MmaCore0::MmaPolicy;
using MmaPolicy1 = typename MmaCore1::MmaPolicy;
// Use fragment iterator for the accumulator
using SmemAccumulatorLayout = cutlass::layout::RowMajor;
using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
WarpShape0, InstructionShape,
ElementAccumulator,
typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
SmemAccumulatorLayout
>;
// Store Accumulator tiles to Shared Memory
using SmemIteratorD0 =
cutlass::epilogue::warp::TileIteratorTensorOp<
WarpShape0,
InstructionShape,
ElementC,
SmemAccumulatorLayout
>;
static int const kThreadCount = 32;
// load warp tile from Shared Memory accumulator
using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator<
MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA,
ElementA, SmemAccumulatorLayout,
MatrixShape<InstructionShape::kM, InstructionShape::kK>,
WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
// Define the Mma
using B2bMma = threadblock::B2bImplicitGemmPipelinedSmemAccumulator<
ThreadblockShape0,
IteratorA0,
SmemIteratorA0,
IteratorB0,
SmemIteratorB0,
IteratorAccumulatorScaleBias,
FragmentIteratorAccumulator,
SmemIteratorD0,
ThreadblockShape1,
WarpIteratorA1,
IteratorB1,
SmemIteratorB1,
ElementC,
LayoutC,
EpilogueOutputOp0,
MmaPolicy0,
MmaPolicy1
>;
// Define the epilogue
using Epilogue = typename detail::DefaultConvEpilogue<
ArchTag,
ThreadblockShape1,
WarpMmaTensorOp1,
1,
EpilogueOutputOp1
>::Epilogue;
// Define the kernel
using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
B2bMma,
Epilogue,
ThreadblockSwizzle,
conv::Operator::kFprop
>;
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm and 2 stage
/// pipeline with interleaved layout.
/// Accumulator will be staged in shared memory.
template <
typename ElementA,
typename ElementB,
typename ElementC,
typename LayoutC,
typename ElementAccumulator,
typename ArchTag,
typename ThreadblockShape0,
typename ThreadblockShape1,
typename WarpShape0,
typename WarpShape1,
typename InstructionShape,
typename EpilogueOutputOp0,
typename EpilogueOutputOp1,
typename ThreadblockSwizzle,
typename MathOperatorTag,
int InterleavedK
>
struct DefaultB2bConv2dFprop <
ElementA,
layout::TensorNCxHWx<InterleavedK>,
ElementB,
layout::TensorCxRSKx<InterleavedK>,
ElementC,
LayoutC,
ElementAccumulator,
arch::OpClassTensorOp,
ArchTag,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
ThreadblockSwizzle,
2,
MathOperatorTag,
IteratorAlgorithm::kOptimized,
true
> {
// Define the core components from GEMM
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
ElementB, layout::RowMajorInterleaved<InterleavedK>,
ElementAccumulator, LayoutC, arch::OpClassTensorOp,
2, MathOperatorTag, true>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
ElementB, layout::RowMajorInterleaved<InterleavedK>,
ElementAccumulator, LayoutC, arch::OpClassTensorOp,
2, MathOperatorTag, true>;
// Define iterators over tiles from the A operand
// Note GEMM shared memory threadmap is used here because conv global memory
// layout needs to be mapped to fprop which is similar to the crosswise
// layout which is used by the interleaved GEMM shared memory threadmap.
// The Interleaved GEMM global memory layout is similar to the congruous
// layout.
// Define iterators over tiles from the A operand
using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
using IteratorA0 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
ElementA, layout::TensorNCxHWx<InterleavedK>,
ThreadMapA0
>
>;
using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
// Define iterators over tiles from the B operand
// Note GEMM shared memory threadmap is used here because conv global memory
// layout needs to be mapped to fprop which is similar to the crosswise
// layout which is used by the interleaved GEMM shared memory threadmap.
// The Interleaved GEMM global memory layout is similar to the congruous
// layout.
using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
using IteratorB0 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
ElementB, layout::TensorCxRSKx<InterleavedK>,
ThreadMapB0
>
>;
using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
/// Define iterators over tiles from scale/bias vectors
using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
static int const kElementsPerAccess = 4; //For interleaved layout
using IteratorAccumulatorScaleBias =
cutlass::transform::threadblock::VectorIterator<
cutlass::transform::threadblock::PredicatedVectorAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kN>,
ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
>;
using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
using IteratorB1 =
cutlass::conv::threadblock::TileIterator<
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
ElementB, layout::TensorCxRSKx<InterleavedK>,
ThreadMapB1
>
>;
using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
// Warp-level GEMM components
using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
using MmaPolicy0 = typename MmaCore0::MmaPolicy;
using MmaPolicy1 = typename MmaCore1::MmaPolicy;
// Use fragment iterator for the accumulator
using SmemAccumulatorLayout = cutlass::layout::ColumnMajorInterleaved<16>;
using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
WarpShape0, InstructionShape,
ElementAccumulator,
typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
SmemAccumulatorLayout
>;
// Store Accumulator tiles to Shared Memory
using SmemIteratorD0 =
cutlass::epilogue::warp::TileIteratorTensorOp<
WarpShape0,
InstructionShape,
ElementC,
SmemAccumulatorLayout
>;
static int const kThreadCount = 32;
// load warp tile from Shared Memory accumulator
using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIteratorCanonical<
MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA,
ElementA, SmemAccumulatorLayout,
MatrixShape<InstructionShape::kM, InstructionShape::kK>,
WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
// Define the Mma
using B2bMma = threadblock::B2bImplicitGemmPipelinedSmemAccumulator<
ThreadblockShape0,
IteratorA0,
SmemIteratorA0,
IteratorB0,
SmemIteratorB0,
IteratorAccumulatorScaleBias,
FragmentIteratorAccumulator,
SmemIteratorD0,
ThreadblockShape1,
WarpIteratorA1,
IteratorB1,
SmemIteratorB1,
ElementC,
LayoutC,
EpilogueOutputOp0,
MmaPolicy0,
MmaPolicy1
>;
// Define the epilogue
using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
ThreadblockShape1,
WarpMmaTensorOp1,
1,
EpilogueOutputOp1,
EpilogueOutputOp1::kCount,
InterleavedK
>::Epilogue;
// Define the kernel
using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
B2bMma,
Epilogue,
ThreadblockSwizzle,
conv::Operator::kFprop
>;
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernel
} // namespace conv
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,804 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief
Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped
matrix multiply-add with the appropriate threadblock-scoped epilogue.
*/
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/conv/kernel/default_conv2d.h"
#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
#include "cutlass/transform/threadblock/predicated_vector_access_iterator.h"
#include "cutlass/transform/threadblock/vector_iterator.h"
#include "cutlass/transform/warp/vector_fragment_iterator.h"
#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
#include "kernel/default_b2b_conv2d_fprop.h"
#include "kernel/b2b_implicit_gemm_convolution.h"
#include "threadblock/b2b_implicit_gemm_multistage_smem_accumulator.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace conv {
namespace kernel {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
/// pipeline.
/// Accumulator will be staged in shared memory.
template <
typename ElementA,
typename LayoutA,
typename ElementB,
typename LayoutB,
typename ElementC,
typename LayoutC,
typename ElementAccumulator,
typename ArchTag,
typename ThreadblockShape0,
typename ThreadblockShape1,
typename WarpShape0,
typename WarpShape1,
typename InstructionShape,
typename EpilogueOutputOp0,
typename EpilogueOutputOp1,
typename ThreadblockSwizzle,
int Stages,
typename MathOperatorTag
>
struct DefaultB2bConv2dFprop <
ElementA,
LayoutA,
ElementB,
LayoutB,
ElementC,
LayoutC,
ElementAccumulator,
arch::OpClassTensorOp,
ArchTag,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
ThreadblockSwizzle,
Stages,
MathOperatorTag,
IteratorAlgorithm::kAnalytic,
true
> {
// Define the core components from GEMM
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
Stages, MathOperatorTag>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
Stages, MathOperatorTag>;
// Define iterators over tiles from the A operand
using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
using IteratorA0 =
cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
ElementA, LayoutA,
ThreadMapA0
>;
using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
// Define iterators over tiles from the B operand
using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
using IteratorB0 =
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
ElementB, LayoutB,
ThreadMapB0
>;
using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
/// Define iterators over tiles from scale/bias vectors
using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
static int const kElementsPerAccess = 2;
using IteratorAccumulatorScaleBias =
cutlass::transform::threadblock::VectorIterator<
cutlass::transform::threadblock::PredicatedVectorAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kN>,
ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
>;
// Define iterators over tiles from the B operand
using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
using IteratorB1 =
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
ElementB, LayoutB,
ThreadMapB1
>;
using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
// Warp-level GEMM components
using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
using MmaPolicy0 = typename MmaCore0::MmaPolicy;
using MmaPolicy1 = typename MmaCore1::MmaPolicy;
// Use fragment iterator for the accumulator
using SmemAccumulatorLayout = cutlass::layout::RowMajor;
using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
WarpShape0, InstructionShape,
ElementAccumulator,
typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
SmemAccumulatorLayout
>;
// Store Accumulator tiles to Shared Memory
using SmemIteratorD0 =
cutlass::epilogue::warp::TileIteratorTensorOp<
WarpShape0,
InstructionShape,
ElementC,
SmemAccumulatorLayout
>;
static int const kThreadCount = 32;
// load warp tile from Shared Memory accumulator
using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator<
MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA,
ElementA, SmemAccumulatorLayout,
MatrixShape<InstructionShape::kM, InstructionShape::kK>,
WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
// Define the Mma
using B2bMma = threadblock::B2bImplicitGemmMultistageSmemAccumulator<
ThreadblockShape0,
IteratorA0,
SmemIteratorA0,
arch::CacheOperation::Always,
IteratorB0,
SmemIteratorB0,
arch::CacheOperation::Global,
IteratorAccumulatorScaleBias,
FragmentIteratorAccumulator,
SmemIteratorD0,
ThreadblockShape1,
WarpIteratorA1,
IteratorB1,
SmemIteratorB1,
arch::CacheOperation::Global,
EpilogueOutputOp0,
MmaPolicy0,
MmaPolicy1,
Stages
>;
// Define the epilogue
using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
ThreadblockShape1,
WarpMmaTensorOp1,
1,
EpilogueOutputOp1,
EpilogueOutputOp1::kCount
>::Epilogue;
// Define the kernel
using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
B2bMma,
Epilogue,
ThreadblockSwizzle,
conv::Operator::kFprop
>;
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
/// pipeline with interleaved layout.
/// Accumulator will be staged in shared memory.
template <
typename ElementA,
typename ElementB,
typename ElementC,
typename LayoutC,
typename ElementAccumulator,
typename ArchTag,
typename ThreadblockShape0,
typename ThreadblockShape1,
typename WarpShape0,
typename WarpShape1,
typename InstructionShape,
typename EpilogueOutputOp0,
typename EpilogueOutputOp1,
typename ThreadblockSwizzle,
int Stages,
typename MathOperatorTag,
int InterleavedK
>
struct DefaultB2bConv2dFprop <
ElementA,
layout::TensorNCxHWx<InterleavedK>,
ElementB,
layout::TensorCxRSKx<InterleavedK>,
ElementC,
LayoutC,
ElementAccumulator,
arch::OpClassTensorOp,
ArchTag,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
ThreadblockSwizzle,
Stages,
MathOperatorTag,
IteratorAlgorithm::kAnalytic,
true
> {
// Define the core components from GEMM
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
ElementB, layout::RowMajorInterleaved<InterleavedK>,
ElementAccumulator, LayoutC, arch::OpClassTensorOp,
Stages, MathOperatorTag, true>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
ElementB, layout::RowMajorInterleaved<InterleavedK>,
ElementAccumulator, LayoutC, arch::OpClassTensorOp,
Stages, MathOperatorTag, true>;
// Define iterators over tiles from the A operand
// Note GEMM shared memory threadmap is used here because conv global memory
// layout needs to be mapped to fprop which is similar to the crosswise
// layout which is used by the interleaved GEMM shared memory threadmap.
// The Interleaved GEMM global memory layout is similar to the congruous
// layout.
using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
using IteratorA0 =
cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
ElementA, layout::TensorNCxHWx<InterleavedK>,
ThreadMapA0
>;
using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
// Define iterators over tiles from the B operand
// Note GEMM shared memory threadmap is used here because conv global memory
// layout needs to be mapped to fprop which is similar to the crosswise
// layout which is used by the interleaved GEMM shared memory threadmap.
// The Interleaved GEMM global memory layout is similar to the congruous
// layout.
using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
using IteratorB0 =
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
ElementB, layout::TensorCxRSKx<InterleavedK>,
ThreadMapB0
>;
using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
/// Define iterators over tiles from scale/bias vectors
using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
static int const kElementsPerAccess = 4;
using IteratorAccumulatorScaleBias =
cutlass::transform::threadblock::VectorIterator<
cutlass::transform::threadblock::PredicatedVectorAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kN>,
ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
>;
using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
using IteratorB1 =
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
ElementB, layout::TensorCxRSKx<InterleavedK>,
ThreadMapB1
>;
using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
// Warp-level GEMM components
using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
using MmaPolicy0 = typename MmaCore0::MmaPolicy;
using MmaPolicy1 = typename MmaCore1::MmaPolicy;
// Use fragment iterator for the accumulator
using SmemAccumulatorLayout = cutlass::layout::ColumnMajorInterleaved<16>;
using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
WarpShape0, InstructionShape,
ElementAccumulator,
typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
SmemAccumulatorLayout
>;
// Store Accumulator tiles to Shared Memory
using SmemIteratorD0 =
cutlass::epilogue::warp::TileIteratorTensorOp<
WarpShape0,
InstructionShape,
ElementC,
SmemAccumulatorLayout
>;
static int const kThreadCount = 32;
// load warp tile from Shared Memory accumulator
using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIteratorCanonical<
MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA,
ElementA, SmemAccumulatorLayout,
MatrixShape<InstructionShape::kM, InstructionShape::kK>,
WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
// Define the Mma
using B2bMma = threadblock::B2bImplicitGemmMultistageSmemAccumulator<
ThreadblockShape0,
IteratorA0,
SmemIteratorA0,
arch::CacheOperation::Always,
IteratorB0,
SmemIteratorB0,
arch::CacheOperation::Global,
IteratorAccumulatorScaleBias,
FragmentIteratorAccumulator,
SmemIteratorD0,
ThreadblockShape1,
WarpIteratorA1,
IteratorB1,
SmemIteratorB1,
arch::CacheOperation::Global,
EpilogueOutputOp0,
MmaPolicy0,
MmaPolicy1,
Stages
>;
// Define the epilogue
using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
ThreadblockShape1,
WarpMmaTensorOp1,
1,
EpilogueOutputOp1,
EpilogueOutputOp1::kCount,
InterleavedK
>::Epilogue;
// Define the kernel
using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
B2bMma,
Epilogue,
ThreadblockSwizzle,
conv::Operator::kFprop
>;
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm and
/// multistage pipeline.
/// Accumulator will be staged in shared memory.
template <
typename ElementA,
typename LayoutA,
typename ElementB,
typename LayoutB,
typename ElementC,
typename LayoutC,
typename ElementAccumulator,
typename ArchTag,
typename ThreadblockShape0,
typename ThreadblockShape1,
typename WarpShape0,
typename WarpShape1,
typename InstructionShape,
typename EpilogueOutputOp0,
typename EpilogueOutputOp1,
typename ThreadblockSwizzle,
int Stages,
typename MathOperatorTag
>
struct DefaultB2bConv2dFprop <
ElementA,
LayoutA,
ElementB,
LayoutB,
ElementC,
LayoutC,
ElementAccumulator,
arch::OpClassTensorOp,
ArchTag,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
ThreadblockSwizzle,
Stages,
MathOperatorTag,
IteratorAlgorithm::kOptimized,
true
> {
// Define the core components from GEMM
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
Stages, MathOperatorTag>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
Stages, MathOperatorTag>;
// Define iterators over tiles from the A operand
using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
using IteratorA0 =
cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
ElementA, LayoutA,
ThreadMapA0
>;
using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
// Define iterators over tiles from the B operand
using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
using IteratorB0 =
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
ElementB, LayoutB,
ThreadMapB0
>;
using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
/// Define iterators over tiles from scale/bias vectors
using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
static int const kElementsPerAccess = 2;
using IteratorAccumulatorScaleBias =
cutlass::transform::threadblock::VectorIterator<
cutlass::transform::threadblock::PredicatedVectorAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kN>,
ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
>;
// Define iterators over tiles from the B operand
using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
using IteratorB1 =
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
ElementB, LayoutB,
ThreadMapB1
>;
using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
// Warp-level GEMM components
using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
using MmaPolicy0 = typename MmaCore0::MmaPolicy;
using MmaPolicy1 = typename MmaCore1::MmaPolicy;
// Use fragment iterator for the accumulator
using SmemAccumulatorLayout = cutlass::layout::RowMajor;
using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
WarpShape0, InstructionShape,
ElementAccumulator,
typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
SmemAccumulatorLayout
>;
// Store Accumulator tiles to Shared Memory
using SmemIteratorD0 =
cutlass::epilogue::warp::TileIteratorTensorOp<
WarpShape0,
InstructionShape,
ElementC,
SmemAccumulatorLayout
>;
static int const kThreadCount = 32;
// load warp tile from Shared Memory accumulator
using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator<
MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA,
ElementA, SmemAccumulatorLayout,
MatrixShape<InstructionShape::kM, InstructionShape::kK>,
WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
// Define the Mma
using B2bMma = threadblock::B2bImplicitGemmMultistageSmemAccumulator<
ThreadblockShape0,
IteratorA0,
SmemIteratorA0,
arch::CacheOperation::Always,
IteratorB0,
SmemIteratorB0,
arch::CacheOperation::Global,
IteratorAccumulatorScaleBias,
FragmentIteratorAccumulator,
SmemIteratorD0,
ThreadblockShape1,
WarpIteratorA1,
IteratorB1,
SmemIteratorB1,
arch::CacheOperation::Global,
EpilogueOutputOp0,
MmaPolicy0,
MmaPolicy1,
Stages
>;
// Define the epilogue
using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
ThreadblockShape1,
WarpMmaTensorOp1,
1,
EpilogueOutputOp1,
EpilogueOutputOp1::kCount
>::Epilogue;
// Define the kernel
using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
B2bMma,
Epilogue,
ThreadblockSwizzle,
conv::Operator::kFprop
>;
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Defines a kernel for Conv2dFprop specialization for Optimzed IteratorAlgorithm and
// multistage pipeline with interleaved layout.
/// Accumulator will be staged in shared memory.
template <
typename ElementA,
typename ElementB,
typename ElementC,
typename LayoutC,
typename ElementAccumulator,
typename ArchTag,
typename ThreadblockShape0,
typename ThreadblockShape1,
typename WarpShape0,
typename WarpShape1,
typename InstructionShape,
typename EpilogueOutputOp0,
typename EpilogueOutputOp1,
typename ThreadblockSwizzle,
int Stages,
typename MathOperatorTag,
int InterleavedK
>
struct DefaultB2bConv2dFprop <
ElementA,
layout::TensorNCxHWx<InterleavedK>,
ElementB,
layout::TensorCxRSKx<InterleavedK>,
ElementC,
LayoutC,
ElementAccumulator,
arch::OpClassTensorOp,
ArchTag,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
ThreadblockSwizzle,
Stages,
MathOperatorTag,
IteratorAlgorithm::kOptimized,
true
> {
// Define the core components from GEMM
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
ElementB, layout::RowMajorInterleaved<InterleavedK>,
ElementAccumulator, LayoutC, arch::OpClassTensorOp,
Stages, MathOperatorTag, true>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
ElementB, layout::RowMajorInterleaved<InterleavedK>,
ElementAccumulator, LayoutC, arch::OpClassTensorOp,
Stages, MathOperatorTag, true>;
// Define iterators over tiles from the A operand
// Note GEMM shared memory threadmap is used here because conv global memory
// layout needs to be mapped to fprop which is similar to the crosswise
// layout which is used by the interleaved GEMM shared memory threadmap.
// The Interleaved GEMM global memory layout is similar to the congruous
// layout.
using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
using IteratorA0 =
cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
ElementA, layout::TensorNCxHWx<InterleavedK>,
ThreadMapA0
>;
using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
// Define iterators over tiles from the B operand
// Note GEMM shared memory threadmap is used here because conv global memory
// layout needs to be mapped to fprop which is similar to the crosswise
// layout which is used by the interleaved GEMM shared memory threadmap.
// The Interleaved GEMM global memory layout is similar to the congruous
// layout.
using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
using IteratorB0 =
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
ElementB, layout::TensorCxRSKx<InterleavedK>,
ThreadMapB0
>;
using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
/// Define iterators over tiles from scale/bias vectors
using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
static int const kElementsPerAccess = 4;
using IteratorAccumulatorScaleBias =
cutlass::transform::threadblock::VectorIterator<
cutlass::transform::threadblock::PredicatedVectorAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kN>,
ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
>;
using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
using IteratorB1 =
cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
ElementB, layout::TensorCxRSKx<InterleavedK>,
ThreadMapB1
>;
using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
// Warp-level GEMM components
using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
using MmaPolicy0 = typename MmaCore0::MmaPolicy;
using MmaPolicy1 = typename MmaCore1::MmaPolicy;
// Use fragment iterator for the accumulator
using SmemAccumulatorLayout = cutlass::layout::ColumnMajorInterleaved<16>;
using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
WarpShape0, InstructionShape,
ElementAccumulator,
typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
SmemAccumulatorLayout
>;
// Store Accumulator tiles to Shared Memory
using SmemIteratorD0 =
cutlass::epilogue::warp::TileIteratorTensorOp<
WarpShape0,
InstructionShape,
ElementC,
SmemAccumulatorLayout
>;
static int const kThreadCount = 32;
// load warp tile from Shared Memory accumulator
using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIteratorCanonical<
MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA,
ElementA, SmemAccumulatorLayout,
MatrixShape<InstructionShape::kM, InstructionShape::kK>,
WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
// Define the Mma
using B2bMma = threadblock::B2bImplicitGemmMultistageSmemAccumulator<
ThreadblockShape0,
IteratorA0,
SmemIteratorA0,
arch::CacheOperation::Always,
IteratorB0,
SmemIteratorB0,
arch::CacheOperation::Global,
IteratorAccumulatorScaleBias,
FragmentIteratorAccumulator,
SmemIteratorD0,
ThreadblockShape1,
WarpIteratorA1,
IteratorB1,
SmemIteratorB1,
arch::CacheOperation::Global,
EpilogueOutputOp0,
MmaPolicy0,
MmaPolicy1,
Stages
>;
// Define the epilogue
using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
ThreadblockShape1,
WarpMmaTensorOp1,
1,
EpilogueOutputOp1,
EpilogueOutputOp1::kCount,
InterleavedK
>::Epilogue;
// Define the kernel
using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
B2bMma,
Epilogue,
ThreadblockSwizzle,
conv::Operator::kFprop
>;
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernel
} // namespace conv
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,29 +1,34 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
*modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice,
*this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
*notice, this list of conditions and the following disclaimer in the
*documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its
*contributors may be used to endorse or promote products derived from this
*software without specific prior written permission.
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
*AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
*IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
*DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT,
*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
*DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
*OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TOR (INCLUDING
*NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
*EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief
Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
@ -113,11 +118,80 @@ template <
bool SplitKSerial,
/// Operation performed by GEMM
typename Operator,
/// Beta is zero or not
bool IsBetaZero = false
/// Stage accumulator in shared memory
bool SmemAccumulator = false
>
struct DefaultB2bGemm;
////////////////////////////////////////////////////////////////////////////////
/// Partial specialization for Ampere Architecture
template <
/// Element type for A matrix operand
typename ElementA,
/// Layout type for A matrix operand
typename LayoutA,
/// Access granularity of A matrix in units of elements
int kAlignmentA,
/// Element type for B matrix operand
typename ElementB,
/// Layout type for B matrix operand
typename LayoutB,
/// Access granularity of A matrix in units of elements
int kAlignmentB,
/// Element type for C and D matrix operands
typename ElementC,
/// Element type for internal accumulation
typename ElementAccumulator,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape0,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape1,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape0,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape1,
/// Warp-level tile size (concept: GemmShape)
typename InstructionShape,
/// Epilogue output operator
typename EpilogueOutputOp0,
/// Epilogue output operator
typename EpilogueOutputOp1,
/// Threadblock-level swizzling operator
typename ThreadblockSwizzle,
/// Number of stages used in the pipelined mainloop
int Stages,
/// If true, kernel is configured to support serial reduction in the
/// epilogue
bool SplitKSerial,
/// Operation performed by GEMM
typename Operator>
struct DefaultB2bGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
arch::Sm80, ThreadblockShape0, ThreadblockShape1,
WarpShape0, WarpShape1, InstructionShape,
EpilogueOutputOp0, EpilogueOutputOp1, ThreadblockSwizzle, Stages, SplitKSerial,
Operator> {
/// Define the threadblock-scoped matrix multiply-accumulate
using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
InstructionShape, Stages, Operator, EpilogueOutputOp0>::ThreadblockB2bMma;
static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
/// Define the epilogue
using Epilogue =
typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
EpilogueOutputOp1::kCount>::Epilogue;
/// Define the kernel-level GEMM operator.
using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
};
////////////////////////////////////////////////////////////////////////////////
/// Partial specialization for Turing Architecture
@ -217,7 +291,82 @@ struct DefaultB2bGemm<
};
/// Partial specialization for Turing IMMA Interleaved layout
/// Partial specialization for Ampere Integer Matrix Multiply Interleaved layout
template <
/// Element type for A matrix operand
typename ElementA,
/// Access granularity of A matrix in units of elements
int kAlignmentA,
/// Element type for B matrix operand
typename ElementB,
/// Access granularity of B matrix in units of elements
int kAlignmentB,
/// Element type for C and D matrix operands
typename ElementC,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape0,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape1,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape0,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape1,
/// Warp-level tile size (concept: GemmShape)
typename InstructionShape,
/// Epilogue output operator
typename EpilogueOutputOp0,
/// Epilogue output operator
typename EpilogueOutputOp1,
/// Threadblock-level swizzling operator
typename ThreadblockSwizzle,
/// Number of stages used in the pipelined mainloop
int Stages,
/// Number of Interleaved k
int InterleavedK,
/// If true, kernel is configured to support serial reduction in the
/// epilogue
bool SplitKSerial,
/// Operation performed by GEMM
typename Operator>
struct DefaultB2bGemm<
ElementA, layout::ColumnMajorInterleaved<InterleavedK>, kAlignmentA,
ElementB, layout::RowMajorInterleaved<InterleavedK>, kAlignmentB,
ElementC, layout::ColumnMajorInterleaved<InterleavedK>, int32_t,
arch::OpClassTensorOp, arch::Sm80,
ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
InstructionShape, EpilogueOutputOp0, EpilogueOutputOp1,
ThreadblockSwizzle, Stages,
SplitKSerial, Operator> {
using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
using ElementAccumulator = int32_t;
/// Define the threadblock-scoped matrix multiply-accumulate
using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm80,
ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
InstructionShape, Stages, Operator, EpilogueOutputOp0,
true>::ThreadblockB2bMma;
static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
/// Define the epilogue
using Epilogue = typename cutlass::epilogue::threadblock::
DefaultInterleavedEpilogueTensorOp<
ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
/// Define the kernel-level GEMM operator.
using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
};
////////////////////////////////////////////////////////////////////////////////
/// Partial specialization for Turing Integer Tensor Core Interleaved layout
template <
/// Element type for A matrix operand
typename ElementA,
@ -251,9 +400,7 @@ template <
/// epilogue
bool SplitKSerial,
/// Operation performed by GEMM
typename Operator,
/// Is Beta zero or not
bool IsBetaZero>
typename Operator>
struct DefaultB2bGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
kAlignmentA, ElementB,
layout::RowMajorInterleaved<InterleavedK>, kAlignmentB,
@ -261,7 +408,7 @@ struct DefaultB2bGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
int32_t, arch::OpClassTensorOp, arch::Sm75,
ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
InstructionShape, EpilogueOutputOp0, EpilogueOutputOp1,
ThreadblockSwizzle, 2, SplitKSerial, Operator, IsBetaZero> {
ThreadblockSwizzle, 2, SplitKSerial, Operator> {
using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
@ -280,8 +427,7 @@ struct DefaultB2bGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
using Epilogue = typename cutlass::epilogue::threadblock::
DefaultInterleavedEpilogueTensorOp<
ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
64 / sizeof_bits<ElementC>::value, InterleavedK,
IsBetaZero>::Epilogue;
64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
/// Define the kernel-level GEMM operator.
using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;

View File

@ -0,0 +1,397 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief
Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
the appropriate threadblock-scoped epilogue.
Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
accommodated by exchanging A and B operands and assuming transposed layouts. Partial
specializations here choose 'device::GemmTransposed' to implement this functionality.
*/
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/layout/matrix.h"
#include "cutlass/numeric_types.h"
#include "cutlass/epilogue/threadblock/epilogue.h"
#include "cutlass/epilogue/thread/linear_combination.h"
#include "cutlass/gemm/gemm.h"
#include "cutlass/gemm/kernel/gemm_pipelined.h"
#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
#include "cutlass/transform/threadblock/vector_iterator.h"
#include "cutlass/transform/threadblock/predicated_vector_access_iterator.h"
#include "kernel/b2b_gemm.h"
#include "threadblock/default_b2b_mma.h"
#include "threadblock/default_b2b_mma_smem_accumulator.h"
////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace gemm {
namespace kernel {
////////////////////////////////////////////////////////////////////////////////
/// Partial specialization for Ampere Architecture
template <
/// Element type for A matrix operand
typename ElementA,
/// Layout type for A matrix operand
typename LayoutA,
/// Access granularity of A matrix in units of elements
int kAlignmentA,
/// Element type for B matrix operand
typename ElementB,
/// Layout type for B matrix operand
typename LayoutB,
/// Access granularity of A matrix in units of elements
int kAlignmentB,
/// Element type for C and D matrix operands
typename ElementC,
/// Element type for internal accumulation
typename ElementAccumulator,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape0,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape1,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape0,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape1,
/// Warp-level tile size (concept: GemmShape)
typename InstructionShape,
/// Epilogue output operator
typename EpilogueOutputOp0,
/// Epilogue output operator
typename EpilogueOutputOp1,
/// Threadblock-level swizzling operator
typename ThreadblockSwizzle,
/// Number of stages used in the pipelined mainloop
int Stages,
/// If true, kernel is configured to support serial reduction in the
/// epilogue
bool SplitKSerial,
/// Operation performed by GEMM
typename Operator>
struct DefaultB2bGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
arch::Sm80, ThreadblockShape0, ThreadblockShape1,
WarpShape0, WarpShape1, InstructionShape,
EpilogueOutputOp0, EpilogueOutputOp1, ThreadblockSwizzle, Stages, SplitKSerial,
Operator, true> {
/// Define the threadblock-scoped matrix multiply-accumulate
using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
InstructionShape, Stages, Operator, EpilogueOutputOp0, false, true>::ThreadblockB2bMma;
static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
/// Define the epilogue
using Epilogue =
typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
EpilogueOutputOp1::kCount>::Epilogue;
/// Define the kernel-level GEMM operator.
using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
};
////////////////////////////////////////////////////////////////////////////////
/// Partial specialization for Turing Architecture
template <
/// Element type for A matrix operand
typename ElementA,
/// Layout type for A matrix operand
typename LayoutA,
/// Access granularity of A matrix in units of elements
int kAlignmentA,
/// Element type for B matrix operand
typename ElementB,
/// Layout type for B matrix operand
typename LayoutB,
/// Access granularity of B matrix in units of elements
int kAlignmentB,
/// Element type for C and D matrix operands
typename ElementC,
/// Element type for internal accumulation
typename ElementAccumulator,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape0,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape1,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape0,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape1,
/// Warp-level tile size (concept: GemmShape)
typename InstructionShape,
/// Epilogue output operator
typename EpilogueOutputOp0,
/// Epilogue output operator
typename EpilogueOutputOp1,
/// Threadblock-level swizzling operator
typename ThreadblockSwizzle,
/// If true, kernel is configured to support serial reduction in the epilogue
bool SplitKSerial,
/// Operation performed by GEMM
typename Operator
>
struct DefaultB2bGemm<
ElementA, LayoutA, kAlignmentA,
ElementB, LayoutB, kAlignmentB,
ElementC, layout::RowMajor,
ElementAccumulator,
arch::OpClassTensorOp,
arch::Sm75,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
ThreadblockSwizzle,
2,
SplitKSerial,
Operator,
true
> {
/// Define the threadblock-scoped matrix multiply-accumulate
using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
ElementA,
LayoutA,
kAlignmentA,
ElementB,
LayoutB,
kAlignmentB,
ElementAccumulator,
layout::RowMajor,
arch::OpClassTensorOp,
arch::Sm75,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
2,
Operator,
EpilogueOutputOp0,
false,
true
>::ThreadblockB2bMma;
static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
/// Define the epilogue
using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
ThreadblockShape1,
typename B2bMma::Operator1,
kPartitionsK1,
EpilogueOutputOp1,
EpilogueOutputOp1::kCount
>::Epilogue;
/// Define the kernel-level GEMM operator.
using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
};
/// Partial specialization for Ampere Integer Matrix Multiply Interleaved layout
template <
/// Element type for A matrix operand
typename ElementA,
/// Access granularity of A matrix in units of elements
int kAlignmentA,
/// Element type for B matrix operand
typename ElementB,
/// Access granularity of B matrix in units of elements
int kAlignmentB,
/// Element type for C and D matrix operands
typename ElementC,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape0,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape1,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape0,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape1,
/// Warp-level tile size (concept: GemmShape)
typename InstructionShape,
/// Epilogue output operator
typename EpilogueOutputOp0,
/// Epilogue output operator
typename EpilogueOutputOp1,
/// Threadblock-level swizzling operator
typename ThreadblockSwizzle,
/// Number of stages used in the pipelined mainloop
int Stages,
/// Number of Interleaved k
int InterleavedK,
/// If true, kernel is configured to support serial reduction in the
/// epilogue
bool SplitKSerial,
/// Operation performed by GEMM
typename Operator>
struct DefaultB2bGemm<
ElementA, layout::ColumnMajorInterleaved<InterleavedK>, kAlignmentA,
ElementB, layout::RowMajorInterleaved<InterleavedK>, kAlignmentB,
ElementC, layout::ColumnMajorInterleaved<InterleavedK>, int32_t,
arch::OpClassTensorOp, arch::Sm80,
ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
InstructionShape, EpilogueOutputOp0, EpilogueOutputOp1,
ThreadblockSwizzle, Stages,
SplitKSerial, Operator, true> {
using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
using ElementAccumulator = int32_t;
/// Define the threadblock-scoped matrix multiply-accumulate
using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm80,
ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
InstructionShape, Stages, Operator, EpilogueOutputOp0,
true, true>::ThreadblockB2bMma;
static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
/// Define the epilogue
using Epilogue = typename cutlass::epilogue::threadblock::
DefaultInterleavedEpilogueTensorOp<
ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
/// Define the kernel-level GEMM operator.
using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
};
////////////////////////////////////////////////////////////////////////////////
/// Partial specialization for Turing Integer Tensor Core Interleaved layout
template <
/// Element type for A matrix operand
typename ElementA,
/// Access granularity of A matrix in units of elements
int kAlignmentA,
/// Element type for B matrix operand
typename ElementB,
/// Access granularity of B matrix in units of elements
int kAlignmentB,
/// Element type for C and D matrix operands
typename ElementC,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape0,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape1,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape0,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape1,
/// Warp-level tile size (concept: GemmShape)
typename InstructionShape,
/// Epilogue output operator
typename EpilogueOutputOp0,
/// Epilogue output operator
typename EpilogueOutputOp1,
/// Threadblock-level swizzling operator
typename ThreadblockSwizzle,
/// Number of Interleaved k
int InterleavedK,
/// If true, kernel is configured to support serial reduction in the
/// epilogue
bool SplitKSerial,
/// Operation performed by GEMM
typename Operator>
struct DefaultB2bGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
kAlignmentA, ElementB,
layout::RowMajorInterleaved<InterleavedK>, kAlignmentB,
ElementC, layout::ColumnMajorInterleaved<InterleavedK>,
int32_t, arch::OpClassTensorOp, arch::Sm75,
ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
InstructionShape, EpilogueOutputOp0, EpilogueOutputOp1,
ThreadblockSwizzle, 2, SplitKSerial, Operator, true> {
using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
using ElementAccumulator = int32_t;
/// Define the threadblock-scoped matrix multiply-accumulate
using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm75,
ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
InstructionShape, 2, Operator, EpilogueOutputOp0, true, true>::ThreadblockB2bMma;
static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
/// Define the epilogue for the 2nd Gemm
using Epilogue = typename cutlass::epilogue::threadblock::
DefaultInterleavedEpilogueTensorOp<
ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
/// Define the kernel-level GEMM operator.
using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
};
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
} // namespace kernel
} // namespace gemm
} // namespace cutlass

View File

@ -0,0 +1,275 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Defines device-side elementwise operations on TensorView. Note, the operations defined
in this header are not specialized for any particular data layout and are therefore not
intended to offer the best possible performance. Rather, they are intended to be generic
reference implementations to support the CUTLASS unit tests.
*/
#pragma once
// Cutlass includes
#include "cutlass/cutlass.h"
#include "cutlass/tensor_view.h"
#include "cutlass/gemm/gemm.h"
///////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace reference {
namespace device {
///////////////////////////////////////////////////////////////////////////////////////////////////
namespace kernel {
template <
typename TensorRefIn, ///< Input TensorRef Type
typename TensorRefOut, ///< Output TensorRef Type
typename ScalarType, ///< alpha Type
typename TensorRefScalar, ///< Scale/Bias TensorRef Type
typename OutputTile,
typename ConvertOp = NumericConverter<typename TensorRefOut::Element, ScalarType>
>
__global__ void TensorScaleBiasGemm(
gemm::GemmCoord problem_size,
TensorRefIn tensor_in, ///< input tensor
TensorRefOut tensor_out, ///< output tensor
ScalarType alpha, ///< alpha
TensorRefScalar tensor_scale, ///< scale tensor
TensorRefScalar tensor_bias ///< bias tensor
) {
ConvertOp convert_op;
MatrixCoord output_coord(
MatrixCoord::Index((threadIdx.x + blockIdx.x * blockDim.x) * OutputTile::kRow),
MatrixCoord::Index((threadIdx.y + blockIdx.y * blockDim.y) * OutputTile::kColumn)
);
// Update the output tensor
for (int j = 0; j < OutputTile::kRow; ++j) {
for (int i = 0; i < OutputTile::kColumn; ++i) {
MatrixCoord coord = output_coord + MatrixCoord(i, j);
if (coord.row() < problem_size.m() && coord.column() < problem_size.n()) {
ScalarType scale = alpha;
if(tensor_scale.good())
scale = tensor_scale.at({0, coord.column()});
ScalarType bias = ScalarType(0);
if(tensor_bias.good())
bias = tensor_bias.at({0, coord.column()});
tensor_out.at(coord) = convert_op(
scale * ScalarType(tensor_in.at(coord)) + bias);
}
}
}
}
template <
typename TensorRefIn, ///< Input TensorRef Type
typename TensorRefOut, ///< Output TensorRef Type
typename ScalarType, ///< alpha Type
typename TensorRefScalar, ///< Scale/Bias TensorRef Type
typename ConvertOp = NumericConverter<typename TensorRefOut::Element, ScalarType>,
int kThreadM = 4, // shape of a thread's tile in the GEMM M dimension
int kThreadN = 4, // shape of a thread's tile in the GEMM N dimension
int kCtaShapeM = 16, // shape of a threadblock in units of threads
int kCtaShapeN = 8 // shape of a threadblock in units of threads
>
__global__ void TensorScaleBiasConv2d(
conv::Conv2dProblemSize problem_size,
TensorRefIn tensor_in, ///< input tensor
TensorRefOut tensor_out, ///< output tensor
ScalarType alpha, ///< alpha
TensorRefScalar tensor_scale, ///< scale tensor
TensorRefScalar tensor_bias ///< bias tensor
) {
ConvertOp convert_op;
int64_t npq_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
int k_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
int thread_n[kThreadM];
int thread_p[kThreadM];
int thread_q[kThreadM];
// Compute N, P, Q coordinates for each row of a thread's tile
int64_t PQ = int64_t(problem_size.P) * problem_size.Q;
CUTLASS_PRAGMA_UNROLL
for (int m = 0; m < kThreadM; ++m) {
int64_t npq = npq_start + m;
thread_n[m] = int(npq / PQ);
int64_t residual = npq % PQ;
thread_p[m] = int(residual / problem_size.Q);
thread_q[m] = int(residual % problem_size.Q);
}
// Write out the results
CUTLASS_PRAGMA_UNROLL
for (int m = 0; m < kThreadM; ++m) {
if (thread_n[m] < problem_size.N && thread_p[m] < problem_size.P && thread_q[m] < problem_size.Q) {
CUTLASS_PRAGMA_UNROLL
for (int n = 0; n < kThreadN; ++n) {
int thread_k = k_start + n;
if (thread_k < problem_size.K) {
ScalarType scale = alpha;
if(tensor_scale.good())
scale = tensor_scale.at({0, thread_k});
ScalarType bias = ScalarType(0);
if(tensor_bias.good())
bias = tensor_bias.at({0, thread_k});
tensor_out.at({thread_n[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
scale * ScalarType(
tensor_in.at({thread_n[m], thread_p[m], thread_q[m], thread_k})
) + bias);
}
}
}
}
}
}
/// Apply scale and bias on a tensor
template <
typename ElementIn, ///< Input Type
typename ElementOut, ///< Output Type
typename Layout, ///< Layout of input/output tensor
typename ScalarType, ///< alpha Type
typename LayoutScaleBias, ///< Layout of scale and bias
typename ConvertOp = NumericConverter<ElementOut, ScalarType>
>
void TensorScaleBiasGemm(
gemm::GemmCoord problem_size,
TensorRef<ElementIn, Layout> tensor_in, ///< input tensor
TensorRef<ElementOut, Layout> tensor_out, ///< output tensor
ScalarType alpha, ///< alpha
TensorRef<ScalarType, LayoutScaleBias> tensor_scale, ///< scale tensor
TensorRef<ScalarType, LayoutScaleBias> tensor_bias ///< bias tensor
) {
using OutputTile = MatrixShape<4, 4>;
dim3 block(16, 8);
dim3 grid(
(problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow),
(problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn)
);
kernel::TensorScaleBiasGemm<
TensorRef<ElementIn, Layout>,
TensorRef<ElementOut, Layout>,
ScalarType,
TensorRef<ScalarType, LayoutScaleBias>,
OutputTile,
ConvertOp
><<< grid, block >>> (
problem_size,
tensor_in,
tensor_out,
alpha,
tensor_scale,
tensor_bias
);
}
/// Apply scale and bias on a tensor
template <
typename ElementIn, ///< Input Type
typename ElementOut, ///< Output Type
typename Layout, ///< Layout of input/output tensor
typename ScalarType, ///< alpha Type
typename LayoutScaleBias, ///< Layout of scale and bias
typename ConvertOp = NumericConverter<ElementOut, ScalarType>
>
void TensorScaleBiasConv2d(
conv::Conv2dProblemSize problem_size,
TensorRef<ElementIn, Layout> tensor_in, ///< input tensor
TensorRef<ElementOut, Layout> tensor_out, ///< output tensor
ScalarType alpha, ///< alpha
TensorRef<ScalarType, LayoutScaleBias> tensor_scale, ///< scale tensor
TensorRef<ScalarType, LayoutScaleBias> tensor_bias ///< bias tensor
) {
int const kThreadM = 4; // shape of a thread's tile in the GEMM M dimension
int const kThreadN = 4; // shape of a thread's tile in the GEMM N dimension
int const kCtaShapeM = 16; // shape of a threadblock in units of threads
int const kCtaShapeN = 8; // shape of a threadblock in units of threads
int64_t npq = int64_t(problem_size.N) * problem_size.P * problem_size.Q;
int64_t blocks_m = (npq + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
dim3 block(kCtaShapeM, kCtaShapeN);
dim3 grid(uint32_t(blocks_m), (problem_size.K + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
kernel::TensorScaleBiasConv2d<
TensorRef<ElementIn, Layout>,
TensorRef<ElementOut, Layout>,
ScalarType,
TensorRef<ScalarType, LayoutScaleBias>,
ConvertOp,
kThreadM,
kThreadN,
kCtaShapeM,
kCtaShapeN
><<< grid, block >>> (
problem_size,
tensor_in,
tensor_out,
alpha,
tensor_scale,
tensor_bias
);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace device
} // namespace reference
} // namespace cutlass

View File

@ -0,0 +1,95 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include <iostream>
// Run tests on GPUs
int testRun(int arch, std::vector<bool (*)()> & test_funcs, const std::string & test_name) {
bool supported = false;
int arch_major = arch / 10;
int arch_minor = arch - arch / 10 * 10;
if(arch_major >= 8) {
// Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
//
// CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
if (__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0)) {
supported = true;
}
}
else if(arch_major >= 7) {
// Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
//
// CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
if (__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)) {
supported = true;
}
}
cudaDeviceProp props;
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (error != cudaSuccess) {
std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
return -1;
}
if (!(props.major == arch_major && props.minor == arch_minor)) {
supported = false;
}
if (!supported) {
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
std::cout << "This example isn't supported on current architecture" << std::endl;
return 0;
}
bool pass = true;
std::cout << "Device: " << props.name << std::endl;
std::cout << "Arch: SM" << arch << std::endl;
std::cout << "Test: " << test_name << std::endl;
for(auto func : test_funcs) {
pass &= func();
}
if(pass)
return 0;
else
return -1;
}

View File

@ -0,0 +1,831 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Template for a multistage threadblock-scoped Implicit GEMM Convolution kernel.
*/
#pragma once
#include "cutlass/aligned_buffer.h"
#include "cutlass/arch/memory.h"
#include "cutlass/array.h"
#include "cutlass/cutlass.h"
#include "cutlass/gemm/gemm.h"
#include "cutlass/matrix_shape.h"
#include "cutlass/numeric_types.h"
#include "cutlass/arch/cache_operation.h"
#include "cutlass/gemm/threadblock/mma_base.h"
#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
#include "threadblock/b2b_mma_base.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace conv {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Structure to compute the matrix product targeting CUDA cores and SIMT math
/// instructions.
template <
/// Size of the Gemm problem - concept: gemm::GemmShape<>
typename Shape0_,
/// Iterates over tiles of A operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator |
// MaskedTileIterator)
typename IteratorA0_,
/// Iterates over tiles of A operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorA0_,
/// Cache operation for operand A
cutlass::arch::CacheOperation::Kind CacheOpA0,
/// Iterates over tiles of B operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator |
// MaskedTileIterator)
typename IteratorB0_,
/// Iterates over tiles of B operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorB0_,
/// Cache operation for operand B
cutlass::arch::CacheOperation::Kind CacheOpB0,
/// Size of the Gemm problem - concept: gemm::GemmShape<>
typename Shape1_,
/// Iterates over the intermediate accumulator tile
// (concept::MmaTensorOpFragmentIterator)
typename FragmentIteratorA1_,
/// Iterates over vectors of scale and bias vector in global memory
// (concept: VectorIterator)
typename IteratorAccumulatorScaleBias_,
/// WarpIterator to load Scale or Bias vector from threadblock fragment
typename FragmentIteratorA1ScaleBias_,
/// Iterates over tiles of B operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator |
// MaskedTileIterator)
typename IteratorB1_,
/// Iterates over tiles of B operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorB1_,
/// Cache operation for operand B
cutlass::arch::CacheOperation::Kind CacheOpB1,
/// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...)
typename OutputOp_,
/// Policy describing tuning details (concept: MmaPolicy)
typename Policy0_,
/// Policy describing tuning details (concept: MmaPolicy)
typename Policy1_,
/// Number of stages,
int Stages,
/// Used for partial specialization
typename Enable = bool>
class B2bImplicitGemmMultistage :
public gemm::threadblock::B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, Stages> {
public:
///< Base class
using Base = gemm::threadblock::B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, Stages>;
///< Size of the Gemm problem - concept: gemm::GemmShape<>
using Shape0 = Shape0_;
///< Iterates over tiles of A operand in global memory
using IteratorA0 = IteratorA0_;
///< Iterates over tiles of B operand in global memory
using IteratorB0 = IteratorB0_;
///< Policy describing tuning details
using Policy0 = Policy0_;
using SmemIteratorA0 = SmemIteratorA0_;
using SmemIteratorB0 = SmemIteratorB0_;
///< Size of the Gemm problem - concept: gemm::GemmShape<>
using Shape1 = Shape1_;
///< Iterates over tiles of A operand in global memory
using FragmentIteratorA1 = FragmentIteratorA1_;
///< Iterates over tiles of the scale and bias vectors in global memory
using IteratorAccumulatorScaleBias = IteratorAccumulatorScaleBias_;
///< WarpIterator to load Scale or Bias vector from threadblock fragment
using FragmentIteratorA1ScaleBias = FragmentIteratorA1ScaleBias_;
///< Iterates over tiles of B operand in global memory
using IteratorB1 = IteratorB1_;
///< Policy describing tuning details
using Policy1 = Policy1_;
using SmemIteratorB1 = SmemIteratorB1_;
///< Epilogue after 1st Gemm
using OutputOp = OutputOp_;
static const bool PerChannelScale = (OutputOp::kScale ==
epilogue::thread::ScaleType::OnlyAlphaPerChannelScaling);
static cutlass::arch::CacheOperation::Kind const kCacheOpA0 = CacheOpA0;
static cutlass::arch::CacheOperation::Kind const kCacheOpB0 = CacheOpB0;
static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1;
//
// Dependent types
//
using ElementC = typename Policy0::Operator::ElementC;
/// Fragment of accumulator tile
using FragmentC0 = typename Policy0::Operator::FragmentC;
/// Warp-level Mma
using Operator0 = typename Policy0::Operator;
/// Fragment of Scale and Bias loaded from global memory
using FragmentA1ScaleBias = typename IteratorAccumulatorScaleBias::Fragment;
/// Fragment of accumulator tile
using FragmentC1 = typename Policy1::Operator::FragmentC;
/// Warp-level Mma
using Operator1 = typename Policy1::Operator;
/// Internal structure exposed for introspection.
struct Detail {
static_assert(Base::kWarpGemmIterations0 > 1,
"The pipelined structure requires at least two warp-level "
"GEMM operations.");
static_assert(Base::kWarpGemmIterations1 > 1,
"The pipelined structure requires at least two warp-level "
"GEMM operations.");
/// Number of cp.async instructions to load one stage of operand A
static int const AsyncCopyIterationsPerStageA0 =
IteratorA0::ThreadMap::Iterations::kCount;
/// Number of cp.async instructions to load one stage of operand B
static int const AsyncCopyIterationsPerStageB0 =
IteratorB0::ThreadMap::Iterations::kCount;
/// Number of cp.async instructions to load one stage of operand B
static int const AsyncCopyIterationsPerStageB1 =
IteratorB1::ThreadMap::Iterations::kCount;
/// Number of stages
static int const kStages = Stages;
/// Number of cp.async instructions to load on group of operand A
static int const kAccessesPerGroupA0 =
(AsyncCopyIterationsPerStageA0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
/// Number of cp.async instructions to load on group of operand B
static int const kAccessesPerGroupB0 =
(AsyncCopyIterationsPerStageB0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
/// Number of cp.async instructions to load on group of operand B
static int const kAccessesPerGroupB1 =
(AsyncCopyIterationsPerStageB1 + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
};
private:
using WarpLoadedFragmentA0 = typename Operator0::FragmentA;
using WarpLoadedFragmentB0 = typename Operator0::FragmentB;
/// Warp Fragment of operand A1 loaded from accmulator tile
using WarpLoadedFragmentA1 = typename FragmentIteratorA1::Fragment;
using WarpLoadedFragmentA1ScaleBias =
typename FragmentIteratorA1ScaleBias::Fragment;
using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
using WarpTransformedFragmentA0 = typename Operator0::TransformedFragmentA;
using WarpTransformedFragmentB0 = typename Operator0::TransformedFragmentB;
using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA;
using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
private:
//
// Data members
//
/// Iterator to write threadblock-scoped tile of A operand to shared memory
SmemIteratorA0 smem_iterator_A0_;
/// Iterator to write threadblock-scoped tile of B operand to shared memory
SmemIteratorB0 smem_iterator_B0_;
/// Iterator to write threadblock-scoped tile of B operand to shared memory
SmemIteratorB1 smem_iterator_B1_;
public:
/// Construct from tensor references
CUTLASS_DEVICE
B2bImplicitGemmMultistage(
///< Shared storage needed for internal use by threadblock-scoped GEMM
typename Base::B2bMmaSharedStorage &shared_storage,
///< ID within the threadblock
int thread_idx,
///< ID of warp
int warp_idx,
///< ID of each thread within a warp
int lane_idx
):
Base(shared_storage, thread_idx, warp_idx, lane_idx),
smem_iterator_A0_(shared_storage.shared_storage0.operand_A_ref(), thread_idx),
smem_iterator_B0_(shared_storage.shared_storage0.operand_B_ref(), thread_idx),
smem_iterator_B1_(shared_storage.shared_storage1.operand_B_ref(), thread_idx)
{
// Compute warp location within threadblock tile by mapping the warp_id to
// three coordinates:
// _m: the warp's position within the threadblock along the M dimension
// _n: the warp's position within the threadblock along the N dimension
// _k: the warp's position within the threadblock along the K dimension
int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM;
int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM;
// Add per-warp offsets in units of warp-level tiles
this->warp_tile_iterator_A0_.add_tile_offset(
{warp_idx_m, Base::kWarpGemmIterations0 * warp_idx_k});
this->warp_tile_iterator_B0_.add_tile_offset(
{Base::kWarpGemmIterations0 * warp_idx_k, warp_idx_n});
this->warp_tile_iterator_B1_.add_tile_offset(
{Base::kWarpGemmIterations1 * warp_idx_k, warp_idx_n});
}
CUTLASS_DEVICE
void copy_tiles_and_advance_0(
IteratorA0 &iterator_A0, IteratorB0 &iterator_B0,
int group_start_A0 = 0, int group_start_B0 = 0) {
iterator_A0.set_iteration_index(group_start_A0);
this->smem_iterator_A0_.set_iteration_index(group_start_A0);
// Async Copy for operand A
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::kAccessesPerGroupA0; ++j) {
if (group_start_A0 + j < Detail::AsyncCopyIterationsPerStageA0) {
typename IteratorA0::AccessType *dst_ptr =
reinterpret_cast<typename IteratorA0::AccessType *>(
this->smem_iterator_A0_.get());
int const kSrcBytes = sizeof_bits<typename IteratorA0::Element>::value *
IteratorA0::ThreadMap::kElementsPerAccess / 8;
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
dst_ptr, iterator_A0.get(), iterator_A0.valid());
++iterator_A0;
++this->smem_iterator_A0_;
}
}
iterator_B0.set_iteration_index(group_start_B0);
this->smem_iterator_B0_.set_iteration_index(group_start_B0);
// Async Copy for operand B
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::kAccessesPerGroupB0; ++j) {
if (group_start_B0 + j < Detail::AsyncCopyIterationsPerStageB0) {
typename IteratorB0::AccessType *dst_ptr =
reinterpret_cast<typename IteratorB0::AccessType *>(
this->smem_iterator_B0_.get());
int const kSrcBytes = sizeof_bits<typename IteratorB0::Element>::value *
IteratorB0::ThreadMap::kElementsPerAccess / 8;
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
dst_ptr, iterator_B0.get(), iterator_B0.valid());
++iterator_B0;
++this->smem_iterator_B0_;
}
}
}
CUTLASS_DEVICE
void copy_tiles_and_advance_1(
IteratorB1 &iterator_B1,
int group_start_B1 = 0) {
iterator_B1.set_iteration_index(group_start_B1);
this->smem_iterator_B1_.set_iteration_index(group_start_B1);
// Async Copy for operand B
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
if (group_start_B1 + j < Detail::AsyncCopyIterationsPerStageB1) {
typename IteratorB1::AccessType *dst_ptr =
reinterpret_cast<typename IteratorB1::AccessType *>(
this->smem_iterator_B1_.get());
int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
IteratorB1::ThreadMap::kElementsPerAccess / 8;
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
dst_ptr, iterator_B1.get(), iterator_B1.valid());
++iterator_B1;
++this->smem_iterator_B1_;
}
}
}
/// Perform a threadblock-scoped matrix multiply-accumulate
CUTLASS_DEVICE
void operator()(
///< problem size of GEMM
int gemm_k_iterations_0,
///< destination accumulator tile
FragmentC1 &accum,
///< iterator over A0 operand in global memory
IteratorA0 iterator_A0,
///< iterator over B0 operand in global memory
IteratorB0 iterator_B0,
///< iterator over A1 operand scale vector in global memory
IteratorAccumulatorScaleBias iterator_A1_scale,
///< iterator over A1 operand bias vector in global memory
IteratorAccumulatorScaleBias iterator_A1_bias,
///< iterator over B1 operand in global memory
IteratorB1 iterator_B1,
///< initial value of accumulator
FragmentC0 const &src_accum,
///< epilogue operation after 1st Gemm
OutputOp output_op_0,
///< Imaginary strides used for planar-complex only - ignored here
int64_t imag_stride_A = 0,
int64_t imag_stride_B = 0) {
//
// Prologue
//
// Issue several complete stages
CUTLASS_PRAGMA_UNROLL
for (int stage = 0; stage < Base::kStages - 1;
++stage, --gemm_k_iterations_0) {
iterator_A0.set_iteration_index(0);
this->smem_iterator_A0_.set_iteration_index(0);
// Async Copy for operand A
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA0; ++j) {
typename IteratorA0::AccessType *dst_ptr =
reinterpret_cast<typename IteratorA0::AccessType *>(
this->smem_iterator_A0_.get());
int const kSrcBytes =
sizeof_bits<typename IteratorA0::Element>::value *
IteratorA0::ThreadMap::kElementsPerAccess / 8;
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
dst_ptr, iterator_A0.get(), iterator_A0.valid());
++iterator_A0;
++this->smem_iterator_A0_;
}
iterator_B0.set_iteration_index(0);
this->smem_iterator_B0_.set_iteration_index(0);
// Async Copy for operand B
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB0; ++j) {
typename IteratorB0::AccessType *dst_ptr =
reinterpret_cast<typename IteratorB0::AccessType *>(
this->smem_iterator_B0_.get());
int const kSrcBytes =
sizeof_bits<typename IteratorB0::Element>::value *
IteratorB0::ThreadMap::kElementsPerAccess / 8;
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
dst_ptr, iterator_B0.get(), iterator_B0.valid());
++iterator_B0;
++this->smem_iterator_B0_;
}
// Move to the next stage
iterator_A0.advance();
iterator_B0.advance();
this->smem_iterator_A0_.add_tile_offset({0, 1});
this->smem_iterator_B0_.add_tile_offset({1, 0});
// Inserts a fence to group cp.async instructions into stages.
cutlass::arch::cp_async_fence();
}
// Perform accumulation in the 'd' output operand
FragmentC0 accum0 = src_accum;
// Waits until kStages-2 stages have committed.
cutlass::arch::cp_async_wait<Base::kStages - 2>();
__syncthreads();
// Pair of fragments used to overlap shared memory loads and math
// instructions
WarpLoadedFragmentA0 warp_loaded_frag_A0[2];
WarpLoadedFragmentB0 warp_loaded_frag_B0[2];
WarpTransformedFragmentA0 warp_transformed_frag_A0[2];
WarpTransformedFragmentB0 warp_transformed_frag_B0[2];
Operator0 warp_mma0;
this->warp_tile_iterator_A0_.set_kgroup_index(0);
this->warp_tile_iterator_B0_.set_kgroup_index(0);
this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[0]);
this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[0]);
++this->warp_tile_iterator_A0_;
++this->warp_tile_iterator_B0_;
// Start issuing the first group of the next stage outside of the mainloop
copy_tiles_and_advance_0(iterator_A0, iterator_B0);
int smem_write_stage_idx = Base::kStages - 1;
int smem_read_stage_idx = 0;
warp_mma0.transform(warp_transformed_frag_A0[0], warp_transformed_frag_B0[0],
warp_loaded_frag_A0[0], warp_loaded_frag_B0[0]);
//
// Mainloop
//
CUTLASS_GEMM_LOOP
for (; gemm_k_iterations_0 > (-Base::kStages + 1);) {
//
// Loop over GEMM K dimension
//
// Computes a warp-level GEMM on data held in shared memory
// Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
CUTLASS_PRAGMA_UNROLL
for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0;
++warp_mma_k) {
// Load warp-level tiles from shared memory, wrapping to k offset if
// this is the last group as the case may be.
this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[(warp_mma_k + 1) % 2]);
this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
++this->warp_tile_iterator_A0_;
++this->warp_tile_iterator_B0_;
if (warp_mma_k > 0)
warp_mma0.transform(warp_transformed_frag_A0[warp_mma_k % 2],
warp_transformed_frag_B0[warp_mma_k % 2],
warp_loaded_frag_A0[warp_mma_k % 2],
warp_loaded_frag_B0[warp_mma_k % 2]);
// Issue global->shared copies for the next stage
int group_start_iteration_A0, group_start_iteration_B0;
if (warp_mma_k + 1 == Base::kWarpGemmIterations0) {
group_start_iteration_A0 = 0;
group_start_iteration_B0 = 0;
} else {
group_start_iteration_A0 =
(warp_mma_k + 1) * Detail::kAccessesPerGroupA0;
group_start_iteration_B0 =
(warp_mma_k + 1) * Detail::kAccessesPerGroupB0;
}
copy_tiles_and_advance_0(iterator_A0, iterator_B0, group_start_iteration_A0,
group_start_iteration_B0);
warp_mma0(
accum0,
warp_transformed_frag_A0[warp_mma_k % 2],
warp_transformed_frag_B0[warp_mma_k % 2],
accum0
);
if (warp_mma_k + 1 == Base::kWarpGemmIterations0)
warp_mma0.transform(warp_transformed_frag_A0[(warp_mma_k + 1) % 2],
warp_transformed_frag_B0[(warp_mma_k + 1) % 2],
warp_loaded_frag_A0[(warp_mma_k + 1) % 2],
warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
if (warp_mma_k + 2 == Base::kWarpGemmIterations0) {
// Inserts a fence to group cp.async instructions into stages.
cutlass::arch::cp_async_fence();
// Waits until kStages-2 stages of cp.async have committed
arch::cp_async_wait<Base::kStages - 2>();
__syncthreads();
// Move to the next stage
iterator_A0.advance();
iterator_B0.advance();
this->smem_iterator_A0_.add_tile_offset({0, 1});
this->smem_iterator_B0_.add_tile_offset({1, 0});
// Add negative offsets to return iterators to the 'start' of the
// circular buffer in shared memory
if (smem_write_stage_idx == (Base::kStages - 1)) {
this->smem_iterator_A0_.add_tile_offset({0, -Base::kStages});
this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
smem_write_stage_idx = 0;
} else {
++smem_write_stage_idx;
}
if (smem_read_stage_idx == (Base::kStages - 1)) {
this->warp_tile_iterator_A0_.add_tile_offset(
{0, -Base::kStages * Policy0::kPartitionsK *
Base::kWarpGemmIterations0});
this->warp_tile_iterator_B0_.add_tile_offset(
{-Base::kStages * Policy0::kPartitionsK *
Base::kWarpGemmIterations0,
0});
smem_read_stage_idx = 0;
} else {
++smem_read_stage_idx;
}
--gemm_k_iterations_0;
}
}
}
// Insert fence and wait for all outstanding cp.async operations to commit.
cutlass::arch::cp_async_fence();
cutlass::arch::cp_async_wait<0>();
__syncthreads();
// 2nd Implicit Gemm
/// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
FragmentIteratorA1 warp_tile_iterator_A1_(accum0);
FragmentA1ScaleBias tb_frag_A1_scale;
FragmentA1ScaleBias tb_frag_A1_bias;
FragmentIteratorA1ScaleBias warp_tile_iterator_A1_scale_(tb_frag_A1_scale);
FragmentIteratorA1ScaleBias warp_tile_iterator_A1_bias_(tb_frag_A1_bias);
if(PerChannelScale) {
tb_frag_A1_scale.clear();
iterator_A1_scale.load(tb_frag_A1_scale);
++iterator_A1_scale;
}
tb_frag_A1_bias.clear();
iterator_A1_bias.load(tb_frag_A1_bias);
++iterator_A1_bias;
//
// Prologue
//
int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
// Issue several complete stages
CUTLASS_PRAGMA_UNROLL
for (int stage = 0; stage < Base::kStages - 1;
++stage, --gemm_k_iterations_1) {
iterator_B1.set_iteration_index(0);
this->smem_iterator_B1_.set_iteration_index(0);
// Async Copy for operand B
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB1; ++j) {
typename IteratorB1::AccessType *dst_ptr =
reinterpret_cast<typename IteratorB1::AccessType *>(
this->smem_iterator_B1_.get());
int const kSrcBytes =
sizeof_bits<typename IteratorB1::Element>::value *
IteratorB1::ThreadMap::kElementsPerAccess / 8;
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
dst_ptr, iterator_B1.get(), iterator_B1.valid());
++iterator_B1;
++this->smem_iterator_B1_;
}
// Move to the next stage
iterator_B1.advance();
this->smem_iterator_B1_.add_tile_offset({1, 0});
// Inserts a fence to group cp.async instructions into stages.
cutlass::arch::cp_async_fence();
}
// Waits until kStages-2 stages have committed.
cutlass::arch::cp_async_wait<Base::kStages - 2>();
__syncthreads();
// Pair of fragments used to overlap shared memory loads and math
// instructions
WarpLoadedFragmentA1 warp_loaded_frag_A1[2];
WarpLoadedFragmentA1ScaleBias warp_loaded_frag_A1_scale[2];
WarpLoadedFragmentA1ScaleBias warp_loaded_frag_A1_bias[2];
WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
WarpTransformedFragmentA1 warp_transformed_frag_A1[2];
WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
Operator1 warp_mma1;
if(PerChannelScale) {
warp_tile_iterator_A1_scale_.load(warp_loaded_frag_A1_scale[0]);
++warp_tile_iterator_A1_scale_;
}
warp_tile_iterator_A1_bias_.load(warp_loaded_frag_A1_bias[0]);
++warp_tile_iterator_A1_bias_;
warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0],
warp_loaded_frag_A1_scale[0],
warp_loaded_frag_A1_bias[0],
output_op_0);
++warp_tile_iterator_A1_;
this->warp_tile_iterator_B1_.set_kgroup_index(0);
this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[0]);
++this->warp_tile_iterator_B1_;
// Start issuing the first group of the next stage outside of the mainloop
copy_tiles_and_advance_1(iterator_B1);
smem_write_stage_idx = Base::kStages - 1;
smem_read_stage_idx = 0;
warp_mma1.transform(warp_transformed_frag_A1[0], warp_transformed_frag_B1[0],
warp_loaded_frag_A1[0], warp_loaded_frag_B1[0]);
//
// Mainloop
//
CUTLASS_PRAGMA_UNROLL
for (gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1 - (Base::kStages - 1);
gemm_k_iterations_1 > (-Base::kStages + 1); gemm_k_iterations_1--) {
//
// Loop over GEMM K dimension
//
// Computes a warp-level GEMM on data held in shared memory
// Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
CUTLASS_PRAGMA_UNROLL
for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1;
++warp_mma_k) {
// Load threadblock-level scale/bias vector from global memory
if (warp_mma_k + 1 == Base::kWarpGemmIterations1) {
if(PerChannelScale) {
tb_frag_A1_scale.clear();
iterator_A1_scale.load(tb_frag_A1_scale);
++iterator_A1_scale;
}
tb_frag_A1_bias.clear();
iterator_A1_bias.load(tb_frag_A1_bias);
++iterator_A1_bias;
}
// Load warp-level scale bias fragment from threadblock scale/bias vector
if(PerChannelScale) {
warp_tile_iterator_A1_scale_.load(warp_loaded_frag_A1_scale[(warp_mma_k + 1) % 2]);
++warp_tile_iterator_A1_scale_;
}
warp_tile_iterator_A1_bias_.load(warp_loaded_frag_A1_bias[(warp_mma_k + 1) % 2]);
++warp_tile_iterator_A1_bias_;
// Load warp-level tile from accumulator fragment
warp_tile_iterator_A1_.load(warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
warp_loaded_frag_A1_scale[(warp_mma_k + 1) % 2],
warp_loaded_frag_A1_bias[(warp_mma_k + 1) % 2],
output_op_0);
++warp_tile_iterator_A1_;
// Load warp-level tiles from shared memory, wrapping to k offset if
// this is the last group as the case may be.
this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
++this->warp_tile_iterator_B1_;
if (warp_mma_k > 0)
warp_mma1.transform(warp_transformed_frag_A1[warp_mma_k % 2],
warp_transformed_frag_B1[warp_mma_k % 2],
warp_loaded_frag_A1[warp_mma_k % 2],
warp_loaded_frag_B1[warp_mma_k % 2]);
// Issue global->shared copies for the next stage
int group_start_iteration_B1;
if (warp_mma_k + 1 == Base::kWarpGemmIterations1) {
group_start_iteration_B1 = 0;
} else {
group_start_iteration_B1 =
(warp_mma_k + 1) * Detail::kAccessesPerGroupB1;
}
copy_tiles_and_advance_1(iterator_B1,
group_start_iteration_B1);
warp_mma1(
accum,
warp_transformed_frag_A1[warp_mma_k % 2],
warp_transformed_frag_B1[warp_mma_k % 2],
accum
);
if (warp_mma_k + 1 == Base::kWarpGemmIterations1)
warp_mma1.transform(warp_transformed_frag_A1[(warp_mma_k + 1) % 2],
warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
if (warp_mma_k + 2 == Base::kWarpGemmIterations1) {
// Inserts a fence to group cp.async instructions into stages.
cutlass::arch::cp_async_fence();
// Waits until kStages-2 stages of cp.async have committed
arch::cp_async_wait<Base::kStages - 2>();
__syncthreads();
// Move to the next stage
iterator_B1.advance();
this->smem_iterator_B1_.add_tile_offset({1, 0});
// Add negative offsets to return iterators to the 'start' of the
// circular buffer in shared memory
if (smem_write_stage_idx == (Base::kStages - 1)) {
this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
smem_write_stage_idx = 0;
} else {
++smem_write_stage_idx;
}
if (smem_read_stage_idx == (Base::kStages - 1)) {
this->warp_tile_iterator_B1_.add_tile_offset(
{-Base::kStages * Policy1::kPartitionsK *
Base::kWarpGemmIterations1,
0});
smem_read_stage_idx = 0;
} else {
++smem_read_stage_idx;
}
}
}
}
// Insert fence and wait for all outstanding cp.async operations to commit.
cutlass::arch::cp_async_fence();
cutlass::arch::cp_async_wait<0>();
__syncthreads();
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace threadblock
} // namespace gemm
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,816 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Template for a multistage threadblock-scoped Implicit GEMM Convolution kernel.
*/
#pragma once
#include "cutlass/aligned_buffer.h"
#include "cutlass/arch/memory.h"
#include "cutlass/array.h"
#include "cutlass/cutlass.h"
#include "cutlass/gemm/gemm.h"
#include "cutlass/matrix_shape.h"
#include "cutlass/numeric_types.h"
#include "cutlass/arch/cache_operation.h"
#include "cutlass/gemm/threadblock/mma_base.h"
#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
#include "threadblock/b2b_mma_base_smem_accumulator.h"
#include "cutlass/epilogue/threadblock/epilogue_smem_accumulator.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace conv {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Structure to compute the matrix product targeting CUDA cores and SIMT math
/// instructions.
template <
/// Size of the Gemm problem - concept: gemm::GemmShape<>
typename Shape0_,
/// Iterates over tiles of A operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator |
// MaskedTileIterator)
typename IteratorA0_,
/// Iterates over tiles of A operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorA0_,
/// Cache operation for operand A
cutlass::arch::CacheOperation::Kind CacheOpA0,
/// Iterates over tiles of B operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator |
// MaskedTileIterator)
typename IteratorB0_,
/// Iterates over tiles of B operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorB0_,
/// Cache operation for operand B
cutlass::arch::CacheOperation::Kind CacheOpB0,
/// Iterates over vectors of scale and bias vector in global memory
// (concept: VectorIterator)
typename IteratorAccumulatorScaleBias_,
/// Iterates over accumulator tile
typename FragmentIteratorAccumulator_,
/// Iterates over accumulator tile in shared memory
typename SmemIteratorD0_,
/// Size of the Gemm problem - concept: gemm::GemmShape<>
typename Shape1_,
/// Iterates over the intermediate accumulator tile
// (concept::MmaTensorOpFragmentIterator)
typename WarpIteratorA1_,
/// Iterates over tiles of B operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator |
// MaskedTileIterator)
typename IteratorB1_,
/// Iterates over tiles of B operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorB1_,
/// Cache operation for operand B
cutlass::arch::CacheOperation::Kind CacheOpB1,
/// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...)
typename OutputOp_,
/// Policy describing tuning details (concept: MmaPolicy)
typename Policy0_,
/// Policy describing tuning details (concept: MmaPolicy)
typename Policy1_,
/// Number of stages,
int Stages,
/// Used for partial specialization
typename Enable = bool>
class B2bImplicitGemmMultistageSmemAccumulator :
public gemm::threadblock::B2bMmaBaseSmemAccumulator<Shape0_, Shape1_, Policy0_, Policy1_, SmemIteratorD0_, Stages> {
public:
///< Base class
using Base = gemm::threadblock::B2bMmaBaseSmemAccumulator<Shape0_, Shape1_, Policy0_, Policy1_, SmemIteratorD0_, Stages>;
///< Size of the Gemm problem - concept: gemm::GemmShape<>
using Shape0 = Shape0_;
///< Iterates over tiles of A operand in global memory
using IteratorA0 = IteratorA0_;
///< Iterates over tiles of B operand in global memory
using IteratorB0 = IteratorB0_;
///< Iterates over tiles of the scale and bias vectors in global memory
using IteratorAccumulatorScaleBias = IteratorAccumulatorScaleBias_;
///< Policy describing tuning details
using Policy0 = Policy0_;
using SmemIteratorA0 = SmemIteratorA0_;
using SmemIteratorB0 = SmemIteratorB0_;
using SmemIteratorD0 = SmemIteratorD0_; ///< Iterates over accumulator tile in shared memory
using FragmentIteratorAccumulator = FragmentIteratorAccumulator_; ///< Iterates over accumulator tile
///< Size of the Gemm problem - concept: gemm::GemmShape<>
using Shape1 = Shape1_;
///< Iterates over tiles of B operand in global memory
using IteratorB1 = IteratorB1_;
///< Policy describing tuning details
using Policy1 = Policy1_;
using SmemIteratorB1 = SmemIteratorB1_;
using WarpIteratorA1 = WarpIteratorA1_; ///< Iterates over the intermediate accumulator tile in shared memory
///< Epilogue after 1st Gemm
using OutputOp = OutputOp_;
static const bool PerChannelScale = (OutputOp::kScale ==
epilogue::thread::ScaleType::OnlyAlphaPerChannelScaling);
static cutlass::arch::CacheOperation::Kind const kCacheOpA0 = CacheOpA0;
static cutlass::arch::CacheOperation::Kind const kCacheOpB0 = CacheOpB0;
static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1;
//
// Dependent types
//
using ElementC = typename Policy0::Operator::ElementC;
/// Fragment of accumulator tile
using FragmentC0 = typename Policy0::Operator::FragmentC;
/// Warp-level Mma
using Operator0 = typename Policy0::Operator;
/// Fragment of Scale and Bias loaded from global memory
using FragmentA1ScaleBias = typename IteratorAccumulatorScaleBias::Fragment;
/// Fragment of accumulator tile
using FragmentC1 = typename Policy1::Operator::FragmentC;
/// Warp-level Mma
using Operator1 = typename Policy1::Operator;
/// Epilog in shared memory
using Epilogue0 = epilogue::threadblock::EpilogueSmemAccumulator<
SmemIteratorD0, ///< SmemTileIterator
FragmentIteratorAccumulator, ///< AccumulatorFragmentIterator
IteratorAccumulatorScaleBias, ///< ScaleBiasIterator
OutputOp>; ///< Output operator
/// Internal structure exposed for introspection.
struct Detail {
static_assert(Base::kWarpGemmIterations0 > 1,
"The pipelined structure requires at least two warp-level "
"GEMM operations.");
static_assert(Base::kWarpGemmIterations1 > 1,
"The pipelined structure requires at least two warp-level "
"GEMM operations.");
/// Number of cp.async instructions to load one stage of operand A
static int const AsyncCopyIterationsPerStageA0 =
IteratorA0::ThreadMap::Iterations::kCount;
/// Number of cp.async instructions to load one stage of operand B
static int const AsyncCopyIterationsPerStageB0 =
IteratorB0::ThreadMap::Iterations::kCount;
/// Number of cp.async instructions to load one stage of operand B
static int const AsyncCopyIterationsPerStageB1 =
IteratorB1::ThreadMap::Iterations::kCount;
/// Number of stages
static int const kStages = Stages;
/// Number of cp.async instructions to load on group of operand A
static int const kAccessesPerGroupA0 =
(AsyncCopyIterationsPerStageA0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
/// Number of cp.async instructions to load on group of operand B
static int const kAccessesPerGroupB0 =
(AsyncCopyIterationsPerStageB0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
/// Number of cp.async instructions to load on group of operand B
static int const kAccessesPerGroupB1 =
(AsyncCopyIterationsPerStageB1 + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
};
private:
using WarpLoadedFragmentA0 = typename Operator0::FragmentA;
using WarpLoadedFragmentB0 = typename Operator0::FragmentB;
using WarpLoadedFragmentA1 = typename Operator1::FragmentA;
using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
using WarpTransformedFragmentA0 = typename Operator0::TransformedFragmentA;
using WarpTransformedFragmentB0 = typename Operator0::TransformedFragmentB;
using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA;
using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
private:
//
// Data members
//
/// Iterator to write threadblock-scoped tile of A operand to shared memory
SmemIteratorA0 smem_iterator_A0_;
/// Iterator to write threadblock-scoped tile of B operand to shared memory
SmemIteratorB0 smem_iterator_B0_;
/// Shared Memory Iterator to store accumulator tile
SmemIteratorD0 smem_iterator_D0_;
/// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
WarpIteratorA1 warp_tile_iterator_A1_;
/// Iterator to write threadblock-scoped tile of B operand to shared memory
SmemIteratorB1 smem_iterator_B1_;
public:
/// Construct from tensor references
CUTLASS_DEVICE
B2bImplicitGemmMultistageSmemAccumulator(
///< Shared storage needed for internal use by threadblock-scoped GEMM
typename Base::B2bMmaSharedStorage &shared_storage,
///< ID within the threadblock
int thread_idx,
///< ID of warp
int warp_idx,
///< ID of each thread within a warp
int lane_idx
):
Base(shared_storage, thread_idx, warp_idx, lane_idx),
smem_iterator_A0_(shared_storage.b2b_mma_shared_storage.shared_storage0.operand_A_ref(), thread_idx),
smem_iterator_B0_(shared_storage.b2b_mma_shared_storage.shared_storage0.operand_B_ref(), thread_idx),
smem_iterator_D0_(shared_storage.accumulator_shared_storage0.accum_ref(), lane_idx),
warp_tile_iterator_A1_(shared_storage.accumulator_shared_storage0.accum_ref(), lane_idx),
smem_iterator_B1_(shared_storage.b2b_mma_shared_storage.shared_storage1.operand_B_ref(), thread_idx)
{
// Compute warp location within threadblock tile by mapping the warp_id to
// three coordinates:
// _m: the warp's position within the threadblock along the M dimension
// _n: the warp's position within the threadblock along the N dimension
// _k: the warp's position within the threadblock along the K dimension
int warp_idx_mn_0 = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
int warp_idx_k_0 = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
int warp_idx_m_0 = warp_idx_mn_0 % Base::WarpCount0::kM;
int warp_idx_n_0 = warp_idx_mn_0 / Base::WarpCount0::kM;
int warp_idx_mn_1 = warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN);
int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN);
int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM;
int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM;
// Add per-warp offsets in units of warp-level tiles
this->warp_tile_iterator_A0_.add_tile_offset(
{warp_idx_m_0, Base::kWarpGemmIterations0 * warp_idx_k_0});
this->warp_tile_iterator_B0_.add_tile_offset(
{Base::kWarpGemmIterations0 * warp_idx_k_0, warp_idx_n_0});
warp_tile_iterator_A1_.add_tile_offset(
{warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1});
this->warp_tile_iterator_B1_.add_tile_offset(
{Base::kWarpGemmIterations1 * warp_idx_k_1, warp_idx_n_1});
// Add smem accumulator iterator warp offset
smem_iterator_D0_.add_tile_offset({ warp_idx_m_0 * SmemIteratorD0::TileIterations::kRow,
warp_idx_n_0 * SmemIteratorD0::TileIterations::kColumn});
}
CUTLASS_DEVICE
void copy_tiles_and_advance_0(
IteratorA0 &iterator_A0, IteratorB0 &iterator_B0,
int group_start_A0 = 0, int group_start_B0 = 0) {
iterator_A0.set_iteration_index(group_start_A0);
this->smem_iterator_A0_.set_iteration_index(group_start_A0);
// Async Copy for operand A
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::kAccessesPerGroupA0; ++j) {
if (group_start_A0 + j < Detail::AsyncCopyIterationsPerStageA0) {
typename IteratorA0::AccessType *dst_ptr =
reinterpret_cast<typename IteratorA0::AccessType *>(
this->smem_iterator_A0_.get());
int const kSrcBytes = sizeof_bits<typename IteratorA0::Element>::value *
IteratorA0::ThreadMap::kElementsPerAccess / 8;
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
dst_ptr, iterator_A0.get(), iterator_A0.valid());
++iterator_A0;
++this->smem_iterator_A0_;
}
}
iterator_B0.set_iteration_index(group_start_B0);
this->smem_iterator_B0_.set_iteration_index(group_start_B0);
// Async Copy for operand B
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::kAccessesPerGroupB0; ++j) {
if (group_start_B0 + j < Detail::AsyncCopyIterationsPerStageB0) {
typename IteratorB0::AccessType *dst_ptr =
reinterpret_cast<typename IteratorB0::AccessType *>(
this->smem_iterator_B0_.get());
int const kSrcBytes = sizeof_bits<typename IteratorB0::Element>::value *
IteratorB0::ThreadMap::kElementsPerAccess / 8;
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
dst_ptr, iterator_B0.get(), iterator_B0.valid());
++iterator_B0;
++this->smem_iterator_B0_;
}
}
}
CUTLASS_DEVICE
void copy_tiles_and_advance_1(
IteratorB1 &iterator_B1,
int group_start_B1 = 0) {
iterator_B1.set_iteration_index(group_start_B1);
this->smem_iterator_B1_.set_iteration_index(group_start_B1);
// Async Copy for operand B
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
if (group_start_B1 + j < Detail::AsyncCopyIterationsPerStageB1) {
typename IteratorB1::AccessType *dst_ptr =
reinterpret_cast<typename IteratorB1::AccessType *>(
this->smem_iterator_B1_.get());
int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
IteratorB1::ThreadMap::kElementsPerAccess / 8;
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
dst_ptr, iterator_B1.get(), iterator_B1.valid());
++iterator_B1;
++this->smem_iterator_B1_;
}
}
}
/// Perform a threadblock-scoped matrix multiply-accumulate
CUTLASS_DEVICE
void operator()(
///< problem size of GEMM
int gemm_k_iterations_0,
///< destination accumulator tile
FragmentC1 &accum,
///< iterator over A0 operand in global memory
IteratorA0 iterator_A0,
///< iterator over B0 operand in global memory
IteratorB0 iterator_B0,
///< iterator over A1 operand scale vector in global memory
IteratorAccumulatorScaleBias iterator_accum0_scale,
///< iterator over A1 operand bias vector in global memory
IteratorAccumulatorScaleBias iterator_accum0_bias,
///< iterator over B1 operand in global memory
IteratorB1 iterator_B1,
///< initial value of accumulator
FragmentC0 const &src_accum,
///< epilogue operation after 1st Gemm
OutputOp output_op_0,
///< Imaginary strides used for planar-complex only - ignored here
int64_t imag_stride_A = 0,
int64_t imag_stride_B = 0) {
//
// Prologue
//
// Issue several complete stages
CUTLASS_PRAGMA_UNROLL
for (int stage = 0; stage < Base::kStages - 1;
++stage, --gemm_k_iterations_0) {
iterator_A0.set_iteration_index(0);
this->smem_iterator_A0_.set_iteration_index(0);
// Async Copy for operand A
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA0; ++j) {
typename IteratorA0::AccessType *dst_ptr =
reinterpret_cast<typename IteratorA0::AccessType *>(
this->smem_iterator_A0_.get());
int const kSrcBytes =
sizeof_bits<typename IteratorA0::Element>::value *
IteratorA0::ThreadMap::kElementsPerAccess / 8;
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
dst_ptr, iterator_A0.get(), iterator_A0.valid());
++iterator_A0;
++this->smem_iterator_A0_;
}
iterator_B0.set_iteration_index(0);
this->smem_iterator_B0_.set_iteration_index(0);
// Async Copy for operand B
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB0; ++j) {
typename IteratorB0::AccessType *dst_ptr =
reinterpret_cast<typename IteratorB0::AccessType *>(
this->smem_iterator_B0_.get());
int const kSrcBytes =
sizeof_bits<typename IteratorB0::Element>::value *
IteratorB0::ThreadMap::kElementsPerAccess / 8;
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
dst_ptr, iterator_B0.get(), iterator_B0.valid());
++iterator_B0;
++this->smem_iterator_B0_;
}
// Move to the next stage
iterator_A0.advance();
iterator_B0.advance();
this->smem_iterator_A0_.add_tile_offset({0, 1});
this->smem_iterator_B0_.add_tile_offset({1, 0});
// Inserts a fence to group cp.async instructions into stages.
cutlass::arch::cp_async_fence();
}
// Perform accumulation in the 'd' output operand
FragmentC0 accum0 = src_accum;
// Waits until kStages-2 stages have committed.
cutlass::arch::cp_async_wait<Base::kStages - 2>();
__syncthreads();
// Pair of fragments used to overlap shared memory loads and math
// instructions
WarpLoadedFragmentA0 warp_loaded_frag_A0[2];
WarpLoadedFragmentB0 warp_loaded_frag_B0[2];
WarpTransformedFragmentA0 warp_transformed_frag_A0[2];
WarpTransformedFragmentB0 warp_transformed_frag_B0[2];
Operator0 warp_mma0;
this->warp_tile_iterator_A0_.set_kgroup_index(0);
this->warp_tile_iterator_B0_.set_kgroup_index(0);
this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[0]);
this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[0]);
++this->warp_tile_iterator_A0_;
++this->warp_tile_iterator_B0_;
// Start issuing the first group of the next stage outside of the mainloop
copy_tiles_and_advance_0(iterator_A0, iterator_B0);
int smem_write_stage_idx = Base::kStages - 1;
int smem_read_stage_idx = 0;
warp_mma0.transform(warp_transformed_frag_A0[0], warp_transformed_frag_B0[0],
warp_loaded_frag_A0[0], warp_loaded_frag_B0[0]);
//
// Mainloop
//
CUTLASS_GEMM_LOOP
for (; gemm_k_iterations_0 > (-Base::kStages + 1);) {
//
// Loop over GEMM K dimension
//
// Computes a warp-level GEMM on data held in shared memory
// Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
CUTLASS_PRAGMA_UNROLL
for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0;
++warp_mma_k) {
// Load warp-level tiles from shared memory, wrapping to k offset if
// this is the last group as the case may be.
this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[(warp_mma_k + 1) % 2]);
this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
++this->warp_tile_iterator_A0_;
++this->warp_tile_iterator_B0_;
if (warp_mma_k > 0)
warp_mma0.transform(warp_transformed_frag_A0[warp_mma_k % 2],
warp_transformed_frag_B0[warp_mma_k % 2],
warp_loaded_frag_A0[warp_mma_k % 2],
warp_loaded_frag_B0[warp_mma_k % 2]);
// Issue global->shared copies for the next stage
int group_start_iteration_A0, group_start_iteration_B0;
if (warp_mma_k + 1 == Base::kWarpGemmIterations0) {
group_start_iteration_A0 = 0;
group_start_iteration_B0 = 0;
} else {
group_start_iteration_A0 =
(warp_mma_k + 1) * Detail::kAccessesPerGroupA0;
group_start_iteration_B0 =
(warp_mma_k + 1) * Detail::kAccessesPerGroupB0;
}
copy_tiles_and_advance_0(iterator_A0, iterator_B0, group_start_iteration_A0,
group_start_iteration_B0);
warp_mma0(
accum0,
warp_transformed_frag_A0[warp_mma_k % 2],
warp_transformed_frag_B0[warp_mma_k % 2],
accum0
);
if (warp_mma_k + 1 == Base::kWarpGemmIterations0)
warp_mma0.transform(warp_transformed_frag_A0[(warp_mma_k + 1) % 2],
warp_transformed_frag_B0[(warp_mma_k + 1) % 2],
warp_loaded_frag_A0[(warp_mma_k + 1) % 2],
warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
if (warp_mma_k + 2 == Base::kWarpGemmIterations0) {
// Inserts a fence to group cp.async instructions into stages.
cutlass::arch::cp_async_fence();
// Waits until kStages-2 stages of cp.async have committed
arch::cp_async_wait<Base::kStages - 2>();
__syncthreads();
// Move to the next stage
iterator_A0.advance();
iterator_B0.advance();
this->smem_iterator_A0_.add_tile_offset({0, 1});
this->smem_iterator_B0_.add_tile_offset({1, 0});
// Add negative offsets to return iterators to the 'start' of the
// circular buffer in shared memory
if (smem_write_stage_idx == (Base::kStages - 1)) {
this->smem_iterator_A0_.add_tile_offset({0, -Base::kStages});
this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
smem_write_stage_idx = 0;
} else {
++smem_write_stage_idx;
}
if (smem_read_stage_idx == (Base::kStages - 1)) {
this->warp_tile_iterator_A0_.add_tile_offset(
{0, -Base::kStages * Policy0::kPartitionsK *
Base::kWarpGemmIterations0});
this->warp_tile_iterator_B0_.add_tile_offset(
{-Base::kStages * Policy0::kPartitionsK *
Base::kWarpGemmIterations0,
0});
smem_read_stage_idx = 0;
} else {
++smem_read_stage_idx;
}
--gemm_k_iterations_0;
}
}
}
// Insert fence and wait for all outstanding cp.async operations to commit.
cutlass::arch::cp_async_fence();
cutlass::arch::cp_async_wait<0>();
__syncthreads();
/// Epilogue for the first Implicit Gemm
Epilogue0 epilogue0;
epilogue0(output_op_0, smem_iterator_D0_, accum0, iterator_accum0_scale, iterator_accum0_bias);
__syncthreads();
// 2nd Implicit Gemm
//
// Prologue
//
int gemm_k_iterations_1 = Shape0::kN / Shape1::kK;
// Issue several complete stages
CUTLASS_PRAGMA_UNROLL
for (int stage = 0; stage < Base::kStages - 1;
++stage, --gemm_k_iterations_1) {
iterator_B1.set_iteration_index(0);
this->smem_iterator_B1_.set_iteration_index(0);
// Async Copy for operand B
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB1; ++j) {
typename IteratorB1::AccessType *dst_ptr =
reinterpret_cast<typename IteratorB1::AccessType *>(
this->smem_iterator_B1_.get());
int const kSrcBytes =
sizeof_bits<typename IteratorB1::Element>::value *
IteratorB1::ThreadMap::kElementsPerAccess / 8;
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
dst_ptr, iterator_B1.get(), iterator_B1.valid());
++iterator_B1;
++this->smem_iterator_B1_;
}
// Move to the next stage
iterator_B1.advance();
this->smem_iterator_B1_.add_tile_offset({1, 0});
// Inserts a fence to group cp.async instructions into stages.
cutlass::arch::cp_async_fence();
}
// Waits until kStages-2 stages have committed.
cutlass::arch::cp_async_wait<Base::kStages - 2>();
__syncthreads();
// Pair of fragments used to overlap shared memory loads and math
// instructions
WarpLoadedFragmentA1 warp_loaded_frag_A1[2];
WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
WarpTransformedFragmentA1 warp_transformed_frag_A1[2];
WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
Operator1 warp_mma1;
warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0]);
++warp_tile_iterator_A1_;
this->warp_tile_iterator_B1_.set_kgroup_index(0);
this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[0]);
++this->warp_tile_iterator_B1_;
// Start issuing the first group of the next stage outside of the mainloop
copy_tiles_and_advance_1(iterator_B1);
smem_write_stage_idx = Base::kStages - 1;
smem_read_stage_idx = 0;
warp_mma1.transform(warp_transformed_frag_A1[0], warp_transformed_frag_B1[0],
warp_loaded_frag_A1[0], warp_loaded_frag_B1[0]);
//
// Mainloop
//
CUTLASS_PRAGMA_UNROLL
for ( gemm_k_iterations_1 = Shape0::kN / Shape1::kK - (Base::kStages - 1);
gemm_k_iterations_1 > (-Base::kStages + 1); gemm_k_iterations_1--) {
//
// Loop over GEMM K dimension
//
// Computes a warp-level GEMM on data held in shared memory
// Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
CUTLASS_PRAGMA_UNROLL
for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1;
++warp_mma_k) {
// Load warp-level tile from accumulator fragment
// skip warp tile loading for the last kgroup
if(gemm_k_iterations_1 > (-Base::kStages + 2) || warp_mma_k < Base::kWarpGemmIterations1 - 1) {
warp_tile_iterator_A1_.load(warp_loaded_frag_A1[(warp_mma_k + 1) % 2]);
}
++warp_tile_iterator_A1_;
// Load warp-level tiles from shared memory, wrapping to k offset if
// this is the last group as the case may be.
this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
++this->warp_tile_iterator_B1_;
if (warp_mma_k > 0)
warp_mma1.transform(warp_transformed_frag_A1[warp_mma_k % 2],
warp_transformed_frag_B1[warp_mma_k % 2],
warp_loaded_frag_A1[warp_mma_k % 2],
warp_loaded_frag_B1[warp_mma_k % 2]);
// Issue global->shared copies for the next stage
int group_start_iteration_B1;
if (warp_mma_k + 1 == Base::kWarpGemmIterations1) {
group_start_iteration_B1 = 0;
} else {
group_start_iteration_B1 =
(warp_mma_k + 1) * Detail::kAccessesPerGroupB1;
}
copy_tiles_and_advance_1(iterator_B1,
group_start_iteration_B1);
warp_mma1(
accum,
warp_transformed_frag_A1[warp_mma_k % 2],
warp_transformed_frag_B1[warp_mma_k % 2],
accum
);
if (warp_mma_k + 1 == Base::kWarpGemmIterations1)
warp_mma1.transform(warp_transformed_frag_A1[(warp_mma_k + 1) % 2],
warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
if (warp_mma_k + 2 == Base::kWarpGemmIterations1) {
// Inserts a fence to group cp.async instructions into stages.
cutlass::arch::cp_async_fence();
// Waits until kStages-2 stages of cp.async have committed
arch::cp_async_wait<Base::kStages - 2>();
__syncthreads();
// Move to the next stage
iterator_B1.advance();
this->smem_iterator_B1_.add_tile_offset({1, 0});
// Add negative offsets to return iterators to the 'start' of the
// circular buffer in shared memory
if (smem_write_stage_idx == (Base::kStages - 1)) {
this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
smem_write_stage_idx = 0;
} else {
++smem_write_stage_idx;
}
if (smem_read_stage_idx == (Base::kStages - 1)) {
this->warp_tile_iterator_B1_.add_tile_offset(
{-Base::kStages * Policy1::kPartitionsK *
Base::kWarpGemmIterations1,
0});
smem_read_stage_idx = 0;
} else {
++smem_read_stage_idx;
}
}
}
}
// Insert fence and wait for all outstanding cp.async operations to commit.
cutlass::arch::cp_async_fence();
cutlass::arch::cp_async_wait<0>();
__syncthreads();
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace threadblock
} // namespace gemm
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,553 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Template for a double-buffered threadblock-scoped GEMM kernel.
*/
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/array.h"
#include "cutlass/aligned_buffer.h"
#include "cutlass/numeric_conversion.h"
#include "cutlass/numeric_types.h"
#include "cutlass/matrix_shape.h"
#include "cutlass/gemm/gemm.h"
#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
#include "threadblock/b2b_mma_base.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace conv {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
template <
/// Size of the Gemm problem - concept: gemm::GemmShape<>
typename Shape0_,
/// Iterates over tiles of A operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
typename IteratorA0_,
/// Iterates over tiles of A operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorA0_,
/// Iterates over tiles of B operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
typename IteratorB0_,
/// Iterates over tiles of B operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorB0_,
/// Size of the Gemm problem - concept: gemm::GemmShape<>
typename Shape1_,
/// Iterates over the intermediate accumulator tile
// (concept::MmaTensorOpFragmentIterator)
typename FragmentIteratorA1_,
/// Iterates over vectors of scale and bias vector in global memory
// (concept: VectorIterator)
typename IteratorAccumulatorScaleBias_,
/// FragmentIterator to load Scale or Bias vector from threadblock fragment
typename FragmentIteratorA1ScaleBias_,
// (concept: VectorFragmentIterator)
/// Iterates over tiles of B operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
typename IteratorB1_,
/// Iterates over tiles of B operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorB1_,
/// Data type of accumulator matrix
typename ElementC_,
/// Data type of accumulator matrix
typename LayoutC_,
/// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...)
typename OutputOp_,
/// Policy describing tuning details (concept: MmaPolicy)
typename Policy0_,
/// Policy describing tuning details (concept: MmaPolicy)
typename Policy1_,
/// Transformation applied to A operand
typename TransformA0_ = NumericArrayConverter<
typename SmemIteratorA0_::Element,
typename IteratorA0_::Element,
IteratorA0_::Fragment::kElements>,
///
/// Transformation applied to B operand
typename TransformB0_ = NumericArrayConverter<
typename SmemIteratorB0_::Element,
typename IteratorB0_::Element,
IteratorB0_::Fragment::kElements>,
///
/// Transformation applied to B operand
typename TransformB1_ = NumericArrayConverter<
typename SmemIteratorB1_::Element,
typename IteratorB1_::Element,
IteratorB1_::Fragment::kElements>,
/// Used for partial specialization
typename Enable = bool
>
class B2bImplicitGemmPipelined :
public gemm::threadblock::B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2> {
public:
///< Base class
using Base = gemm::threadblock::B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2>;
using Shape0 = Shape0_; ///< Size of the Gemm problem - concept: gemm::GemmShape<>
using IteratorA0 = IteratorA0_; ///< Iterates over tiles of A operand in global memory
using IteratorB0 = IteratorB0_; ///< Iterates over tiles of B operand in global memory
using Policy0 = Policy0_; ///< Policy0 describing tuning details
using SmemIteratorA0 = SmemIteratorA0_;
using SmemIteratorB0 = SmemIteratorB0_;
using Shape1 = Shape1_; ///< Size of the Gemm problem - concept: gemm::GemmShape<>
using FragmentIteratorA1 = FragmentIteratorA1_; ///< Iterates over tiles of A1 operand from accumulator tile
using IteratorAccumulatorScaleBias = IteratorAccumulatorScaleBias_; ///< Iterates over tiles of the scale and bias vectors in global memory
using FragmentIteratorA1ScaleBias =
FragmentIteratorA1ScaleBias_; ///< WarpIterator to load Scale or Bias vector from the threadblock fragment
using IteratorB1 = IteratorB1_; ///< Iterates over tiles of B operand in global memory
using Policy1 = Policy1_; ///< Policy1 describing tuning details
using SmemIteratorB1 = SmemIteratorB1_;
using ElementC = ElementC_; ///< Data type of accumulator matrix
using LayoutC = LayoutC_; ///< Layout of accumulator matrix
using OutputOp = OutputOp_; ///< Epilogue after 1st Gemm
static const bool PerChannelScale = (OutputOp::kScale ==
epilogue::thread::ScaleType::OnlyAlphaPerChannelScaling);
using TransformA0 = TransformA0_;
using TransformB0 = TransformB0_;
using TransformB1 = TransformB1_;
//
// Dependent types
//
/// Fragment of operand A loaded from global memory
using FragmentA0 = typename IteratorA0::Fragment;
/// Fragment of operand B loaded from global memory
using FragmentB0 = typename IteratorB0::Fragment;
/// Fragment of accumulator tile
using FragmentC0 = typename Policy0::Operator::FragmentC;
/// Warp-level Mma
using Operator0 = typename Policy0::Operator;
/// Fragment of Scale and Bias loaded from global memory
using FragmentA1ScaleBias = typename IteratorAccumulatorScaleBias::Fragment;
/// Fragment of operand B loaded from global memory
using FragmentB1 = typename IteratorB1::Fragment;
/// Fragment of accumulator tile
using FragmentC1 = typename Policy1::Operator::FragmentC;
/// Warp-level Mma
using Operator1 = typename Policy1::Operator;
/// Obtain the arch tag from the warp-level operator
using ArchTag = typename Policy0::Operator::ArchTag;
/// Complex transform on A0 operand
static ComplexTransform const kTransformA0 = Operator0::kTransformA;
/// Complex transform on B0 operand
static ComplexTransform const kTransformB0 = Operator0::kTransformB;
/// Complex transform on B1 operand
static ComplexTransform const kTransformB1 = Operator1::kTransformB;
// staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
private:
using WarpFragmentA0 = typename Operator0::FragmentA;
using WarpFragmentB0 = typename Operator0::FragmentB;
/// Warp Fragment of operand A1 loaded from accmulator tile
using WarpFragmentA1 = typename FragmentIteratorA1::Fragment;
/// Warp Fragment of operand A1 scale and bias loaded from threadblock fragment
using WarpFragmentA1ScaleBias =
typename FragmentIteratorA1ScaleBias::Fragment;
using WarpFragmentB1 = typename Operator1::FragmentB;
protected:
/// Iterator to write threadblock-scoped tile of A operand to shared memory
SmemIteratorA0 smem_iterator_A_;
/// Iterator to write threadblock-scoped tile of B0 operand to shared memory
SmemIteratorB0 smem_iterator_B0_;
/// Iterator to write threadblock-scoped tile of B1 operand to shared memory
SmemIteratorB1 smem_iterator_B1_;
public:
/// Construct from tensor references
CUTLASS_DEVICE
B2bImplicitGemmPipelined(
typename Base::B2bMmaSharedStorage &shared_storage, ///< Shared storage needed for internal use by threadblock-scoped GEMM
int thread_idx, ///< ID within the threadblock
int warp_idx, ///< ID of warp
int lane_idx ///< ID of each thread within a warp
):
Base(shared_storage, thread_idx, warp_idx, lane_idx),
smem_iterator_A_(shared_storage.shared_storage0.operand_A_ref(), thread_idx),
smem_iterator_B0_(shared_storage.shared_storage0.operand_B_ref(), thread_idx),
smem_iterator_B1_(shared_storage.shared_storage1.operand_B_ref(), thread_idx) {
// Compute warp location within threadblock tile by mapping the warp_id to
// three coordinates:
// _m: the warp's position within the threadblock along the M dimension
// _n: the warp's position within the threadblock along the N dimension
// _k: the warp's position within the threadblock along the K dimension
int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM;
int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM;
//These may change across different GEMM layers
int tile_offset_k_0 = Base::kWarpGemmIterations0 * warp_idx_k;
int tile_offset_k_1 = Base::kWarpGemmIterations1 * warp_idx_k;
// Add per-warp offsets in units of warp-level tiles
this->warp_tile_iterator_A0_.add_tile_offset({warp_idx_m, tile_offset_k_0});
this->warp_tile_iterator_B0_.add_tile_offset({tile_offset_k_0, warp_idx_n});
this->warp_tile_iterator_B1_.add_tile_offset({tile_offset_k_1, warp_idx_n});
}
/// Perform a threadblock-scoped matrix multiply-accumulate
CUTLASS_DEVICE
void operator()(
int gemm_k_iterations_0, ///< number of iterations of the mainloop
FragmentC1 &accum, ///< destination accumulator tile
IteratorA0 iterator_A, ///< iterator over A operand in global memory
IteratorB0 iterator_B0, ///< iterator over B0 operand in global memory
IteratorAccumulatorScaleBias iterator_A1_scale, ///< iterator over A1 operand scale vectors in global memory
IteratorAccumulatorScaleBias iterator_A1_bias, ///< iterator over A1 operand bias vectors in global memory
IteratorB1 iterator_B1, ///< iterator over B1 operand in global memory
FragmentC0 const &src_accum, ///< source accumulator tile
OutputOp output_op_0, ///< epilogue operation after 1st Gemm
TransformA0 transform_A0 = TransformA0(), ///< transformation applied to A0 fragment
TransformB0 transform_B0 = TransformB0(), ///< transformation applied to B0 fragment
TransformB1 transform_B1 = TransformB1()) { ///< transformation applied to B1 fragment
//
// Prologue
//
// Perform accumulation in the 'd' output operand
FragmentC0 accum0 = src_accum;
FragmentA0 tb_frag_A;
FragmentB0 tb_frag_B0;
tb_frag_A.clear();
tb_frag_B0.clear();
// The last kblock is loaded in the prolog
iterator_A.load(tb_frag_A);
iterator_B0.load(tb_frag_B0);
++iterator_A;
++iterator_B0;
this->smem_iterator_A_.store(transform_A0(tb_frag_A));
this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
++this->smem_iterator_A_;
++this->smem_iterator_B0_;
__syncthreads();
// Pair of fragments used to overlap shared memory loads and math instructions
WarpFragmentA0 warp_frag_A0[2];
WarpFragmentB0 warp_frag_B0[2];
this->warp_tile_iterator_A0_.set_kgroup_index(0);
this->warp_tile_iterator_B0_.set_kgroup_index(0);
this->warp_tile_iterator_A0_.load(warp_frag_A0[0]);
this->warp_tile_iterator_B0_.load(warp_frag_B0[0]);
++this->warp_tile_iterator_A0_;
++this->warp_tile_iterator_B0_;
Operator0 warp_mma0;
int smem_write_stage_idx = 1;
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
// shared memory loads (which have the tighest latency requirement).
//
// Mainloop
//
// Note: The main loop does not support Base::kWarpGemmIterations == 2.
CUTLASS_GEMM_LOOP
for (; gemm_k_iterations_0 > 0; --gemm_k_iterations_0) {
//
// Loop over GEMM K dimension
//
CUTLASS_PRAGMA_UNROLL
for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0; ++warp_mma_k) {
// Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
// as the case may be.
if (warp_mma_k == Base::kWarpGemmIterations0 - 1) {
// Write fragments to shared memory
this->smem_iterator_A_.store(transform_A0(tb_frag_A));
this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
__syncthreads();
++this->smem_iterator_A_;
++this->smem_iterator_B0_;
// Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
if (smem_write_stage_idx == 1) {
this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
}
else {
this->warp_tile_iterator_A0_.add_tile_offset(
{0, -Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0});
this->warp_tile_iterator_B0_.add_tile_offset(
{-Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0,
0});
}
smem_write_stage_idx ^= 1;
}
this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
this->warp_tile_iterator_A0_.load(warp_frag_A0[(warp_mma_k + 1) % 2]);
this->warp_tile_iterator_B0_.load(warp_frag_B0[(warp_mma_k + 1) % 2]);
++this->warp_tile_iterator_A0_;
++this->warp_tile_iterator_B0_;
if (warp_mma_k == 0) {
iterator_A.load(tb_frag_A);
iterator_B0.load(tb_frag_B0);
++iterator_A;
++iterator_B0;
}
warp_mma0(accum0, warp_frag_A0[warp_mma_k % 2],
warp_frag_B0[warp_mma_k % 2], accum0);
}
}
//2nd Implicit Gemm
/// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
FragmentIteratorA1 warp_tile_iterator_A1_(accum0);
//
// Prologue
//
FragmentA1ScaleBias tb_frag_A1_scale;
FragmentA1ScaleBias tb_frag_A1_bias;
FragmentIteratorA1ScaleBias warp_tile_iterator_A1_scale_(tb_frag_A1_scale);
FragmentIteratorA1ScaleBias warp_tile_iterator_A1_bias_(tb_frag_A1_bias);
FragmentB1 tb_frag_B1;
if(PerChannelScale)
tb_frag_A1_scale.clear();
tb_frag_A1_bias.clear();
tb_frag_B1.clear();
// The last kblock is loaded in the prolog
if(PerChannelScale)
iterator_A1_scale.load(tb_frag_A1_scale);
iterator_A1_bias.load(tb_frag_A1_bias);
iterator_B1.load(tb_frag_B1);
if(PerChannelScale)
++iterator_A1_scale;
++iterator_A1_bias;
++iterator_B1;
this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
++this->smem_iterator_B1_;
__syncthreads();
// Pair of fragments used to overlap shared memory loads and math instructions
WarpFragmentA1ScaleBias warp_frag_A1_scale[2];
WarpFragmentA1ScaleBias warp_frag_A1_bias[2];
WarpFragmentA1 warp_frag_A1[2];
WarpFragmentB1 warp_frag_B1[2];
this->warp_tile_iterator_B1_.set_kgroup_index(0);
if(PerChannelScale)
warp_tile_iterator_A1_scale_.load(warp_frag_A1_scale[0]);
warp_tile_iterator_A1_bias_.load(warp_frag_A1_bias[0]);
warp_tile_iterator_A1_.load(warp_frag_A1[0], warp_frag_A1_scale[0],
warp_frag_A1_bias[0], output_op_0);
this->warp_tile_iterator_B1_.load(warp_frag_B1[0]);
++warp_tile_iterator_A1_;
if(PerChannelScale)
++warp_tile_iterator_A1_scale_;
++warp_tile_iterator_A1_bias_;
++this->warp_tile_iterator_B1_;
Operator1 warp_mma1;
smem_write_stage_idx = 1;
int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
// shared memory loads (which have the tighest latency requirement).
//
// Mainloop
//
// Note: The main loop does not support Base::kWarpGemmIterations == 2.
CUTLASS_PRAGMA_UNROLL
for (; gemm_k_iterations_1 > 0; --gemm_k_iterations_1) {
//
// Loop over GEMM K dimension
//
CUTLASS_PRAGMA_UNROLL
for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1; ++warp_mma_k) {
// Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
// as the case may be.
if (warp_mma_k == Base::kWarpGemmIterations1 - 1) {
this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
__syncthreads();
++this->smem_iterator_B1_;
// Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
if (smem_write_stage_idx == 1) {
this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
}
else {
this->warp_tile_iterator_B1_.add_tile_offset(
{-Base::kStages * Policy1::kPartitionsK * Base::kWarpGemmIterations1,
0});
}
smem_write_stage_idx ^= 1;
if(PerChannelScale) {
tb_frag_A1_scale.clear();
iterator_A1_scale.load(tb_frag_A1_scale);
++iterator_A1_scale;
}
tb_frag_A1_bias.clear();
iterator_A1_bias.load(tb_frag_A1_bias);
++iterator_A1_bias;
}
this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
if(PerChannelScale)
warp_tile_iterator_A1_scale_.load(warp_frag_A1_scale[(warp_mma_k + 1) % 2]);
warp_tile_iterator_A1_bias_.load(warp_frag_A1_bias[(warp_mma_k + 1) % 2]);
warp_tile_iterator_A1_.load(warp_frag_A1[(warp_mma_k + 1) % 2],
warp_frag_A1_scale[(warp_mma_k + 1) % 2],
warp_frag_A1_bias[(warp_mma_k + 1) % 2],
output_op_0);
this->warp_tile_iterator_B1_.load(warp_frag_B1[(warp_mma_k + 1) % 2]);
if(PerChannelScale)
++warp_tile_iterator_A1_scale_;
++warp_tile_iterator_A1_bias_;
++warp_tile_iterator_A1_;
++this->warp_tile_iterator_B1_;
if (warp_mma_k == 0) {
iterator_B1.load(tb_frag_B1);
++iterator_B1;
}
warp_mma1(accum, warp_frag_A1[warp_mma_k % 2],
warp_frag_B1[warp_mma_k % 2], accum);
}
}
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace threadblock
} // namespace gemm
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,535 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Template for a double-buffered threadblock-scoped GEMM kernel.
*/
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/array.h"
#include "cutlass/aligned_buffer.h"
#include "cutlass/numeric_conversion.h"
#include "cutlass/numeric_types.h"
#include "cutlass/matrix_shape.h"
#include "cutlass/gemm/gemm.h"
#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
#include "threadblock/b2b_mma_base_smem_accumulator.h"
#include "cutlass/epilogue/threadblock/epilogue_smem_accumulator.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace conv {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
template <
/// Size of the Gemm problem - concept: gemm::GemmShape<>
typename Shape0_,
/// Iterates over tiles of A operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
typename IteratorA0_,
/// Iterates over tiles of A operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorA0_,
/// Iterates over tiles of B operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
typename IteratorB0_,
/// Iterates over tiles of B operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorB0_,
/// Iterates over vectors of scale and bias vector in global memory
// (concept: VectorIterator)
typename IteratorAccumulatorScaleBias_,
/// Iterates over accumulator tile
typename FragmentIteratorAccumulator_,
/// Iterates over accumulator tile in shared memory
typename SmemIteratorD0_,
/// Size of the Gemm problem - concept: gemm::GemmShape<>
typename Shape1_,
/// Iterates over the intermediate accumulator tile in shared memory
typename WarpIteratorA1_,
/// Iterates over tiles of B operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
typename IteratorB1_,
/// Iterates over tiles of B operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorB1_,
/// Data type of accumulator matrix
typename ElementC_,
/// Data type of accumulator matrix
typename LayoutC_,
/// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...)
typename OutputOp_,
/// Policy describing tuning details (concept: MmaPolicy)
typename Policy0_,
/// Policy describing tuning details (concept: MmaPolicy)
typename Policy1_,
/// Transformation applied to A0 operand
typename TransformA0_ = NumericArrayConverter<
typename SmemIteratorA0_::Element,
typename IteratorA0_::Element,
IteratorA0_::Fragment::kElements>,
///
/// Transformation applied to B0 operand
typename TransformB0_ = NumericArrayConverter<
typename SmemIteratorB0_::Element,
typename IteratorB0_::Element,
IteratorB0_::Fragment::kElements>,
///
/// Transformation applied to B1 operand
typename TransformB1_ = NumericArrayConverter<
typename SmemIteratorB1_::Element,
typename IteratorB1_::Element,
IteratorB1_::Fragment::kElements>,
/// Used for partial specialization
typename Enable = bool
>
class B2bImplicitGemmPipelinedSmemAccumulator :
public gemm::threadblock::B2bMmaBaseSmemAccumulator<Shape0_, Shape1_, Policy0_, Policy1_, SmemIteratorD0_, 2> {
public:
///< Base class
using Base = gemm::threadblock::B2bMmaBaseSmemAccumulator<Shape0_, Shape1_, Policy0_, Policy1_, SmemIteratorD0_, 2>;
using Shape0 = Shape0_; ///< Size of the Gemm problem - concept: gemm::GemmShape<>
using IteratorA0 = IteratorA0_; ///< Iterates over tiles of A operand in global memory
using IteratorB0 = IteratorB0_; ///< Iterates over tiles of B operand in global memory
using IteratorAccumulatorScaleBias = IteratorAccumulatorScaleBias_; ///< Iterates over tiles of the scale and bias vectors in global memory
using Policy0 = Policy0_; ///< Policy0 describing tuning details
using SmemIteratorA0 = SmemIteratorA0_;
using SmemIteratorB0 = SmemIteratorB0_;
using SmemIteratorD0 = SmemIteratorD0_; ///< Iterates over accumulator tile in shared memory
using FragmentIteratorAccumulator = FragmentIteratorAccumulator_; ///< Iterates over accumulator tile
using Shape1 = Shape1_; ///< Size of the Gemm problem - concept: gemm::GemmShape<>
using IteratorB1 = IteratorB1_; ///< Iterates over tiles of B operand in global memory
using Policy1 = Policy1_; ///< Policy1 describing tuning details
using SmemIteratorB1 = SmemIteratorB1_;
using WarpIteratorA1 = WarpIteratorA1_; ///< Iterates over the intermediate accumulator tile in shared memory
using ElementC = ElementC_; ///< Data type of accumulator matrix
using LayoutC = LayoutC_; ///< Layout of accumulator matrix
using OutputOp = OutputOp_; ///< Epilogue after 1st Gemm
using TransformA0 = TransformA0_;
using TransformB0 = TransformB0_;
using TransformB1 = TransformB1_;
//
// Dependent types
//
/// Fragment of operand A loaded from global memory
using FragmentA0 = typename IteratorA0::Fragment;
/// Fragment of operand B loaded from global memory
using FragmentB0 = typename IteratorB0::Fragment;
/// Fragment of accumulator tile
using FragmentC0 = typename Policy0::Operator::FragmentC;
/// Warp-level Mma
using Operator0 = typename Policy0::Operator;
/// Fragment of operand B loaded from global memory
using FragmentB1 = typename IteratorB1::Fragment;
/// Fragment of accumulator tile
using FragmentC1 = typename Policy1::Operator::FragmentC;
/// Warp-level Mma
using Operator1 = typename Policy1::Operator;
/// Obtain the arch tag from the warp-level operator
using ArchTag = typename Policy0::Operator::ArchTag;
/// Complex transform on A0 operand
static ComplexTransform const kTransformA0 = Operator0::kTransformA;
/// Complex transform on B0 operand
static ComplexTransform const kTransformB0 = Operator0::kTransformB;
/// Complex transform on B1 operand
static ComplexTransform const kTransformB1 = Operator1::kTransformB;
/// staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
/// Epilog in shared memory
using Epilogue0 = epilogue::threadblock::EpilogueSmemAccumulator<
SmemIteratorD0, ///< SmemTileIterator
FragmentIteratorAccumulator, ///< AccumulatorFragmentIterator
IteratorAccumulatorScaleBias, ///< ScaleBiasIterator
OutputOp>; ///< Output operator
private:
using WarpFragmentA0 = typename Operator0::FragmentA;
using WarpFragmentB0 = typename Operator0::FragmentB;
using WarpFragmentA1 = typename Operator1::FragmentA;
using WarpFragmentB1 = typename Operator1::FragmentB;
protected:
/// Iterator to write threadblock-scoped tile of A operand to shared memory
SmemIteratorA0 smem_iterator_A_;
/// Iterator to write threadblock-scoped tile of B0 operand to shared memory
SmemIteratorB0 smem_iterator_B0_;
/// Shared Memory Iterator to store accumulator tile
SmemIteratorD0 smem_iterator_D0_;
/// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
WarpIteratorA1 warp_tile_iterator_A1_;
/// Iterator to write threadblock-scoped tile of B1 operand to shared memory
SmemIteratorB1 smem_iterator_B1_;
public:
/// Construct from tensor references
CUTLASS_DEVICE
B2bImplicitGemmPipelinedSmemAccumulator(
typename Base::B2bMmaSharedStorage &shared_storage, ///< Shared storage needed for internal use by threadblock-scoped GEMM
int thread_idx, ///< ID within the threadblock
int warp_idx, ///< ID of warp
int lane_idx ///< ID of each thread within a warp
):
Base(shared_storage, thread_idx, warp_idx, lane_idx),
smem_iterator_A_(shared_storage.b2b_mma_shared_storage.shared_storage0.operand_A_ref(), thread_idx),
smem_iterator_B0_(shared_storage.b2b_mma_shared_storage.shared_storage0.operand_B_ref(), thread_idx),
smem_iterator_D0_(shared_storage.accumulator_shared_storage0.accum_ref(), lane_idx),
warp_tile_iterator_A1_(shared_storage.accumulator_shared_storage0.accum_ref(), lane_idx),
smem_iterator_B1_(shared_storage.b2b_mma_shared_storage.shared_storage1.operand_B_ref(), thread_idx) {
// Compute warp location within threadblock tile by mapping the warp_id to
// three coordinates:
// _m: the warp's position within the threadblock along the M dimension
// _n: the warp's position within the threadblock along the N dimension
// _k: the warp's position within the threadblock along the K dimension
int warp_idx_mn_0 = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
int warp_idx_k_0 = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
int warp_idx_m_0 = warp_idx_mn_0 % Base::WarpCount0::kM;
int warp_idx_n_0 = warp_idx_mn_0 / Base::WarpCount0::kM;
int tile_offset_k_0 = Base::kWarpGemmIterations0 * warp_idx_k_0;
int warp_idx_mn_1 = warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN);
int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN);
int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM;
int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM;
int tile_offset_k_1 = Base::kWarpGemmIterations1 * warp_idx_k_1;
// Add per-warp offsets in units of warp-level tiles
this->warp_tile_iterator_A0_.add_tile_offset({warp_idx_m_0, tile_offset_k_0});
this->warp_tile_iterator_B0_.add_tile_offset({tile_offset_k_0, warp_idx_n_0});
warp_tile_iterator_A1_.add_tile_offset({warp_idx_m_1, tile_offset_k_1});
this->warp_tile_iterator_B1_.add_tile_offset({tile_offset_k_1, warp_idx_n_1});
// Add smem accumulator iterator warp offset
smem_iterator_D0_.add_tile_offset({ warp_idx_m_0 * SmemIteratorD0::TileIterations::kRow,
warp_idx_n_0 * SmemIteratorD0::TileIterations::kColumn});
}
/// Perform a threadblock-scoped matrix multiply-accumulate
CUTLASS_DEVICE
void operator()(
int gemm_k_iterations_0, ///< number of iterations of the mainloop
FragmentC1 &accum, ///< destination accumulator tile
IteratorA0 iterator_A, ///< iterator over A operand in global memory
IteratorB0 iterator_B0, ///< iterator over B0 operand in global memory
IteratorAccumulatorScaleBias iterator_accum0_scale, ///< iterator over D0 scale vector in global memory
IteratorAccumulatorScaleBias iterator_accum0_bias, ///< iterator over D0 bias vector in global memory
IteratorB1 iterator_B1, ///< iterator over B1 operand in global memory
FragmentC0 const &src_accum, ///< source accumulator tile
OutputOp output_op_0, ///< epilogue operation after 1st Gemm
TransformA0 transform_A0 = TransformA0(), ///< transformation applied to A0 fragment
TransformB0 transform_B0 = TransformB0(), ///< transformation applied to B0 fragment
TransformB1 transform_B1 = TransformB1()) { ///< transformation applied to B1 fragment
//
// Prologue
//
// Perform accumulation in the 'd' output operand
FragmentC0 accum0 = src_accum;
FragmentA0 tb_frag_A;
FragmentB0 tb_frag_B0;
tb_frag_A.clear();
tb_frag_B0.clear();
// The last kblock is loaded in the prolog
iterator_A.load(tb_frag_A);
iterator_B0.load(tb_frag_B0);
++iterator_A;
++iterator_B0;
this->smem_iterator_A_.store(transform_A0(tb_frag_A));
this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
++this->smem_iterator_A_;
++this->smem_iterator_B0_;
__syncthreads();
// Pair of fragments used to overlap shared memory loads and math instructions
WarpFragmentA0 warp_frag_A0[2];
WarpFragmentB0 warp_frag_B0[2];
this->warp_tile_iterator_A0_.set_kgroup_index(0);
this->warp_tile_iterator_B0_.set_kgroup_index(0);
this->warp_tile_iterator_A0_.load(warp_frag_A0[0]);
this->warp_tile_iterator_B0_.load(warp_frag_B0[0]);
++this->warp_tile_iterator_A0_;
++this->warp_tile_iterator_B0_;
Operator0 warp_mma0;
int smem_write_stage_idx = 1;
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
// shared memory loads (which have the tighest latency requirement).
//
// Mainloop
//
// Note: The main loop does not support Base::kWarpGemmIterations == 2.
CUTLASS_GEMM_LOOP
for (; gemm_k_iterations_0 > 0; --gemm_k_iterations_0) {
//
// Loop over GEMM K dimension
//
CUTLASS_PRAGMA_UNROLL
for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0; ++warp_mma_k) {
// Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
// as the case may be.
if (warp_mma_k == Base::kWarpGemmIterations0 - 1) {
// Write fragments to shared memory
this->smem_iterator_A_.store(transform_A0(tb_frag_A));
this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
__syncthreads();
++this->smem_iterator_A_;
++this->smem_iterator_B0_;
// Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
if (smem_write_stage_idx == 1) {
this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
}
else {
this->warp_tile_iterator_A0_.add_tile_offset(
{0, -Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0});
this->warp_tile_iterator_B0_.add_tile_offset(
{-Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0,
0});
}
smem_write_stage_idx ^= 1;
}
this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
this->warp_tile_iterator_A0_.load(warp_frag_A0[(warp_mma_k + 1) % 2]);
this->warp_tile_iterator_B0_.load(warp_frag_B0[(warp_mma_k + 1) % 2]);
++this->warp_tile_iterator_A0_;
++this->warp_tile_iterator_B0_;
if (warp_mma_k == 0) {
iterator_A.load(tb_frag_A);
iterator_B0.load(tb_frag_B0);
++iterator_A;
++iterator_B0;
}
warp_mma0(accum0, warp_frag_A0[warp_mma_k % 2],
warp_frag_B0[warp_mma_k % 2], accum0);
}
}
/// Epilogue for the first Implicit Gemm
Epilogue0 epilogue0;
epilogue0(output_op_0, smem_iterator_D0_, accum0, iterator_accum0_scale, iterator_accum0_bias);
__syncthreads();
/// 2nd Implicit Gemm
//
// Prologue
//
FragmentB1 tb_frag_B1;
tb_frag_B1.clear();
// The last kblock is loaded in the prolog
iterator_B1.load(tb_frag_B1);
++iterator_B1;
this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
++this->smem_iterator_B1_;
__syncthreads();
// Pair of fragments used to overlap shared memory loads and math instructions
WarpFragmentA1 warp_frag_A1[2];
WarpFragmentB1 warp_frag_B1[2];
this->warp_tile_iterator_B1_.set_kgroup_index(0);
warp_tile_iterator_A1_.load(warp_frag_A1[0]);
this->warp_tile_iterator_B1_.load(warp_frag_B1[0]);
++warp_tile_iterator_A1_;
++this->warp_tile_iterator_B1_;
Operator1 warp_mma1;
smem_write_stage_idx = 1;
int gemm_k_iterations_1 = Shape0::kN / Shape1::kK;
//
// Mainloop
//
// Note: The main loop does not support Base::kWarpGemmIterations == 2.
CUTLASS_PRAGMA_UNROLL
for (; gemm_k_iterations_1 > 0; --gemm_k_iterations_1) {
//
// Loop over GEMM K dimension
//
CUTLASS_PRAGMA_UNROLL
for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1; ++warp_mma_k) {
// Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
// as the case may be.
if (warp_mma_k == Base::kWarpGemmIterations1 - 1) {
// Write fragments to shared memory
this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
__syncthreads();
++this->smem_iterator_B1_;
// Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
if (smem_write_stage_idx == 1) {
this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
}
else {
this->warp_tile_iterator_B1_.add_tile_offset(
{-Base::kStages * Policy1::kPartitionsK *
Base::kWarpGemmIterations1,
0});
}
smem_write_stage_idx ^= 1;
}
this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
// skip warp tile loading for the last kgroup
if(gemm_k_iterations_1 > 1 || warp_mma_k < Base::kWarpGemmIterations1 - 1)
warp_tile_iterator_A1_.load(warp_frag_A1[(warp_mma_k + 1) % 2]);
this->warp_tile_iterator_B1_.load(warp_frag_B1[(warp_mma_k + 1) % 2]);
++warp_tile_iterator_A1_;
++this->warp_tile_iterator_B1_;
if (warp_mma_k == 0) {
iterator_B1.load(tb_frag_B1);
++iterator_B1;
}
warp_mma1(accum, warp_frag_A1[warp_mma_k % 2],
warp_frag_B1[warp_mma_k % 2], accum);
}
}
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace threadblock
} // namespace gemm
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
@ -180,8 +186,8 @@ class B2bMmaBase {
using SharedStorage0 = SharedStorage<Shape0, Policy0>;
using SharedStorage1 = SharedStorage<Shape1, Policy1>;
union B2bMmaSharedStorage {
SharedStorage0 sharedStorage0;
SharedStorage1 sharedStorage1;
SharedStorage0 shared_storage0;
SharedStorage1 shared_storage1;
};
@ -197,7 +203,7 @@ class B2bMmaBase {
/// Iterator to load a warp-scoped tile of B0 operand from shared memory
typename Operator0::IteratorB warp_tile_iterator_B0_;
/// Iterator to load a warp-scoped tile of B0 operand from shared memory
/// Iterator to load a warp-scoped tile of B1 operand from shared memory
typename Operator1::IteratorB warp_tile_iterator_B1_;
public:
@ -214,9 +220,9 @@ public:
///< ID of each thread within a warp
int lane_idx
):
warp_tile_iterator_A0_(shared_storage.sharedStorage0.operand_A_ref(), lane_idx),
warp_tile_iterator_B0_(shared_storage.sharedStorage0.operand_B_ref(), lane_idx),
warp_tile_iterator_B1_(shared_storage.sharedStorage1.operand_B_ref(), lane_idx) {
warp_tile_iterator_A0_(shared_storage.shared_storage0.operand_A_ref(), lane_idx),
warp_tile_iterator_B0_(shared_storage.shared_storage0.operand_B_ref(), lane_idx),
warp_tile_iterator_B1_(shared_storage.shared_storage1.operand_B_ref(), lane_idx) {
}
};

View File

@ -0,0 +1,179 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Template for a double-buffered threadblock-scoped GEMM kernel.
*/
#pragma once
#include "cutlass/aligned_buffer.h"
#include "cutlass/arch/memory.h"
#include "cutlass/array.h"
#include "cutlass/cutlass.h"
#include "cutlass/gemm/gemm.h"
#include "cutlass/matrix_shape.h"
#include "cutlass/numeric_types.h"
#include "threadblock/b2b_mma_base.h"
////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace gemm {
namespace threadblock {
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
/// Structure to compute the matrix product targeting CUDA cores and SIMT math
/// instructions.
template <
/// Size of the Gemm problem - concept: gemm::GemmShape<>
typename Shape0_,
/// Size of the Gemm problem - concept: gemm::GemmShape<>
typename Shape1_,
/// Policy describing tuning details (concept: MmaPolicy)
typename Policy0_,
/// Policy describing tuning details (concept: MmaPolicy)
typename Policy1_,
/// Shared Memory Accumulator Iterator
typename SmemAccumulatorIterator0_,
/// Number of stages,
int Stages,
/// Used for partial specialization
typename Enable = bool>
class B2bMmaBaseSmemAccumulator :
public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, Stages> {
public:
///< Base class
using Base = B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, Stages>;
///< Size of the Gemm problem - concept: gemm::GemmShape<>
using Shape0 = Shape0_;
using Shape1 = Shape1_;
///< Policy describing tuning details
using Policy0 = Policy0_;
using Policy1 = Policy1_;
using SmemAccumulatorIterator0 = SmemAccumulatorIterator0_;
//
// Nested structs
//
/// Shared storage object needed by accumulator
template<
typename Shape_,
typename Element_,
typename Layout_,
typename Padding_
>
class AccumulatorSharedStorage {
public:
//
// Type definitions
//
using Shape = Shape_;
using Element = Element_;
using Layout = Layout_;
using Padding = Padding_;
/// Tensor reference to the accumulator
using TensorRefAccum = TensorRef<Element, Layout>;
/// Shape of the accumulator matrix in shared memory
using ShapeAccum = MatrixShape<Shape::kM + Padding::kRow,
Shape::kN + Padding::kColumn>;
public:
//
// Data members
//
/// Buffer for accumulator
AlignedBuffer<Element, ShapeAccum::kCount> accum;
public:
//
// Methods
//
/// Returns a layout object for the Accum matrix
CUTLASS_DEVICE
static Layout LayoutAccum() {
return Layout::packed({ShapeAccum::kRow, ShapeAccum::kColumn});
}
/// Returns a TensorRef to the Accumulator
CUTLASS_HOST_DEVICE
TensorRefAccum accum_ref() {
return TensorRefAccum{accum.data(), LayoutAccum()};
}
};
using AccumulatorSharedStorage0 = AccumulatorSharedStorage<
Shape0, typename SmemAccumulatorIterator0::Element,
typename SmemAccumulatorIterator0::TensorLayout,
typename SmemAccumulatorIterator0::Padding>;
struct B2bMmaSharedStorage {
typename Base::B2bMmaSharedStorage b2b_mma_shared_storage;
AccumulatorSharedStorage0 accumulator_shared_storage0;
};
public:
/// Construct from tensor references
CUTLASS_DEVICE
B2bMmaBaseSmemAccumulator(
///< Shared storage needed for internal use by threadblock-scoped GEMM
B2bMmaSharedStorage &shared_storage,
///< ID within the threadblock
int thread_idx,
///< ID of warp
int warp_idx,
///< ID of each thread within a warp
int lane_idx
):
Base(shared_storage.b2b_mma_shared_storage, thread_idx, warp_idx, lane_idx) {
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace threadblock
} // namespace gemm
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,885 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Template for a double-buffered threadblock-scoped GEMM kernel.
*/
#pragma once
#include "cutlass/aligned_buffer.h"
#include "cutlass/arch/memory.h"
#include "cutlass/array.h"
#include "cutlass/cutlass.h"
#include "cutlass/gemm/gemm.h"
#include "cutlass/matrix_shape.h"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
#include "threadblock/b2b_mma_base.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace gemm {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Structure to compute the matrix product targeting CUDA cores and SIMT math
/// instructions.
template <
/// Size of the Gemm problem - concept: gemm::GemmShape<>
typename Shape0_,
/// Iterates over tiles of A operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator |
// MaskedTileIterator)
typename IteratorA0_,
/// Iterates over tiles of A operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorA0_,
/// Cache operation for operand A
cutlass::arch::CacheOperation::Kind CacheOpA0,
/// Iterates over tiles of B operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator |
// MaskedTileIterator)
typename IteratorB0_,
/// Iterates over tiles of B operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorB0_,
/// Cache operation for operand B
cutlass::arch::CacheOperation::Kind CacheOpB0,
/// Size of the Gemm problem - concept: gemm::GemmShape<>
typename Shape1_,
/// Iterates over the intermediate accumulator tile
// (concept::MmaTensorOpFragmentIterator)
typename FragmentIteratorA1_,
/// Iterates over vectors of scale and bias vector in global memory
// (concept: VectorIterator)
typename IteratorAccumulatorScaleBias_,
/// WarpIterator to load Scale or Bias vector from threadblock fragment
typename FragmentIteratorA1ScaleBias_,
/// Iterates over tiles of B operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator |
// MaskedTileIterator)
typename IteratorB1_,
/// Iterates over tiles of B operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorB1_,
/// Cache operation for operand B
cutlass::arch::CacheOperation::Kind CacheOpB1,
/// Data type of accumulator matrix
typename ElementC_,
/// Data type of accumulator matrix
typename LayoutC_,
/// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...)
typename OutputOp_,
/// Policy describing tuning details (concept: MmaPolicy)
typename Policy0_,
/// Policy describing tuning details (concept: MmaPolicy)
typename Policy1_,
/// Number of stages,
int Stages,
/// Used for partial specialization
typename Enable = bool>
class B2bMmaMultistage :
public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, Stages> {
public:
///< Base class
using Base = B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, Stages>;
///< Size of the Gemm problem - concept: gemm::GemmShape<>
using Shape0 = Shape0_;
///< Iterates over tiles of A operand in global memory
using IteratorA0 = IteratorA0_;
///< Iterates over tiles of B operand in global memory
using IteratorB0 = IteratorB0_;
///< Policy describing tuning details
using Policy0 = Policy0_;
using SmemIteratorA0 = SmemIteratorA0_;
using SmemIteratorB0 = SmemIteratorB0_;
///< Size of the Gemm problem - concept: gemm::GemmShape<>
using Shape1 = Shape1_;
///< Iterates over intermediate accumulator tile
using FragmentIteratorA1 = FragmentIteratorA1_;
///< Iterates over tiles of the scale and bias vectors in global memory
using IteratorAccumulatorScaleBias = IteratorAccumulatorScaleBias_;
///< WarpIterator to load Scale or Bias vector from threadblock fragment
using FragmentIteratorA1ScaleBias = FragmentIteratorA1ScaleBias_;
///< Iterates over tiles of B operand in global memory
using IteratorB1 = IteratorB1_;
///< Policy describing tuning details
using Policy1 = Policy1_;
using SmemIteratorB1 = SmemIteratorB1_;
///< Data type of accumulator matrix
using ElementC = ElementC_;
///< Layout of accumulator matrix
using LayoutC = LayoutC_;
///< Epilogue after 1st Gemm
using OutputOp = OutputOp_;
static const bool PerChannelScale = (OutputOp::kScale ==
epilogue::thread::ScaleType::OnlyAlphaPerChannelScaling);
static cutlass::arch::CacheOperation::Kind const kCacheOpA0 = CacheOpA0;
static cutlass::arch::CacheOperation::Kind const kCacheOpB0 = CacheOpB0;
static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1;
//
// Dependent types
//
/// Fragment of accumulator tile
using FragmentC0 = typename Policy0::Operator::FragmentC;
/// Warp-level Mma
using Operator0 = typename Policy0::Operator;
/// Fragment of Scale and Bias loaded from global memory
using FragmentA1ScaleBias = typename IteratorAccumulatorScaleBias::Fragment;
/// Fragment of accumulator tile
using FragmentC1 = typename Policy1::Operator::FragmentC;
/// Warp-level Mma
using Operator1 = typename Policy1::Operator;
/// Minimum architecture is Sm80 to support cp.async
using ArchTag = arch::Sm80;
/// Complex transform on A operand
static ComplexTransform const kTransformA0 = Operator0::kTransformA;
/// Complex transform on B operand
static ComplexTransform const kTransformB0 = Operator0::kTransformB;
/// Complex transform on B operand
static ComplexTransform const kTransformB1 = Operator1::kTransformB;
/// Internal structure exposed for introspection.
struct Detail {
static_assert(Base::kWarpGemmIterations0 > 1,
"The pipelined structure requires at least two warp-level "
"GEMM operations.");
static_assert(Base::kWarpGemmIterations1 > 1,
"The pipelined structure requires at least two warp-level "
"GEMM operations.");
/// Number of cp.async instructions to load one stage of operand A
static int const TBLoadIterationsA0 =
IteratorA0::ThreadMap::Iterations::kCount;
/// Number of cp.async instructions to load one stage of operand B
static int const TBLoadIterationsB0 =
IteratorB0::ThreadMap::Iterations::kCount;
/// Number of cp.async instructions to load one stage of operand B
static int const TBLoadIterationsB1 =
IteratorB1::ThreadMap::Iterations::kCount;
/// Number of stages
static int const kStages = Stages;
/// Number of cp.async instructions to load on group of operand A
static int const kAccessesPerGroupA0 =
(TBLoadIterationsA0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
/// Number of cp.async instructions to load on group of operand B
static int const kAccessesPerGroupB0 =
(TBLoadIterationsB0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
/// Number of cp.async instructions to load on group of operand B
static int const kAccessesPerGroupB1 =
(TBLoadIterationsB1 + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
};
private:
using WarpLoadedFragmentA0 = typename Operator0::FragmentA;
using WarpLoadedFragmentB0 = typename Operator0::FragmentB;
/// Warp Fragment of operand A1 loaded from accmulator tile
using WarpLoadedFragmentA1 = typename FragmentIteratorA1::Fragment;
using WarpLoadedFragmentA1ScaleBias =
typename FragmentIteratorA1ScaleBias::Fragment;
using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
using WarpTransformedFragmentA0 = typename Operator0::TransformedFragmentA;
using WarpTransformedFragmentB0 = typename Operator0::TransformedFragmentB;
using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA;
using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
private:
//
// Data members
//
/// Iterator to write threadblock-scoped tile of A operand to shared memory
SmemIteratorA0 smem_iterator_A0_;
/// Iterator to write threadblock-scoped tile of B operand to shared memory
SmemIteratorB0 smem_iterator_B0_;
/// Iterator to write threadblock-scoped tile of B operand to shared memory
SmemIteratorB1 smem_iterator_B1_;
public:
/// Construct from tensor references
CUTLASS_DEVICE
B2bMmaMultistage(
///< Shared storage needed for internal use by threadblock-scoped GEMM
typename Base::B2bMmaSharedStorage &shared_storage,
///< ID within the threadblock
int thread_idx,
///< ID of warp
int warp_idx,
///< ID of each thread within a warp
int lane_idx,
///< GEMM0 N is used for accumulator extent
int problem_size_0_n
):
Base(shared_storage, thread_idx, warp_idx, lane_idx),
smem_iterator_A0_(shared_storage.shared_storage0.operand_A_ref(), thread_idx),
smem_iterator_B0_(shared_storage.shared_storage0.operand_B_ref(), thread_idx),
smem_iterator_B1_(shared_storage.shared_storage1.operand_B_ref(), thread_idx)
{
// Compute warp location within threadblock tile by mapping the warp_id to
// three coordinates:
// _m: the warp's position within the threadblock along the M dimension
// _n: the warp's position within the threadblock along the N dimension
// _k: the warp's position within the threadblock along the K dimension
int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM;
int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM;
// Add per-warp offsets in units of warp-level tiles
this->warp_tile_iterator_A0_.add_tile_offset(
{warp_idx_m, Base::kWarpGemmIterations0 * warp_idx_k});
this->warp_tile_iterator_B0_.add_tile_offset(
{Base::kWarpGemmIterations0 * warp_idx_k, warp_idx_n});
this->warp_tile_iterator_B1_.add_tile_offset(
{Base::kWarpGemmIterations1 * warp_idx_k, warp_idx_n});
}
CUTLASS_DEVICE
void copy_tiles_and_advance_0(IteratorA0 &iterator_A0, IteratorB0 &iterator_B0,
int group_start_A0 = 0, int group_start_B0 = 0) {
iterator_A0.set_iteration_index(group_start_A0 *
IteratorA0::kAccessesPerVector);
this->smem_iterator_A0_.set_iteration_index(group_start_A0);
// Load for operand A
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::kAccessesPerGroupA0; ++j) {
if (group_start_A0 + j < Detail::TBLoadIterationsA0) {
typename IteratorA0::AccessType *dst_ptr =
reinterpret_cast<typename IteratorA0::AccessType *>(
this->smem_iterator_A0_.get());
int const kSrcBytes = sizeof_bits<typename IteratorA0::Element>::value *
IteratorA0::ThreadMap::kElementsPerAccess /
IteratorA0::kAccessesPerVector / 8;
CUTLASS_PRAGMA_UNROLL
for (int v = 0; v < IteratorA0::kAccessesPerVector; ++v) {
auto gmem_ptr = iterator_A0.get();
cutlass::arch::cp_async<kSrcBytes, kCacheOpA0>(
dst_ptr + v, gmem_ptr, iterator_A0.valid());
++iterator_A0;
}
++this->smem_iterator_A0_;
}
}
iterator_B0.set_iteration_index(group_start_B0 *
IteratorB0::kAccessesPerVector);
this->smem_iterator_B0_.set_iteration_index(group_start_B0);
// Load for operand B
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::kAccessesPerGroupB0; ++j) {
if (group_start_B0 + j < Detail::TBLoadIterationsB0) {
typename IteratorB0::AccessType *dst_ptr =
reinterpret_cast<typename IteratorB0::AccessType *>(
this->smem_iterator_B0_.get());
int const kSrcBytes = sizeof_bits<typename IteratorB0::Element>::value *
IteratorB0::ThreadMap::kElementsPerAccess /
IteratorB0::kAccessesPerVector / 8;
CUTLASS_PRAGMA_UNROLL
for (int v = 0; v < IteratorB0::kAccessesPerVector; ++v) {
auto gmem_ptr = iterator_B0.get();
cutlass::arch::cp_async<kSrcBytes, kCacheOpB0>(
dst_ptr + v, gmem_ptr, iterator_B0.valid());
++iterator_B0;
}
++this->smem_iterator_B0_;
}
}
}
CUTLASS_DEVICE
void copy_tiles_and_advance_1(IteratorB1 &iterator_B1,
int group_start_B1 = 0) {
iterator_B1.set_iteration_index(group_start_B1 *
IteratorB1::kAccessesPerVector);
this->smem_iterator_B1_.set_iteration_index(group_start_B1);
// Load for operand B
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
if (group_start_B1 + j < Detail::TBLoadIterationsB1) {
typename IteratorB1::AccessType *dst_ptr =
reinterpret_cast<typename IteratorB1::AccessType *>(
this->smem_iterator_B1_.get());
int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
IteratorB1::ThreadMap::kElementsPerAccess /
IteratorB1::kAccessesPerVector / 8;
CUTLASS_PRAGMA_UNROLL
for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
auto gmem_ptr = iterator_B1.get();
cutlass::arch::cp_async<kSrcBytes, kCacheOpB1>(
dst_ptr + v, gmem_ptr, iterator_B1.valid());
++iterator_B1;
}
++this->smem_iterator_B1_;
}
}
}
/// Perform a threadblock-scoped matrix multiply-accumulate
CUTLASS_DEVICE
void operator()(
///< problem size of GEMM
int gemm_k_iterations_0,
///< destination accumulator tile
FragmentC1 &accum,
///< iterator over A0 operand in global memory
IteratorA0 iterator_A0,
///< iterator over B0 operand in global memory
IteratorB0 iterator_B0,
///< iterator over A1 operand scale vector in global memory
IteratorAccumulatorScaleBias iterator_A1_scale,
///< iterator over A1 operand bias vector in global memory
IteratorAccumulatorScaleBias iterator_A1_bias,
///< iterator over B1 operand in global memory
IteratorB1 iterator_B1,
///< initial value of accumulator
FragmentC0 const &src_accum,
///< epilogue operation after 1st Gemm
OutputOp output_op_0)
{
//
// Prologue
//
// Issue several complete stages
CUTLASS_PRAGMA_UNROLL
for (int stage = 0; stage < Base::kStages - 1;
++stage, --gemm_k_iterations_0) {
iterator_A0.clear_mask(gemm_k_iterations_0 == 0);
iterator_B0.clear_mask(gemm_k_iterations_0 == 0);
iterator_A0.set_iteration_index(0);
this->smem_iterator_A0_.set_iteration_index(0);
// Load for operand A
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::TBLoadIterationsA0; ++j) {
typename IteratorA0::AccessType *dst_ptr =
reinterpret_cast<typename IteratorA0::AccessType *>(
this->smem_iterator_A0_.get());
CUTLASS_PRAGMA_UNROLL
for (int v = 0; v < IteratorA0::kAccessesPerVector; ++v) {
int const kSrcBytes =
sizeof_bits<typename IteratorA0::Element>::value *
IteratorA0::ThreadMap::kElementsPerAccess /
IteratorA0::kAccessesPerVector / 8;
int src_bytes = (iterator_A0.valid() ? kSrcBytes : 0);
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
dst_ptr + v, iterator_A0.get(), iterator_A0.valid());
++iterator_A0;
}
++this->smem_iterator_A0_;
}
iterator_B0.set_iteration_index(0);
this->smem_iterator_B0_.set_iteration_index(0);
// Load for operand B
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::TBLoadIterationsB0; ++j) {
typename IteratorB0::AccessType *dst_ptr =
reinterpret_cast<typename IteratorB0::AccessType *>(
this->smem_iterator_B0_.get());
CUTLASS_PRAGMA_UNROLL
for (int v = 0; v < IteratorB0::kAccessesPerVector; ++v) {
int const kSrcBytes =
sizeof_bits<typename IteratorB0::Element>::value *
IteratorB0::ThreadMap::kElementsPerAccess /
IteratorB0::kAccessesPerVector / 8;
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
dst_ptr + v, iterator_B0.get(), iterator_B0.valid());
++iterator_B0;
}
++this->smem_iterator_B0_;
}
// Move to the next stage
iterator_A0.add_tile_offset({0, 1});
iterator_B0.add_tile_offset({1, 0});
this->smem_iterator_A0_.add_tile_offset({0, 1});
this->smem_iterator_B0_.add_tile_offset({1, 0});
// Defines the boundary of a stage of cp.async.
cutlass::arch::cp_async_fence();
}
// Perform accumulation in the 'd' output operand
FragmentC0 accum0 = src_accum;
// DEPBAR+SYNC
cutlass::arch::cp_async_wait<Base::kStages - 2>();
__syncthreads();
// Pair of fragments used to overlap shared memory loads and math
// instructions
WarpLoadedFragmentA0 warp_loaded_frag_A0[2];
WarpLoadedFragmentB0 warp_loaded_frag_B0[2];
WarpTransformedFragmentA0 warp_transformed_frag_A0[2];
WarpTransformedFragmentB0 warp_transformed_frag_B0[2];
Operator0 warp_mma0;
this->warp_tile_iterator_A0_.set_kgroup_index(0);
this->warp_tile_iterator_B0_.set_kgroup_index(0);
this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[0]);
this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[0]);
++this->warp_tile_iterator_A0_;
++this->warp_tile_iterator_B0_;
iterator_A0.clear_mask(gemm_k_iterations_0 == 0);
iterator_B0.clear_mask(gemm_k_iterations_0 == 0);
int smem_write_stage_idx = Base::kStages - 1;
int smem_read_stage_idx = 0;
warp_mma0.transform(warp_transformed_frag_A0[0], warp_transformed_frag_B0[0],
warp_loaded_frag_A0[0], warp_loaded_frag_B0[0]);
//
// Mainloop
//
CUTLASS_GEMM_LOOP
for (; gemm_k_iterations_0 > (-Base::kStages + 1);) {
//
// Loop over GEMM K dimension
//
// Computes a warp-level GEMM on data held in shared memory
// Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
CUTLASS_PRAGMA_UNROLL
for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0;
++warp_mma_k) {
// Load warp-level tiles from shared memory, wrapping to k offset if
// this is the last group as the case may be.
this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[(warp_mma_k + 1) % 2]);
this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
++this->warp_tile_iterator_A0_;
++this->warp_tile_iterator_B0_;
if (warp_mma_k > 0)
warp_mma0.transform(warp_transformed_frag_A0[warp_mma_k % 2],
warp_transformed_frag_B0[warp_mma_k % 2],
warp_loaded_frag_A0[warp_mma_k % 2],
warp_loaded_frag_B0[warp_mma_k % 2]);
warp_mma0(
accum0,
warp_transformed_frag_A0[warp_mma_k % 2],
warp_transformed_frag_B0[warp_mma_k % 2],
accum0
);
// Issue global->shared copies for the this stage
if (warp_mma_k < Base::kWarpGemmIterations0 - 1) {
int group_start_iteration_A0, group_start_iteration_B0;
group_start_iteration_A0 = warp_mma_k * Detail::kAccessesPerGroupA0;
group_start_iteration_B0 = warp_mma_k * Detail::kAccessesPerGroupB0;
copy_tiles_and_advance_0(iterator_A0, iterator_B0, group_start_iteration_A0,
group_start_iteration_B0);
}
if (warp_mma_k + 2 == Base::kWarpGemmIterations0) {
int group_start_iteration_A0, group_start_iteration_B0;
group_start_iteration_A0 =
(warp_mma_k + 1) * Detail::kAccessesPerGroupA0;
group_start_iteration_B0 =
(warp_mma_k + 1) * Detail::kAccessesPerGroupB0;
copy_tiles_and_advance_0(iterator_A0, iterator_B0, group_start_iteration_A0,
group_start_iteration_B0);
// Inserts a memory fence between stages of cp.async instructions.
cutlass::arch::cp_async_fence();
// Waits until kStages-2 stages have committed.
arch::cp_async_wait<Base::kStages - 2>();
__syncthreads();
// Move to the next stage
iterator_A0.add_tile_offset({0, 1});
iterator_B0.add_tile_offset({1, 0});
this->smem_iterator_A0_.add_tile_offset({0, 1});
this->smem_iterator_B0_.add_tile_offset({1, 0});
// Add negative offsets to return iterators to the 'start' of the
// circular buffer in shared memory
if (smem_write_stage_idx == (Base::kStages - 1)) {
this->smem_iterator_A0_.add_tile_offset({0, -Base::kStages});
this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
smem_write_stage_idx = 0;
} else {
++smem_write_stage_idx;
}
if (smem_read_stage_idx == (Base::kStages - 1)) {
this->warp_tile_iterator_A0_.add_tile_offset(
{0, -Base::kStages * Policy0::kPartitionsK *
Base::kWarpGemmIterations0});
this->warp_tile_iterator_B0_.add_tile_offset(
{-Base::kStages * Policy0::kPartitionsK *
Base::kWarpGemmIterations0,
0});
smem_read_stage_idx = 0;
} else {
++smem_read_stage_idx;
}
--gemm_k_iterations_0;
iterator_A0.clear_mask(gemm_k_iterations_0 == 0);
iterator_B0.clear_mask(gemm_k_iterations_0 == 0);
}
// Do any conversions feeding the first stage at the end of the loop so
// we can start right away on mma instructions
if (warp_mma_k + 1 == Base::kWarpGemmIterations0)
warp_mma0.transform(warp_transformed_frag_A0[(warp_mma_k + 1) % 2],
warp_transformed_frag_B0[(warp_mma_k + 1) % 2],
warp_loaded_frag_A0[(warp_mma_k + 1) % 2],
warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
}
}
// 2nd Gemm
/// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
FragmentIteratorA1 warp_tile_iterator_A1_(accum0);
FragmentA1ScaleBias tb_frag_A1_scale;
FragmentA1ScaleBias tb_frag_A1_bias;
FragmentIteratorA1ScaleBias warp_tile_iterator_A1_scale_(tb_frag_A1_scale);
FragmentIteratorA1ScaleBias warp_tile_iterator_A1_bias_(tb_frag_A1_bias);
if(PerChannelScale) {
tb_frag_A1_scale.clear();
iterator_A1_scale.load(tb_frag_A1_scale);
++iterator_A1_scale;
}
tb_frag_A1_bias.clear();
iterator_A1_bias.load(tb_frag_A1_bias);
++iterator_A1_bias;
//
// Prologue
//
int gemm_k_iterations_1 = (FragmentIteratorA1::Policy::kIterations + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
// Issue several complete stages
CUTLASS_PRAGMA_UNROLL
for (int stage = 0; stage < Base::kStages - 1;
++stage, --gemm_k_iterations_1) {
iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
iterator_B1.set_iteration_index(0);
this->smem_iterator_B1_.set_iteration_index(0);
// Load for operand B
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) {
typename IteratorB1::AccessType *dst_ptr =
reinterpret_cast<typename IteratorB1::AccessType *>(
this->smem_iterator_B1_.get());
CUTLASS_PRAGMA_UNROLL
for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
int const kSrcBytes =
sizeof_bits<typename IteratorB1::Element>::value *
IteratorB1::ThreadMap::kElementsPerAccess /
IteratorB1::kAccessesPerVector / 8;
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
dst_ptr + v, iterator_B1.get(), iterator_B1.valid());
++iterator_B1;
}
++this->smem_iterator_B1_;
}
// Move to the next stage
iterator_B1.add_tile_offset({1, 0});
this->smem_iterator_B1_.add_tile_offset({1, 0});
// Defines the boundary of a stage of cp.async.
cutlass::arch::cp_async_fence();
}
// DEPBAR+SYNC
cutlass::arch::cp_async_wait<Base::kStages - 2>();
__syncthreads();
// Pair of fragments used to overlap shared memory loads and math
// instructions
WarpLoadedFragmentA1 warp_loaded_frag_A1[2];
WarpLoadedFragmentA1ScaleBias warp_loaded_frag_A1_scale[2];
WarpLoadedFragmentA1ScaleBias warp_loaded_frag_A1_bias[2];
WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
WarpTransformedFragmentA1 warp_transformed_frag_A1[2];
WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
Operator1 warp_mma1;
if(PerChannelScale) {
warp_tile_iterator_A1_scale_.load(warp_loaded_frag_A1_scale[0]);
++warp_tile_iterator_A1_scale_;
}
warp_tile_iterator_A1_bias_.load(warp_loaded_frag_A1_bias[0]);
++warp_tile_iterator_A1_bias_;
warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0],
warp_loaded_frag_A1_scale[0],
warp_loaded_frag_A1_bias[0],
output_op_0);
++warp_tile_iterator_A1_;
this->warp_tile_iterator_B1_.set_kgroup_index(0);
this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[0]);
++this->warp_tile_iterator_B1_;
iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
smem_write_stage_idx = Base::kStages - 1;
smem_read_stage_idx = 0;
warp_mma1.transform(warp_transformed_frag_A1[0], warp_transformed_frag_B1[0],
warp_loaded_frag_A1[0], warp_loaded_frag_B1[0]);
//
// Mainloop
//
gemm_k_iterations_1 = (FragmentIteratorA1::Policy::kIterations + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1 - (Base::kStages - 1);
CUTLASS_PRAGMA_UNROLL
for (; gemm_k_iterations_1 > (-Base::kStages + 1); gemm_k_iterations_1--) {
//
// Loop over GEMM K dimension
//
// Computes a warp-level GEMM on data held in shared memory
// Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
CUTLASS_PRAGMA_UNROLL
for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1;
++warp_mma_k) {
// Load threadblock-level scale/bias vector from global memory
if (warp_mma_k + 1 == Base::kWarpGemmIterations1) {
if(PerChannelScale) {
tb_frag_A1_scale.clear();
iterator_A1_scale.load(tb_frag_A1_scale);
++iterator_A1_scale;
}
tb_frag_A1_bias.clear();
iterator_A1_bias.load(tb_frag_A1_bias);
++iterator_A1_bias;
}
// Load warp-level scale bias fragment from threadblock scale/bias vector
if(PerChannelScale) {
warp_tile_iterator_A1_scale_.load(warp_loaded_frag_A1_scale[(warp_mma_k + 1) % 2]);
++warp_tile_iterator_A1_scale_;
}
warp_tile_iterator_A1_bias_.load(warp_loaded_frag_A1_bias[(warp_mma_k + 1) % 2]);
++warp_tile_iterator_A1_bias_;
// Load warp-level tile from accumulator fragment
warp_tile_iterator_A1_.load(warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
warp_loaded_frag_A1_scale[(warp_mma_k + 1) % 2],
warp_loaded_frag_A1_bias[(warp_mma_k + 1) % 2],
output_op_0);
++warp_tile_iterator_A1_;
// Load warp-level tiles from shared memory, wrapping to k offset if
// this is the last group as the case may be.
this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
++this->warp_tile_iterator_B1_;
if (warp_mma_k > 0)
warp_mma1.transform(warp_transformed_frag_A1[warp_mma_k % 2],
warp_transformed_frag_B1[warp_mma_k % 2],
warp_loaded_frag_A1[warp_mma_k % 2],
warp_loaded_frag_B1[warp_mma_k % 2]);
warp_mma1(
accum,
warp_transformed_frag_A1[warp_mma_k % 2],
warp_transformed_frag_B1[warp_mma_k % 2],
accum
);
// Issue global->shared copies for the this stage
if (warp_mma_k < Base::kWarpGemmIterations1 - 1) {
int group_start_iteration_B1;
group_start_iteration_B1 = warp_mma_k * Detail::kAccessesPerGroupB1;
copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
}
if (warp_mma_k + 2 == Base::kWarpGemmIterations1) {
int group_start_iteration_B1;
group_start_iteration_B1 =
(warp_mma_k + 1) * Detail::kAccessesPerGroupB1;
copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
// Inserts a memory fence between stages of cp.async instructions.
cutlass::arch::cp_async_fence();
// Waits until kStages-2 stages have committed.
arch::cp_async_wait<Base::kStages - 2>();
__syncthreads();
// Move to the next stage
iterator_B1.add_tile_offset({1, 0});
this->smem_iterator_B1_.add_tile_offset({1, 0});
// Add negative offsets to return iterators to the 'start' of the
// circular buffer in shared memory
if (smem_write_stage_idx == (Base::kStages - 1)) {
this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
smem_write_stage_idx = 0;
} else {
++smem_write_stage_idx;
}
if (smem_read_stage_idx == (Base::kStages - 1)) {
this->warp_tile_iterator_B1_.add_tile_offset(
{-Base::kStages * Policy1::kPartitionsK *
Base::kWarpGemmIterations1,
0});
smem_read_stage_idx = 0;
} else {
++smem_read_stage_idx;
}
iterator_B1.clear_mask(gemm_k_iterations_1 == 1);
}
// Do any conversions feeding the first stage at the end of the loop so
// we can start right away on mma instructions
if (warp_mma_k + 1 == Base::kWarpGemmIterations1)
warp_mma1.transform(warp_transformed_frag_A1[(warp_mma_k + 1) % 2],
warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
}
}
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace threadblock
} // namespace gemm
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,869 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Template for a double-buffered threadblock-scoped GEMM kernel.
*/
#pragma once
#include "cutlass/aligned_buffer.h"
#include "cutlass/arch/memory.h"
#include "cutlass/array.h"
#include "cutlass/cutlass.h"
#include "cutlass/gemm/gemm.h"
#include "cutlass/matrix_shape.h"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
#include "threadblock/b2b_mma_base_smem_accumulator.h"
#include "cutlass/epilogue/threadblock/epilogue_smem_accumulator.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace gemm {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Structure to compute the matrix product targeting CUDA cores and SIMT math
/// instructions.
template <
/// Size of the Gemm problem - concept: gemm::GemmShape<>
typename Shape0_,
/// Iterates over tiles of A operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator |
// MaskedTileIterator)
typename IteratorA0_,
/// Iterates over tiles of A operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorA0_,
/// Cache operation for operand A
cutlass::arch::CacheOperation::Kind CacheOpA0,
/// Iterates over tiles of B operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator |
// MaskedTileIterator)
typename IteratorB0_,
/// Iterates over tiles of B operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorB0_,
/// Cache operation for operand B
cutlass::arch::CacheOperation::Kind CacheOpB0,
/// Iterates over vectors of scale and bias vector in global memory
// (concept: VectorIterator)
typename IteratorAccumulatorScaleBias_,
/// Iterates over accumulator tile
typename FragmentIteratorAccumulator_,
/// Iterates over accumulator tile in shared memory
typename SmemIteratorD0_,
/// Size of the Gemm problem - concept: gemm::GemmShape<>
typename Shape1_,
/// Iterates over the intermediate accumulator tile in shared memory
typename WarpIteratorA1_,
/// Iterates over tiles of B operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator |
// MaskedTileIterator)
typename IteratorB1_,
/// Iterates over tiles of B operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorB1_,
/// Cache operation for operand B
cutlass::arch::CacheOperation::Kind CacheOpB1,
/// Data type of accumulator matrix
typename ElementC_,
/// Data type of accumulator matrix
typename LayoutC_,
/// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...)
typename OutputOp_,
/// Policy describing tuning details (concept: MmaPolicy)
typename Policy0_,
/// Policy describing tuning details (concept: MmaPolicy)
typename Policy1_,
/// Number of stages,
int Stages,
/// Used for partial specialization
typename Enable = bool>
class B2bMmaMultistageSmemAccumulator :
public gemm::threadblock::B2bMmaBaseSmemAccumulator<Shape0_, Shape1_, Policy0_, Policy1_, SmemIteratorD0_, Stages> {
public:
///< Base class
using Base = gemm::threadblock::B2bMmaBaseSmemAccumulator<Shape0_, Shape1_, Policy0_, Policy1_, SmemIteratorD0_, Stages>;
///< Size of the Gemm problem - concept: gemm::GemmShape<>
using Shape0 = Shape0_;
///< Iterates over tiles of A operand in global memory
using IteratorA0 = IteratorA0_;
///< Iterates over tiles of B operand in global memory
using IteratorB0 = IteratorB0_;
///< Iterates over tiles of the scale and bias vectors in global memory
using IteratorAccumulatorScaleBias = IteratorAccumulatorScaleBias_;
///< Policy describing tuning details
using Policy0 = Policy0_;
using SmemIteratorA0 = SmemIteratorA0_;
using SmemIteratorB0 = SmemIteratorB0_;
using SmemIteratorD0 = SmemIteratorD0_; ///< Iterates over accumulator tile in shared memory
using FragmentIteratorAccumulator = FragmentIteratorAccumulator_; ///< Iterates over accumulator tile
///< Size of the Gemm problem - concept: gemm::GemmShape<>
using Shape1 = Shape1_;
///< Iterates over tiles of B operand in global memory
using IteratorB1 = IteratorB1_;
///< Policy describing tuning details
using Policy1 = Policy1_;
using SmemIteratorB1 = SmemIteratorB1_;
using WarpIteratorA1 = WarpIteratorA1_; ///< Iterates over the intermediate accumulator tile in shared memory
///< Data type of accumulator matrix
using ElementC = ElementC_;
///< Layout of accumulator matrix
using LayoutC = LayoutC_;
///< Epilogue after 1st Gemm
using OutputOp = OutputOp_;
static cutlass::arch::CacheOperation::Kind const kCacheOpA0 = CacheOpA0;
static cutlass::arch::CacheOperation::Kind const kCacheOpB0 = CacheOpB0;
static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1;
//
// Dependent types
//
/// Fragment of accumulator tile
using FragmentC0 = typename Policy0::Operator::FragmentC;
/// Warp-level Mma
using Operator0 = typename Policy0::Operator;
/// Fragment of Scale and Bias loaded from global memory
using FragmentA1ScaleBias = typename IteratorAccumulatorScaleBias::Fragment;
/// Fragment of accumulator tile
using FragmentC1 = typename Policy1::Operator::FragmentC;
/// Warp-level Mma
using Operator1 = typename Policy1::Operator;
/// Epilog in shared memory
using Epilogue0 = epilogue::threadblock::EpilogueSmemAccumulator<
SmemIteratorD0, ///< SmemTileIterator
FragmentIteratorAccumulator, ///< AccumulatorFragmentIterator
IteratorAccumulatorScaleBias, ///< ScaleBiasIterator
OutputOp>; ///< Output operator
/// Minimum architecture is Sm80 to support cp.async
using ArchTag = arch::Sm80;
/// Complex transform on A operand
static ComplexTransform const kTransformA0 = Operator0::kTransformA;
/// Complex transform on B operand
static ComplexTransform const kTransformB0 = Operator0::kTransformB;
/// Complex transform on B operand
static ComplexTransform const kTransformB1 = Operator1::kTransformB;
/// Internal structure exposed for introspection.
struct Detail {
static_assert(Base::kWarpGemmIterations0 > 1,
"The pipelined structure requires at least two warp-level "
"GEMM operations.");
static_assert(Base::kWarpGemmIterations1 > 1,
"The pipelined structure requires at least two warp-level "
"GEMM operations.");
/// Number of cp.async instructions to load one stage of operand A
static int const TBLoadIterationsA0 =
IteratorA0::ThreadMap::Iterations::kCount;
/// Number of cp.async instructions to load one stage of operand B
static int const TBLoadIterationsB0 =
IteratorB0::ThreadMap::Iterations::kCount;
/// Number of cp.async instructions to load one stage of operand B
static int const TBLoadIterationsB1 =
IteratorB1::ThreadMap::Iterations::kCount;
/// Number of stages
static int const kStages = Stages;
/// Number of cp.async instructions to load on group of operand A
static int const kAccessesPerGroupA0 =
(TBLoadIterationsA0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
/// Number of cp.async instructions to load on group of operand B
static int const kAccessesPerGroupB0 =
(TBLoadIterationsB0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
/// Number of cp.async instructions to load on group of operand B
static int const kAccessesPerGroupB1 =
(TBLoadIterationsB1 + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
};
private:
using WarpLoadedFragmentA0 = typename Operator0::FragmentA;
using WarpLoadedFragmentB0 = typename Operator0::FragmentB;
using WarpLoadedFragmentA1 = typename Operator1::FragmentA;
using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
using WarpTransformedFragmentA0 = typename Operator0::TransformedFragmentA;
using WarpTransformedFragmentB0 = typename Operator0::TransformedFragmentB;
using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA;
using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
private:
//
// Data members
//
/// Iterator to write threadblock-scoped tile of A operand to shared memory
SmemIteratorA0 smem_iterator_A0_;
/// Iterator to write threadblock-scoped tile of B operand to shared memory
SmemIteratorB0 smem_iterator_B0_;
/// Shared Memory Iterator to store accumulator tile
SmemIteratorD0 smem_iterator_D0_;
/// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
WarpIteratorA1 warp_tile_iterator_A1_;
/// Iterator to write threadblock-scoped tile of B operand to shared memory
SmemIteratorB1 smem_iterator_B1_;
public:
/// Construct from tensor references
CUTLASS_DEVICE
B2bMmaMultistageSmemAccumulator(
///< Shared storage needed for internal use by threadblock-scoped GEMM
typename Base::B2bMmaSharedStorage &shared_storage,
///< ID within the threadblock
int thread_idx,
///< ID of warp
int warp_idx,
///< ID of each thread within a warp
int lane_idx,
///< GEMM0 N is used for accumulator extent
int problem_size_0_n
):
Base(shared_storage, thread_idx, warp_idx, lane_idx),
smem_iterator_A0_(shared_storage.b2b_mma_shared_storage.shared_storage0.operand_A_ref(), thread_idx),
smem_iterator_B0_(shared_storage.b2b_mma_shared_storage.shared_storage0.operand_B_ref(), thread_idx),
smem_iterator_D0_(shared_storage.accumulator_shared_storage0.accum_ref(), lane_idx),
warp_tile_iterator_A1_(shared_storage.accumulator_shared_storage0.accum_ref(), {Base::WarpGemm1::kM, problem_size_0_n}, lane_idx ),
smem_iterator_B1_(shared_storage.b2b_mma_shared_storage.shared_storage1.operand_B_ref(), thread_idx)
{
// Compute warp location within threadblock tile by mapping the warp_id to
// three coordinates:
// _m: the warp's position within the threadblock along the M dimension
// _n: the warp's position within the threadblock along the N dimension
// _k: the warp's position within the threadblock along the K dimension
int warp_idx_mn_0 = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
int warp_idx_k_0 = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
int warp_idx_m_0 = warp_idx_mn_0 % Base::WarpCount0::kM;
int warp_idx_n_0 = warp_idx_mn_0 / Base::WarpCount0::kM;
int warp_idx_mn_1 = warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN);
int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN);
int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM;
int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM;
// Add per-warp offsets in units of warp-level tiles
this->warp_tile_iterator_A0_.add_tile_offset(
{warp_idx_m_0, Base::kWarpGemmIterations0 * warp_idx_k_0});
this->warp_tile_iterator_B0_.add_tile_offset(
{Base::kWarpGemmIterations0 * warp_idx_k_0, warp_idx_n_0});
warp_tile_iterator_A1_.add_tile_offset(
{warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1});
this->warp_tile_iterator_B1_.add_tile_offset(
{Base::kWarpGemmIterations1 * warp_idx_k_1, warp_idx_n_1});
// Add smem accumulator iterator warp offset
smem_iterator_D0_.add_tile_offset({ warp_idx_m_0 * SmemIteratorD0::TileIterations::kRow,
warp_idx_n_0 * SmemIteratorD0::TileIterations::kColumn});
}
CUTLASS_DEVICE
void copy_tiles_and_advance_0(IteratorA0 &iterator_A0, IteratorB0 &iterator_B0,
int group_start_A0 = 0, int group_start_B0 = 0) {
iterator_A0.set_iteration_index(group_start_A0 *
IteratorA0::kAccessesPerVector);
this->smem_iterator_A0_.set_iteration_index(group_start_A0);
// cp.async for operand A
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::kAccessesPerGroupA0; ++j) {
if (group_start_A0 + j < Detail::TBLoadIterationsA0) {
typename IteratorA0::AccessType *dst_ptr =
reinterpret_cast<typename IteratorA0::AccessType *>(
this->smem_iterator_A0_.get());
int const kSrcBytes = sizeof_bits<typename IteratorA0::Element>::value *
IteratorA0::ThreadMap::kElementsPerAccess /
IteratorA0::kAccessesPerVector / 8;
CUTLASS_PRAGMA_UNROLL
for (int v = 0; v < IteratorA0::kAccessesPerVector; ++v) {
auto gmem_ptr = iterator_A0.get();
cutlass::arch::cp_async<kSrcBytes, kCacheOpA0>(
dst_ptr + v, gmem_ptr, iterator_A0.valid());
++iterator_A0;
}
++this->smem_iterator_A0_;
}
}
iterator_B0.set_iteration_index(group_start_B0 *
IteratorB0::kAccessesPerVector);
this->smem_iterator_B0_.set_iteration_index(group_start_B0);
// cp.async for operand B
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::kAccessesPerGroupB0; ++j) {
if (group_start_B0 + j < Detail::TBLoadIterationsB0) {
typename IteratorB0::AccessType *dst_ptr =
reinterpret_cast<typename IteratorB0::AccessType *>(
this->smem_iterator_B0_.get());
int const kSrcBytes = sizeof_bits<typename IteratorB0::Element>::value *
IteratorB0::ThreadMap::kElementsPerAccess /
IteratorB0::kAccessesPerVector / 8;
CUTLASS_PRAGMA_UNROLL
for (int v = 0; v < IteratorB0::kAccessesPerVector; ++v) {
auto gmem_ptr = iterator_B0.get();
cutlass::arch::cp_async<kSrcBytes, kCacheOpB0>(
dst_ptr + v, gmem_ptr, iterator_B0.valid());
++iterator_B0;
}
++this->smem_iterator_B0_;
}
}
}
CUTLASS_DEVICE
void copy_tiles_and_advance_1(IteratorB1 &iterator_B1,
int group_start_B1 = 0) {
iterator_B1.set_iteration_index(group_start_B1 *
IteratorB1::kAccessesPerVector);
this->smem_iterator_B1_.set_iteration_index(group_start_B1);
// cp.async for operand B
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
if (group_start_B1 + j < Detail::TBLoadIterationsB1) {
typename IteratorB1::AccessType *dst_ptr =
reinterpret_cast<typename IteratorB1::AccessType *>(
this->smem_iterator_B1_.get());
int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
IteratorB1::ThreadMap::kElementsPerAccess /
IteratorB1::kAccessesPerVector / 8;
CUTLASS_PRAGMA_UNROLL
for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
auto gmem_ptr = iterator_B1.get();
cutlass::arch::cp_async<kSrcBytes, kCacheOpB1>(
dst_ptr + v, gmem_ptr, iterator_B1.valid());
++iterator_B1;
}
++this->smem_iterator_B1_;
}
}
}
/// Perform a threadblock-scoped matrix multiply-accumulate
CUTLASS_DEVICE
void operator()(
///< problem size of GEMM
int gemm_k_iterations_0,
///< destination accumulator tile
FragmentC1 &accum,
///< iterator over A0 operand in global memory
IteratorA0 iterator_A0,
///< iterator over B0 operand in global memory
IteratorB0 iterator_B0,
///< iterator over A1 operand scale vector in global memory
IteratorAccumulatorScaleBias iterator_accum0_scale,
///< iterator over A1 operand bias vector in global memory
IteratorAccumulatorScaleBias iterator_accum0_bias,
///< iterator over B1 operand in global memory
IteratorB1 iterator_B1,
///< initial value of accumulator
FragmentC0 const &src_accum,
///< epilogue operation after 1st Gemm
OutputOp output_op_0)
{
//
// Prologue
//
// Issue several complete stages
CUTLASS_PRAGMA_UNROLL
for (int stage = 0; stage < Base::kStages - 1;
++stage, --gemm_k_iterations_0) {
iterator_A0.clear_mask(gemm_k_iterations_0 == 0);
iterator_B0.clear_mask(gemm_k_iterations_0 == 0);
iterator_A0.set_iteration_index(0);
this->smem_iterator_A0_.set_iteration_index(0);
// cp.async for operand A
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::TBLoadIterationsA0; ++j) {
typename IteratorA0::AccessType *dst_ptr =
reinterpret_cast<typename IteratorA0::AccessType *>(
this->smem_iterator_A0_.get());
CUTLASS_PRAGMA_UNROLL
for (int v = 0; v < IteratorA0::kAccessesPerVector; ++v) {
int const kSrcBytes =
sizeof_bits<typename IteratorA0::Element>::value *
IteratorA0::ThreadMap::kElementsPerAccess /
IteratorA0::kAccessesPerVector / 8;
int src_bytes = (iterator_A0.valid() ? kSrcBytes : 0);
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
dst_ptr + v, iterator_A0.get(), iterator_A0.valid());
++iterator_A0;
}
++this->smem_iterator_A0_;
}
iterator_B0.set_iteration_index(0);
this->smem_iterator_B0_.set_iteration_index(0);
// cp.async for operand B
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::TBLoadIterationsB0; ++j) {
typename IteratorB0::AccessType *dst_ptr =
reinterpret_cast<typename IteratorB0::AccessType *>(
this->smem_iterator_B0_.get());
CUTLASS_PRAGMA_UNROLL
for (int v = 0; v < IteratorB0::kAccessesPerVector; ++v) {
int const kSrcBytes =
sizeof_bits<typename IteratorB0::Element>::value *
IteratorB0::ThreadMap::kElementsPerAccess /
IteratorB0::kAccessesPerVector / 8;
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
dst_ptr + v, iterator_B0.get(), iterator_B0.valid());
++iterator_B0;
}
++this->smem_iterator_B0_;
}
// Move to the next stage
iterator_A0.add_tile_offset({0, 1});
iterator_B0.add_tile_offset({1, 0});
this->smem_iterator_A0_.add_tile_offset({0, 1});
this->smem_iterator_B0_.add_tile_offset({1, 0});
// Defines the boundary of a stage of cp.async.
cutlass::arch::cp_async_fence();
}
// Perform accumulation in the 'd' output operand
FragmentC0 accum0 = src_accum;
// DEPBAR+SYNC
cutlass::arch::cp_async_wait<Base::kStages - 2>();
__syncthreads();
// Pair of fragments used to overlap shared memory loads and math
// instructions
WarpLoadedFragmentA0 warp_loaded_frag_A0[2];
WarpLoadedFragmentB0 warp_loaded_frag_B0[2];
WarpTransformedFragmentA0 warp_transformed_frag_A0[2];
WarpTransformedFragmentB0 warp_transformed_frag_B0[2];
Operator0 warp_mma0;
this->warp_tile_iterator_A0_.set_kgroup_index(0);
this->warp_tile_iterator_B0_.set_kgroup_index(0);
this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[0]);
this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[0]);
++this->warp_tile_iterator_A0_;
++this->warp_tile_iterator_B0_;
iterator_A0.clear_mask(gemm_k_iterations_0 == 0);
iterator_B0.clear_mask(gemm_k_iterations_0 == 0);
int smem_write_stage_idx = Base::kStages - 1;
int smem_read_stage_idx = 0;
warp_mma0.transform(warp_transformed_frag_A0[0], warp_transformed_frag_B0[0],
warp_loaded_frag_A0[0], warp_loaded_frag_B0[0]);
//
// Mainloop
//
CUTLASS_GEMM_LOOP
for (; gemm_k_iterations_0 > (-Base::kStages + 1);) {
//
// Loop over GEMM K dimension
//
// Computes a warp-level GEMM on data held in shared memory
// Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
CUTLASS_PRAGMA_UNROLL
for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0;
++warp_mma_k) {
// Load warp-level tiles from shared memory, wrapping to k offset if
// this is the last group as the case may be.
this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[(warp_mma_k + 1) % 2]);
this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
++this->warp_tile_iterator_A0_;
++this->warp_tile_iterator_B0_;
if (warp_mma_k > 0)
warp_mma0.transform(warp_transformed_frag_A0[warp_mma_k % 2],
warp_transformed_frag_B0[warp_mma_k % 2],
warp_loaded_frag_A0[warp_mma_k % 2],
warp_loaded_frag_B0[warp_mma_k % 2]);
warp_mma0(
accum0,
warp_transformed_frag_A0[warp_mma_k % 2],
warp_transformed_frag_B0[warp_mma_k % 2],
accum0
);
// Issue global->shared copies for the this stage
if (warp_mma_k < Base::kWarpGemmIterations0 - 1) {
int group_start_iteration_A0, group_start_iteration_B0;
group_start_iteration_A0 = warp_mma_k * Detail::kAccessesPerGroupA0;
group_start_iteration_B0 = warp_mma_k * Detail::kAccessesPerGroupB0;
copy_tiles_and_advance_0(iterator_A0, iterator_B0, group_start_iteration_A0,
group_start_iteration_B0);
}
if (warp_mma_k + 2 == Base::kWarpGemmIterations0) {
int group_start_iteration_A0, group_start_iteration_B0;
group_start_iteration_A0 =
(warp_mma_k + 1) * Detail::kAccessesPerGroupA0;
group_start_iteration_B0 =
(warp_mma_k + 1) * Detail::kAccessesPerGroupB0;
copy_tiles_and_advance_0(iterator_A0, iterator_B0, group_start_iteration_A0,
group_start_iteration_B0);
// Inserts a memory fence between stages of cp.async instructions.
cutlass::arch::cp_async_fence();
// Waits until kStages-2 stages have committed.
arch::cp_async_wait<Base::kStages - 2>();
__syncthreads();
// Move to the next stage
iterator_A0.add_tile_offset({0, 1});
iterator_B0.add_tile_offset({1, 0});
this->smem_iterator_A0_.add_tile_offset({0, 1});
this->smem_iterator_B0_.add_tile_offset({1, 0});
// Add negative offsets to return iterators to the 'start' of the
// circular buffer in shared memory
if (smem_write_stage_idx == (Base::kStages - 1)) {
this->smem_iterator_A0_.add_tile_offset({0, -Base::kStages});
this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
smem_write_stage_idx = 0;
} else {
++smem_write_stage_idx;
}
if (smem_read_stage_idx == (Base::kStages - 1)) {
this->warp_tile_iterator_A0_.add_tile_offset(
{0, -Base::kStages * Policy0::kPartitionsK *
Base::kWarpGemmIterations0});
this->warp_tile_iterator_B0_.add_tile_offset(
{-Base::kStages * Policy0::kPartitionsK *
Base::kWarpGemmIterations0,
0});
smem_read_stage_idx = 0;
} else {
++smem_read_stage_idx;
}
--gemm_k_iterations_0;
iterator_A0.clear_mask(gemm_k_iterations_0 == 0);
iterator_B0.clear_mask(gemm_k_iterations_0 == 0);
}
// Do any conversions feeding the first stage at the end of the loop so
// we can start right away on mma instructions
if (warp_mma_k + 1 == Base::kWarpGemmIterations0)
warp_mma0.transform(warp_transformed_frag_A0[(warp_mma_k + 1) % 2],
warp_transformed_frag_B0[(warp_mma_k + 1) % 2],
warp_loaded_frag_A0[(warp_mma_k + 1) % 2],
warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
}
}
/// Epilogue for the first Implicit Gemm
Epilogue0 epilogue0;
epilogue0(output_op_0, smem_iterator_D0_, accum0, iterator_accum0_scale, iterator_accum0_bias);
__syncthreads();
// 2nd Gemm
//
// Prologue
//
int gemm_k_iterations_1 = Shape0::kN / Shape1::kK;
// Issue several complete stages
CUTLASS_PRAGMA_UNROLL
for (int stage = 0; stage < Base::kStages - 1;
++stage, --gemm_k_iterations_1) {
iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
iterator_B1.set_iteration_index(0);
this->smem_iterator_B1_.set_iteration_index(0);
// cp.async for operand B
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) {
typename IteratorB1::AccessType *dst_ptr =
reinterpret_cast<typename IteratorB1::AccessType *>(
this->smem_iterator_B1_.get());
CUTLASS_PRAGMA_UNROLL
for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
int const kSrcBytes =
sizeof_bits<typename IteratorB1::Element>::value *
IteratorB1::ThreadMap::kElementsPerAccess /
IteratorB1::kAccessesPerVector / 8;
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
dst_ptr + v, iterator_B1.get(), iterator_B1.valid());
++iterator_B1;
}
++this->smem_iterator_B1_;
}
// Move to the next stage
iterator_B1.add_tile_offset({1, 0});
this->smem_iterator_B1_.add_tile_offset({1, 0});
// Defines the boundary of a stage of cp.async.
cutlass::arch::cp_async_fence();
}
// DEPBAR+SYNC
cutlass::arch::cp_async_wait<Base::kStages - 2>();
__syncthreads();
// Pair of fragments used to overlap shared memory loads and math
// instructions
WarpLoadedFragmentA1 warp_loaded_frag_A1[2];
WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
WarpTransformedFragmentA1 warp_transformed_frag_A1[2];
WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
Operator1 warp_mma1;
warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0]);
++warp_tile_iterator_A1_;
this->warp_tile_iterator_B1_.set_kgroup_index(0);
this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[0]);
++this->warp_tile_iterator_B1_;
iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
smem_write_stage_idx = Base::kStages - 1;
smem_read_stage_idx = 0;
warp_mma1.transform(warp_transformed_frag_A1[0], warp_transformed_frag_B1[0],
warp_loaded_frag_A1[0], warp_loaded_frag_B1[0]);
//
// Mainloop
//
CUTLASS_PRAGMA_UNROLL
for ( gemm_k_iterations_1 = Shape0::kN / Shape1::kK - (Base::kStages - 1);
gemm_k_iterations_1 > (-Base::kStages + 1); gemm_k_iterations_1--) {
//
// Loop over GEMM K dimension
//
// Computes a warp-level GEMM on data held in shared memory
// Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
CUTLASS_PRAGMA_UNROLL
for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1;
++warp_mma_k) {
// Load warp-level tile from accumulator fragment
// skip warp tile loading for the last kgroup
if(gemm_k_iterations_1 > (-Base::kStages + 2) || warp_mma_k < Base::kWarpGemmIterations1 - 1) {
warp_tile_iterator_A1_.load(warp_loaded_frag_A1[(warp_mma_k + 1) % 2]);
}
++warp_tile_iterator_A1_;
// Load warp-level tiles from shared memory, wrapping to k offset if
// this is the last group as the case may be.
this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
++this->warp_tile_iterator_B1_;
if (warp_mma_k > 0)
warp_mma1.transform(warp_transformed_frag_A1[warp_mma_k % 2],
warp_transformed_frag_B1[warp_mma_k % 2],
warp_loaded_frag_A1[warp_mma_k % 2],
warp_loaded_frag_B1[warp_mma_k % 2]);
warp_mma1(
accum,
warp_transformed_frag_A1[warp_mma_k % 2],
warp_transformed_frag_B1[warp_mma_k % 2],
accum
);
// Issue global->shared copies for the this stage
if (warp_mma_k < Base::kWarpGemmIterations1 - 1) {
int group_start_iteration_B1;
group_start_iteration_B1 = warp_mma_k * Detail::kAccessesPerGroupB1;
copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
}
if (warp_mma_k + 2 == Base::kWarpGemmIterations1) {
int group_start_iteration_B1;
group_start_iteration_B1 =
(warp_mma_k + 1) * Detail::kAccessesPerGroupB1;
copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
// Inserts a memory fence between stages of cp.async instructions.
cutlass::arch::cp_async_fence();
// Waits until kStages-2 stages have committed.
arch::cp_async_wait<Base::kStages - 2>();
__syncthreads();
// Move to the next stage
iterator_B1.add_tile_offset({1, 0});
this->smem_iterator_B1_.add_tile_offset({1, 0});
// Add negative offsets to return iterators to the 'start' of the
// circular buffer in shared memory
if (smem_write_stage_idx == (Base::kStages - 1)) {
this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
smem_write_stage_idx = 0;
} else {
++smem_write_stage_idx;
}
if (smem_read_stage_idx == (Base::kStages - 1)) {
this->warp_tile_iterator_B1_.add_tile_offset(
{-Base::kStages * Policy1::kPartitionsK *
Base::kWarpGemmIterations1,
0});
smem_read_stage_idx = 0;
} else {
++smem_read_stage_idx;
}
iterator_B1.clear_mask(gemm_k_iterations_1 == 1);
}
// Do any conversions feeding the first stage at the end of the loop so
// we can start right away on mma instructions
if (warp_mma_k + 1 == Base::kWarpGemmIterations1)
warp_mma1.transform(warp_transformed_frag_A1[(warp_mma_k + 1) % 2],
warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
}
}
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace threadblock
} // namespace gemm
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,24 +1,30 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
@ -48,10 +54,6 @@ namespace gemm {
namespace threadblock {
////////////////////////////////////////////////////////////////////////////////////////////////
template<int a>
struct chk_val {
static_assert(a==0, "check value");
};
/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
template <
@ -74,6 +76,11 @@ template <
/// Iterates over the intermediate accumulator tile
// (concept::MmaTensorOpFragmentIterator)
typename FragmentIteratorA1_,
/// Iterates over vectors of scale and bias vector in global memory
// (concept: VectorIterator)
typename IteratorAccumulatorScaleBias_,
/// FragmentIterator to load Scale or Bias vector from threadblock fragment
typename FragmentIteratorA1ScaleBias_,
/// Iterates over tiles of B operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
typename IteratorB1_,
@ -110,7 +117,8 @@ template <
/// Used for partial specialization
typename Enable = bool
>
class B2bMmaPipelined : public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2> {
class B2bMmaPipelined :
public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2> {
public:
///< Base class
@ -126,6 +134,9 @@ public:
using Shape1 = Shape1_; ///< Size of the Gemm problem - concept: gemm::GemmShape<>
using FragmentIteratorA1 = FragmentIteratorA1_; ///< Iterates over intermediate accumulator tile
using IteratorAccumulatorScaleBias = IteratorAccumulatorScaleBias_; ///< Iterates over tiles of the scale and bias vectors in global memory
using FragmentIteratorA1ScaleBias =
FragmentIteratorA1ScaleBias_; ///< WarpIterator to load Scale or Bias vector from the threadblock fragment
using IteratorB1 = IteratorB1_; ///< Iterates over tiles of B operand in global memory
using Policy1 = Policy1_; ///< Policy describing tuning details
@ -134,9 +145,12 @@ public:
using ElementC = ElementC_; ///< Data type of accumulator matrix
using LayoutC = LayoutC_; ///< Layout of accumulator matrix
using OutputOp = OutputOp_; ///< Epilogue after 1st Gemm
static const bool PerChannelScale = (OutputOp::kScale ==
epilogue::thread::ScaleType::OnlyAlphaPerChannelScaling);
using TransformA0 = TransformA0_;
using TransformB0 = TransformB0_;
using TransformB1 = TransformB1_;
@ -156,6 +170,9 @@ public:
/// Warp-level Mma
using Operator0 = typename Policy0::Operator;
/// Fragment of Scale and Bias loaded from global memory
using FragmentA1ScaleBias = typename IteratorAccumulatorScaleBias::Fragment;
/// Fragment of operand B loaded from global memory
using FragmentB1 = typename IteratorB1::Fragment;
@ -165,7 +182,7 @@ public:
/// Warp-level Mma
using Operator1 = typename Policy1::Operator;
/// Obtain the arch tag from the warp-level operator
using ArchTag = typename Policy0::Operator::ArchTag;
@ -178,7 +195,7 @@ public:
/// Complex transform on B1 operand
static ComplexTransform const kTransformB1 = Operator1::kTransformB;
// staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
/// staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
private:
@ -187,6 +204,9 @@ private:
using WarpFragmentB0 = typename Operator0::FragmentB;
/// Warp Fragment of operand A1 loaded from accmulator tile
using WarpFragmentA1 = typename FragmentIteratorA1::Fragment;
/// Warp Fragment of operand A1 scale and bias loaded from threadblock fragment
using WarpFragmentA1ScaleBias =
typename FragmentIteratorA1ScaleBias::Fragment;
using WarpFragmentB1 = typename Operator1::FragmentB;
protected:
@ -208,12 +228,13 @@ public:
typename Base::B2bMmaSharedStorage &shared_storage, ///< Shared storage needed for internal use by threadblock-scoped GEMM
int thread_idx, ///< ID within the threadblock
int warp_idx, ///< ID of warp
int lane_idx ///< ID of each thread within a warp
int lane_idx, ///< ID of each thread within a warp
int problem_size_0_n ///< GEMM0 N is used for accumulator extent
):
Base(shared_storage, thread_idx, warp_idx, lane_idx),
smem_iterator_A_(shared_storage.sharedStorage0.operand_A_ref(), thread_idx),
smem_iterator_B0_(shared_storage.sharedStorage0.operand_B_ref(), thread_idx),
smem_iterator_B1_(shared_storage.sharedStorage1.operand_B_ref(), thread_idx) {
smem_iterator_A_(shared_storage.shared_storage0.operand_A_ref(), thread_idx),
smem_iterator_B0_(shared_storage.shared_storage0.operand_B_ref(), thread_idx),
smem_iterator_B1_(shared_storage.shared_storage1.operand_B_ref(), thread_idx) {
// Compute warp location within threadblock tile by mapping the warp_id to three coordinates:
@ -241,16 +262,18 @@ public:
/// Perform a threadblock-scoped matrix multiply-accumulate
CUTLASS_DEVICE
void operator()(
int gemm_k_iterations_0, ///< number of iterations of the mainloop
FragmentC1 &accum, ///< destination accumulator tile
IteratorA0 iterator_A, ///< iterator over A operand in global memory
IteratorB0 iterator_B0, ///< iterator over B0 operand in global memory
IteratorB1 iterator_B1, ///< iterator over B1 operand in global memory
FragmentC0 const &src_accum, ///< source accumualtor tile
OutputOp output_op_0, ///< epilogue operation after 1st Gemm
int gemm_k_iterations_0, ///< number of iterations of the mainloop
FragmentC1 &accum, ///< destination accumulator tile
IteratorA0 iterator_A, ///< iterator over A operand in global memory
IteratorB0 iterator_B0, ///< iterator over B0 operand in global memory
IteratorAccumulatorScaleBias iterator_A1_scale, ///< iterator over A1 operand scale vectors in global memory
IteratorAccumulatorScaleBias iterator_A1_bias, ///< iterator over A1 operand bias vectors in global memory
IteratorB1 iterator_B1, ///< iterator over B1 operand in global memory
FragmentC0 const &src_accum, ///< source accumualtor tile
OutputOp output_op_0, ///< epilogue operation after 1st Gemm
TransformA0 transform_A0 = TransformA0(), ///< transformation applied to A0 fragment
TransformB0 transform_B0 = TransformB0(), ///< transformation applied to B0 fragment
TransformB1 transform_B1 = TransformB1()) { ///< transformation applied to B1 fragment
TransformB0 transform_B0 = TransformB0(), ///< transformation applied to B0 fragment
TransformB1 transform_B1 = TransformB1()) { ///< transformation applied to B1 fragment
//
// Prologue
@ -272,8 +295,8 @@ public:
++iterator_A;
++iterator_B0;
this->smem_iterator_A_.store(tb_frag_A);
this->smem_iterator_B0_.store(tb_frag_B0);
this->smem_iterator_A_.store(transform_A0(tb_frag_A));
this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
++this->smem_iterator_A_;
++this->smem_iterator_B0_;
@ -298,23 +321,19 @@ public:
int smem_write_stage_idx = 1;
// Avoid reading out of bounds
if (gemm_k_iterations_0 <= 1) {
iterator_A.clear_mask();
iterator_B0.clear_mask();
}
iterator_A.clear_mask(gemm_k_iterations_0 <= 1);
iterator_B0.clear_mask(gemm_k_iterations_0 <= 1);
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
// shared memory loads (which have the tighest latency requirement).
iterator_A.load(tb_frag_A);
//
// Mainloop
//
// Note: The main loop does not support Base::WarpGemmIterations == 2.
// Note: The main loop does not support Base::kWarpGemmIterations == 2.
CUTLASS_GEMM_LOOP
for (; gemm_k_iterations_0 > 0; --gemm_k_iterations_0) {
//
// Loop over GEMM K dimension
//
@ -328,19 +347,14 @@ public:
if (warp_mma_k == Base::kWarpGemmIterations0 - 1) {
// Write fragments to shared memory
this->smem_iterator_A_.store(tb_frag_A);
this->smem_iterator_A_.store(transform_A0(tb_frag_A));
this->smem_iterator_B0_.store(tb_frag_B0);
this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
__syncthreads();
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
// shared memory loads (which have the tighest latency requirement).
iterator_A.load(tb_frag_A);
++this->smem_iterator_B0_;
++this->smem_iterator_A_;
++this->smem_iterator_B0_;
// Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
if (smem_write_stage_idx == 1) {
@ -369,19 +383,18 @@ public:
if (warp_mma_k == 0) {
iterator_A.load(tb_frag_A);
iterator_B0.load(tb_frag_B0);
++iterator_A;
++iterator_B0;
// Avoid reading out of bounds if this was the last loop iteration
if (gemm_k_iterations_0 <= 2) {
iterator_A.clear_mask();
iterator_B0.clear_mask();
}
iterator_A.clear_mask(gemm_k_iterations_0 <= 2);
iterator_B0.clear_mask(gemm_k_iterations_0 <= 2);
}
warp_mma0(accum0, warp_frag_A0[warp_mma_k % 2], warp_frag_B0[warp_mma_k % 2], accum0);
warp_mma0(accum0, warp_frag_A0[warp_mma_k % 2],
warp_frag_B0[warp_mma_k % 2], accum0);
}
}
@ -394,32 +407,53 @@ public:
// Prologue
//
FragmentA1ScaleBias tb_frag_A1_scale;
FragmentA1ScaleBias tb_frag_A1_bias;
FragmentIteratorA1ScaleBias warp_tile_iterator_A1_scale_(tb_frag_A1_scale);
FragmentIteratorA1ScaleBias warp_tile_iterator_A1_bias_(tb_frag_A1_bias);
FragmentB1 tb_frag_B1;
if(PerChannelScale)
tb_frag_A1_scale.clear();
tb_frag_A1_bias.clear();
tb_frag_B1.clear();
// The last kblock is loaded in the prolog
if(PerChannelScale)
iterator_A1_scale.load(tb_frag_A1_scale);
iterator_A1_bias.load(tb_frag_A1_bias);
iterator_B1.load(tb_frag_B1);
if(PerChannelScale)
++iterator_A1_scale;
++iterator_A1_bias;
++iterator_B1;
this->smem_iterator_B1_.store(tb_frag_B1);
this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
++this->smem_iterator_B1_;
__syncthreads();
// Pair of fragments used to overlap shared memory loads and math instructions
WarpFragmentA1ScaleBias warp_frag_A1_scale[2];
WarpFragmentA1ScaleBias warp_frag_A1_bias[2];
WarpFragmentA1 warp_frag_A1[2];
WarpFragmentB1 warp_frag_B1[2];
//warp_tile_iterator_A1_.set_kgroup_index(0);
this->warp_tile_iterator_B1_.set_kgroup_index(0);
warp_tile_iterator_A1_.load(warp_frag_A1[0], output_op_0);
if(PerChannelScale)
warp_tile_iterator_A1_scale_.load(warp_frag_A1_scale[0]);
warp_tile_iterator_A1_bias_.load(warp_frag_A1_bias[0]);
warp_tile_iterator_A1_.load(warp_frag_A1[0], warp_frag_A1_scale[0],
warp_frag_A1_bias[0], output_op_0);
this->warp_tile_iterator_B1_.load(warp_frag_B1[0]);
++warp_tile_iterator_A1_;
if(PerChannelScale)
++warp_tile_iterator_A1_scale_;
++warp_tile_iterator_A1_bias_;
++this->warp_tile_iterator_B1_;
Operator1 warp_mma1;
@ -429,9 +463,7 @@ public:
int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
// Avoid reading out of bounds
if (gemm_k_iterations_1 <= 1) {
iterator_B1.clear_mask();
}
iterator_B1.clear_mask(gemm_k_iterations_1 <= 1);
//
// Mainloop
@ -454,15 +486,14 @@ public:
if (warp_mma_k == Base::kWarpGemmIterations1 - 1) {
// Write fragments to shared memory
this->smem_iterator_B1_.store(tb_frag_B1);
this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
__syncthreads();
++smem_iterator_B1_;
++this->smem_iterator_B1_;
// Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
if (smem_write_stage_idx == 1) {
smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
}
else {
this->warp_tile_iterator_B1_.add_tile_offset(
@ -472,14 +503,31 @@ public:
}
smem_write_stage_idx ^= 1;
if(PerChannelScale) {
tb_frag_A1_scale.clear();
iterator_A1_scale.load(tb_frag_A1_scale);
++iterator_A1_scale;
}
tb_frag_A1_bias.clear();
iterator_A1_bias.load(tb_frag_A1_bias);
++iterator_A1_bias;
}
this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
warp_tile_iterator_A1_.load(warp_frag_A1[(warp_mma_k + 1) % 2], output_op_0);
if(PerChannelScale)
warp_tile_iterator_A1_scale_.load(warp_frag_A1_scale[(warp_mma_k + 1) % 2]);
warp_tile_iterator_A1_bias_.load(warp_frag_A1_bias[(warp_mma_k + 1) % 2]);
warp_tile_iterator_A1_.load(warp_frag_A1[(warp_mma_k + 1) % 2],
warp_frag_A1_scale[(warp_mma_k + 1) % 2],
warp_frag_A1_bias[(warp_mma_k + 1) % 2],
output_op_0);
this->warp_tile_iterator_B1_.load(warp_frag_B1[(warp_mma_k + 1) % 2]);
if(PerChannelScale)
++warp_tile_iterator_A1_scale_;
++warp_tile_iterator_A1_bias_;
++warp_tile_iterator_A1_;
++this->warp_tile_iterator_B1_;
@ -488,17 +536,14 @@ public:
iterator_B1.load(tb_frag_B1);
++iterator_B1;
// Avoid reading out of bounds if this was the last loop iteration
if (gemm_k_iterations_1 <= 2) {
iterator_B1.clear_mask();
}
iterator_B1.clear_mask(gemm_k_iterations_1 <= 2);
}
warp_mma1(accum, warp_frag_A1[warp_mma_k % 2], warp_frag_B1[warp_mma_k % 2], accum);
warp_mma1(accum, warp_frag_A1[warp_mma_k % 2],
warp_frag_B1[warp_mma_k % 2], accum);
}
}
}
};

View File

@ -0,0 +1,544 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Template for a double-buffered threadblock-scoped Back-to-back fused GEMM kernel.
*/
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/array.h"
#include "cutlass/aligned_buffer.h"
#include "cutlass/numeric_conversion.h"
#include "cutlass/numeric_types.h"
#include "cutlass/matrix_shape.h"
#include "cutlass/gemm/gemm.h"
#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
#include "threadblock/b2b_mma_base_smem_accumulator.h"
#include "cutlass/epilogue/threadblock/epilogue_smem_accumulator.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace gemm {
namespace threadblock {
////////////////////////////////////////////////////////////////////////////////////////////////
/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
template <
/// Size of the Gemm problem - concept: gemm::GemmShape<>
typename Shape0_,
/// Iterates over tiles of A operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
typename IteratorA0_,
/// Iterates over tiles of A operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorA0_,
/// Iterates over tiles of B operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
typename IteratorB0_,
/// Iterates over tiles of B operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorB0_,
/// Iterates over vectors of scale and bias vector in global memory
// (concept: VectorIterator)
typename IteratorAccumulatorScaleBias_,
/// Iterates over accumulator tile
typename FragmentIteratorAccumulator_,
/// Iterates over accumulator tile in shared memory
typename SmemIteratorD0_,
/// Size of the Gemm problem - concept: gemm::GemmShape<>
typename Shape1_,
/// Iterates over the intermediate accumulator tile in shared memory
typename WarpIteratorA1_,
/// Iterates over tiles of B operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
typename IteratorB1_,
/// Iterates over tiles of B operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorB1_,
/// Data type of accumulator matrix
typename ElementC_,
/// Data type of accumulator matrix
typename LayoutC_,
/// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...)
typename OutputOp_,
/// Policy describing tuning details (concept: MmaPipelinedPolicy)
typename Policy0_,
/// Policy describing tuning details (concept: MmaPipelinedPolicy)
typename Policy1_,
/// Transformation applied to A0 operand
typename TransformA0_ = NumericArrayConverter<
typename SmemIteratorA0_::Element,
typename IteratorA0_::Element,
IteratorA0_::Fragment::kElements>,
///
/// Transformation applied to B0 operand
typename TransformB0_ = NumericArrayConverter<
typename SmemIteratorB0_::Element,
typename IteratorB0_::Element,
IteratorB0_::Fragment::kElements>,
///
/// Transformation applied to B1 operand
typename TransformB1_ = NumericArrayConverter<
typename SmemIteratorB1_::Element,
typename IteratorB1_::Element,
IteratorB1_::Fragment::kElements>,
/// Used for partial specialization
typename Enable = bool
>
class B2bMmaPipelinedSmemAccumulator :
public B2bMmaBaseSmemAccumulator<Shape0_, Shape1_, Policy0_, Policy1_, SmemIteratorD0_, 2> {
public:
///< Base class
using Base = B2bMmaBaseSmemAccumulator<Shape0_, Shape1_, Policy0_, Policy1_, SmemIteratorD0_, 2>;
using Shape0 = Shape0_; ///< Size of the Gemm problem - concept: gemm::GemmShape<>
using IteratorA0 = IteratorA0_; ///< Iterates over tiles of A operand in global memory
using IteratorB0 = IteratorB0_; ///< Iterates over tiles of B operand in global memory
using IteratorAccumulatorScaleBias = IteratorAccumulatorScaleBias_; ///< Iterates over tiles of the scale and bias vectors in global memory
using Policy0 = Policy0_; ///< Policy0 describing tuning details
using SmemIteratorA0 = SmemIteratorA0_;
using SmemIteratorB0 = SmemIteratorB0_;
using SmemIteratorD0 = SmemIteratorD0_; ///< Iterates over accumulator tile in shared memory
using FragmentIteratorAccumulator = FragmentIteratorAccumulator_; ///< Iterates over accumulator tile
using Shape1 = Shape1_; ///< Size of the Gemm problem - concept: gemm::GemmShape<>
using IteratorB1 = IteratorB1_; ///< Iterates over tiles of B operand in global memory
using Policy1 = Policy1_; ///< Policy1 describing tuning details
using SmemIteratorB1 = SmemIteratorB1_;
using WarpIteratorA1 = WarpIteratorA1_; ///< Iterates over the intermediate accumulator tile in shared memory
using ElementC = ElementC_; ///< Data type of accumulator matrix
using LayoutC = LayoutC_; ///< Layout of accumulator matrix
using OutputOp = OutputOp_; ///< Epilogue after 1st Gemm
using TransformA0 = TransformA0_;
using TransformB0 = TransformB0_;
using TransformB1 = TransformB1_;
//
// Dependent types
//
/// Fragment of operand A loaded from global memory
using FragmentA0 = typename IteratorA0::Fragment;
/// Fragment of operand B loaded from global memory
using FragmentB0 = typename IteratorB0::Fragment;
/// Fragment of accumulator tile
using FragmentC0 = typename Policy0::Operator::FragmentC;
/// Warp-level Mma
using Operator0 = typename Policy0::Operator;
/// Fragment of operand B loaded from global memory
using FragmentB1 = typename IteratorB1::Fragment;
/// Fragment of accumulator tile
using FragmentC1 = typename Policy1::Operator::FragmentC;
/// Warp-level Mma
using Operator1 = typename Policy1::Operator;
/// Obtain the arch tag from the warp-level operator
using ArchTag = typename Policy0::Operator::ArchTag;
/// Complex transform on A0 operand
static ComplexTransform const kTransformA0 = Operator0::kTransformA;
/// Complex transform on B0 operand
static ComplexTransform const kTransformB0 = Operator0::kTransformB;
/// Complex transform on B1 operand
static ComplexTransform const kTransformB1 = Operator1::kTransformB;
/// staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
/// Epilog in shared memory
using Epilogue0 = epilogue::threadblock::EpilogueSmemAccumulator<
SmemIteratorD0, ///< SmemTileIterator
FragmentIteratorAccumulator, ///< AccumulatorFragmentIterator
IteratorAccumulatorScaleBias, ///< ScaleBiasIterator
OutputOp>; ///< Output operator
private:
using WarpFragmentA0 = typename Operator0::FragmentA;
using WarpFragmentB0 = typename Operator0::FragmentB;
using WarpFragmentA1 = typename Operator1::FragmentA;
using WarpFragmentB1 = typename Operator1::FragmentB;
protected:
/// Iterator to write threadblock-scoped tile of A operand to shared memory
SmemIteratorA0 smem_iterator_A_;
/// Iterator to write threadblock-scoped tile of B0 operand to shared memory
SmemIteratorB0 smem_iterator_B0_;
/// Shared Memory Iterator to store accumulator tile
SmemIteratorD0 smem_iterator_D0_;
/// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
WarpIteratorA1 warp_tile_iterator_A1_;
/// Iterator to write threadblock-scoped tile of B1 operand to shared memory
SmemIteratorB1 smem_iterator_B1_;
public:
/// Construct from tensor references
CUTLASS_DEVICE
B2bMmaPipelinedSmemAccumulator(
typename Base::B2bMmaSharedStorage &shared_storage, ///< Shared storage needed for internal use by threadblock-scoped GEMM
int thread_idx, ///< ID within the threadblock
int warp_idx, ///< ID of warp
int lane_idx, ///< ID of each thread within a warp
int problem_size_0_n ///< GEMM0 N is used for accumulator extent
):
Base(shared_storage, thread_idx, warp_idx, lane_idx),
smem_iterator_A_(shared_storage.b2b_mma_shared_storage.shared_storage0.operand_A_ref(), thread_idx),
smem_iterator_B0_(shared_storage.b2b_mma_shared_storage.shared_storage0.operand_B_ref(), thread_idx),
smem_iterator_D0_(shared_storage.accumulator_shared_storage0.accum_ref(), lane_idx),
warp_tile_iterator_A1_(shared_storage.accumulator_shared_storage0.accum_ref(), {Base::WarpGemm1::kM, problem_size_0_n}, lane_idx),
smem_iterator_B1_(shared_storage.b2b_mma_shared_storage.shared_storage1.operand_B_ref(), thread_idx) {
// Compute warp location within threadblock tile by mapping the warp_id to
// three coordinates:
// _m: the warp's position within the threadblock along the M dimension
// _n: the warp's position within the threadblock along the N dimension
// _k: the warp's position within the threadblock along the K dimension
int warp_idx_mn_0 = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
int warp_idx_k_0 = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
int warp_idx_m_0 = warp_idx_mn_0 % Base::WarpCount0::kM;
int warp_idx_n_0 = warp_idx_mn_0 / Base::WarpCount0::kM;
int tile_offset_k_0 = Base::kWarpGemmIterations0 * warp_idx_k_0;
int warp_idx_mn_1 = warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN);
int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN);
int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM;
int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM;
int tile_offset_k_1 = Base::kWarpGemmIterations1 * warp_idx_k_1;
// Add per-warp offsets in units of warp-level tiles
this->warp_tile_iterator_A0_.add_tile_offset({warp_idx_m_0, tile_offset_k_0});
this->warp_tile_iterator_B0_.add_tile_offset({tile_offset_k_0, warp_idx_n_0});
warp_tile_iterator_A1_.add_tile_offset({warp_idx_m_1, tile_offset_k_1});
this->warp_tile_iterator_B1_.add_tile_offset({tile_offset_k_1, warp_idx_n_1});
// Add smem accumulator iterator warp offset
smem_iterator_D0_.add_tile_offset({ warp_idx_m_0 * SmemIteratorD0::TileIterations::kRow,
warp_idx_n_0 * SmemIteratorD0::TileIterations::kColumn});
}
/// Perform a threadblock-scoped matrix multiply-accumulate
CUTLASS_DEVICE
void operator()(
int gemm_k_iterations_0, ///< number of iterations of the mainloop
FragmentC1 &accum, ///< destination accumulator tile
IteratorA0 iterator_A, ///< iterator over A operand in global memory
IteratorB0 iterator_B0, ///< iterator over B0 operand in global memory
IteratorAccumulatorScaleBias iterator_accum0_scale, ///< iterator over D0 scale vector in global memory
IteratorAccumulatorScaleBias iterator_accum0_bias, ///< iterator over D0 bias vector in global memory
IteratorB1 iterator_B1, ///< iterator over B1 operand in global memory
FragmentC0 const &src_accum, ///< source accumualtor tile
OutputOp output_op_0, ///< epilogue operation after 1st Gemm
TransformA0 transform_A0 = TransformA0(), ///< transformation applied to A0 fragment
TransformB0 transform_B0 = TransformB0(), ///< transformation applied to B0 fragment
TransformB1 transform_B1 = TransformB1()) { ///< transformation applied to B1 fragment
//
// Prologue
//
// Perform accumulation in the 'd' output operand
FragmentC0 accum0 = src_accum;
FragmentA0 tb_frag_A;
FragmentB0 tb_frag_B0;
tb_frag_A.clear();
tb_frag_B0.clear();
// The last kblock is loaded in the prolog
iterator_A.load(tb_frag_A);
iterator_B0.load(tb_frag_B0);
++iterator_A;
++iterator_B0;
this->smem_iterator_A_.store(transform_A0(tb_frag_A));
this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
++this->smem_iterator_A_;
++this->smem_iterator_B0_;
__syncthreads();
// Pair of fragments used to overlap shared memory loads and math instructions
WarpFragmentA0 warp_frag_A0[2];
WarpFragmentB0 warp_frag_B0[2];
this->warp_tile_iterator_A0_.set_kgroup_index(0);
this->warp_tile_iterator_B0_.set_kgroup_index(0);
this->warp_tile_iterator_A0_.load(warp_frag_A0[0]);
this->warp_tile_iterator_B0_.load(warp_frag_B0[0]);
++this->warp_tile_iterator_A0_;
++this->warp_tile_iterator_B0_;
Operator0 warp_mma0;
int smem_write_stage_idx = 1;
// Avoid reading out of bounds
iterator_A.clear_mask(gemm_k_iterations_0 <= 1);
iterator_B0.clear_mask(gemm_k_iterations_0 <= 1);
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
// shared memory loads (which have the tighest latency requirement).
//
// Mainloop
//
// Note: The main loop does not support Base::kWarpGemmIterations == 2.
CUTLASS_GEMM_LOOP
for (; gemm_k_iterations_0 > 0; --gemm_k_iterations_0) {
//
// Loop over GEMM K dimension
//
CUTLASS_PRAGMA_UNROLL
for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0; ++warp_mma_k) {
// Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
// as the case may be.
if (warp_mma_k == Base::kWarpGemmIterations0 - 1) {
// Write fragments to shared memory
this->smem_iterator_A_.store(transform_A0(tb_frag_A));
this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
__syncthreads();
++this->smem_iterator_A_;
++this->smem_iterator_B0_;
// Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
if (smem_write_stage_idx == 1) {
this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
}
else {
this->warp_tile_iterator_A0_.add_tile_offset(
{0, -Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0});
this->warp_tile_iterator_B0_.add_tile_offset(
{-Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0,
0});
}
smem_write_stage_idx ^= 1;
}
this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
this->warp_tile_iterator_A0_.load(warp_frag_A0[(warp_mma_k + 1) % 2]);
this->warp_tile_iterator_B0_.load(warp_frag_B0[(warp_mma_k + 1) % 2]);
++this->warp_tile_iterator_A0_;
++this->warp_tile_iterator_B0_;
if (warp_mma_k == 0) {
iterator_A.load(tb_frag_A);
iterator_B0.load(tb_frag_B0);
++iterator_A;
++iterator_B0;
// Avoid reading out of bounds if this was the last loop iteration
iterator_A.clear_mask(gemm_k_iterations_0 <= 2);
iterator_B0.clear_mask(gemm_k_iterations_0 <= 2);
}
warp_mma0(accum0, warp_frag_A0[warp_mma_k % 2],
warp_frag_B0[warp_mma_k % 2], accum0);
}
}
/// Epilogue for the first Implicit Gemm
Epilogue0 epilogue0;
epilogue0(output_op_0, smem_iterator_D0_, accum0, iterator_accum0_scale, iterator_accum0_bias);
__syncthreads();
//2nd Gemm
//
// Prologue
//
FragmentB1 tb_frag_B1;
tb_frag_B1.clear();
// The last kblock is loaded in the prolog
iterator_B1.load(tb_frag_B1);
++iterator_B1;
this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
++this->smem_iterator_B1_;
__syncthreads();
// Pair of fragments used to overlap shared memory loads and math instructions
WarpFragmentA1 warp_frag_A1[2];
WarpFragmentB1 warp_frag_B1[2];
this->warp_tile_iterator_B1_.set_kgroup_index(0);
warp_tile_iterator_A1_.load(warp_frag_A1[0]);
this->warp_tile_iterator_B1_.load(warp_frag_B1[0]);
++warp_tile_iterator_A1_;
++this->warp_tile_iterator_B1_;
Operator1 warp_mma1;
smem_write_stage_idx = 1;
int gemm_k_iterations_1 = Shape0::kN / Shape1::kK;
// Avoid reading out of bounds
iterator_B1.clear_mask(gemm_k_iterations_1 <= 1);
//
// Mainloop
//
// Note: The main loop does not support Base::kWarpGemmIterations == 2.
CUTLASS_PRAGMA_UNROLL
for (; gemm_k_iterations_1 > 0; --gemm_k_iterations_1) {
//
// Loop over GEMM K dimension
//
CUTLASS_PRAGMA_UNROLL
for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1; ++warp_mma_k) {
// Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
// as the case may be.
if (warp_mma_k == Base::kWarpGemmIterations1 - 1) {
// Write fragments to shared memory
this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
__syncthreads();
++this->smem_iterator_B1_;
// Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
if (smem_write_stage_idx == 1) {
this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
}
else {
this->warp_tile_iterator_B1_.add_tile_offset(
{-Base::kStages * Policy1::kPartitionsK *
Base::kWarpGemmIterations1,
0});
}
smem_write_stage_idx ^= 1;
}
this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
// skip warp tile loading for the last kgroup
if(gemm_k_iterations_1 > 1 || warp_mma_k < Base::kWarpGemmIterations1 - 1)
warp_tile_iterator_A1_.load(warp_frag_A1[(warp_mma_k + 1) % 2]);
this->warp_tile_iterator_B1_.load(warp_frag_B1[(warp_mma_k + 1) % 2]);
++warp_tile_iterator_A1_;
++this->warp_tile_iterator_B1_;
if (warp_mma_k == 0) {
iterator_B1.load(tb_frag_B1);
++iterator_B1;
// Avoid reading out of bounds if this was the last loop iteration
iterator_B1.clear_mask(gemm_k_iterations_1 <= 2);
}
warp_mma1(accum, warp_frag_A1[warp_mma_k % 2],
warp_frag_B1[warp_mma_k % 2], accum);
}
}
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace threadblock
} // namespace gemm
} // namespace cutlass

View File

@ -0,0 +1,584 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
*/
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/numeric_types.h"
#include "cutlass/arch/arch.h"
#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
#include "cutlass/transform/threadblock/predicated_vector_access_iterator.h"
#include "cutlass/transform/threadblock/vector_iterator.h"
#include "cutlass/transform/warp/vector_fragment_iterator.h"
#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
#include "threadblock/b2b_mma_pipelined.h"
#include "threadblock/b2b_mma_multistage.h"
////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace gemm {
namespace threadblock {
////////////////////////////////////////////////////////////////////////////////
template <
/// Element type for A matrix operand
typename ElementA_,
/// Layout type for A matrix operand
typename LayoutA_,
/// Access granularity of A matrix in units of elements
int kAlignmentA,
/// Element type for B matrix operand
typename ElementB_,
/// Layout type for B matrix operand
typename LayoutB_,
/// Access granularity of B matrix in units of elements
int kAlignmentB,
/// Element type for internal accumulation
typename ElementAccumulator_,
/// Layout type for C and D matrix operands
typename LayoutC_,
/// Operator class tag
typename OperatorClass_,
/// Tag indicating architecture to tune for
typename ArchTag_,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape0_,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape1_,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape0_,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape1_,
/// Instruction-level tile size (concept: GemmShape)
typename InstructionShape_,
/// Number of stages used in the pipelined mainloop
int Stages,
/// Operation perfomed by GEMM
typename Operator,
/// Epilogue output operator
typename EpilogueOutputOp,
/// Store the accumulators in row major or column major. Row major is used
/// when output layout is interleaved.
bool AccumulatorsInRowMajor = false,
/// Staging the accumulators in shared memory.
bool SmemAccumulator = false>
struct DefaultB2bMma;
////////////////////////////////////////////////////////////////////////////////
/// Specialization for row-major output with 2-stage pipeline
template <
/// Element type for A matrix operand
typename ElementA,
/// Layout type for A matrix operand
typename LayoutA,
/// Access granularity of A matrix in units of elements
int kAlignmentA,
/// Element type for B matrix operand
typename ElementB,
/// Layout type for B matrix operand
typename LayoutB,
/// Access granularity of B matrix in units of elements
int kAlignmentB,
/// Element type for internal accumulation
typename ElementAccumulator,
/// Tag indicating architecture to tune for
typename ArchTag,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape0,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape1,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape0,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape1,
/// Instruction-level tile size (concept: GemmShape)
typename InstructionShape,
/// Operation performed by GEMM
typename Operator,
/// Epilogue output operator
typename EpilogueOutputOp>
struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
kAlignmentB, ElementAccumulator, layout::RowMajor,
arch::OpClassTensorOp, ArchTag,
ThreadblockShape0, ThreadblockShape1,
WarpShape0, WarpShape1,
InstructionShape, 2, Operator, EpilogueOutputOp, false> {
// Define the MmaCore components
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
arch::OpClassTensorOp, 2, Operator>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
arch::OpClassTensorOp, 2, Operator>;
// Define iterators over tiles from the A operand
using IteratorA0 =
cutlass::transform::threadblock::PredicatedTileIterator<
cutlass::MatrixShape<MmaCore0::Shape::kM, MmaCore0::Shape::kK>,
ElementA, LayoutA, 1, typename MmaCore0::IteratorThreadMapA, kAlignmentA>;
// Define iterators over tiles from the B operand
using IteratorB0 =
cutlass::transform::threadblock::PredicatedTileIterator<
cutlass::MatrixShape<MmaCore0::Shape::kK, MmaCore0::Shape::kN>,
ElementB, LayoutB, 0, typename MmaCore0::IteratorThreadMapB, kAlignmentB>;
// Use fragment iterator for A operand
using AccumulatorLayout = cutlass::layout::ColumnMajor;
using FragmentIteratorA1 =
cutlass::gemm::warp::MmaTensorOpFragmentIterator<
cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
MmaCore1::Shape::kK, //kBlocksColumn
ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp>;
using ElementScaleBias = typename EpilogueOutputOp::ElementCompute;
using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
static int const kElementsPerAccess = 2;
using IteratorAccumulatorScaleBias =
cutlass::transform::threadblock::VectorIterator<
cutlass::transform::threadblock::PredicatedVectorAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
>;
// Warp-level iterators to load scale and bias vectors
using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
LayoutScaleBias, InstructionShape, kElementsPerAccess>;
// Define iterators over tiles from the B operand
using IteratorB1 =
cutlass::transform::threadblock::PredicatedTileIterator<
cutlass::MatrixShape<MmaCore1::Shape::kK, MmaCore1::Shape::kN>,
ElementB, LayoutB, 0, typename MmaCore1::IteratorThreadMapB, kAlignmentB>;
// Define the threadblock-scoped pipelined matrix multiply
using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaPipelined<
typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA,
IteratorB0, typename MmaCore0::SmemIteratorB,
typename MmaCore1::Shape, FragmentIteratorA1,
IteratorAccumulatorScaleBias, FragmentIteratorA1ScaleBias,
IteratorB1, typename MmaCore1::SmemIteratorB,
ElementAccumulator, layout::RowMajor,
EpilogueOutputOp,
typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy>;
};
////////////////////////////////////////////////////////////////////////////////
/// Specialization for row-major output for multi-stage
template <
/// Element type for A matrix operand
typename ElementA,
/// Layout type for A matrix operand
typename LayoutA,
/// Access granularity of A matrix in units of elements
int kAlignmentA,
/// Element type for B matrix operand
typename ElementB,
/// Layout type for B matrix operand
typename LayoutB,
/// Access granularity of B matrix in units of elements
int kAlignmentB,
/// Element type for internal accumulation
typename ElementAccumulator,
/// Tag indicating architecture to tune for
typename ArchTag,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape0,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape1,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape0,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape1,
/// Instruction-level tile size (concept: GemmShape)
typename InstructionShape,
/// Number of stages used in the multistage mainloop
int Stages,
/// Operation performed by GEMM
typename Operator,
/// Epilogue output operator
typename EpilogueOutputOp>
struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
kAlignmentB, ElementAccumulator, layout::RowMajor,
arch::OpClassTensorOp, ArchTag,
ThreadblockShape0, ThreadblockShape1,
WarpShape0, WarpShape1,
InstructionShape, Stages, Operator, EpilogueOutputOp, false> {
static cutlass::arch::CacheOperation::Kind const CacheOpA =
((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
? cutlass::arch::CacheOperation::Global
: cutlass::arch::CacheOperation::Always;
static cutlass::arch::CacheOperation::Kind const CacheOpB =
((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
? cutlass::arch::CacheOperation::Global
: cutlass::arch::CacheOperation::Always;
// Define the MmaCore components
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
Stages, Operator, false, CacheOpA, CacheOpB>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
Stages, Operator, false, CacheOpA, CacheOpB>;
// Define iterators over tiles from the A operand
using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
using AccessTypeA0 = cutlass::Array<ElementA, kAlignmentA>;
using IteratorA0 =
cutlass::transform::threadblock::PredicatedTileAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
ElementA, LayoutA, 1, ThreadMapA0, AccessTypeA0>;
// Define iterators over tiles from the B operand
using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
using AccessTypeB0 = cutlass::Array<ElementB, kAlignmentB>;
using IteratorB0 =
cutlass::transform::threadblock::PredicatedTileAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
ElementB, LayoutB, 0, ThreadMapB0, AccessTypeB0>;
// Use fragment iterator for A operand
using AccumulatorLayout = cutlass::layout::ColumnMajor;
using FragmentIteratorA1 =
cutlass::gemm::warp::MmaTensorOpFragmentIterator<
cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
MmaCore1::Shape::kK, //kBlocksColumn
ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp>;
/// Define iterators over tiles from scale/bias vectors
using ElementScaleBias = typename EpilogueOutputOp::ElementCompute;
using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
static int const kElementsPerAccess = 2;
using IteratorAccumulatorScaleBias =
cutlass::transform::threadblock::VectorIterator<
cutlass::transform::threadblock::PredicatedVectorAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
>;
// Warp-level iterators to load scale and bias vectors
using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
LayoutScaleBias, InstructionShape, kElementsPerAccess>;
// Define iterators over tiles from the B operand
using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
using AccessTypeB1 = cutlass::Array<ElementB, kAlignmentB>;
using IteratorB1 =
cutlass::transform::threadblock::PredicatedTileAccessIterator<
cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
ElementB, LayoutB, 0, ThreadMapB1, AccessTypeB1>;
// Define the threadblock-scoped pipelined matrix multiply
using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaMultistage<
typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA,
MmaCore0::kCacheOpA,
IteratorB0, typename MmaCore0::SmemIteratorB, MmaCore0::kCacheOpB,
typename MmaCore1::Shape, FragmentIteratorA1,
IteratorAccumulatorScaleBias, FragmentIteratorA1ScaleBias,
IteratorB1, typename MmaCore1::SmemIteratorB, MmaCore1::kCacheOpB,
ElementAccumulator, layout::RowMajor,
EpilogueOutputOp,
typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy, Stages>;
};
////////////////////////////////////////////////////////////////////////////////
/// Specialization for column-major-interleaved output with 2-stage pipeline
template <
/// Element type for A matrix operand
typename ElementA,
/// Layout type for A matrix operand
typename LayoutA,
/// Access granularity of A matrix in units of elements
int kAlignmentA,
/// Element type for B matrix operand
typename ElementB,
/// Layout type for B matrix operand
typename LayoutB,
/// Access granularity of B matrix in units of elements
int kAlignmentB,
/// Element type for internal accumulation
typename ElementAccumulator,
/// Tag indicating architecture to tune for
typename OperatorClass,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape0,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape1,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape0,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape1,
/// Instruction-level tile size (concept: GemmShape)
typename InstructionShape,
/// Operation performed by GEMM
typename Operator,
/// Epilogue output operator
typename EpilogueOutputOp,
/// Number of Interleaved K
int InterleavedK>
struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
kAlignmentB, ElementAccumulator,
layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, arch::Sm75,
ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
InstructionShape, 2, Operator, EpilogueOutputOp, true> {
// Define the MmaCore components
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementAccumulator,
layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, 2, Operator,
true>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementAccumulator,
layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, 2, Operator,
true>;
static_assert(kAlignmentA == 128 / sizeof_bits<ElementA>::value,
"Alignment must match thread data map's vector length");
static_assert(kAlignmentB ==128 / sizeof_bits<ElementB>::value,
"Alignment must match thread data map's vector length");
// Define iterators over tiles from the A operand
using IteratorA0 = cutlass::transform::threadblock::PredicatedTileIterator<
cutlass::MatrixShape<MmaCore0::Shape::kM, MmaCore0::Shape::kK>, ElementA,
LayoutA, 1, typename MmaCore0::IteratorThreadMapA>;
// Define iterators over tiles from the B operand
using IteratorB0 = cutlass::transform::threadblock::PredicatedTileIterator<
cutlass::MatrixShape<MmaCore0::Shape::kK, MmaCore0::Shape::kN>, ElementB,
LayoutB, 0, typename MmaCore0::IteratorThreadMapB>;
// Use fragment iterator for A1 operand
using AccumulatorLayout = cutlass::layout::RowMajor; //AccumulatorsInRowMajor = true
using FragmentIteratorA1 =
cutlass::gemm::warp::MmaTensorOpFragmentIterator<
cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
MmaCore1::Shape::kK, //kBlocksColumn
ElementAccumulator, ElementA, AccumulatorLayout,
InstructionShape, EpilogueOutputOp>;
using ElementScaleBias = typename EpilogueOutputOp::ElementCompute;
using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
static int const kElementsPerAccess = 4;
using IteratorAccumulatorScaleBias =
cutlass::transform::threadblock::VectorIterator<
cutlass::transform::threadblock::PredicatedVectorAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
>;
// Warp-level iterators to load scale and bias vectors
using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
LayoutScaleBias, InstructionShape, kElementsPerAccess>;
// Define iterators over tiles from the B operand
using IteratorB1 =
cutlass::transform::threadblock::PredicatedTileIterator<
cutlass::MatrixShape<MmaCore1::Shape::kK, MmaCore1::Shape::kN>,
ElementB, LayoutB, 0, typename MmaCore1::IteratorThreadMapB>;
// Define the threadblock-scoped pipelined matrix multiply
using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaPipelined<
typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA,
IteratorB0, typename MmaCore0::SmemIteratorB,
typename MmaCore1::Shape, FragmentIteratorA1,
IteratorAccumulatorScaleBias, FragmentIteratorA1ScaleBias,
IteratorB1, typename MmaCore1::SmemIteratorB,
ElementAccumulator, layout::ColumnMajorInterleaved<InterleavedK>,
EpilogueOutputOp,
typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy>;
};
////////////////////////////////////////////////////////////////////////////////
/// Specialization for column-major-interleaved output with multi-stage
template <
/// Element type for A matrix operand
typename ElementA,
/// Layout type for A matrix operand
typename LayoutA,
/// Access granularity of A matrix in units of elements
int kAlignmentA,
/// Element type for B matrix operand
typename ElementB,
/// Layout type for B matrix operand
typename LayoutB,
/// Access granularity of B matrix in units of elements
int kAlignmentB,
/// Element type for internal accumulation
typename ElementAccumulator,
/// Tag indicating architecture to tune for
typename OperatorClass,
/// Tag indicating architecture to tune for
typename ArchTag,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape0,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape1,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape0,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape1,
/// Instruction-level tile size (concept: GemmShape)
typename InstructionShape,
/// Number of stages used in the multistage mainloop
int Stages,
/// Operation performed by GEMM
typename Operator,
/// Epilogue output operator
typename EpilogueOutputOp,
/// Number of Interleaved K
int InterleavedK>
struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
kAlignmentB, ElementAccumulator,
layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, ArchTag,
ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
InstructionShape, Stages, Operator, EpilogueOutputOp, true> {
// Define the MmaCore components
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementAccumulator,
layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, Stages,
Operator, true>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementAccumulator,
layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, Stages,
Operator, true>;
// Define iterators over tiles from the A operand
using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
using IteratorA0 =
cutlass::transform::threadblock::PredicatedTileAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
ElementA, LayoutA, 1, ThreadMapA0, AccessTypeA>;
// Define iterators over tiles from the B operand
using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
using IteratorB0 =
cutlass::transform::threadblock::PredicatedTileAccessIterator<
cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
ElementB, LayoutB, 0, ThreadMapB0, AccessTypeB>;
// Use fragment iterator for A1 operand
using AccumulatorLayout = cutlass::layout::RowMajor; //AccumulatorsInRowMajor = true
using FragmentIteratorA1 =
cutlass::gemm::warp::MmaTensorOpFragmentIterator<
cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
MmaCore1::Shape::kK, //kBlocksColumn
ElementAccumulator, ElementA, AccumulatorLayout,
InstructionShape, EpilogueOutputOp>;
/// Define iterators over tiles from scale/bias vectors
using ElementScaleBias = typename EpilogueOutputOp::ElementCompute;
using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
static int const kElementsPerAccess = 4;
using IteratorAccumulatorScaleBias =
cutlass::transform::threadblock::VectorIterator<
cutlass::transform::threadblock::PredicatedVectorAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
>;
// Warp-level iterators to load scale and bias vectors
using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
LayoutScaleBias, InstructionShape, kElementsPerAccess>;
// Define iterators over tiles from the B operand
using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
using IteratorB1 =
cutlass::transform::threadblock::PredicatedTileAccessIterator<
cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
ElementB, LayoutB, 0, ThreadMapB1, AccessTypeB>;
// Define the threadblock-scoped multistage matrix multiply
using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaMultistage<
typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA,
MmaCore0::kCacheOpA,
IteratorB0, typename MmaCore0::SmemIteratorB, MmaCore0::kCacheOpB,
typename MmaCore1::Shape, FragmentIteratorA1,
IteratorAccumulatorScaleBias, FragmentIteratorA1ScaleBias,
IteratorB1, typename MmaCore1::SmemIteratorB, MmaCore1::kCacheOpB,
ElementAccumulator, layout::ColumnMajorInterleaved<InterleavedK>,
EpilogueOutputOp,
typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy, Stages>;
};
////////////////////////////////////////////////////////////////////////////////
} // namespace threadblock
} // namespace gemm
} // namespace cutlass
////////////////////////////////////////////////////////////////////////////////

Some files were not shown because too many files have changed in this diff Show More