CUTLASS 2.10 (#615)

Co-authored-by: Aniket Shivam <ashivam@nvidia.com>
2022-09-03 15:48:46 -07:00
parent ca23ff7924
commit b72cbf957d
289 changed files with 43708 additions and 2513 deletions
--- a/tools/library/scripts/pycutlass/test/conv/init.py
+++ b/tools/library/scripts/pycutlass/test/conv/init.py
--- a/tools/library/scripts/pycutlass/test/conv/cached_results_SM80.txt
+++ b/tools/library/scripts/pycutlass/test/conv/cached_results_SM80.txt
@ -0,0 +1,274 @@
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1767700736 2104699940 3506659864 557648934
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1539314507 3971227455 1976927351 1642148785
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 276489656 653235219 3147305346 880610205
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 272457724 2178229139 2786201726 4170295839
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 242235041 2149454506 784935854 682531065
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 3478189705 1667216236 1437761176
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 379326961 1780379994 3740415776
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 924848818 3533854396 2683779476
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2870331951 359232443 2147867990 1653277018
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2870331951 3784314846 2644315999 4224154526
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3787448414 3562991793 535073859 2563373454
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 426169840 2464808416 864648234 461884698
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2564934525 3910792915 3577331017 827498183
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 28479234 867695528 1947311971 83328334
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4192922822 4244595864 2296602326 2349214706
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 274678245 3464152269 1682550229 3446204619
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3993280136 828543035 1319748516 956044554
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 832003025 3799813757 4030292245 457791957
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1444316594 4129865888 93616503 412257611
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2931873718 1841508064 1497852219 36703874
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2931873718 1841508064 1497852219 1842147148
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1612565294 109894479 1782187316 3370789453
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 841569299 1010785577 1158956167 3261208135
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1893352157 48149942 3544807462 446577726
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 3585320147 2150950452 1625817025 3964129474
+conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 289918791 2624928614 3423533117 3186342135
+conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 2732296888 1838622641 4203745561
+conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 3456572634 893492926 1966259884
+conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 671982235 4014726279 4027869577 1510990157
+conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 798317794 4140605332 3580988556 3425909428
+conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1721270411 2106553169 835800311 3417471222
+conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 860217059 166776702 1109666471
+conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2128738105 855244826 2670006594 3857976152
+conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1931093565 3079461262 3579256638 2926210806
+conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2472246681 2952423142 2045838875 3445165841
+conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2956871200 2133381336 2601441527 2035094220
+conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 365467186 1700915522 2515933441 406719240
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 3347784734 156533442 1012781676 688128904
+conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 927718585 3117803557 1370701307 1462167731
+conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 973422497 1926250028 3440543762
+conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 2892862516 3649300762 1521470286
+conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2075083065 3181416651 1733426984 872275640
+conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4005590448 1639170045 388151578 4186957447
+conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 181075276 1433744686 860506550 3475157408
+conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1747719409 877465841 2345541783
+conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 856324887 2307248012 337386755 3363072703
+conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1906605830 722034901 2562804622 2508759317
+conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 805717279 2196645331 3235235362 1518334120
+conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3168796339 72559978 778918419 1260968000
+conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 261954979 2634885882 451986822 3792829599
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3747142491 2426759809 2622222681 371723930
+conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2056905385 3612826298 2531545294 476754549
+conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 2391975923 197605094 3409942185
+conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 3071904063 408984565 2378809888
+conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3414629540 3067676760 1540919649 2008865071
+conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4100326666 1085505037 2778215386 230227569
+conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3662895757 2731079464 3570839563 3483629877
+conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 408419601 3415600242 2106927195
+conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2154102133 3606099389 4034802752 3200055633
+conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2609259399 3910244699 1319285699 2229775542
+conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2948772873 2780071616 2703730845 3090625734
+conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 752289976 4278696824 360883914 3802692600
+conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3723912751 653419877 359675571 283806385
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 2027599472 1075980921 3101013494 2025203940
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 991402150 1393431534 1148212814 1350914659
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 4283492776 419570292 1210341563
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4178596783 3828059710 2735749436 2671012171
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 924522595 563724475 3750778972 4152580670
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1021044158 1686067905 3765040166 4102272733
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 2674994719 635224486 2759329777
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 4201252830 2920298728 304256151
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 70289262 646435722 4137562540
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1317457392 1288095320 2132879813 656196754
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1317457392 2202157489 2326567490 2475188414
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2476454437 1857118302 4164386062 239840568
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2767650699 3514840131 590439733 3879821123
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3896287283 3112762669 2515107934 2106635937
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1903067870 1021832870 3003938078 2751931686
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3489785028 2466126497 1374078692 2737628040
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2051350923 263676708 3639860119 1370886256
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 719099834 1474713672 204857540 2768940347
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3441724486 3162593831 421721594 3097845598
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2034354027 1249407570 2567025479 1441082595
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 941893937 3608468045 635631428 2369653089
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 941893937 3608468045 635631428 1218705038
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 172579142 319546523 718795680 1453661415
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2823351660 1326352711 1110204809 1155441703
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3238446487 2572503545 686287700 1559476701
+conv2d fprop_1x8x8x1_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 991402150 1883874274 1180207512 3934800419
+conv2d fprop_1x16x16x1_8x8_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 24290453 4230587034 4117433929 2540623821
+conv2d fprop_1x16x16x1_12x12_16x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 24290453 3802993432 1563447158 515257167
+conv2d fprop_1x224x224x1_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 2583340103 3928463259 1564251818
+conv2d fprop_1x224x224x1_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 2966178620 3457283045 1726663817
+conv2d fprop_1x224x224x1_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 1794561978 3101289788 3492498648
+conv2d fprop_1x224x224x1_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 1794561978 498358130 4111289929
+conv2d fprop_1x8x8x2_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2693144988 3876248534 3038023830 1910263513
+conv2d fprop_1x16x16x2_8x8_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 3355193355 319259163 535683577
+conv2d fprop_1x16x16x2_12x12_16x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 1548147432 3385829172 2741952709
+conv2d fprop_1x224x224x2_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 2686562907 3948710179 3669872932
+conv2d fprop_1x224x224x2_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 576815792 2317227037 1211532666
+conv2d fprop_1x224x224x2_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 27596985 555460201 895685163
+conv2d fprop_1x224x224x2_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 27596985 1465341652 2228916523
+conv2d fprop_1x8x8x4_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 24290453 137535877 1436667267 1395660627
+conv2d fprop_1x224x224x4_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 2226159049 4051661898 209529384
+conv2d fprop_1x224x224x4_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 3541851870 2271016226 2671623385
+conv2d fprop_1x224x224x4_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 982184919 2007343215 3362992769
+conv2d fprop_1x224x224x4_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 982184919 20610297 1086800078
+conv2d fprop_1x8x8x8_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 3117444553 1497663382 3561001103
+conv2d fprop_1x224x224x8_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 1414143072 827338392 2827855918
+conv2d fprop_1x224x224x8_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 3886996022 26545788 3407771964
+conv2d fprop_1x224x224x8_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 380272816 2374613655 3601677176
+conv2d fprop_1x224x224x8_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 380272816 778374730 2110111988
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1736512560 49406874 846358010 3314905564
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1848484956 1432417472 1903569827 3750799351
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4236427320 3696009469 69852620 201921851
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 109006944 450017448 1793784844 903209915
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 813367872 2397796503 1928191746 3210229460
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 1307184141 46021356 1674017987
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 1212511562 3331767121 2446286369
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 2013675943 1681111033 1469213228
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1703349794 500298386 3218034344 4159283207
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1703349794 1123534155 145385311 4273847179
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3862659311 349459322 1503631520 1404971956
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1623686755 961217371 552550209 3980749384
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3554927580 1131648083 4149599295 3119557776
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1767639287 3350675774 128324027 1059816532
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3986143536 17411088 40173029 1694092310
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1157793540 3513299281 48848814 1435528367
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 988962069 4292634763 388976034 2674929544
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4202383208 3529769234 1046186503 3368902675
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 856448884 3057259762 2063087558 1995545427
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2281940872 144496548 2455451862 400986166
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2281940872 144496548 2455451862 1082696406
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2702905851 1992889713 731289041 608504198
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2742293143 4197915274 606840 3671124731
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 149434841 2288560511 2994968424 2881838300
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 2226824643 327135318 3718671210 2121176659
+conv2d fprop_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3254575292 1119957081 672831271
+conv2d fprop_1x4x4x14_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3115523958 3622905002 4020453928 3853387318
+conv2d fprop_1x23x56x98_10x22_128x3x3_pad_h4w5_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1702870033 1876930844 1190400523 3937287850
+conv2d fprop_1x4x4x28_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 2587856937 2021107274 2789519899
+conv2d fprop_1x23x56x100_10x22_128x3x3_pad_h4w5_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2368669977 1353376771 744357395 786349633
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 991402150 1393431534 2496492611 3901723984
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4208297221 4283492776 3148637036 258220505
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4178596783 3828059710 281106520 1103939403
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 924522595 563724475 1938163814 2197809394
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1021044158 1686067905 350851834 3999808950
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3335547 2674994719 1034822169 1611033520
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3335547 4201252830 1597212204 2181492560
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3335547 70289262 3001492060 1379239000
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1317457392 1288095320 4211138051 2804617605
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1317457392 2202157489 1043108884 2923122465
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2476454437 1857118302 3877008798 1206012078
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2767650699 3514840131 2946529611 3907056932
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3896287283 3112762669 1581171257 3959460786
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1903067870 1021832870 1926804094 1756790353
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3489785028 2466126497 1712378956 434322965
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2051350923 263676708 355203300 821870356
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 719099834 1474713672 2886387159 4086314983
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3441724486 3162593831 1422796372 2049419539
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2034354027 1249407570 1196036582 2684312264
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 941893937 3608468045 2198911423 1060050551
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 941893937 3608468045 2198911423 3361618746
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 172579142 319546523 2332616929 543467298
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2823351660 1326352711 3839068434 65031397
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3238446487 2572503545 3604065639 2111204111
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2149247508 1775375365 2663631601 1249487679
+conv2d fprop_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 403997062 1679063623 4062928786
+conv2d dgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 3464637181 1623218578 436154205
+conv2d dgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1479940693 3253144559 3883419107
+conv2d dgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 1871463331 2425320272 74566211
+conv2d dgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3484040069 664160900 3610888033 22347127
+conv2d dgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1924855848 1382111427 2541177413
+conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 868180534 1764715518 3070473696 2392864704
+conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3437976747 666906244 3401957738 2050602745
+conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4195072693 1575210381 781892324 2848949054
+conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3457330201 2316839359 1539389419 4293781748
+conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 2469024119 2885305868 2693098375
+conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 2469024119 2885305868 1969608051
+conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1690216859 554790212 2885143346 780489333
+conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 835105643 3337423971 3866137775
+conv2d dgrad_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2956180805 1092015789 3160693693 1526395881
+conv2d dgrad_1x56x56x12_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 1941683430 2236679600 3168985259
+conv2d dgrad_1x55x55x12_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 1941683430 3784328837 471971363
+conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 289918791 1266976707 942688231 3457364823
+conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 1027662440 2005082293 2235558527
+conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 3380032042 1370040310 1348846927
+conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 671982235 1423304149 2107662762 1234913781
+conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 798317794 1709026638 2421185623 3308071321
+conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1721270411 2519327328 2541413264 3185574975
+conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 2070174510 1364436192 3531942595
+conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2128738105 2056902987 3079166829 2329433528
+conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3857917762 3227877956 645422556
+conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3857917762 3817218800 985231315
+conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 1398036015 3630062764 2492522537
+conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2784049299 643733019 3649549642 2637869234
+conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 2332160299 302086821 3303132343
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1931093565 2458714707 2919710256 2311575036
+conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2472246681 2260022344 500095455 2760458995
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1530672622 3635363851 2402907878 4131497953
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1500864134 2536338700 2459524764 2504484273
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3344871528 2667385029 2714805835 3487838445
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 966721255 1547169349 3198573835 302049294
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 2440004820 1576818970 1317923157
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 2440004820 1576818970 3186679687
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4028893260 4220759192 2236533218 3731336532
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2956871200 1591352238 1756650151 1262787222
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 365467186 892422645 1334708242 1372556938
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 3347784734 150035460 2897171548 3701081496
+conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 927718585 4106152802 2634710231 744755886
+conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 3464637181 2709881923 2407415563
+conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 3723472741 3733128758 3129111191
+conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2075083065 2042513140 253288229 404121198
+conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4005590448 1116254439 525487530 3284739065
+conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 181075276 1743485155 91136873 2508716910
+conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 386662952 1127709182 4026285141
+conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 856324887 3954249564 2591894666 2655687700
+conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1300426008 1263618595 1313664339
+conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1300426008 1756414462 2995557277
+conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 447261065 121940906 1497499264
+conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3484040069 2966693627 1423016429 341928547
+conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1759979610 2761559427 68093525
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1906605830 2980501720 1650970502 3258883197
+conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 805717279 3502822733 3985958544 2568949300
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 868180534 3289288595 385631111 328914986
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3437976747 3391080565 1513955316 1521294163
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4195072693 1669352457 2608107448 4284090805
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3457330201 1126870455 106232038 3054809396
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 1723074453 1186911503 4239438967
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 1723074453 1186911503 2113601884
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1690216859 2413490039 36034283 1112346965
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3168796339 1601750164 14375779 2894970748
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 261954979 1300976652 4259930640 305685205
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3747142491 1747587481 4137156526 1174257270
+conv2d wgrad_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2956180805 1086820986 1644914756 2013471312
+conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2056905385 447674669 724481645 1457430910
+conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 1227883689 3401425854 3897766524
+conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 3749787834 3350064812 1136116240
+conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3414629540 820341033 770836461 2451581199
+conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4100326666 2581696511 1088458082 1521190911
+conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3662895757 2885454895 935600441 2615245898
+conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 3831334389 3506139121 814982501
+conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2154102133 737968461 1291834254 2665225480
+conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 3573498719 1809195644 1765637461
+conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 3573498719 3379808294 483095299
+conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 4194153035 2863868771 1639389008
+conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2624318208 157618421 1779474147 814087242
+conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 2300180628 423968553 3890279569
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2609259399 1848932917 522753581 1926508271
+conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2948772873 3663040534 4014266327 1288646188
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3271403719 1585195072 1487505772 3253374264
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1419588777 451194147 3578359696 3659768981
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 763924990 2780826684 2883769406 148530958
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2578426561 3849874822 102765469 1305171059
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 1995451256 2632815435 1516344656
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 1995451256 2632815435 1586331550
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2462511240 2274021368 1188866747 3178890497
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 752289976 1226457131 4187777346 1400559240
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3723912751 1585959358 3731079159 1498901684
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 2027599472 2758666204 3287095476 4291916486
+conv2d wgrad_1x8x8x1_8x8_1x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1767700736 4278264698 2331753571 2554564568
+conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 927718585 3117803557 1370701307 1462167731
+conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 973422497 1926250028 3440543762
+conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 2892862516 3649300762 1521470286
+conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2075083065 3181416651 1733426984 872275640
+conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4005590448 1639170045 388151578 4186957447
+conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 181075276 1433744686 860506550 3475157408
+conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1747719409 877465841 2345541783
+conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 856324887 2307248012 337386755 3363072703
+conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1906605830 722034901 2562804622 2508759317
+conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 805717279 2196645331 3235235362 1518334120
+conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3168796339 72559978 778918419 1260968000
+conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 261954979 2634885882 451986822 3792829599
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3747142491 2426759809 2622222681 371723930
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
@ -0,0 +1,187 @@
+# test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+from pycutlass.conv2d_operation import *
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+
+class Conv2dDgradImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+    
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,162 @@
+# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dDgradImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage3(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=4, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage3_64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage4_64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=4, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
@ -0,0 +1,89 @@
+# test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+import pycutlass
+from pycutlass.conv2d_operation import *
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dDgradImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[1, 1, 1],
+            element_a=cutlass.float32, element_b=cutlass.float32, 
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32, 
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 8], stages=4, 
+            warp_count=[4, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[1, 1, 1],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 8], stages=4, 
+            warp_count=[2, 4, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,86 @@
+# test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dDgradImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,154 @@
+# test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass.test import *
+import unittest
+
+def conv2d_few_channel_problemsizes(channels):
+    problem_sizes = [
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 8, 8, channels),
+            cutlass.Tensor4DCoord(16, 3, 3, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(2, 2),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 16, 16, channels),
+            cutlass.Tensor4DCoord(16, 3, 3, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(2, 2),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 16, 16, channels),
+            cutlass.Tensor4DCoord(16, 7, 7, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(32, 7, 7, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(64, 7, 7, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(2, 2),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(64, 5, 5, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(64, 5, 5, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(2, 2),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+    ]
+
+    return problem_sizes
+
+class Conv2dFpropFewChannelsF16NHWCF16NHWCF16HNWCTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=2)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=2)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.few_channels,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation, conv2d_few_channel_problemsizes(2)))
+    
+    def test_SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_1(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=1)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=2, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.few_channels,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation, conv2d_few_channel_problemsizes(1)))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,175 @@
+# test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass.test import *
+import unittest
+
+def conv2d_fixed_channel_problemsizes(channels):
+    problem_sizes = [
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 8, 8, channels),
+            cutlass.Tensor4DCoord(16, 3, 3, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(2, 2),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(32, 7, 7, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(64, 7, 7, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(2, 2),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(64, 5, 5, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(64, 5, 5, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(2, 2),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+    ]
+
+    return problem_sizes
+
+class Conv2dFpropFixedChannelsF16NHWCF16NHWCF16HNWCTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_8(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.fixed_channels,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(8)))
+    
+    def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.fixed_channels,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(4)))
+    
+    def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=2)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=2)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.fixed_channels,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(2)))
+
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
@ -0,0 +1,291 @@
+# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dFpropImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=2)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=2)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 14),
+                cutlass.Tensor4DCoord(8, 3, 3, 14),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 23, 56, 98),
+                cutlass.Tensor4DCoord(128, 3, 3, 98),
+                cutlass.Tensor4DCoord(4, 0, 5, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+    
+    def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=2)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=2)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 14),
+                cutlass.Tensor4DCoord(8, 3, 3, 14),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 23, 56, 98),
+                cutlass.Tensor4DCoord(128, 3, 3, 98),
+                cutlass.Tensor4DCoord(4, 0, 5, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+    
+    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 28),
+                cutlass.Tensor4DCoord(8, 3, 3, 28),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 23, 56, 100),
+                cutlass.Tensor4DCoord(128, 3, 3, 100),
+                cutlass.Tensor4DCoord(4, 0, 5, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,48 @@
+# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dFpropImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
@ -0,0 +1,87 @@
+# test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+import pycutlass
+from pycutlass.conv2d_operation import *
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dFpropImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[1, 1, 1],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 8], stages=4, 
+            warp_count=[4, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle2
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+    def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[1, 1, 1],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 8], stages=4, 
+            warp_count=[2, 4, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,98 @@
+# test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dFpropImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align2(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=2)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=2)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            )
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+    
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,235 @@
+# test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dStridedDgradImplicitGemmF16NHWCF16NHWCF32NHWCTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.StridedDgradIdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x256_64x3_64x64x64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 256, 64], stages=3, 
+            warp_count=[2, 4, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.StridedDgradIdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4_128x128_32x3_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.StridedDgradIdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+    
+    def test_SM80_Device_Conv2d_Strided_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.StridedDgradIdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Strided_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32_align4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.StridedDgradIdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 56, 56, 12),
+                cutlass.Tensor4DCoord(8, 1, 1, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(2, 2),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 55, 55, 12),
+                cutlass.Tensor4DCoord(8, 1, 1, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(2, 2),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+
+
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
@ -0,0 +1,86 @@
+# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dWgradImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
+    def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,224 @@
+# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dWgradImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
+    def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_64x256_32x4_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[64, 256, 32], stages=3, 
+            warp_count=[1, 4, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+    
+    def test_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
@ -0,0 +1,87 @@
+# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+import pycutlass
+from pycutlass.conv2d_operation import *
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dWgradImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[1, 1, 1],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 8], stages=4, 
+            warp_count=[2, 4, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[1, 1, 1],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 8], stages=4, 
+            warp_count=[2, 4, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,98 @@
+# test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dWgradImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align1(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=1)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 8, 8, 1),
+                cutlass.Tensor4DCoord(1, 3, 3, 1),
+                cutlass.Tensor4DCoord(1, 1, 1, 1),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+    
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/run_all_tests.py
+++ b/tools/library/scripts/pycutlass/test/conv/run_all_tests.py
@ -0,0 +1,10 @@
+import pycutlass
+import unittest
+from pycutlass.memory_manager import *
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**32, 2**32)
+    loader = unittest.TestLoader()
+    tests = loader.discover('./', 'conv2d_*.py')
+    testRunner = unittest.runner.TextTestRunner()
+    testRunner.run(tests)
--- a/tools/library/scripts/pycutlass/test/frontend/run_test.sh
+++ b/tools/library/scripts/pycutlass/test/frontend/run_test.sh
@ -0,0 +1 @@
+CUPY_CACHE_DIR=./ python test_frontend.py
--- a/tools/library/scripts/pycutlass/test/frontend/test_frontend.py
+++ b/tools/library/scripts/pycutlass/test/frontend/test_frontend.py
@ -0,0 +1,136 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+## Test case for Pytorch
+import pycutlass
+import unittest
+from pycutlass import *
+import torch
+import cupy as cp
+
+
+class Test_Frontend(unittest.TestCase):
+    def setUp(self) -> None:
+        #
+        # define the cutlass operator
+        #
+        math_inst = MathInstruction(
+            [1, 1, 1], cutlass.float32, cutlass.float32, cutlass.float32,
+            cutlass.OpClass.Simt, MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            [128, 128, 8], 4, [2, 4, 1],
+            math_inst, 80, 80
+        )
+
+        A = TensorDescription(
+            cutlass.float32, cutlass.RowMajor, 1
+        )
+
+        B = TensorDescription(
+            cutlass.float32, cutlass.RowMajor, 1
+        )
+
+        C = TensorDescription(
+            cutlass.float32, cutlass.RowMajor, 1
+        )
+
+        self.operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=cutlass.float32,
+            epilogue_functor=EpilogueFunctor.LinearCombination, 
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+
+        pycutlass.compiler.add_module([self.operation,])
+
+
+    def test_torch_frontend(self):
+        problem_size = cutlass.gemm.GemmCoord(512, 256, 128)
+
+        tensor_A = torch.ceil(torch.empty(size=(problem_size.m(), problem_size.k()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
+        tensor_B = torch.ceil(torch.empty(size=(problem_size.k(), problem_size.n()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
+        tensor_C = torch.ceil(torch.empty(size=(problem_size.m(), problem_size.n()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
+        tensor_D = torch.empty_like(tensor_C)
+        
+
+        alpha = 1.0
+        beta = 0.0
+
+        arguments = GemmArguments(
+            operation=self.operation, problem_size=problem_size,
+            A=tensor_A, B=tensor_B, C=tensor_C, D=tensor_D,
+            output_op=LinearCombinationFunctorArguments(alpha, beta),
+            gemm_mode=cutlass.gemm.Mode.Gemm, split_k_splices=1
+        )
+
+        self.operation.run(arguments)
+
+        arguments.sync()
+
+        tensor_D_ref = alpha * tensor_A @ tensor_B + beta * tensor_C
+
+        self.assertTrue(torch.equal(tensor_D, tensor_D_ref))
+    
+    def test_cupy_frontend(self):
+        cp.cuda.set_allocator(rmm.rmm_cupy_allocator)
+
+        problem_size = cutlass.gemm.GemmCoord(512, 256, 128)
+
+        tensor_A = cp.ceil(cp.random.uniform(low=-8.5, high=7.5, size=(problem_size.m(), problem_size.k()), dtype=cp.float32))
+        tensor_B = cp.ceil(cp.random.uniform(low=-8.5, high=7.5, size=(problem_size.k(), problem_size.n()), dtype=cp.float32))
+        tensor_C = cp.ceil(cp.random.uniform(low=-8.5, high=7.5, size=(problem_size.m(), problem_size.n()), dtype=cp.float32))
+        tensor_D = cp.ones_like(tensor_C)
+
+        alpha = 1.0
+        beta = 1.0
+
+        tensor_D_ref = alpha * tensor_A @ tensor_B + beta * tensor_C
+
+        arguments = GemmArguments(
+            operation=self.operation, problem_size=problem_size,
+            A=tensor_A, B=tensor_B, C=tensor_C, D=tensor_D,
+            output_op=LinearCombinationFunctorArguments(alpha, beta),
+            gemm_mode=cutlass.gemm.Mode.Gemm, split_k_splices=1
+        )
+
+        self.operation.run(arguments)
+
+        arguments.sync()
+
+        self.assertTrue(cp.array_equal(tensor_D, tensor_D_ref))
+
+
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**32, 2**32)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/gemm/init.py
+++ b/tools/library/scripts/pycutlass/test/gemm/init.py
--- a/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm80.py
+++ b/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm80.py
@ -0,0 +1,93 @@
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+from pycutlass.test.gemm_testbed import test_all_gemm
+
+class GemmBF16TensorOpSm80(unittest.TestCase):
+    def SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32_64x128x64_32x64x64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.bfloat16, element_b=cutlass.bfloat16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[64, 128, 64],
+            stages=4, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.bfloat16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.bfloat16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.float32, layout=cutlass.RowMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    def test_SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32_128x256x64_64x64x64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.bfloat16, element_b=cutlass.bfloat16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[64, 128, 32],
+            stages=6, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.bfloat16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.bfloat16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.bfloat16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "multistage"))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**24, 2**24)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm80.py
+++ b/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm80.py
@ -0,0 +1,425 @@
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+from pycutlass.test.gemm_testbed import test_all_gemm
+
+
+class GemmF16Sm80(unittest.TestCase):
+    def test_SM80_Device_Gemm_f32t_f32n_f32t_tensor_op_bf16_f32_128x128x32_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32],
+            stages=3, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.float32, layout=cutlass.ColumnMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.BatchedIdentitySwizzle
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor,
+            direct_store=True
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+
+    def test_SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32_128x128x64_64x64x64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64],
+            stages=3, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    def test_SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32_128x256x64_64x64x64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 256, 64],
+            stages=3, warp_count=[2, 4, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.float32, layout=cutlass.ColumnMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    def test_SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32_256x128x64_64x64x64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[256, 128, 64],
+            stages=3, warp_count=[4, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.float32, layout=cutlass.RowMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    def test_SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16_sliced_k_128x64x64_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 64, 64],
+            stages=3, warp_count=[2, 1, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float16
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    def test_SM80_Device_GemmUniversal_f16n_f16t_f32t_tensor_op_f32_64x64x32_32x32x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[64, 64, 32],
+            stages=10, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float16
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    def test_SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32_256x128x64_64x64x64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[256, 128, 64],
+            stages=3, warp_count=[4, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    def test_test_SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16_sliced_k_128x64x64_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 64, 64],
+            stages=3, warp_count=[2, 1, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    def test_SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32_128x256x64_64x64x64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 256, 64],
+            stages=3, warp_count=[2, 4, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    def test_SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32_128x256x64_64x64x64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 256, 64],
+            stages=3, warp_count=[2, 4, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.float32, layout=cutlass.ColumnMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**24, 2**24)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/gemm/gemm_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/gemm/gemm_f32_sm80.py
@ -0,0 +1,138 @@
+import pycutlass
+from pycutlass import *
+from pycutlass.memory_manager import get_allocated_size
+from pycutlass.test import *
+import unittest
+
+from pycutlass.test.gemm_testbed import test_all_gemm
+
+
+class GemmF32nF32nF32nTensorOpF32Sm80(unittest.TestCase):
+    def test_SM80_Device_Gemm_f32t_f32n_f32t_tensor_op_bf16_f32_128x128x32_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add_fast_bf16
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32],
+            stages=3, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float32, layout=cutlass.RowMajor,
+            alignment=4
+        )
+        B = TensorDescription(
+            element=cutlass.float32, layout=cutlass.ColumnMajor,
+            alignment=4
+        )
+        C = TensorDescription(
+            element=cutlass.float32, layout=cutlass.RowMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+
+
+    def test_SM80_Device_Gemm_f32n_f32n_f32t_tensor_op_f32_128x128x32_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32],
+            stages=3, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float32, layout=cutlass.ColumnMajor,
+            alignment=4
+        )
+        B = TensorDescription(
+            element=cutlass.float32, layout=cutlass.ColumnMajor,
+            alignment=4
+        )
+        C = TensorDescription(
+            element=cutlass.float32, layout=cutlass.RowMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    def test_SM80_Device_Gemm_f32n_f32n_f32t_tensor_op_fast_accurate_f32_64x64x32_32x32x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add_fast_f32
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[64, 64, 32],
+            stages=3, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float32, layout=cutlass.ColumnMajor,
+            alignment=4
+        )
+        B = TensorDescription(
+            element=cutlass.float32, layout=cutlass.ColumnMajor,
+            alignment=4
+        )
+        C = TensorDescription(
+            element=cutlass.float32, layout=cutlass.RowMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**24, 2**24)
+    pycutlass.compiler.load_from_cache()
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm80.py
+++ b/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm80.py
@ -0,0 +1,95 @@
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+from pycutlass.test.gemm_testbed import test_all_gemm
+
+class GemmF64TensorOpSm80(unittest.TestCase):
+    def test_SM80_Device_Gemm_f64n_f64t_f64t_tensor_op_f64_32x32x16_16x16x16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[8, 8, 4],
+            element_a=cutlass.float64, element_b=cutlass.float64,
+            element_accumulator=cutlass.float64, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[32, 32, 16],
+            stages=4, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        # alignment 1 restricted for double
+        A = TensorDescription(
+            element=cutlass.float64, layout=cutlass.ColumnMajor,
+            alignment=1
+        )
+        B = TensorDescription(
+            element=cutlass.float64, layout=cutlass.RowMajor,
+            alignment=1
+        )
+        C = TensorDescription(
+            element=cutlass.float64, layout=cutlass.RowMajor,
+            alignment=1
+        )
+
+        element_epilogue = cutlass.float64
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    def test_SM80_Device_Gemm_f64t_f64n_f64t_tensor_op_f64_64x64x16_32x32x16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[8, 8, 4],
+            element_a=cutlass.float64, element_b=cutlass.float64,
+            element_accumulator=cutlass.float64, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[64, 64, 16],
+            stages=4, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        # alignment 1 restricted for double
+        A = TensorDescription(
+            element=cutlass.float64, layout=cutlass.RowMajor,
+            alignment=1
+        )
+        B = TensorDescription(
+            element=cutlass.float64, layout=cutlass.ColumnMajor,
+            alignment=1
+        )
+        C = TensorDescription(
+            element=cutlass.float64, layout=cutlass.RowMajor,
+            alignment=1
+        )
+
+        element_epilogue = cutlass.float64
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**24, 2**24)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/gemm/gemm_grouped_sm80.py
+++ b/tools/library/scripts/pycutlass/test/gemm/gemm_grouped_sm80.py
@ -0,0 +1,197 @@
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+from pycutlass.test.gemm_grouped_testbed import TestbedGrouped
+
+
+class GemmGroupedSm80(unittest.TestCase):
+    def test_SM80_Device_GemmGrouped_f16n_f16t_f32n_tensor_op_f32_128x128x32_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16], element_a=cutlass.float16,
+            element_b=cutlass.float16, element_accumulator=cutlass.float32,
+            opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32],
+            stages=3, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+
+        C = TensorDescription(
+            element=cutlass.float32, layout=cutlass.ColumnMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float32
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        swizzling_functor = cutlass.BatchedIdentitySwizzle
+
+        for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
+            operation = GemmOperationGrouped(
+                tile_description.minimum_compute_capability,
+                tile_description, A, B, C,
+                element_epilogue,
+                epilogue_functor, swizzling_functor,
+                precompute_mode=precompute_mode
+            )
+
+            testbed = TestbedGrouped(operation=operation)
+
+            self.assertTrue(testbed.run(24))
+    
+    def test_SM80_Device_GemmGrouped_f64t_f64t_f64n_tensor_op_f64_64x64x16_32x32x16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[8, 8, 4], element_a=cutlass.float64,
+            element_b=cutlass.float64, element_accumulator=cutlass.float64,
+            opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[64, 64, 16],
+            stages=4, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float64, layout=cutlass.RowMajor,
+            alignment=1
+        )
+
+        B = TensorDescription(
+            element=cutlass.float64, layout=cutlass.RowMajor,
+            alignment=1
+        )
+
+        C = TensorDescription(
+            element=cutlass.float64, layout=cutlass.ColumnMajor,
+            alignment=1
+        )
+
+        element_epilogue = cutlass.float64
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        swizzling_functor = cutlass.BatchedIdentitySwizzle
+
+        for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
+            operation = GemmOperationGrouped(
+                tile_description.minimum_compute_capability,
+                tile_description, A, B, C,
+                element_epilogue,
+                epilogue_functor, swizzling_functor,
+                precompute_mode=precompute_mode
+            )
+
+            testbed = TestbedGrouped(operation=operation)
+
+            self.assertTrue(testbed.run(24))
+    
+    def test_SM80_Device_GemmGrouped_f32t_f32t_f32t_simt_f32_128x64x8_64x32x1(self):
+        math_inst = MathInstruction(
+            instruction_shape=[1, 1, 1], element_a=cutlass.float32,
+            element_b=cutlass.float32, element_accumulator=cutlass.float32,
+            opcode_class=cutlass.OpClass.Simt,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 64, 8],
+            stages=4, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float32, layout=cutlass.RowMajor,
+            alignment=1
+        )
+
+        B = TensorDescription(
+            element=cutlass.float32, layout=cutlass.RowMajor,
+            alignment=1
+        )
+
+        C = TensorDescription(
+            element=cutlass.float32, layout=cutlass.RowMajor,
+            alignment=1
+        )
+
+        element_epilogue = cutlass.float32
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        swizzling_functor = cutlass.BatchedIdentitySwizzle
+
+        for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
+            operation = GemmOperationGrouped(
+                tile_description.minimum_compute_capability,
+                tile_description, A, B, C,
+                element_epilogue,
+                epilogue_functor, swizzling_functor,
+                precompute_mode=precompute_mode
+            )
+
+            testbed = TestbedGrouped(operation=operation)
+
+            self.assertTrue(testbed.run(27))
+    
+    def test_SM80_Device_GemmGrouped_f16n_f16t_f32n_tensor_op_f32_128x128x32_64x64x32_cache(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16], element_a=cutlass.float16,
+            element_b=cutlass.float16, element_accumulator=cutlass.float32,
+            opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32],
+            stages=3, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+
+        C = TensorDescription(
+            element=cutlass.float32, layout=cutlass.ColumnMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float32
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        swizzling_functor = cutlass.BatchedIdentitySwizzle
+
+        for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
+            operation = GemmOperationGrouped(
+                tile_description.minimum_compute_capability,
+                tile_description, A, B, C,
+                element_epilogue,
+                epilogue_functor, swizzling_functor,
+                precompute_mode=precompute_mode
+            )
+
+            testbed = TestbedGrouped(operation=operation)
+
+            self.assertTrue(testbed.run(5))
+
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm80.py
+++ b/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm80.py
@ -0,0 +1,219 @@
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+from pycutlass.test.gemm_testbed import test_all_gemm
+
+class GemmS8TensorOpF32Sm80(unittest.TestCase):
+    def test_SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32_64x64x64_32x32x64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 32],
+            element_a=cutlass.int8, element_b=cutlass.int8,
+            element_accumulator=cutlass.int32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add_saturate
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[64, 64, 64],
+            stages=6, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.int8, layout=cutlass.ColumnMajorInterleaved32,
+            alignment=16
+        )
+        B = TensorDescription(
+            element=cutlass.int8, layout=cutlass.RowMajorInterleaved32,
+            alignment=16
+        )
+        C = TensorDescription(
+            element=cutlass.int8, layout=cutlass.ColumnMajorInterleaved32,
+            alignment=8
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.FastLinearCombinationClamp
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "interleaved"))
+    
+    def test_SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32_256x128x128_64x64x128(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 32],
+            element_a=cutlass.int8, element_b=cutlass.int8,
+            element_accumulator=cutlass.int32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 128],
+            stages=3, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.int8, layout=cutlass.RowMajor,
+            alignment=16
+        )
+        B = TensorDescription(
+            element=cutlass.int8, layout=cutlass.ColumnMajor,
+            alignment=16
+        )
+        C = TensorDescription(
+            element=cutlass.int8, layout=cutlass.RowMajor,
+            alignment=16
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.FastLinearCombinationClamp
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "multistage"))
+    
+    def test_SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32_128x128x128_64x64x128(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 32],
+            element_a=cutlass.int8, element_b=cutlass.int8,
+            element_accumulator=cutlass.int32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 128],
+            stages=3, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.int8, layout=cutlass.RowMajor,
+            alignment=16
+        )
+        B = TensorDescription(
+            element=cutlass.int8, layout=cutlass.ColumnMajor,
+            alignment=16
+        )
+        C = TensorDescription(
+            element=cutlass.int8, layout=cutlass.ColumnMajor,
+            alignment=16
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.FastLinearCombinationClamp
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "multistage"))
+    
+    def test_SM80_Device_Gemm_s8t_s8n_s32n_tensor_op_s32_128x128x128_64x64x128(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 32],
+            element_a=cutlass.int8, element_b=cutlass.int8,
+            element_accumulator=cutlass.int32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 128],
+            stages=3, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.int8, layout=cutlass.RowMajor,
+            alignment=16
+        )
+        B = TensorDescription(
+            element=cutlass.int8, layout=cutlass.ColumnMajor,
+            alignment=16
+        )
+        C = TensorDescription(
+            element=cutlass.int32, layout=cutlass.ColumnMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.int32
+
+        epilogue_functor = EpilogueFunctor.LinearCombinationClamp
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "multistage"))
+    
+    def test_SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32_128x128x128_64x64x128(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 32],
+            element_a=cutlass.int8, element_b=cutlass.int8,
+            element_accumulator=cutlass.int32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 128],
+            stages=3, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.int8, layout=cutlass.RowMajor,
+            alignment=16
+        )
+        B = TensorDescription(
+            element=cutlass.int8, layout=cutlass.ColumnMajor,
+            alignment=16
+        )
+        C = TensorDescription(
+            element=cutlass.int32, layout=cutlass.RowMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.int32
+
+        epilogue_functor = EpilogueFunctor.LinearCombinationClamp
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "multistage"))
+    
+
+
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**24, 2**24)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/gemm/run_all_tests.py
+++ b/tools/library/scripts/pycutlass/test/gemm/run_all_tests.py
@ -0,0 +1,9 @@
+import pycutlass
+import unittest
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    loader = unittest.TestLoader()
+    tests = loader.discover('./', 'gemm_*.py')
+    testRunner = unittest.runner.TextTestRunner()
+    testRunner.run(tests)
--- a/tools/library/scripts/pycutlass/test/unit/cached_results_SM80_2080.txt
+++ b/tools/library/scripts/pycutlass/test/unit/cached_results_SM80_2080.txt
@ -0,0 +1,350 @@
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1767700736 2104699940 3506659864 557648934
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1539314507 3971227455 1976927351 1642148785
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 276489656 653235219 3147305346 880610205
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 272457724 2178229139 2786201726 4170295839
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 242235041 2149454506 784935854 682531065
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 3478189705 1667216236 1437761176
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 379326961 1780379994 3740415776
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 924848818 3533854396 2683779476
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2870331951 359232443 2147867990 1653277018
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2870331951 3784314846 2644315999 4224154526
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3787448414 3562991793 535073859 2563373454
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 426169840 2464808416 864648234 461884698
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2564934525 3910792915 3577331017 827498183
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 28479234 867695528 1947311971 83328334
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4192922822 4244595864 2296602326 2349214706
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 274678245 3464152269 1682550229 3446204619
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3993280136 828543035 1319748516 956044554
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 832003025 3799813757 4030292245 457791957
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1444316594 4129865888 93616503 412257611
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2931873718 1841508064 1497852219 36703874
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2931873718 1841508064 1497852219 1842147148
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1612565294 109894479 1782187316 3370789453
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 841569299 1010785577 1158956167 3261208135
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1893352157 48149942 3544807462 446577726
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 3585320147 2150950452 1625817025 3964129474
+conv2d dgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 1227883689 3016005301 4142905842
+conv2d dgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 3337296764 4183699161 3654176452
+conv2d dgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 3852963969 864006170 920352568
+conv2d dgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2624318208 2750240096 2120184232 2600672872
+conv2d dgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 3224082300 2084034673 3588056946
+conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3271403719 3033073939 304048758 1882633089
+conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1419588777 610026473 447427404 2639856195
+conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 763924990 2818680871 58428273 3332443900
+conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2578426561 1891702153 103393067 2558647731
+conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 162127134 3567670201 3173514764
+conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 162127134 3567670201 363897018
+conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2462511240 1350938697 1696306119 1005311005
+conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3884703009 3552725366 1975514757 1210310496
+conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2056905385 447674669 724481645 1457430910
+conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 1227883689 3401425854 3897766524
+conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 3749787834 3350064812 1136116240
+conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3414629540 820341033 770836461 2451581199
+conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4100326666 2581696511 1088458082 1521190911
+conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3662895757 2885454895 935600441 2615245898
+conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 3831334389 3506139121 814982501
+conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2154102133 737968461 1291834254 2665225480
+conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 3573498719 1809195644 1765637461
+conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 3573498719 3379808294 483095299
+conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 4194153035 2863868771 1639389008
+conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2624318208 157618421 1779474147 814087242
+conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 2300180628 423968553 3890279569
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2609259399 1848932917 522753581 1926508271
+conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2948772873 3663040534 4014266327 1288646188
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3271403719 1585195072 1487505772 3253374264
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1419588777 451194147 3578359696 3659768981
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 763924990 2780826684 2883769406 148530958
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2578426561 3849874822 102765469 1305171059
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 1995451256 2632815435 1516344656
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 1995451256 2632815435 1586331550
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2462511240 2274021368 1188866747 3178890497
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 752289976 1226457131 4187777346 1400559240
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3723912751 1585959358 3731079159 1498901684
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 2027599472 2758666204 3287095476 4291916486
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3393706648 3519979618 1149261202 799742106
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3409586999 409840186 1724648597 2642018980
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1815685330 1398622058 2431638856 1016967269
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2555706782 3271563943 1020153035 299097281
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 4173830187 736684125 472021975 2064613035
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3010335403 2751224679 2250540122 3725638844
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3010335403 1583610315 3287895411 2394340435
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3010335403 2356047354 7055632 915702611
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2748205217 2539405983 1217377670 2011175578
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2748205217 2114448427 249997769 2711364520
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1528321643 1532777511 3597171412 296622236
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1326617037 3415095747 847196866 1481554158
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1122706355 2841974626 2791878604 632900093
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1728385278 2462678309 3066040807 1334515660
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2175275779 1117731224 857614711 2096711962
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 4140401170 3710340185 1683575469 317397427
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3552249008 2918315307 2290683130 536859016
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2869959072 2516947012 3328285094 2393284712
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1349264322 1823945068 400087667 2893025864
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3321662203 426084311 4233055093 4078572279
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3321662203 426084311 4233055093 3044377475
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 803041205 2521863610 3206942690 127091020
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 4083508736 37801570 240515127 2234797539
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2207374588 535059558 2268619394 1489214085
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 dnhwc_dnhwc_dnhwc_d_d 3614026280 1721563676 2979825951 1104908081
+conv2d dgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 810746104 2226238626 2053372396 2462697514
+conv2d dgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 810746104 235646718 1374133172 3696289981
+conv2d dgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2536722089 184705847 3148323124 84213385
+conv2d dgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2264868815 1724845245 3498302256 4094034457
+conv2d dgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1621735632 233390337 1801952602 3532884734
+conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3048346885 2306163504 642074123 4083120683
+conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2798030672 683783039 3025345160 1890891136
+conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1731071506 1844675436 2292509333 4006304179
+conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 132147677 604503886 143348844 3037223953
+conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1269799445 1678940393 3405733837 1820114523
+conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1269799445 1678940393 3405733837 467254076
+conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1794301352 2320042028 2134048179 508141072
+conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 561590023 3382154048 4154621995 517057927
+conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 593915463 2360210889 2685491481 2265099675
+conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 810746104 2226238626 1155815529 558646991
+conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2536722089 1876429398 4216128545 1754596046
+conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 348523586 2609019785 3938405680 2601133907
+conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1984146316 1475870285 1157657800 1143965395
+conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2971058593 1478256319 503014742 3930504182
+conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1621735632 1214508920 1537003531 3830217225
+conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2031518387 2695641559 933408074 4026827730
+conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 810746104 517276344 1158854831 3123629043
+conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 810746104 517276344 1448394173 1864626308
+conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2536722089 711164468 2465036841 2993377049
+conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2264868815 3003481795 333430991 3094857755
+conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1621735632 1126010692 3313703859 637497110
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1130094757 2605103293 2477101661 1276123281
+conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 4286533436 1302900889 2613245986 2523724148
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3048346885 923365529 1681226722 417509256
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2798030672 3441819646 1293178065 188472807
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1731071506 1117530547 2706270359 502156742
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 132147677 2029225588 3851064913 3164530726
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1269799445 2337137106 3312954197 2466682688
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1269799445 2337137106 3312954197 2684544683
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1794301352 72938921 2354994612 1463501392
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 252570564 2903451081 3619280116 1448586411
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2037991187 1665743881 241585763 103256264
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 dnhwc_dnhwc_dnhwc_d_d 2653975581 3337638999 1440125233 2448165745
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 991402150 1393431534 1148212814 1350914659
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 4283492776 419570292 1210341563
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4178596783 3828059710 2735749436 2671012171
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 924522595 563724475 3750778972 4152580670
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1021044158 1686067905 3765040166 4102272733
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 2674994719 635224486 2759329777
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 4201252830 2920298728 304256151
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 70289262 646435722 4137562540
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1317457392 1288095320 2132879813 656196754
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1317457392 2202157489 2326567490 2475188414
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2476454437 1857118302 4164386062 239840568
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2767650699 3514840131 590439733 3879821123
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3896287283 3112762669 2515107934 2106635937
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1903067870 1021832870 3003938078 2751931686
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3489785028 2466126497 1374078692 2737628040
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2051350923 263676708 3639860119 1370886256
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 719099834 1474713672 204857540 2768940347
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3441724486 3162593831 421721594 3097845598
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2034354027 1249407570 2567025479 1441082595
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 941893937 3608468045 635631428 2369653089
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 941893937 3608468045 635631428 1218705038
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 172579142 319546523 718795680 1453661415
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2823351660 1326352711 1110204809 1155441703
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3238446487 2572503545 686287700 1559476701
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_f_f 2149247508 1775375365 3317647029 2497607448
+conv2d dgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 3464637181 1623218578 436154205
+conv2d dgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1479940693 3253144559 3883419107
+conv2d dgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 1871463331 2425320272 74566211
+conv2d dgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3484040069 664160900 3610888033 22347127
+conv2d dgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1924855848 1382111427 2541177413
+conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 868180534 1764715518 3070473696 2392864704
+conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3437976747 666906244 3401957738 2050602745
+conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4195072693 1575210381 781892324 2848949054
+conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3457330201 2316839359 1539389419 4293781748
+conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 2469024119 2885305868 2693098375
+conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 2469024119 2885305868 1969608051
+conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1690216859 554790212 2885143346 780489333
+conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 835105643 3337423971 3866137775
+conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 927718585 4106152802 720400339 3989318043
+conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4110991321 3464637181 4051957661 126285749
+conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 832653836 3723472741 2044236350 2463899842
+conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2075083065 2042513140 3691286135 322550345
+conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4005590448 1116254439 2328237343 1918824440
+conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 181075276 1743485155 3526891198 1979405632
+conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1513864544 386662952 4057300775 1456746562
+conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 856324887 3954249564 2340393915 4127188930
+conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4110991321 1300426008 2921497047 4145791960
+conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4110991321 1300426008 4080981223 3076991942
+conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 832653836 447261065 3823545045 392205236
+conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3484040069 2966693627 3900095420 919511892
+conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1513864544 1759979610 4272621682 1029257940
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1906605830 2980501720 978889789 3136018973
+conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 805717279 3502822733 1810065278 1387739380
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 868180534 3289288595 209477462 4142168174
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3437976747 3391080565 97275649 4063718293
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4195072693 1669352457 2182133559 2494741804
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3457330201 1126870455 319272291 3811977088
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 754609939 1723074453 1660326213 3902884425
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 754609939 1723074453 1660326213 423159249
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1690216859 2413490039 223529410 3303697952
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3168796339 1601750164 1428743330 403295189
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 261954979 1300976652 2749562370 3058142403
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_f_f 3747142491 1747587481 3143977827 835130482
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1736512560 49406874 846358010 3314905564
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1848484956 1432417472 1903569827 3750799351
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4236427320 3696009469 69852620 201921851
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 109006944 450017448 1793784844 903209915
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 813367872 2397796503 1928191746 3210229460
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 1307184141 46021356 1674017987
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 1212511562 3331767121 2446286369
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 2013675943 1681111033 1469213228
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1703349794 500298386 3218034344 4159283207
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1703349794 1123534155 145385311 4273847179
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3862659311 349459322 1503631520 1404971956
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1623686755 961217371 552550209 3980749384
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3554927580 1131648083 4149599295 3119557776
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1767639287 3350675774 128324027 1059816532
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3986143536 17411088 40173029 1694092310
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1157793540 3513299281 48848814 1435528367
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 988962069 4292634763 388976034 2674929544
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4202383208 3529769234 1046186503 3368902675
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 856448884 3057259762 2063087558 1995545427
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2281940872 144496548 2455451862 400986166
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2281940872 144496548 2455451862 1082696406
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2702905851 1992889713 731289041 608504198
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2742293143 4197915274 606840 3671124731
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 149434841 2288560511 2994968424 2881838300
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 2226824643 327135318 3718671210 2121176659
+conv2d dgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 1027662440 4172720592 446082987
+conv2d dgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 1101653138 3727072529 875733988
+conv2d dgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 3906526127 655926291 939844058
+conv2d dgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2784049299 2031878085 1709408312 1277173429
+conv2d dgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 22652410 1700696921 2175632852
+conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1530672622 436588210 470857851 284463232
+conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1500864134 59350507 969037229 1510558485
+conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3344871528 856797938 2030818524 4231831552
+conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 966721255 2885833872 2829967135 3441569557
+conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 4148824382 2827420298 378131261
+conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 4148824382 2827420298 2955292920
+conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4028893260 1474248671 1302526250 4182204885
+conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1569788048 162506176 819639712 763595635
+conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 289918791 1266976707 942688231 3457364823
+conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 1027662440 2005082293 2235558527
+conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 3380032042 1370040310 1348846927
+conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 671982235 1423304149 2107662762 1234913781
+conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 798317794 1709026638 2421185623 3308071321
+conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1721270411 2519327328 2541413264 3185574975
+conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 2070174510 1364436192 3531942595
+conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2128738105 2056902987 3079166829 2329433528
+conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3857917762 3227877956 645422556
+conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3857917762 3817218800 985231315
+conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 1398036015 3630062764 2492522537
+conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2784049299 643733019 3649549642 2637869234
+conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 2332160299 302086821 3303132343
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1931093565 2458714707 2919710256 2311575036
+conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2472246681 2260022344 500095455 2760458995
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1530672622 3635363851 2402907878 4131497953
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1500864134 2536338700 2459524764 2504484273
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3344871528 2667385029 2714805835 3487838445
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 966721255 1547169349 3198573835 302049294
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 2440004820 1576818970 1317923157
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 2440004820 1576818970 3186679687
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4028893260 4220759192 2236533218 3731336532
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2956871200 1591352238 1756650151 1262787222
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 365467186 892422645 1334708242 1372556938
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 3347784734 150035460 2897171548 3701081496
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 945660191 3750377696 2496492611 3515056508
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2806300501 2591577756 3148637036 3845512743
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2322444122 3525997046 281106520 3456307300
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 327345109 1137297282 1938163814 2551101563
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 797067973 481331945 350851834 2477733239
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1316460560 2044204046 1034822169 3340281844
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1316460560 4174274001 1597212204 1881272946
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1316460560 1535088984 3001492060 2308505016
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3190527989 3733991924 4211138051 3710311115
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3190527989 3430768821 1043108884 4185640072
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 943531303 1948306075 3877008798 2803592376
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3262141476 4125717435 2946529611 2221512094
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1599291337 3982786366 1581171257 1188352423
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2237070215 3046262465 1926804094 1435916873
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 721666814 2012769306 1712378956 1388990183
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1596349869 3775131163 355203300 1126174452
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1380587417 1208642645 2886387159 3113955983
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1332573203 1417735573 1422796372 3309229181
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2714027800 2106992819 1196036582 2095126659
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1105097447 1992731268 2198911423 3378137735
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1105097447 1992731268 2198911423 3868431311
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2552471160 2218470296 2332616929 923645661
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2231354584 4035702005 3839068434 8981294
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 4019719318 3985307916 3604065639 277096636
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 bf16nhwc_bf16nhwc_fnhwc_f_f 258381429 3482776077 2663631601 593179089
+conv2d dgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1613915345 188810648 1623218578 2585892217
+conv2d dgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1613915345 691990354 3253144559 2988350639
+conv2d dgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2788041828 1670375523 2425320272 2553108650
+conv2d dgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1049321188 1865889553 3610888033 1459693945
+conv2d dgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3820648800 3236781482 1382111427 1986396315
+conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 463742721 2524037630 3070473696 210045128
+conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 738614177 4071452982 3401957738 2920893800
+conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2479111539 2662555669 781892324 2338234282
+conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2089076160 260434096 1539389419 1219120658
+conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 14838294 3344412669 2885305868 1926445693
+conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 14838294 3344412669 2885305868 1478058549
+conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3945616248 4118489020 2885143346 1545684873
+conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 295760528 1685244361 3337423971 772814550
+conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 623727338 942771643 2634710231 3063349371
+conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1613915345 188810648 2709881923 3532383400
+conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2788041828 3762161398 3733128758 3693097785
+conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 139944998 3812563855 253288229 1359907535
+conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 492562992 3677108443 525487530 445191233
+conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 594197095 3773864559 91136873 4170763393
+conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3820648800 1025574686 1127709182 677727764
+conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1901075489 3296829308 2591894666 2932517926
+conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1613915345 4223561525 1263618595 50680160
+conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1613915345 4223561525 1756414462 3209752057
+conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2788041828 1023542180 121940906 624551470
+conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1049321188 296097075 1423016429 1058165639
+conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3820648800 4160685370 2761559427 1788182893
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1859384988 222880684 1650970502 1632078530
+conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1704522433 2403392926 3985958544 1432584676
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 463742721 3455033786 385631111 1683348880
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 738614177 3199562330 1513955316 2131256035
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2479111539 2702777753 2608107448 4014212857
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2089076160 4042009058 106232038 1140762595
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 14838294 2260768172 1186911503 3194129408
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 14838294 2260768172 1186911503 1312312812
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3945616248 2287161276 36034283 4262860382
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2906914535 476297538 14375779 1340176713
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 4292101959 3378414564 4259930640 1392755176
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 bf16nhwc_bf16nhwc_fnhwc_f_f 3529371817 368260304 4137156526 122558013
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 2948718568 2631391783 3260825675 4278587299
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 1635109696 2835574424 4179385325 2803281440
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 3344954627 1649157278 2032056735 1176638626
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 61750237 3452849177 1697665310 3475459781
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 1394759191 1571308277 898534533 4125341936
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 3402206912 2433594404 1575577431 4106154211
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 98638790 2735493952 346473870 1911666301
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 98638790 2735493952 346473870 2124440208
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 2934485636 3286257323 541566528 1113783492
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 164942943 4259285988 1250700182 508419908
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 3805460372 2607401558 3465030781 210641751
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 4200926784 1001915027 387475271 3360115596
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 331078659 469730619 2547196469 1620698703
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 431968022 1614654085 903827412 1349891842
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 3674369485 1055554271 3217013807 1356703347
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 2609462247 3227824772 365527403 2720889763
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 2609462247 2150996976 2899308770 2371758816
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 2609462247 2124373651 2711906981 3194739760
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 1070162100 2750964634 3090791018 3481982191
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 1070162100 1563941622 767747438 3163252390
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 884815233 3576251756 3216742798 3534462723
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 3230717758 3192193994 1161445944 371179683
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 2450454245 2905280248 910194866 839083662
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 2948718568 2631391783 638794727 4292051282
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 1635109696 2835574424 1855687620 130932480
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 3344954627 1649157278 4191418350 958044197
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 61750237 3452849177 3260472389 771128506
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 1394759191 1571308277 4279538191 956191103
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 3402206912 2433594404 2021112123 2983097553
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 98638790 2735493952 3178839372 568554158
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 98638790 2735493952 3178839372 18194802
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 2934485636 3286257323 2559221535 2310182528
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 164942943 4259285988 984016853 888753301
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 2823094147 1681845497 4242738907 3244428635
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 s8nhwc_s8nhwc_inhwc_i_i 4060010502 2881035321 3927119619 3311661122
--- a/tools/library/scripts/pycutlass/test/unit/test_sm80.py
+++ b/tools/library/scripts/pycutlass/test/unit/test_sm80.py
@ -0,0 +1,440 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+## Test case generator for SM80
+
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+#
+# Create GEMM operation
+#
+
+def TestGemmOperator(gemm_kind, math_inst, layout, alignment, tiling, arch, mixed=False,
+    epilogue_functor = EpilogueFunctor.LinearCombination, 
+    swizzling_functor=cutlass.IdentitySwizzle1, **kwargs):
+    """
+    Test GEMM Operation based on configuration
+    """
+
+    if "data_type" in kwargs.keys():
+        data_type = kwargs["data_type"]
+    else:
+        if mixed or math_inst.element_a == cutlass.bfloat16:
+            data_type = [
+                math_inst.element_a,
+                math_inst.element_b,
+                math_inst.element_accumulator,
+                math_inst.element_accumulator
+            ]
+        else:
+            data_type = [
+                math_inst.element_a,
+                math_inst.element_b,
+                math_inst.element_a,
+                math_inst.element_accumulator
+            ]
+    
+    tile_description = TileDescription(
+        tiling[0], tiling[1], tiling[2],
+        math_inst, arch, arch
+    )
+
+    A = TensorDescription(
+        data_type[0], layout[0], alignment[0]
+    )
+
+    B = TensorDescription(
+        data_type[1], layout[1], alignment[1]
+    )
+
+    C = TensorDescription(
+        data_type[2], layout[2], alignment[2]
+    )
+
+    element_epilogue = data_type[3]
+
+    if gemm_kind == GemmKind.Universal:
+        operation = GemmOperationUniversal(
+            arch=arch, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue, 
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+        if A.layout in [cutlass.ColumnMajorInterleaved32, cutlass.RowMajorInterleaved32]:
+            return test_all_gemm(operation, "interleaved")
+        else:
+            return test_all_gemm(operation, "universal")
+        
+    elif gemm_kind == GemmKind.Grouped:
+        operation = GemmOperationGrouped(
+            arch, tile_description, A, B, C,
+            element_epilogue, epilogue_functor, swizzling_functor,
+            precompute_mode=kwargs["precompute_mode"]
+        )
+        testbed = TestbedGrouped(operation=operation)
+        return testbed.run(24)
+    else:
+        raise NotImplementedError("the gemm kind is not implemented")
+
+
+def TestConv2dOperator(math_inst, alignment, tiling, arch, 
+    stride_supports=[StrideSupport.Strided, StrideSupport.Strided, StrideSupport.Strided],
+    epilogue_functor=EpilogueFunctor.LinearCombination, 
+    swizzling_functor=cutlass.IdentitySwizzle1, interleaved=False, **kwargs):
+    """
+    Test Conv2d Operation based on configurations
+    """
+
+    mixeds = [False, True, False]
+    conv_kinds = [cutlass.conv.Operator.fprop, cutlass.conv.Operator.dgrad, cutlass.conv.Operator.wgrad]
+
+    results = []
+
+    default_swizzling_functor = swizzling_functor
+
+    if "layout" in kwargs.keys():
+        layout = kwargs["layout"]
+    else:
+        layout = (cutlass.TensorNHWC, cutlass.TensorNHWC, cutlass.TensorNHWC)
+
+    for mixed, conv_kind, stride_support in zip(mixeds, conv_kinds, stride_supports):
+
+        if "data_type" in kwargs.keys():
+            data_type = kwargs["data_type"]
+        else:
+            if mixed or math_inst.element_a == cutlass.bfloat16:
+                data_type = [
+                    math_inst.element_a,
+                    math_inst.element_b,
+                    math_inst.element_accumulator,
+                    math_inst.element_accumulator
+                ]
+            else:
+                data_type = [
+                    math_inst.element_a,
+                    math_inst.element_b,
+                    math_inst.element_a,
+                    math_inst.element_accumulator
+                ]
+        # skip Int8 Conv Backward
+        if data_type[0] == cutlass.int8 and conv_kind in [cutlass.conv.Operator.dgrad, cutlass.conv.Operator.wgrad]:
+            continue
+
+        A = TensorDescription(
+            element=data_type[0],
+            layout=layout[0],
+            alignment=alignment[0])
+        B = TensorDescription(
+            element=data_type[1],
+            layout=layout[1], 
+            alignment=alignment[1])
+        C = TensorDescription(
+            element=data_type[2],
+            layout=layout[2], 
+            alignment=alignment[2])
+        
+        tile_description = TileDescription(
+            threadblock_shape=tiling[0], stages=tiling[1], 
+            warp_count=tiling[2],
+            math_instruction=math_inst,
+            min_compute=arch, max_compute=arch
+        )
+
+        if conv_kind == cutlass.conv.Operator.dgrad and stride_support == StrideSupport.Strided:
+            swizzling_functor = cutlass.StridedDgradIdentitySwizzle1
+        else:
+            swizzling_functor = default_swizzling_functor
+
+        operation = Conv2dOperation(
+            conv_kind=conv_kind, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=arch, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=data_type[3], stride_support=stride_support,
+            epilogue_functor=epilogue_functor,
+            swizzling_functor=swizzling_functor
+        )
+        
+        results.append(test_all_conv2d(operation, interleaved=interleaved))
+    
+    return results
+
+
+
+class Test_SM80(unittest.TestCase):
+    def test_SM80_TensorOp_16816(self):
+        math_instructions = [
+            MathInstruction(
+                [16, 8, 16], cutlass.float16, cutlass.float16, cutlass.float32,
+                cutlass.OpClass.TensorOp, MathOperation.multiply_add
+            ),
+            MathInstruction(
+                [16, 8, 16], cutlass.float16, cutlass.float16, cutlass.float16,
+                cutlass.OpClass.TensorOp, MathOperation.multiply_add
+            ),
+            MathInstruction(
+                [16, 8, 16], cutlass.bfloat16, cutlass.bfloat16, cutlass.float32,
+                cutlass.OpClass.TensorOp, MathOperation.multiply_add
+            )
+        ]
+
+        layouts = [
+            (cutlass.RowMajor, cutlass.RowMajor, cutlass.RowMajor),
+            (cutlass.ColumnMajor, cutlass.RowMajor, cutlass.RowMajor),
+            (cutlass.RowMajor, cutlass.ColumnMajor, cutlass.RowMajor)
+        ]
+
+        alignments = [
+            (8, 8, 8), (4, 8, 8), (8, 4, 8)
+        ]
+
+        tilings = [
+            ([256, 128, 32], 3, [4, 2, 1]),
+            ([64, 256, 32], 4, [1, 4, 1]),
+            ([128, 64, 64], 3, [2, 2, 1])
+        ]
+
+        for math_inst, layout, alignment, tiling in zip(math_instructions, layouts, alignments, tilings):
+            self.assertTrue(TestGemmOperator(GemmKind.Universal, math_inst, layout, alignment, tiling, 80, False))
+            self.assertTrue(TestGemmOperator(GemmKind.Grouped, math_inst, layout, alignment, tiling, 80, True, precompute_mode=SchedulerMode.Host))
+            stride_supports = [StrideSupport.Strided, StrideSupport.Strided, StrideSupport.Strided]
+            results = TestConv2dOperator(math_inst, alignment, tiling, 80, stride_supports=stride_supports)
+            for res in results:
+                self.assertTrue(res)
+
+    def test_SM80_TensorOp_1688(self):
+        # tf32 is not supported by most of python environment. Skip the test
+        self.assertTrue(True)
+    
+    def test_SM80_TensorOp_1688_fast_math(self):
+        math_instructions = [
+            MathInstruction(
+                [16, 8, 8], cutlass.tfloat32, cutlass.tfloat32, cutlass.float32,
+                cutlass.OpClass.TensorOp, MathOperation.multiply_add
+            ),
+            MathInstruction(
+                [16, 8, 8], cutlass.float16, cutlass.float16, cutlass.float32,
+                cutlass.OpClass.TensorOp, MathOperation.multiply_add_fast_f16
+            ),
+            MathInstruction(
+                [16, 8, 8], cutlass.bfloat16, cutlass.bfloat16, cutlass.float32,
+                cutlass.OpClass.TensorOp, MathOperation.multiply_add_fast_bf16
+            ),
+            MathInstruction(
+                [16, 8, 8], cutlass.float32, cutlass.float32, cutlass.float32,
+                cutlass.OpClass.TensorOp, MathOperation.multiply_add_fast_f32
+            )
+        ]
+
+        layouts = [
+            (cutlass.RowMajor, cutlass.RowMajor, cutlass.ColumnMajor),
+            (cutlass.RowMajor, cutlass.ColumnMajor, cutlass.ColumnMajor),
+            (cutlass.ColumnMajor, cutlass.RowMajor, cutlass.ColumnMajor),
+            (cutlass.ColumnMajor, cutlass.ColumnMajor, cutlass.RowMajor)
+        ]
+        alignments = [
+            (4, 4, 4), (4, 2, 4), (2, 4, 4), (2, 2, 4)
+        ]
+        tilings = [
+            ([128, 256, 16], 3, [4, 2, 1]),
+            ([64, 256, 16], 4, [1, 4, 1]),
+            ([128, 64, 32], 3, [2, 2, 1]),
+            ([256, 64, 32], 3, [4, 2, 1])
+        ]
+        data_type = [
+            cutlass.float32, cutlass.float32, cutlass.float32, cutlass.float32
+        ]
+        for math_inst, layout, alignment, tiling in zip(math_instructions, layouts, alignments, tilings):
+            self.assertTrue(
+                TestGemmOperator(
+                    GemmKind.Universal, math_inst, layout, 
+                    alignment, tiling, 80, False, data_type=data_type))
+            self.assertTrue(
+                TestGemmOperator(
+                    GemmKind.Grouped, math_inst, layout, alignment, tiling, 80, 
+                    True, precompute_mode=SchedulerMode.Device, data_type=data_type))
+            stride_supports = [StrideSupport.Unity, StrideSupport.Strided, StrideSupport.Unity]
+            results = TestConv2dOperator(math_inst, alignment, tiling, 80, stride_supports=stride_supports, data_type=data_type)
+            for res in results:
+                self.assertTrue(res)
+
+    def test_SM80_TensorOp_884(self):
+        math_inst = MathInstruction(
+            [8, 8, 4], cutlass.float64, cutlass.float64, cutlass.float64,
+            cutlass.OpClass.TensorOp, MathOperation.multiply_add
+        )
+        layout = (cutlass.ColumnMajor, cutlass.ColumnMajor, cutlass.ColumnMajor)
+        alignment = (1, 1, 1)
+
+        tiling = ([64, 256, 16], 3, [2, 4, 1])
+        data_type = [cutlass.float64, cutlass.float64, cutlass.float64, cutlass.float64]
+        self.assertTrue(TestGemmOperator(GemmKind.Universal, math_inst, layout, alignment, tiling, 80, False, data_type=data_type))
+        self.assertTrue(TestGemmOperator(GemmKind.Grouped, math_inst, layout, alignment, tiling, 80, True, precompute_mode=SchedulerMode.Device, data_type=data_type))
+        stride_supports = [StrideSupport.Unity, StrideSupport.Strided, StrideSupport.Unity]
+        results = TestConv2dOperator(math_inst, alignment, tiling, 80, stride_supports=stride_supports, data_type=data_type)
+        for res in results:
+            self.assertTrue(res)
+    
+    def test_SM80_TensorOp_16832_TN(self):
+        math_inst = MathInstruction(
+            [16, 8, 32], cutlass.int8, cutlass.int8, cutlass.int32,
+            cutlass.OpClass.TensorOp, MathOperation.multiply_add_saturate
+        )
+        layout = (cutlass.RowMajor, cutlass.ColumnMajor, cutlass.ColumnMajor)
+        alignment = (16, 16, 4)
+        alignment_mixed = (16, 16, 16)
+        tiling = ([128, 256, 64], 3, [2, 4, 1])
+
+        data_type = [cutlass.int8, cutlass.int8, cutlass.int32, cutlass.int32]
+        data_type_mixed = [cutlass.int8, cutlass.int8, cutlass.int8, cutlass.float32]
+
+        self.assertTrue(TestGemmOperator(GemmKind.Universal, math_inst, layout, alignment, tiling, 80, False, data_type=data_type))
+        self.assertTrue(TestGemmOperator(GemmKind.Grouped, math_inst, layout, alignment_mixed, tiling, 80, True, precompute_mode=SchedulerMode.Device, data_type=data_type_mixed))
+        stride_supports = [StrideSupport.Strided, StrideSupport.Strided, StrideSupport.Strided]
+        results = TestConv2dOperator(math_inst, alignment, tiling, 80, stride_supports=stride_supports, data_type=data_type)
+        for res in results:
+            self.assertTrue(res)
+    
+    def test_SM80_Simt_f32(self):
+        math_inst = MathInstruction(
+            [1, 1, 1], cutlass.float32, cutlass.float32, cutlass.float32,
+            cutlass.OpClass.Simt, MathOperation.multiply_add
+        )
+        layout = (cutlass.RowMajor, cutlass.RowMajor, cutlass.RowMajor)
+        alignment = (1, 1, 1)
+
+        tiling = ([128, 256, 8], 4, [2, 4, 1])
+        data_type = [cutlass.float32, cutlass.float32, cutlass.float32, cutlass.float32]
+        self.assertTrue(TestGemmOperator(GemmKind.Universal, math_inst, layout, alignment, tiling, 80, False, data_type=data_type))
+        self.assertTrue(TestGemmOperator(GemmKind.Grouped, math_inst, layout, alignment, tiling, 80, True, precompute_mode=SchedulerMode.Host, data_type=data_type))
+        stride_supports = [StrideSupport.Strided, StrideSupport.Strided, StrideSupport.Strided]
+        results = TestConv2dOperator(math_inst, alignment, tiling, 80, stride_supports=stride_supports, data_type=data_type)
+        for res in results:
+            self.assertTrue(res)
+
+    def test_SM80_Simt_f64(self):
+        math_inst = MathInstruction(
+            [1, 1, 1], cutlass.float64, cutlass.float64, cutlass.float64,
+            cutlass.OpClass.Simt, MathOperation.multiply_add
+        )
+        layout = (cutlass.RowMajor, cutlass.RowMajor, cutlass.ColumnMajor)
+        alignment = (1, 1, 1)
+
+        tiling = ([64, 128, 8], 5, [2, 2, 1])
+        data_type = [cutlass.float64, cutlass.float64, cutlass.float64, cutlass.float64]
+        self.assertTrue(TestGemmOperator(GemmKind.Universal, math_inst, layout, alignment, tiling, 80, False, data_type=data_type))
+        self.assertTrue(TestGemmOperator(GemmKind.Grouped, math_inst, layout, alignment, tiling, 80, True, precompute_mode=SchedulerMode.Device, data_type=data_type))
+        stride_supports = [StrideSupport.Strided, StrideSupport.Strided, StrideSupport.Strided]
+        results = TestConv2dOperator(math_inst, alignment, tiling, 80, stride_supports=stride_supports, data_type=data_type)
+        for res in results:
+            self.assertTrue(res)
+
+    def test_SM80_TensorOp_16832_Interleaved(self):
+        math_inst = MathInstruction(
+            [16, 8, 32], cutlass.int8, cutlass.int8, cutlass.int32,
+            cutlass.OpClass.TensorOp, MathOperation.multiply_add_saturate
+        )
+
+        layout = (cutlass.ColumnMajorInterleaved32, cutlass.RowMajorInterleaved32, cutlass.ColumnMajorInterleaved32)
+        alignment_mixed = (16, 16, 8)
+        tiling = ([256, 64, 64], 4, [4, 1, 1])
+        data_type_mixed = [cutlass.int8, cutlass.int8, cutlass.int8, cutlass.float32]
+
+        self.assertTrue(TestGemmOperator(GemmKind.Universal, math_inst, layout, alignment_mixed, tiling, 80, False, data_type=data_type_mixed, epilogue_functor=EpilogueFunctor.FastLinearCombinationClamp))
+        stride_supports = [StrideSupport.Strided, StrideSupport.Strided, StrideSupport.Strided]
+        layout = [cutlass.TensorNC32HW32, cutlass.TensorC32RSK32, cutlass.TensorNC32HW32]
+        results = TestConv2dOperator(math_inst, alignment_mixed, tiling, 80, stride_supports=stride_supports, data_type=data_type_mixed, layout=layout, interleaved=True)
+        for res in results:
+            self.assertTrue(res)
+
+    def SM80_SparseTensorOp_16832(self):
+        pass
+    def test_SM80_PlanarComplexTensorOp_16816(self):
+        pass
+    def test_SM80_SparseTensorOp_16816_fast_math(self):
+        pass
+    def test_SM80_TensorOp_1688_complex(self):
+        pass
+    def test_SM80_TensorOp_1688_fast_fp32_math_complex(self):
+        pass
+    def test_SM80_TensorOp_1688_rank_k(self):
+        pass
+    def test_SM80_TensorOp_1688_rank_k_complex(self):
+        pass
+    def test_SM80_TensorOp_1688_trmm(self):
+        pass
+    def test_SM80_TensorOp_1688_trmm_complex(self):
+        pass
+    def test_SM80_TensorOp_1688_symm(self):
+        pass
+    def test_SM80_TensorOp_1688_symm_complex(self):
+        pass
+    def test_SM80_TensorOp_884_complex(self):
+        pass
+    def test_SM80_TensorOp_884_complex_gaussian(self):
+        pass
+    def test_SM80_TensorOp_884_rank_k(self):
+        pass
+    def test_SM80_TensorOp_884_rank_k_complex(self):
+        pass
+    def test_SM80_TensorOp_884_rank_k_complex_gaussian(self):
+        pass
+    def test_SM80_TensorOp_884_trmm(self):
+        pass
+    def test_SM80_TensorOp_884_trmm_complex(self):
+        pass
+    def test_SM80_TensorOp_884_trmm_complex_gaussian(self):
+        pass
+    def test_SM80_TensorOp_884_symm(self):
+        pass
+    def test_SM80_TensorOp_884_symm_complex(self):
+        pass
+    def test_SM80_TensorOp_884_symm_complex_gaussian(self):
+        pass
+    def test_SM80_SparseTensorOp_16864_TN(self):
+        pass
+    def test_SM80_TensorOp_16864_TN(self):
+        pass
+    def test_SM80_SparseTensorOp_168128_TN(self):
+        pass
+    def test_SM80_TensorOp_16864_Interleaved(self):
+        pass
+    def test_SM80_TensorOp_168256(self):
+        pass
+    def test_SM80_Simt_complex(self):
+        pass
+
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**20, 2**34)
+    pycutlass.compiler.nvcc()
+    unittest.main()