CUTLASS 2.10 (#615)

Co-authored-by: Aniket Shivam <ashivam@nvidia.com>
2022-09-03 15:48:46 -07:00
parent ca23ff7924
commit b72cbf957d
289 changed files with 43708 additions and 2513 deletions
--- a/tools/library/scripts/pycutlass/test/conv/init.py
+++ b/tools/library/scripts/pycutlass/test/conv/init.py
--- a/tools/library/scripts/pycutlass/test/conv/cached_results_SM80.txt
+++ b/tools/library/scripts/pycutlass/test/conv/cached_results_SM80.txt
@ -0,0 +1,274 @@
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1767700736 2104699940 3506659864 557648934
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1539314507 3971227455 1976927351 1642148785
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 276489656 653235219 3147305346 880610205
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 272457724 2178229139 2786201726 4170295839
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 242235041 2149454506 784935854 682531065
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 3478189705 1667216236 1437761176
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 379326961 1780379994 3740415776
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 924848818 3533854396 2683779476
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2870331951 359232443 2147867990 1653277018
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2870331951 3784314846 2644315999 4224154526
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3787448414 3562991793 535073859 2563373454
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 426169840 2464808416 864648234 461884698
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2564934525 3910792915 3577331017 827498183
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 28479234 867695528 1947311971 83328334
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4192922822 4244595864 2296602326 2349214706
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 274678245 3464152269 1682550229 3446204619
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3993280136 828543035 1319748516 956044554
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 832003025 3799813757 4030292245 457791957
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1444316594 4129865888 93616503 412257611
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2931873718 1841508064 1497852219 36703874
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2931873718 1841508064 1497852219 1842147148
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1612565294 109894479 1782187316 3370789453
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 841569299 1010785577 1158956167 3261208135
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1893352157 48149942 3544807462 446577726
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 3585320147 2150950452 1625817025 3964129474
+conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 289918791 2624928614 3423533117 3186342135
+conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 2732296888 1838622641 4203745561
+conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 3456572634 893492926 1966259884
+conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 671982235 4014726279 4027869577 1510990157
+conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 798317794 4140605332 3580988556 3425909428
+conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1721270411 2106553169 835800311 3417471222
+conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 860217059 166776702 1109666471
+conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2128738105 855244826 2670006594 3857976152
+conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1931093565 3079461262 3579256638 2926210806
+conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2472246681 2952423142 2045838875 3445165841
+conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2956871200 2133381336 2601441527 2035094220
+conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 365467186 1700915522 2515933441 406719240
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 3347784734 156533442 1012781676 688128904
+conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 927718585 3117803557 1370701307 1462167731
+conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 973422497 1926250028 3440543762
+conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 2892862516 3649300762 1521470286
+conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2075083065 3181416651 1733426984 872275640
+conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4005590448 1639170045 388151578 4186957447
+conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 181075276 1433744686 860506550 3475157408
+conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1747719409 877465841 2345541783
+conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 856324887 2307248012 337386755 3363072703
+conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1906605830 722034901 2562804622 2508759317
+conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 805717279 2196645331 3235235362 1518334120
+conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3168796339 72559978 778918419 1260968000
+conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 261954979 2634885882 451986822 3792829599
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3747142491 2426759809 2622222681 371723930
+conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2056905385 3612826298 2531545294 476754549
+conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 2391975923 197605094 3409942185
+conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 3071904063 408984565 2378809888
+conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3414629540 3067676760 1540919649 2008865071
+conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4100326666 1085505037 2778215386 230227569
+conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3662895757 2731079464 3570839563 3483629877
+conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 408419601 3415600242 2106927195
+conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2154102133 3606099389 4034802752 3200055633
+conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2609259399 3910244699 1319285699 2229775542
+conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2948772873 2780071616 2703730845 3090625734
+conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 752289976 4278696824 360883914 3802692600
+conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3723912751 653419877 359675571 283806385
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 2027599472 1075980921 3101013494 2025203940
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 991402150 1393431534 1148212814 1350914659
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 4283492776 419570292 1210341563
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4178596783 3828059710 2735749436 2671012171
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 924522595 563724475 3750778972 4152580670
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1021044158 1686067905 3765040166 4102272733
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 2674994719 635224486 2759329777
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 4201252830 2920298728 304256151
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 70289262 646435722 4137562540
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1317457392 1288095320 2132879813 656196754
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1317457392 2202157489 2326567490 2475188414
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2476454437 1857118302 4164386062 239840568
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2767650699 3514840131 590439733 3879821123
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3896287283 3112762669 2515107934 2106635937
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1903067870 1021832870 3003938078 2751931686
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3489785028 2466126497 1374078692 2737628040
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2051350923 263676708 3639860119 1370886256
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 719099834 1474713672 204857540 2768940347
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3441724486 3162593831 421721594 3097845598
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2034354027 1249407570 2567025479 1441082595
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 941893937 3608468045 635631428 2369653089
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 941893937 3608468045 635631428 1218705038
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 172579142 319546523 718795680 1453661415
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2823351660 1326352711 1110204809 1155441703
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3238446487 2572503545 686287700 1559476701
+conv2d fprop_1x8x8x1_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 991402150 1883874274 1180207512 3934800419
+conv2d fprop_1x16x16x1_8x8_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 24290453 4230587034 4117433929 2540623821
+conv2d fprop_1x16x16x1_12x12_16x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 24290453 3802993432 1563447158 515257167
+conv2d fprop_1x224x224x1_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 2583340103 3928463259 1564251818
+conv2d fprop_1x224x224x1_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 2966178620 3457283045 1726663817
+conv2d fprop_1x224x224x1_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 1794561978 3101289788 3492498648
+conv2d fprop_1x224x224x1_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 1794561978 498358130 4111289929
+conv2d fprop_1x8x8x2_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2693144988 3876248534 3038023830 1910263513
+conv2d fprop_1x16x16x2_8x8_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 3355193355 319259163 535683577
+conv2d fprop_1x16x16x2_12x12_16x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 1548147432 3385829172 2741952709
+conv2d fprop_1x224x224x2_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 2686562907 3948710179 3669872932
+conv2d fprop_1x224x224x2_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 576815792 2317227037 1211532666
+conv2d fprop_1x224x224x2_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 27596985 555460201 895685163
+conv2d fprop_1x224x224x2_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 27596985 1465341652 2228916523
+conv2d fprop_1x8x8x4_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 24290453 137535877 1436667267 1395660627
+conv2d fprop_1x224x224x4_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 2226159049 4051661898 209529384
+conv2d fprop_1x224x224x4_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 3541851870 2271016226 2671623385
+conv2d fprop_1x224x224x4_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 982184919 2007343215 3362992769
+conv2d fprop_1x224x224x4_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 982184919 20610297 1086800078
+conv2d fprop_1x8x8x8_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 3117444553 1497663382 3561001103
+conv2d fprop_1x224x224x8_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 1414143072 827338392 2827855918
+conv2d fprop_1x224x224x8_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 3886996022 26545788 3407771964
+conv2d fprop_1x224x224x8_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 380272816 2374613655 3601677176
+conv2d fprop_1x224x224x8_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 380272816 778374730 2110111988
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1736512560 49406874 846358010 3314905564
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1848484956 1432417472 1903569827 3750799351
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4236427320 3696009469 69852620 201921851
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 109006944 450017448 1793784844 903209915
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 813367872 2397796503 1928191746 3210229460
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 1307184141 46021356 1674017987
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 1212511562 3331767121 2446286369
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 2013675943 1681111033 1469213228
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1703349794 500298386 3218034344 4159283207
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1703349794 1123534155 145385311 4273847179
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3862659311 349459322 1503631520 1404971956
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1623686755 961217371 552550209 3980749384
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3554927580 1131648083 4149599295 3119557776
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1767639287 3350675774 128324027 1059816532
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3986143536 17411088 40173029 1694092310
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1157793540 3513299281 48848814 1435528367
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 988962069 4292634763 388976034 2674929544
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4202383208 3529769234 1046186503 3368902675
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 856448884 3057259762 2063087558 1995545427
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2281940872 144496548 2455451862 400986166
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2281940872 144496548 2455451862 1082696406
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2702905851 1992889713 731289041 608504198
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2742293143 4197915274 606840 3671124731
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 149434841 2288560511 2994968424 2881838300
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 2226824643 327135318 3718671210 2121176659
+conv2d fprop_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3254575292 1119957081 672831271
+conv2d fprop_1x4x4x14_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3115523958 3622905002 4020453928 3853387318
+conv2d fprop_1x23x56x98_10x22_128x3x3_pad_h4w5_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1702870033 1876930844 1190400523 3937287850
+conv2d fprop_1x4x4x28_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 2587856937 2021107274 2789519899
+conv2d fprop_1x23x56x100_10x22_128x3x3_pad_h4w5_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2368669977 1353376771 744357395 786349633
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 991402150 1393431534 2496492611 3901723984
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4208297221 4283492776 3148637036 258220505
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4178596783 3828059710 281106520 1103939403
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 924522595 563724475 1938163814 2197809394
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1021044158 1686067905 350851834 3999808950
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3335547 2674994719 1034822169 1611033520
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3335547 4201252830 1597212204 2181492560
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3335547 70289262 3001492060 1379239000
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1317457392 1288095320 4211138051 2804617605
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1317457392 2202157489 1043108884 2923122465
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2476454437 1857118302 3877008798 1206012078
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2767650699 3514840131 2946529611 3907056932
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3896287283 3112762669 1581171257 3959460786
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1903067870 1021832870 1926804094 1756790353
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3489785028 2466126497 1712378956 434322965
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2051350923 263676708 355203300 821870356
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 719099834 1474713672 2886387159 4086314983
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3441724486 3162593831 1422796372 2049419539
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2034354027 1249407570 1196036582 2684312264
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 941893937 3608468045 2198911423 1060050551
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 941893937 3608468045 2198911423 3361618746
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 172579142 319546523 2332616929 543467298
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2823351660 1326352711 3839068434 65031397
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3238446487 2572503545 3604065639 2111204111
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2149247508 1775375365 2663631601 1249487679
+conv2d fprop_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 403997062 1679063623 4062928786
+conv2d dgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 3464637181 1623218578 436154205
+conv2d dgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1479940693 3253144559 3883419107
+conv2d dgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 1871463331 2425320272 74566211
+conv2d dgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3484040069 664160900 3610888033 22347127
+conv2d dgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1924855848 1382111427 2541177413
+conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 868180534 1764715518 3070473696 2392864704
+conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3437976747 666906244 3401957738 2050602745
+conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4195072693 1575210381 781892324 2848949054
+conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3457330201 2316839359 1539389419 4293781748
+conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 2469024119 2885305868 2693098375
+conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 2469024119 2885305868 1969608051
+conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1690216859 554790212 2885143346 780489333
+conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 835105643 3337423971 3866137775
+conv2d dgrad_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2956180805 1092015789 3160693693 1526395881
+conv2d dgrad_1x56x56x12_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 1941683430 2236679600 3168985259
+conv2d dgrad_1x55x55x12_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 1941683430 3784328837 471971363
+conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 289918791 1266976707 942688231 3457364823
+conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 1027662440 2005082293 2235558527
+conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 3380032042 1370040310 1348846927
+conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 671982235 1423304149 2107662762 1234913781
+conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 798317794 1709026638 2421185623 3308071321
+conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1721270411 2519327328 2541413264 3185574975
+conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 2070174510 1364436192 3531942595
+conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2128738105 2056902987 3079166829 2329433528
+conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3857917762 3227877956 645422556
+conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3857917762 3817218800 985231315
+conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 1398036015 3630062764 2492522537
+conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2784049299 643733019 3649549642 2637869234
+conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 2332160299 302086821 3303132343
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1931093565 2458714707 2919710256 2311575036
+conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2472246681 2260022344 500095455 2760458995
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1530672622 3635363851 2402907878 4131497953
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1500864134 2536338700 2459524764 2504484273
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3344871528 2667385029 2714805835 3487838445
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 966721255 1547169349 3198573835 302049294
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 2440004820 1576818970 1317923157
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 2440004820 1576818970 3186679687
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4028893260 4220759192 2236533218 3731336532
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2956871200 1591352238 1756650151 1262787222
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 365467186 892422645 1334708242 1372556938
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 3347784734 150035460 2897171548 3701081496
+conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 927718585 4106152802 2634710231 744755886
+conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 3464637181 2709881923 2407415563
+conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 3723472741 3733128758 3129111191
+conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2075083065 2042513140 253288229 404121198
+conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4005590448 1116254439 525487530 3284739065
+conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 181075276 1743485155 91136873 2508716910
+conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 386662952 1127709182 4026285141
+conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 856324887 3954249564 2591894666 2655687700
+conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1300426008 1263618595 1313664339
+conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1300426008 1756414462 2995557277
+conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 447261065 121940906 1497499264
+conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3484040069 2966693627 1423016429 341928547
+conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1759979610 2761559427 68093525
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1906605830 2980501720 1650970502 3258883197
+conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 805717279 3502822733 3985958544 2568949300
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 868180534 3289288595 385631111 328914986
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3437976747 3391080565 1513955316 1521294163
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4195072693 1669352457 2608107448 4284090805
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3457330201 1126870455 106232038 3054809396
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 1723074453 1186911503 4239438967
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 1723074453 1186911503 2113601884
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1690216859 2413490039 36034283 1112346965
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3168796339 1601750164 14375779 2894970748
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 261954979 1300976652 4259930640 305685205
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3747142491 1747587481 4137156526 1174257270
+conv2d wgrad_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2956180805 1086820986 1644914756 2013471312
+conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2056905385 447674669 724481645 1457430910
+conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 1227883689 3401425854 3897766524
+conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 3749787834 3350064812 1136116240
+conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3414629540 820341033 770836461 2451581199
+conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4100326666 2581696511 1088458082 1521190911
+conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3662895757 2885454895 935600441 2615245898
+conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 3831334389 3506139121 814982501
+conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2154102133 737968461 1291834254 2665225480
+conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 3573498719 1809195644 1765637461
+conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 3573498719 3379808294 483095299
+conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 4194153035 2863868771 1639389008
+conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2624318208 157618421 1779474147 814087242
+conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 2300180628 423968553 3890279569
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2609259399 1848932917 522753581 1926508271
+conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2948772873 3663040534 4014266327 1288646188
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3271403719 1585195072 1487505772 3253374264
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1419588777 451194147 3578359696 3659768981
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 763924990 2780826684 2883769406 148530958
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2578426561 3849874822 102765469 1305171059
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 1995451256 2632815435 1516344656
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 1995451256 2632815435 1586331550
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2462511240 2274021368 1188866747 3178890497
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 752289976 1226457131 4187777346 1400559240
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3723912751 1585959358 3731079159 1498901684
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 2027599472 2758666204 3287095476 4291916486
+conv2d wgrad_1x8x8x1_8x8_1x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1767700736 4278264698 2331753571 2554564568
+conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 927718585 3117803557 1370701307 1462167731
+conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 973422497 1926250028 3440543762
+conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 2892862516 3649300762 1521470286
+conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2075083065 3181416651 1733426984 872275640
+conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4005590448 1639170045 388151578 4186957447
+conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 181075276 1433744686 860506550 3475157408
+conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1747719409 877465841 2345541783
+conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 856324887 2307248012 337386755 3363072703
+conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1906605830 722034901 2562804622 2508759317
+conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 805717279 2196645331 3235235362 1518334120
+conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3168796339 72559978 778918419 1260968000
+conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 261954979 2634885882 451986822 3792829599
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3747142491 2426759809 2622222681 371723930
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
@ -0,0 +1,187 @@
+# test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+from pycutlass.conv2d_operation import *
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+
+class Conv2dDgradImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+    
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,162 @@
+# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dDgradImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage3(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=4, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage3_64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage4_64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=4, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
@ -0,0 +1,89 @@
+# test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+import pycutlass
+from pycutlass.conv2d_operation import *
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dDgradImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[1, 1, 1],
+            element_a=cutlass.float32, element_b=cutlass.float32, 
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32, 
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 8], stages=4, 
+            warp_count=[4, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[1, 1, 1],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 8], stages=4, 
+            warp_count=[2, 4, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,86 @@
+# test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dDgradImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,154 @@
+# test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass.test import *
+import unittest
+
+def conv2d_few_channel_problemsizes(channels):
+    problem_sizes = [
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 8, 8, channels),
+            cutlass.Tensor4DCoord(16, 3, 3, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(2, 2),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 16, 16, channels),
+            cutlass.Tensor4DCoord(16, 3, 3, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(2, 2),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 16, 16, channels),
+            cutlass.Tensor4DCoord(16, 7, 7, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(32, 7, 7, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(64, 7, 7, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(2, 2),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(64, 5, 5, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(64, 5, 5, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(2, 2),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+    ]
+
+    return problem_sizes
+
+class Conv2dFpropFewChannelsF16NHWCF16NHWCF16HNWCTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=2)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=2)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.few_channels,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation, conv2d_few_channel_problemsizes(2)))
+    
+    def test_SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_1(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=1)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=2, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.few_channels,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation, conv2d_few_channel_problemsizes(1)))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,175 @@
+# test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass.test import *
+import unittest
+
+def conv2d_fixed_channel_problemsizes(channels):
+    problem_sizes = [
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 8, 8, channels),
+            cutlass.Tensor4DCoord(16, 3, 3, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(2, 2),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(32, 7, 7, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(64, 7, 7, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(2, 2),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(64, 5, 5, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(64, 5, 5, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(2, 2),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+    ]
+
+    return problem_sizes
+
+class Conv2dFpropFixedChannelsF16NHWCF16NHWCF16HNWCTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_8(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.fixed_channels,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(8)))
+    
+    def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.fixed_channels,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(4)))
+    
+    def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=2)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=2)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.fixed_channels,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(2)))
+
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
@ -0,0 +1,291 @@
+# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dFpropImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=2)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=2)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 14),
+                cutlass.Tensor4DCoord(8, 3, 3, 14),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 23, 56, 98),
+                cutlass.Tensor4DCoord(128, 3, 3, 98),
+                cutlass.Tensor4DCoord(4, 0, 5, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+    
+    def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=2)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=2)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 14),
+                cutlass.Tensor4DCoord(8, 3, 3, 14),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 23, 56, 98),
+                cutlass.Tensor4DCoord(128, 3, 3, 98),
+                cutlass.Tensor4DCoord(4, 0, 5, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+    
+    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 28),
+                cutlass.Tensor4DCoord(8, 3, 3, 28),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 23, 56, 100),
+                cutlass.Tensor4DCoord(128, 3, 3, 100),
+                cutlass.Tensor4DCoord(4, 0, 5, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,48 @@
+# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dFpropImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
@ -0,0 +1,87 @@
+# test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+import pycutlass
+from pycutlass.conv2d_operation import *
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dFpropImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[1, 1, 1],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 8], stages=4, 
+            warp_count=[4, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle2
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+    def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[1, 1, 1],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 8], stages=4, 
+            warp_count=[2, 4, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,98 @@
+# test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dFpropImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align2(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=2)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=2)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            )
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+    
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,235 @@
+# test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dStridedDgradImplicitGemmF16NHWCF16NHWCF32NHWCTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.StridedDgradIdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x256_64x3_64x64x64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 256, 64], stages=3, 
+            warp_count=[2, 4, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.StridedDgradIdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4_128x128_32x3_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.StridedDgradIdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+    
+    def test_SM80_Device_Conv2d_Strided_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.StridedDgradIdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Strided_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32_align4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.StridedDgradIdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 56, 56, 12),
+                cutlass.Tensor4DCoord(8, 1, 1, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(2, 2),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 55, 55, 12),
+                cutlass.Tensor4DCoord(8, 1, 1, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(2, 2),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+
+
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
@ -0,0 +1,86 @@
+# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dWgradImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
+    def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,224 @@
+# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dWgradImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
+    def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_64x256_32x4_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[64, 256, 32], stages=3, 
+            warp_count=[1, 4, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+    
+    def test_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
@ -0,0 +1,87 @@
+# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+import pycutlass
+from pycutlass.conv2d_operation import *
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dWgradImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[1, 1, 1],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 8], stages=4, 
+            warp_count=[2, 4, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[1, 1, 1],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 8], stages=4, 
+            warp_count=[2, 4, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,98 @@
+# test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dWgradImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align1(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=1)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 8, 8, 1),
+                cutlass.Tensor4DCoord(1, 3, 3, 1),
+                cutlass.Tensor4DCoord(1, 1, 1, 1),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+    
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/run_all_tests.py
+++ b/tools/library/scripts/pycutlass/test/conv/run_all_tests.py
@ -0,0 +1,10 @@
+import pycutlass
+import unittest
+from pycutlass.memory_manager import *
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**32, 2**32)
+    loader = unittest.TestLoader()
+    tests = loader.discover('./', 'conv2d_*.py')
+    testRunner = unittest.runner.TextTestRunner()
+    testRunner.run(tests)