cutlass

youngkingdom/cutlass

Fork 0

Commit Graph

Select branches

Hide Pull Requests

2.11

Deepseek

cutlass-3.5.0

feature/2.10/updates_before_tagging

feature/3.0.0

feature/enable-mxfp-group-gemm-sm120

main

oss_ci

redirect

release/3.2.x

release/4.2

strided_output_conv

thakkarV-patch-1

thakkarv/4.0-changelog

v4

#10

#100

#1006

#1007

#1012

#1019

#102

#1021

#1022

#1024

#1035

#1037

#1041

#1043

#1047

#1049

#1053

#1059

#1065

#1068

#107

#1071

#1072

#1073

#1078

#1080

#1082

#1084

#1089

#1090

#1091

#1097

#1100

#1101

#1102

#1104

#1109

#1112

#1113

#1116

#1119

#1120

#1121

#1124

#1127

#1128

#1128

#1132

#1134

#1135

#1140

#1143

#1146

#1147

#1153

#1167

#1168

#1169

#1172

#1173

#1175

#1177

#1179

#1180

#1185

#1187

#1189

#1190

#1191

#1192

#1193

#1194

#1195

#1196

#1197

#1200

#1209

#1218

#1218

#1224

#1225

#1232

#1249

#1251

#1257

#1258

#1264

#1273

#1274

#1274

#1275

#1278

#1279

#1286

#1287

#1294

#13

#1302

#1303

#1305

#1306

#1308

#1318

#1325

#1328

#133

#1339

#134

#1346

#135

#1350

#1357

#1377

#1380

#1380

#1384

#1386

#1400

#1404

#141

#1411

#1413

#1415

#1416

#1417

#1420

#1428

#1433

#1437

#1439

#1451

#1453

#1453

#1454

#1458

#1465

#1468

#1469

#147

#1470

#1470

#1471

#1473

#1477

#1479

#148

#1486

#1491

#1494

#1495

#1498

#15

#150

#151

#1512

#1517

#1526

#1527

#1528

#1528

#1529

#1534

#1534

#1539

#1539

#1543

#1553

#1554

#1554

#1569

#1578

#1584

#1593

#1593

#1604

#1604

#1618

#1618

#162

#1623

#1630

#1632

#1632

#1638

#1639

#1641

#1647

#1650

#1652

#1653

#1653

#1656

#1656

#1658

#1661

#1664

#1665

#1666

#1667

#1673

#1674

#1674

#1679

#1680

#1695

#1700

#1702

#1702

#1708

#1709

#1713

#1714

#1727

#1733

#1753

#1765

#1771

#1774

#1776

#1782

#1784

#1787

#179

#1790

#1795

#1796

#1799

#1803

#1820

#1826

#1832

#1832

#1833

#1835

#1843

#1850

#1853

#1855

#1855

#1856

#1864

#187

#1870

#1871

#1878

#1880

#1883

#1887

#1887

#189

#1890

#1891

#1891

#1894

#1896

#1899

#1907

#1912

#192

#1925

#1926

#193

#1931

#1932

#1935

#1942

#1951

#1960

#1961

#1962

#1966

#1968

#1972

#1977

#1982

#1983

#1989

#1993

#2

#2005

#2020

#2021

#2024

#2026

#2030

#2031

#2033

#2035

#2035

#2037

#2045

#2051

#2059

#2066

#2069

#2078

#2078

#2082

#2086

#2089

#2090

#2095

#2104

#2110

#2111

#2112

#2120

#2122

#2123

#2124

#2129

#2130

#2134

#2135

#2136

#2137

#2139

#214

#2141

#2141

#2142

#2143

#2155

#2156

#2159

#216

#2160

#2160

#2161

#2167

#217

#2171

#2172

#2174

#2177

#2179

#2179

#218

#2180

#2185

#2188

#219

#2194

#2195

#2196

#2199

#220

#2203

#2204

#2211

#2213

#2216

#2219

#2220

#2221

#2224

#2234

#2248

#2249

#2250

#2251

#2255

#2256

#2257

#2257

#2267

#2269

#2269

#2270

#2273

#2275

#2276

#2279

#228

#2283

#2285

#2290

#2291

#2292

#2294

#2295

#2298

#2299

#230

#2305

#2305

#2307

#2311

#2315

#2317

#2318

#2324

#2328

#2328

#2329

#2330

#2333

#2340

#235

#2351

#2358

#2359

#2361

#2366

#237

#2370

#2371

#2374

#2375

#2377

#2378

#2379

#2383

#2385

#2387

#239

#2390

#2391

#2398

#2399

#24

#2400

#2401

#2402

#2402

#2407

#2414

#2416

#2417

#2419

#2420

#2421

#2422

#2425

#2429

#2436

#2439

#2447

#2448

#2457

#2457

#246

#2462

#2465

#2466

#2469

#2469

#247

#2472

#2477

#2480

#2481

#2485

#2489

#2492

#25

#2502

#2502

#2506

#251

#2510

#2511

#2514

#2516

#2517

#2526

#2527

#2527

#2529

#2536

#2537

#2538

#2540

#2543

#2544

#2548

#2548

#2553

#2554

#2554

#2556

#2558

#2558

#256

#2561

#2562

#2564

#2564

#2565

#2567

#2567

#2568

#2568

#2571

#2575

#2579

#2580

#2582

#2587

#259

#2591

#2592

#2594

#2594

#2596

#2598

#2599

#26

#2605

#2605

#2607

#2609

#2610

#2610

#2611

#2612

#2615

#2621

#2621

#2623

#2627

#2635

#2638

#2639

#264

#2644

#2645

#2646

#2646

#2648

#2650

#2651

#2652

#266

#2660

#2661

#2661

#2666

#2667

#2669

#2670

#2670

#2671

#2671

#2678

#2678

#2680

#2680

#2682

#2682

#2684

#2685

#2685

#2686

#2687

#2687

#2688

#2688

#2689

#2689

#2690

#2690

#2691

#2694

#2694

#2702

#2702

#2704

#2704

#2705

#2709

#2713

#2713

#2714

#2718

#2718

#2719

#2719

#272

#2721

#2721

#2729

#2729

#2731

#2731

#2734

#2734

#2739

#2739

#274

#2740

#2740

#2741

#2741

#2742

#277

#28

#285

#290

#290

#292

#295

#297

#298

#30

#301

#303

#305

#306

#308

#313

#318

#325

#33

#331

#341

#345

#363

#364

#365

#366

#375

#378

#379

#38

#381

#382

#383

#386

#388

#391

#392

#393

#394

#402

#403

#406

#407

#412

#413

#415

#419

#42

#424

#429

#433

#437

#440

#441

#442

#444

#446

#447

#449

#450

#451

#452

#453

#456

#46

#467

#468

#469

#47

#471

#472

#473

#477

#478

#479

#48

#480

#482

#486

#487

#488

#489

#493

#497

#497

#503

#507

#514

#516

#518

#518

#52

#53

#531

#532

#542

#543

#546

#550

#559

#562

#563

#564

#574

#576

#586

#587

#590

#597

#6

#6

#603

#604

#607

#608

#61

#615

#616

#618

#62

#620

#622

#623

#624

#626

#628

#629

#63

#631

#632

#633

#634

#635

#636

#637

#638

#639

#64

#641

#645

#646

#65

#650

#658

#659

#662

#669

#670

#671

#672

#677

#682

#691

#698

#7

#70

#701

#703

#704

#714

#717

#719

#720

#726

#727

#728

#730

#741

#743

#749

#752

#753

#754

#759

#760

#761

#764

#765

#766

#768

#773

#775

#776

#779

#786

#789

#790

#791

#796

#8

#805

#806

#807

#812

#82

#822

#823

#826

#828

#829

#83

#830

#832

#836

#838

#839

#841

#842

#844

#845

#846

#849

#853

#855

#857

#858

#862

#869

#87

#871

#878

#879

#883

#885

#891

#892

#893

#895

#896

#897

#9

#903

#905

#91

#912

#914

#915

#916

#917

#918

#920

#921

#925

#927

#932

#936

#937

#939

#940

#942

#945

#950

#951

#952

#957

#958

#96

#961

#967

#970

#976

#977

#979

#984

#992

#993

#995

#996

v0.1.0

v0.1.1

v1.0.0

v1.0.1

v1.1.0

v1.2.0

v1.3.0

v1.3.2

v1.3.3

v2.0.0

v2.1.0

v2.10.0

v2.11.0

v2.2.0

v2.3.0

v2.4.0

v2.5.0

v2.6.0

v2.6.1

v2.7.0

v2.8.0

v2.9.0

v2.9.1

v3.0.0

v3.1.0

v3.2.0

v3.2.1

v3.2.2

v3.3.0

v3.4.0

v3.4.1

v3.5.0

v3.5.1

v3.6.0

v3.7.0

v3.8.0

v3.9.0

v3.9.1

v3.9.2

v4.0.0

v4.1.0

v4.2.0

v4.2.1

40f124ef27 [CUTLASS] Add GNA to PUBLICATIONS.md (#2276) Ali Hassani 2025-05-02 16:57:19 -04:00
89f6bf2739 Fix group scale gemm when K==128 (#2275) Jiazhen Han 2025-05-02 12:41:18 -07:00
f535c33634 3.9.1 doc/version change (#2273) v3.9.1 Haicheng Wu 2025-05-01 00:27:00 -04:00
e3cb8a773a Import cuda, cudart, nvrtc lazily (#2251) Michael Lazos 2025-04-30 20:10:33 -07:00
c4bdfe821c Lazy scipy import (#2250) Michael Lazos 2025-04-30 13:10:00 -07:00
b3ce7e12b7 Make cc a positional argument (#2249) Michael Lazos 2025-04-30 13:09:25 -07:00
fe75ead92e Import pydot lazily (#2248) Michael Lazos 2025-04-30 13:08:17 -07:00
35136f5564 Fix wrong detection of python version for use_rmm. (#2224) Ruoxi 2025-04-30 12:29:33 -07:00
e5b810bed1 Use cudaMemcpyAsync in gemm grouped with kRequiresPrecomputation schedule. (#2256) Qi Yuhang 2025-05-01 03:28:05 +08:00
2b78c2fe31 cherry-pick feature/hopper-blockwise-generalization-optimization (#2270) Lain 2025-04-29 13:47:22 -07:00
697126019e fix blackwell grouped groupwise hang (#2267) Haicheng Wu 2025-04-29 11:54:20 -04:00
e94e888df3 Update CHANGELOG.md v3.9.0 Haicheng Wu 2025-04-24 21:51:34 -04:00
be73ad20a5 Update CHANGELOG.md for 3.9 Haicheng Wu 2025-04-24 16:54:06 -04:00
f02a7c2976 Update README.md for 3.9 Haicheng Wu 2025-04-24 16:51:45 -04:00
331a1f5b3f cutlass 3.9 update (#2255) Yujia Zhai 2025-04-24 12:42:40 -07:00
8e345c5c5b fix_missing_stdint (#2199) 吴坎 2025-04-24 10:21:22 +08:00
81a43e6d92 Set EpiTile correctly when TileN is not divisible by 32 (#2220) Tri Dao 2025-04-21 00:02:51 -04:00
ade6376fa0 [SM90] Change register allocation for TileN=208 to avoid spills (#2219) Tri Dao 2025-04-21 00:02:30 -04:00
bb4dd682dd Fix broken links and alt text in cluster launch control docs (#2234) milesvant 2025-04-20 21:01:12 -07:00
5e497243f7 fix: fig link in cute docs (#2216) Zhang_kg 2025-04-11 02:51:41 +08:00
b3f3c7758c Update tile_iterator.cu (#2204) Haisheng Chen 2025-04-10 11:49:58 -07:00
9e1b649827 fix-left-inverse-for-nvcc114 (#2196) reed 2025-04-11 02:48:46 +08:00
5120b21cc3 suppress compilation warnings (#2195) reed 2025-04-11 02:48:01 +08:00
dd76dec4ef [Doc] Make C++ code more plausible (#2156) Ronan Keryell 2025-04-10 11:35:46 -07:00
19cc2a5feb add support for sm89 in cute and the unit tests (#2177) kf-zhang 2025-04-11 02:16:36 +08:00
09df6ac464 [Doc]fix typo (#2174) liwenju0 2025-04-11 00:46:53 +08:00
df8a550d39 Update mma_atom.hpp (#2159) liujshi 2025-04-03 23:42:10 +08:00
79fc51f4b8 v3.9 update (#2213) Yujia Zhai 2025-04-02 23:10:16 -07:00
6f4921858b v3.9 update (#2203) Yujia Zhai 2025-04-02 12:11:18 -07:00
62750a2b75 v3.9 (#2185) Yujia Zhai 2025-03-20 22:52:23 -07:00
8c4d1dc47d Treat negative zero as equivalent to positive zero in sm90_sparse_gemm_compressor.hpp (#2110) Tyler Michael Smith 2025-03-20 22:44:17 -07:00
3fe62887d8 adding blackwell (#2143) Mohamed Mekkouri 2025-03-18 03:20:40 +01:00
bd03b22f64 fix typo (#2136) dongxiao 2025-03-18 10:19:43 +08:00
6c6b78550e Fix SM90 beta=1 hang and stream-K launch errors (#2172) Jack Kosaian 2025-03-13 13:07:37 -05:00
06e560d98a Blockwise/Groupwise kernel improvement and programatic dependent launch enablement (#2161) dePaul Miller 2025-03-10 11:36:11 -07:00
e9a75581fe DeepGemm Support - Step 2 (#2142) Deepseek Yuxi Chi 2025-02-28 23:11:59 +08:00
df18f5e4f5 Improvements for: Groupwise scaling along M for FP8 gemm (#2095) Lucas Wilkinson 2025-02-27 22:39:29 -05:00
ca4fdbea70 Blockwise and Groupwise GEMM for Blackwell and Improvements for Hopper (#2139) dePaul Miller 2025-02-26 09:44:58 -08:00
ac210faef8 DeepGemm Support (#2137) Yuxi Chi 2025-02-26 20:01:12 +08:00
15f5468872 Migrate FlashMLA codes to example. (#2135) Junkai-Wu 2025-02-26 14:29:07 +08:00
af5519d938 Flash MLA Support - Step 2 (#2134) myu-guo 2025-02-26 12:18:03 +08:00
415d587ebf Flash MLA support (#2130) myu-guo 2025-02-24 21:31:56 +08:00
eefa171318 [EVT] Fix Row/Col broadcast with array arguments (#2120) Josh Fromm 2025-02-21 14:47:30 -08:00
afa1772203 truncate name for cutlass profiler (#2124) v3.8.0 Yujia Zhai 2025-02-20 21:16:56 -08:00
9b3772dfa6 Hopper Grouped GEMM support for FP8 Accum (#2123) ANIKET SHIVAM 2025-02-20 18:55:26 -08:00
b84e9802d8 update 3.8 v2 (#2112) Yujia Zhai 2025-02-19 19:03:14 -08:00
e9627ce55b Always use cudaGetDriverEntryPoint with CUDA 12 (#2086) dan_the_3rd 2025-02-11 19:04:25 +01:00
ad6e1ec19c Add ParetoQ to PUBLICATIONS.md (#2089) Sijia(Jackson) Chen 2025-02-10 13:47:02 -08:00
0642d46dd4 Update 0x_gemm_tutorial.md (#2090) botbw 2025-02-11 05:46:43 +08:00
833f6990e0 v3.8.0 update (#2082) Yujia Zhai 2025-02-06 18:33:40 -08:00
affd1b693d [EVT] Add support for Row/Col broadcast PtrArray (#2033) Josh Fromm 2025-02-02 09:10:07 -08:00
6f55278121 bugfix generic-k code in top-k with softmax (#1993) Tadej Ciglarič 2025-02-01 01:05:35 +01:00
3c28697b9f Groupwise scaling along M for FP8 gemm (#2037) Liang 2025-02-01 02:51:28 +08:00
bdd641790a Update README.md Haicheng Wu 2025-01-28 18:08:13 -05:00
cc19d4d22b fix a readme broken link (#2069) Haicheng Wu 2025-01-28 18:03:34 -05:00
47daa33c61 fix cuda 12.6 issues (#2066) Haicheng Wu 2025-01-28 17:28:29 -05:00
389e493055 CUTLASS 3.8 Release (#2059) mihir-awatramani 2025-01-24 23:44:06 -08:00
9eb01fa0b0 update 3.7 docs (#2051) Yujia Zhai 2025-01-23 12:13:50 -08:00
b78588d163 CUTLASS 3.7 (#2045) v3.7.0 Yujia Zhai 2025-01-18 06:53:07 -08:00
902dff3663 fix assertion in integer_subbytes.h (#1961) bobliao 2025-01-10 11:47:58 +08:00
ef5620dd1d Blockwise Scaling for FP8 (#1932) Manish Gupta 2025-01-09 08:22:09 -08:00
375e284e6a Add Line Break (#2020) Lei Mao 2025-01-08 20:46:59 -08:00
52b35e90ce Fix Typos (#2021) Lei Mao 2025-01-08 20:46:28 -08:00
24f991e879 Fix typo in library_defaults.py (#2024) ZincCat 2025-01-08 12:44:11 -08:00
51b25e7b58 Add vector-types back to platform.h (#2026) Driss Guessous 2025-01-08 12:31:59 -08:00
7de6a59784 Add half->int8 saturate conversion to promise valid range (#1983) ZZK 2025-01-08 22:01:07 +08:00
c506e16788 fix mem fence (#2030) Yujia Zhai 2025-01-07 16:02:26 -08:00
7494a180a4 fix bug: arch/mma_sm60.h Mma<2,2,1> calculate wrong (#1989) Dongxu.Wang 2025-01-07 11:05:12 +08:00
cffd5d32b7 Update 0x_gemm_tutorial.md (#1982) Andrew O'Neill 2025-01-06 19:04:35 -08:00
bf9da7b76c Update CHANGELOG.md v3.6.0 Haicheng Wu 2024-12-25 17:11:15 -05:00
3d261a5974 3.6.0 update (#2005) Yujia Zhai 2024-12-24 22:34:40 -08:00
e1cd8c7866 Fix Typo (#1962) Lei Mao 2024-12-10 19:07:37 -08:00
33c584364e Fix CuTe README Typo (#1951) Lei Mao 2024-12-10 19:05:40 -08:00
2b6cfd34d1 fix a typo that fails the compiling when ElementScale is not the same as MmaType (#1977) Lain 2024-12-10 12:54:44 -08:00
4c42f73fda Improve mixed dtype GEMM (#1972) Lain 2024-12-06 10:33:22 -08:00
80243e0b8c add {uint4, uint2, int2} => {fp16, bf16} conversion (#1966) Lain 2024-12-03 11:03:43 -08:00
b0e09d7cd3 Fix cutlass python library with cuda 12.6.2.post1 (#1942) dan_the_3rd 2024-11-18 15:06:32 +01:00
8aa95dbb88 Fix the racing condition of mixed-input gemm when writing the registers (#1931) Lain 2024-11-08 10:15:54 -08:00
d656afbd2a fix undefined in device code error (#1880) LiYu Lu 2024-11-07 03:56:54 +08:00
32e3c38aef remove restriction of stride == kernel in nhwc_pooling (#1896) LiuQiang 2024-11-07 03:54:53 +08:00
9004ed2d1b Update publications (#1912) Wenlei Bao 2024-11-06 11:54:15 -08:00
19f51596e8 feat: support kFactor 8 used in mma tensor op tile iterator (#1512) chenwei 2024-10-29 23:56:59 +08:00
e8a8b69365 Refactor some GroupedGEMM logic (#1899) azhurkevich 2024-10-25 17:14:01 -07:00
08a49953a0 Add a print for the uint{x}b_t type. (#1871) LiYu Lu 2024-10-25 02:39:22 +08:00
a424ca6cf9 fix wrong A/BLayout in MMA_Traits for binary mma and append other MMA_Traits support (#1856) Caleb_Du 2024-10-25 02:38:35 +08:00
be692b48b0 remove redundant hardcoded packing configs in mixed dtype gemm (#1894) Lain 2024-10-23 11:24:09 -07:00
12626bcfe4 Update gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu with include "cutlass/gemm/device/gemm_universal.h" (#1569) 侯奇 2024-10-24 00:56:36 +08:00
f02913c34e Include of regular_tile_iterator.h fixed for NVRTC (#1765) MaxAkaAltmer 2024-10-23 19:55:59 +03:00
03e3bffaec Adjusting code indentation (#1639) 103yiran 2024-10-24 00:55:02 +08:00
e5f3caf145 Fix README (#1658) Lei Mao 2024-10-23 09:52:43 -07:00
83ae20c740 added mapping for bf16 to torch::kBFloat16 (#1843) Bogumil Sapinski Mobica 2024-10-23 18:48:31 +02:00
b0c09ed077 fix by adding public (#1753) Xinyu Yang 2024-10-24 00:45:58 +08:00
ea69cc2849 fix typo (#1853) sijialou 2024-10-24 00:45:28 +08:00
f3a3bfcbf2 add maximum support (#1833) Xinyu Yang 2024-10-24 00:44:56 +08:00
d65266a868 Add all supported GMMA shapes (#1890) Sergey Klevtsov 2024-10-22 15:13:36 -07:00
5b50a8faaf Add GMMA shape m64n40k16 (#1864) Tri Dao 2024-10-21 17:41:47 -07:00
08101d9d0c Improve sm90 mixed dtype kernel (#1883) Sergey Klevtsov 2024-10-17 17:06:38 -07:00
755194a7bd add is_last_tile Haicheng Wu 2024-10-17 12:11:02 -07:00
53668799b2 Handle MNK Sm90{Row, Col}Reduction problem shapes (#1803) Saagar Jha 2024-10-14 16:46:20 -07:00
cc3c29a81a CUTLASS 3.6.0 (#1850) Yujia Zhai 2024-10-09 12:33:27 -07:00