Compare commits
669 Commits
codex/remo
...
copilot/di
| Author | SHA1 | Date | |
|---|---|---|---|
| 7557a67655 | |||
| 1af476b0e9 | |||
| 8c3b1c7c62 | |||
| d4aa144343 | |||
| fcb1d570bb | |||
| accb8fab07 | |||
| 5b0448104f | |||
| f7a6682872 | |||
| a9fe0793f2 | |||
| 7568a282b9 | |||
| 1da3309ace | |||
| 5522fb274b | |||
| 0f95a1c3f2 | |||
| ded24e3e54 | |||
| d6704dd099 | |||
| ecca3fee76 | |||
| 9a0d2f0d92 | |||
| ad3ec89532 | |||
| 3481e40743 | |||
| 5e72216d17 | |||
| 1a33aacf82 | |||
| 7ba6aa8f56 | |||
| ab2eb27b74 | |||
| 3c7fefdeba | |||
| 1891cf605a | |||
| 8df98c2161 | |||
| 4fb8771cc0 | |||
| 413ef7a3b4 | |||
| 8b62495076 | |||
| 83fd49b1fc | |||
| a4a4f0f617 | |||
| 0d8161b075 | |||
| d2c33c397a | |||
| f6d5f5888c | |||
| 9007bf57e6 | |||
| f257544709 | |||
| 0b51c9bd8b | |||
| d3ab240f39 | |||
| 94666612a9 | |||
| 4fe5895361 | |||
| 111faf1118 | |||
| 6afc28a9ba | |||
| 141e6a0505 | |||
| 130aa8cbcf | |||
| e3d8186666 | |||
| f5710ef02a | |||
| a8c02fb5bf | |||
| 02af36df36 | |||
| e88bdd60d9 | |||
| 05e034f085 | |||
| 936643a868 | |||
| b186149e8e | |||
| 2abbd351ef | |||
| 446912d1cb | |||
| a00d6254e9 | |||
| 05181cc57f | |||
| 259504e147 | |||
| 0484b64248 | |||
| f58d9b6404 | |||
| 44b5ce956d | |||
| 7a865f2325 | |||
| 2fa90bda27 | |||
| 0291fbf65c | |||
| b46e4a06f1 | |||
| d34f5fe939 | |||
| bdb01a38fe | |||
| 5b3c35a68e | |||
| 61fbfe5274 | |||
| 255e34ca50 | |||
| a8d2e326ec | |||
| 53a56e658b | |||
| 69f064062b | |||
| 921e78f4bb | |||
| 6ebffafbb6 | |||
| 3b96f85c36 | |||
| 23ad820553 | |||
| 5d3be3ba4c | |||
| 4f882be4a0 | |||
| 9273754222 | |||
| f4e8154076 | |||
| a663f6ae64 | |||
| a4fc21895e | |||
| a3e8611da5 | |||
| 7c2bdb83dc | |||
| 9932ed6a83 | |||
| 2d631d28c6 | |||
| b368382964 | |||
| a806c14cc7 | |||
| 181bf5bbde | |||
| cbd5e07a51 | |||
| 63b22e0dbb | |||
| 5980604c44 | |||
| 361a7463d3 | |||
| 720af6ab79 | |||
| 55cba4a05c | |||
| c7abff2990 | |||
| 71b1c8b667 | |||
| 8fb7b2fab9 | |||
| be7b55a83d | |||
| 315b860abe | |||
| 87c41c26ad | |||
| 65d2cf9511 | |||
| d63cd9ff10 | |||
| 66a168a197 | |||
| a99564ac5b | |||
| 4c5f632165 | |||
| b853540388 | |||
| 56ed7609a9 | |||
| 29c9cb8007 | |||
| 83f478bb19 | |||
| 269c4db0a4 | |||
| 52efc34ebf | |||
| d95d0f4b98 | |||
| 0402428200 | |||
| 17af6aa0da | |||
| fc168c33f3 | |||
| acc78aeb88 | |||
| 0f67d4d962 | |||
| 7e1d697b56 | |||
| 699d62e6cf | |||
| cd390b609d | |||
| 2080b05099 | |||
| 6454afec90 | |||
| 41a62564a7 | |||
| 284cc92275 | |||
| 435be10db9 | |||
| b7030d962b | |||
| 3567816932 | |||
| e0ef8a2920 | |||
| 42efe609ba | |||
| 88d3141ec6 | |||
| 09a6a49eaf | |||
| 074475541a | |||
| d4c574c39f | |||
| c528b9006a | |||
| 85fee74b33 | |||
| 8dbe0c527f | |||
| 5cc6bddb6e | |||
| 1f9460c4c1 | |||
| 70022ffc00 | |||
| f417746ad7 | |||
| 0552cfb195 | |||
| 51dd14ac2b | |||
| dbfbf9f324 | |||
| ca76486a16 | |||
| a9f55dc588 | |||
| 81d5bb765a | |||
| 0825197bee | |||
| 9ef3d5b875 | |||
| 295c7f0267 | |||
| 3fa2c12185 | |||
| fe2016de2d | |||
| 237cf6d32a | |||
| faee3ccdc2 | |||
| 570c3e1cd4 | |||
| 3a4255c7c4 | |||
| 61089465a6 | |||
| 88afa11010 | |||
| d00ce29d89 | |||
| 3b7bdf983b | |||
| 50b788a17a | |||
| fc059c7061 | |||
| bfb240cc49 | |||
| e255d92990 | |||
| 3729ed00ba | |||
| 6644796bf4 | |||
| ff93cc8c84 | |||
| 243ed7d32e | |||
| 7e0941055f | |||
| 6738e4a093 | |||
| 2566dca2a9 | |||
| b4fda58a2d | |||
| a0003b56b0 | |||
| 5beacce2ea | |||
| 8669c69afa | |||
| 1651003c35 | |||
| 1cb8c6c5fe | |||
| e05a6754a8 | |||
| 084a9dae80 | |||
| c9461e05a4 | |||
| 4dfdb821c8 | |||
| 58fab50d82 | |||
| db6f28d898 | |||
| 14e2f1231e | |||
| 7c4767f1eb | |||
| 9771e0b432 | |||
| 980de31ca0 | |||
| 1c160841ea | |||
| 4ca13a8667 | |||
| 675aa2ec64 | |||
| 3ae082c373 | |||
| 49c00fe304 | |||
| 141d3b9fc5 | |||
| abf3db40ef | |||
| 8e4ca4d14e | |||
| 1a0f4defb7 | |||
| 843af7f7fc | |||
| 1f633b8632 | |||
| a4c29e6e82 | |||
| 8f18feb191 | |||
| ed540d6d4c | |||
| f6027b2855 | |||
| ab3e80042e | |||
| ceacedc1f9 | |||
| bfa59be8f1 | |||
| 265ecb05fb | |||
| 09a7e6f617 | |||
| 6c2eef5a5d | |||
| 19748806f0 | |||
| 4a8a567e16 | |||
| 344a0017c0 | |||
| becb7de40b | |||
| 250fb1b8ea | |||
| 647214f3d5 | |||
| ddeec11ba9 | |||
| 86ed77022d | |||
| aa1356ec53 | |||
| ecc3c0940a | |||
| ba09652de2 | |||
| bd66b8529b | |||
| 6c728f7771 | |||
| 80e9452984 | |||
| c3a2c6ac5f | |||
| 72f431e709 | |||
| be4445072c | |||
| f381cf2302 | |||
| 5ff5d94e77 | |||
| f95da13c3d | |||
| aef368aa08 | |||
| 5f6cbf60d6 | |||
| 3ada34f9cb | |||
| 0eb8f2b880 | |||
| 163965d183 | |||
| a03cf9bc70 | |||
| 352c0c8a28 | |||
| bfe0b4bd2a | |||
| 58fbbcb2f5 | |||
| 87778d5f00 | |||
| f9e7ad5400 | |||
| 4d0f266113 | |||
| e93ff6c8b9 | |||
| 1c691f4a71 | |||
| 9fce7bee74 | |||
| b63f2143f8 | |||
| f32bf7582e | |||
| 8a81d776ce | |||
| f6fdacd82c | |||
| d31f7844f8 | |||
| 7a6c8c3fa1 | |||
| 221bf72577 | |||
| b3aba04e5a | |||
| 8a297115e2 | |||
| 191eed0bb9 | |||
| fb860670da | |||
| 83e760c57d | |||
| c2bba69065 | |||
| e133d6d218 | |||
| a1946c9f61 | |||
| 9f020f4f31 | |||
| 3b45075206 | |||
| 168e578efc | |||
| 6ac5e06f7c | |||
| 5c2acb270a | |||
| b26b70bec4 | |||
| ab4be40fc5 | |||
| 245e4f2c01 | |||
| 1d165d6d85 | |||
| 83004020fd | |||
| 12e21701e7 | |||
| 30a33b92ee | |||
| 7c572544e4 | |||
| c312320764 | |||
| c981f0ea78 | |||
| 6367bde739 | |||
| f50cc221ea | |||
| acedc74b1a | |||
| d29483b58a | |||
| 950cf9e58e | |||
| 3125d79950 | |||
| e33ee23ee3 | |||
| b10c64c834 | |||
| 0925b28a8e | |||
| 99722d5f0e | |||
| 4c91a28e30 | |||
| b038d9c40c | |||
| 2ba60ec7fe | |||
| bd7157a071 | |||
| be429d0cfd | |||
| c253745eb8 | |||
| daec4d2624 | |||
| 6c9fdbf725 | |||
| 483ea64611 | |||
| e20eba753b | |||
| bbc1b29665 | |||
| acb1bfa601 | |||
| 75c7ad9918 | |||
| 5550ff9c25 | |||
| 3aeb19a39e | |||
| 8c017b3490 | |||
| 9c2c2287a0 | |||
| fec2b341ad | |||
| 87bc0c492f | |||
| fe3b9372ad | |||
| bde9e2272a | |||
| 08405609cc | |||
| ab81379ea6 | |||
| 4ffd6e8942 | |||
| 965c5f4914 | |||
| 4d055ef465 | |||
| 17c540a993 | |||
| 4d4d6bad19 | |||
| 11ae016bd7 | |||
| 41d3071918 | |||
| fb5e10d3fb | |||
| b2f78cbad4 | |||
| 23583ee28c | |||
| 01c977e96d | |||
| b3dda72c23 | |||
| fb0571b077 | |||
| 2ed8b6b3d0 | |||
| 013abde6ef | |||
| a5464dcf92 | |||
| ac3ed5a815 | |||
| e6ba2000ae | |||
| aa255ff55a | |||
| 7bb736d00e | |||
| 9f4e30904b | |||
| 5afd3276df | |||
| 43721bc67f | |||
| 02d709a6f1 | |||
| 4a510ab487 | |||
| 314fa8abbf | |||
| 334535b6fb | |||
| dcbb3f1871 | |||
| 00417f4e44 | |||
| ed344f4116 | |||
| e51928793e | |||
| d2740fafbf | |||
| 17838e50ef | |||
| 44c8555621 | |||
| f7d318de2b | |||
| 76f0d05bc6 | |||
| 7d8975de84 | |||
| 785d8b6410 | |||
| f6cdc9a02f | |||
| c72d44ba4a | |||
| c292032b44 | |||
| b286fba2bb | |||
| 509cdc0370 | |||
| 9b6504c307 | |||
| e19b16dde6 | |||
| 582f2c6be7 | |||
| f8a0acbdbe | |||
| 1317034379 | |||
| 0ecc553ee6 | |||
| f96bc3649c | |||
| 938c43ea7f | |||
| 0a9ef0cfce | |||
| e5b438a247 | |||
| 0b99f5d302 | |||
| 1f491aa0c8 | |||
| de92d916fe | |||
| a1063628a4 | |||
| d796375258 | |||
| 14f8456344 | |||
| 4794c2bd92 | |||
| d3cbaa08dc | |||
| 828523ad8e | |||
| 136a17fe6e | |||
| f57438338d | |||
| 5d598680e3 | |||
| 8f4b313c37 | |||
| f93e348010 | |||
| f54f85129e | |||
| d4d1a6024f | |||
| db1764e4e0 | |||
| 7f83b4ee8e | |||
| 5c3bae1a6a | |||
| 5210dc3940 | |||
| 650b51f9f9 | |||
| 6256697997 | |||
| 71557a5f7c | |||
| f3c378ffa7 | |||
| f5ed68ef63 | |||
| efdef57b1f | |||
| b8a4572157 | |||
| 302ef403a2 | |||
| 8865da157b | |||
| f0862eae43 | |||
| 8c851f6d04 | |||
| 7cfa420f49 | |||
| a27b288e4a | |||
| e471d7ca7e | |||
| c43ca8259e | |||
| 85a65e7f51 | |||
| a2986b3e33 | |||
| 96b9aa5aa0 | |||
| e66d787bce | |||
| bfad142e25 | |||
| 9354660036 | |||
| 07ca70af8d | |||
| 2dcd12d357 | |||
| 579d2e5458 | |||
| 0512c04aee | |||
| 7e0ef4084a | |||
| 4aed506b65 | |||
| a86b4c58e8 | |||
| ff4810ba73 | |||
| 9d6964926e | |||
| 0e65818910 | |||
| 380f17527c | |||
| b92ab3deda | |||
| acaa2c0a4a | |||
| 82af928c41 | |||
| 87efc681db | |||
| c3a722fcb2 | |||
| aba48f7db1 | |||
| 04b5f9802d | |||
| efc8f7d814 | |||
| 6d87a2838c | |||
| e6cdbd6792 | |||
| df850c4912 | |||
| 720394de43 | |||
| 88a49745af | |||
| ca683a2a72 | |||
| e9f1b8c9e9 | |||
| ea97940d6c | |||
| fdd32750f0 | |||
| c715ba3735 | |||
| 9c4cb68339 | |||
| 780eb03d9b | |||
| ef9676a1f1 | |||
| 70b1b330e1 | |||
| d1d063a588 | |||
| 7e6edb1469 | |||
| 74704d4553 | |||
| d2f816d6ff | |||
| 577d498212 | |||
| fd85c9f426 | |||
| d32c611f45 | |||
| 01ad27faff | |||
| 481545b397 | |||
| d3cc8427c0 | |||
| 4821ac1b4d | |||
| 4497c8f821 | |||
| 2e36cdbe2b | |||
| fe3edb4cf0 | |||
| 29350922c6 | |||
| 8ae169286f | |||
| 8a0af6a561 | |||
| cfded80793 | |||
| b59dd19b55 | |||
| 3e051bda82 | |||
| 8317f72354 | |||
| d8bebb008a | |||
| 35bc22f23c | |||
| fa96fb9c70 | |||
| e3fdb627d9 | |||
| 7200a21cd1 | |||
| 577c72a227 | |||
| 314285d4f2 | |||
| d2a7938582 | |||
| 89342ce4c0 | |||
| f89f599395 | |||
| e251e457c5 | |||
| afc47e4de7 | |||
| e3b90c1ba2 | |||
| 134f70b3ed | |||
| a1b2d658ee | |||
| 5c7fe25491 | |||
| 53c9a7cee2 | |||
| 0d21b9b51e | |||
| 10214b6935 | |||
| 4a61950f4d | |||
| 3263799056 | |||
| 8e67b2557a | |||
| 4073c82c4e | |||
| 767c3ab869 | |||
| 4f207c7174 | |||
| 782505ed8e | |||
| 98f30b8cba | |||
| 3cd36660f7 | |||
| 46ad73955a | |||
| 41f3884438 | |||
| 60e419c1ee | |||
| 7ef6052804 | |||
| 4fca1a1bd2 | |||
| a6049be73c | |||
| 18ed7746ea | |||
| 8fcaaf6a16 | |||
| 9bb38130cb | |||
| b91d8db873 | |||
| 045b396d09 | |||
| 76852017ea | |||
| 82e64c7a20 | |||
| 4ca204055e | |||
| c5c8f5ea59 | |||
| 01653a917b | |||
| 0cd103e7cb | |||
| 5be7ca1b99 | |||
| f0a30a067b | |||
| 9d6cff3ede | |||
| a25f2adee9 | |||
| d0bed837ac | |||
| f7ee69868a | |||
| d2a71530c1 | |||
| 086609de64 | |||
| 727144bed1 | |||
| 55392bc879 | |||
| ddaff2938e | |||
| 27ed39a347 | |||
| 8f8474fbe3 | |||
| be067861c6 | |||
| 5bc26c438d | |||
| eef921f45e | |||
| e317414ce1 | |||
| 949cb0170d | |||
| e94cfd51da | |||
| 7c12763b24 | |||
| 3b780a4bbb | |||
| 30f78af147 | |||
| 19a9b169bf | |||
| 96ad65b7fe | |||
| 8d2b8c0ff2 | |||
| b2155ed317 | |||
| 910abdbd08 | |||
| cddce79fda | |||
| e519281920 | |||
| 7b03584de8 | |||
| ae9d0e7da5 | |||
| 0e67102d93 | |||
| f4ba2061cf | |||
| 1e6848a65d | |||
| 67661375fa | |||
| 213b64452a | |||
| 784c231151 | |||
| 606b00e80f | |||
| 720d3cd0f0 | |||
| ab196edefb | |||
| 3ee202ea1e | |||
| ad430a67ca | |||
| 6f0f570c43 | |||
| b545a0b207 | |||
| 29255cfc3b | |||
| da4455609d | |||
| aafb99a4d4 | |||
| 757fa4a4da | |||
| c6187f55f7 | |||
| 8983e0216f | |||
| 1ee35382cb | |||
| 6e783bc54b | |||
| c9d33c60dc | |||
| 2e54db4d2b | |||
| 44f633dba1 | |||
| a462331e36 | |||
| 4069db3f2e | |||
| 0d37450eb7 | |||
| 47e66c24e2 | |||
| 3b736e1c38 | |||
| 2c1c7dfb35 | |||
| e246ad6f0c | |||
| 5728da11ea | |||
| 92be3f3517 | |||
| d1ddf340c8 | |||
| ec10fd0abc | |||
| 0426e3c5e1 | |||
| 4bdf7ac593 | |||
| dc7976dd9f | |||
| e4791438ed | |||
| e6e898f95d | |||
| ddcbc2f334 | |||
| a83ff278d6 | |||
| cf4cd6c24f | |||
| b960441812 | |||
| 1317028aa8 | |||
| 5e49c3e777 | |||
| 0d7c3cb51d | |||
| 1b2c440cd6 | |||
| 0f29dca988 | |||
| d24cf322e1 | |||
| d17f0fbf30 | |||
| 43ab8cfaa5 | |||
| de253d63b7 | |||
| 8bd696fa53 | |||
| bb6d8c21f9 | |||
| ebf6ef1a9b | |||
| 0c52d6ef81 | |||
| 467a4f98f1 | |||
| e614ab7806 | |||
| 2a03f93de9 | |||
| da364615fc | |||
| f08919b7d1 | |||
| 93f2c0aa08 | |||
| 4ebc9108a7 | |||
| e1ba235668 | |||
| b82f4307c9 | |||
| 76879cc160 | |||
| b25d7b5657 | |||
| e09d1753ec | |||
| 4ba8875749 | |||
| 6273fe8d3d | |||
| 9fb3ae4e6f | |||
| 76afe4edf8 | |||
| c1b06fc182 | |||
| 241b4cfe66 | |||
| 9fc983c707 | |||
| 2f99f2f506 | |||
| 338b1bf04f | |||
| e39dc46f8f | |||
| 10c75b5439 | |||
| f9582fd8f4 | |||
| f377333bd7 | |||
| f8607863d8 | |||
| 335b28f7d1 | |||
| 5e65d6b2ad | |||
| 0d4f48fa10 | |||
| 127c8b782a | |||
| cd9890544b | |||
| 067da2d1df | |||
| 046118b938 | |||
| b32260ab85 | |||
| f80e7866c0 | |||
| 31a4b3e6c4 | |||
| caf8b1c084 | |||
| 1b86bd8e18 | |||
| 59012df99b | |||
| 3d1f67616d | |||
| 6ebaf43ee4 | |||
| 0c824fc46f | |||
| eb577e4655 | |||
| 8f36850f73 | |||
| 29fd2662ba | |||
| 30a3e5af69 | |||
| a38c1bfe09 | |||
| 320feae6f5 | |||
| 1e4ecca1d0 | |||
| c0a7b89d8e | |||
| 6f59beaf0b | |||
| 41f1cf38f2 | |||
| 08d26a1b7e | |||
| 63773a6200 | |||
| 883b42896a | |||
| e1098ced95 | |||
| d100d78eb3 | |||
| 7e4cd070b0 | |||
| 46b0779996 | |||
| de342585ff | |||
| 185d8ed44f | |||
| d9836d4517 | |||
| 5f7e8a916a | |||
| 4dbdf4a294 | |||
| c6873c4e6d | |||
| 2111b4643c | |||
| c50901f3b9 | |||
| 8229280a9c | |||
| f77df94647 | |||
| f231e5bc21 | |||
| 2161efe978 | |||
| f23b4c04fd | |||
| 93540958b8 | |||
| 44b9af5bb2 | |||
| 7cd95dc8a3 | |||
| c02058c222 | |||
| b2ea5ba677 | |||
| 824a3f403f | |||
| 05f6846ede | |||
| 20db99cc69 | |||
| 6431be808f | |||
| 4727a8afa7 |
@ -5,11 +5,11 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
|
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
|
||||||
# Note that we have 800 MiB quota, please use it wisely.
|
# Note that we have 800 MiB quota, please use it wisely.
|
||||||
# See https://github.com/pypi/support/issues/6326 .
|
# See https://github.com/pypi/support/issues/6326 .
|
||||||
# Please also sync the value with the one in Dockerfile.
|
# Please also sync the value with the one in Dockerfile.
|
||||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
|
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
|
||||||
|
|
||||||
|
|
||||||
def print_top_10_largest_files(zip_file):
|
def print_top_10_largest_files(zip_file):
|
||||||
|
|||||||
12
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
Normal file
12
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
||||||
|
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.419
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.416
|
||||||
|
limit: 1000
|
||||||
|
num_fewshot: 5
|
||||||
@ -0,0 +1,12 @@
|
|||||||
|
# For hf script, without -t option (tensor parallel size).
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
|
||||||
|
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
||||||
|
backend: "vllm-vlm"
|
||||||
|
tasks:
|
||||||
|
- name: "chartqa"
|
||||||
|
metrics:
|
||||||
|
- name: "relaxed_accuracy,none"
|
||||||
|
# TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
|
||||||
|
value: 0.80
|
||||||
|
limit: 100
|
||||||
|
num_fewshot: 0
|
||||||
@ -0,0 +1,10 @@
|
|||||||
|
# For hf script, without -t option (tensor parallel size).
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
|
||||||
|
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
||||||
|
tasks:
|
||||||
|
- name: "mmlu_pro"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,custom-extract"
|
||||||
|
value: 0.80
|
||||||
|
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
||||||
|
num_fewshot: 5
|
||||||
@ -1,4 +1,5 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
|
# For vllm script, with -t option (tensor parallel size)
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
|
||||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
|
|||||||
@ -0,0 +1,12 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
|
||||||
|
|
||||||
|
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
|
||||||
|
backend: "vllm-vlm"
|
||||||
|
tasks:
|
||||||
|
- name: "chartqa"
|
||||||
|
metrics:
|
||||||
|
- name: "relaxed_accuracy,none"
|
||||||
|
value: 0.855
|
||||||
|
limit: 2500
|
||||||
|
num_fewshot: 0
|
||||||
1
.buildkite/lm-eval-harness/configs/models-large-h100.txt
Normal file
1
.buildkite/lm-eval-harness/configs/models-large-h100.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
|
||||||
@ -0,0 +1 @@
|
|||||||
|
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
|
||||||
1
.buildkite/lm-eval-harness/configs/models-mm-small.txt
Normal file
1
.buildkite/lm-eval-harness/configs/models-mm-small.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
Qwen2.5-VL-7B-Instruct.yaml
|
||||||
44
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
Executable file
44
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
Executable file
@ -0,0 +1,44 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# We can use this script to compute baseline accuracy on chartqa for vllm.
|
||||||
|
#
|
||||||
|
# Make sure you have lm-eval-harness installed:
|
||||||
|
# pip install lm-eval==0.4.9
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on ChartQA using multimodal vllm."
|
||||||
|
echo "This pathway is intended to be used to create baselines for "
|
||||||
|
echo "our correctness tests in vllm's CI."
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -m - huggingface stub or local directory of the model"
|
||||||
|
echo " -l - limit number of samples to run"
|
||||||
|
echo " -t - tensor parallel size to run at"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
while getopts "m:l:t:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
m )
|
||||||
|
MODEL="$OPTARG"
|
||||||
|
;;
|
||||||
|
l )
|
||||||
|
LIMIT="$OPTARG"
|
||||||
|
;;
|
||||||
|
t )
|
||||||
|
TP_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
lm_eval --model vllm-vlm \
|
||||||
|
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
|
||||||
|
--tasks chartqa \
|
||||||
|
--batch_size auto \
|
||||||
|
--apply_chat_template \
|
||||||
|
--limit $LIMIT
|
||||||
0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file → Executable file
0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file → Executable file
@ -0,0 +1,50 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
|
||||||
|
# We use this for fp8, which HF does not support.
|
||||||
|
#
|
||||||
|
# Make sure you have lm-eval-harness installed:
|
||||||
|
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
|
||||||
|
echo "This pathway is intended to be used to create baselines for "
|
||||||
|
echo "our automated nm-test-accuracy workflow"
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -m - huggingface stub or local directory of the model"
|
||||||
|
echo " -l - limit number of samples to run"
|
||||||
|
echo " -f - number of fewshot samples to use"
|
||||||
|
echo " -t - tensor parallel size to run at"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
while getopts "m:b:l:f:t:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
m )
|
||||||
|
MODEL="$OPTARG"
|
||||||
|
;;
|
||||||
|
b )
|
||||||
|
BATCH_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
l )
|
||||||
|
LIMIT="$OPTARG"
|
||||||
|
;;
|
||||||
|
f )
|
||||||
|
FEWSHOT="$OPTARG"
|
||||||
|
;;
|
||||||
|
t )
|
||||||
|
TP_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
lm_eval --model vllm \
|
||||||
|
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
|
||||||
|
--tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
||||||
|
--batch_size auto
|
||||||
@ -19,21 +19,27 @@ RTOL = 0.08
|
|||||||
def launch_lm_eval(eval_config, tp_size):
|
def launch_lm_eval(eval_config, tp_size):
|
||||||
trust_remote_code = eval_config.get("trust_remote_code", False)
|
trust_remote_code = eval_config.get("trust_remote_code", False)
|
||||||
max_model_len = eval_config.get("max_model_len", 4096)
|
max_model_len = eval_config.get("max_model_len", 4096)
|
||||||
|
batch_size = eval_config.get("batch_size", "auto")
|
||||||
|
backend = eval_config.get("backend", "vllm")
|
||||||
model_args = (
|
model_args = (
|
||||||
f"pretrained={eval_config['model_name']},"
|
f"pretrained={eval_config['model_name']},"
|
||||||
f"tensor_parallel_size={tp_size},"
|
f"tensor_parallel_size={tp_size},"
|
||||||
f"enforce_eager=true,"
|
f"enforce_eager=true,"
|
||||||
f"add_bos_token=true,"
|
f"add_bos_token=true,"
|
||||||
f"trust_remote_code={trust_remote_code},"
|
f"trust_remote_code={trust_remote_code},"
|
||||||
f"max_model_len={max_model_len}"
|
f"max_model_len={max_model_len},"
|
||||||
)
|
)
|
||||||
results = lm_eval.simple_evaluate(
|
results = lm_eval.simple_evaluate(
|
||||||
model="vllm",
|
model=backend,
|
||||||
model_args=model_args,
|
model_args=model_args,
|
||||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||||
num_fewshot=eval_config["num_fewshot"],
|
num_fewshot=eval_config["num_fewshot"],
|
||||||
limit=eval_config["limit"],
|
limit=eval_config["limit"],
|
||||||
batch_size="auto",
|
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
||||||
|
# text models. however, this is regressing measured strict-match for
|
||||||
|
# existing text models in CI, so only apply it for mm.
|
||||||
|
apply_chat_template=backend == "vllm-vlm",
|
||||||
|
batch_size=batch_size,
|
||||||
)
|
)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|||||||
@ -7,6 +7,7 @@ from importlib import util
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
pd.options.display.float_format = "{:.2f}".format
|
||||||
plotly_found = util.find_spec("plotly.express") is not None
|
plotly_found = util.find_spec("plotly.express") is not None
|
||||||
|
|
||||||
|
|
||||||
@ -109,7 +110,10 @@ def compare_data_columns(
|
|||||||
if len(compare_frames) >= 2:
|
if len(compare_frames) >= 2:
|
||||||
base = compare_frames[0]
|
base = compare_frames[0]
|
||||||
current = compare_frames[-1]
|
current = compare_frames[-1]
|
||||||
ratio = current / base
|
if "P99" in data_column or "Median" in data_column:
|
||||||
|
ratio = base / current # for latency
|
||||||
|
else:
|
||||||
|
ratio = current / base
|
||||||
ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
|
ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
|
||||||
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
|
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
|
||||||
frames.append(ratio)
|
frames.append(ratio)
|
||||||
@ -199,6 +203,71 @@ def split_json_by_tp_pp(
|
|||||||
return saved_paths
|
return saved_paths
|
||||||
|
|
||||||
|
|
||||||
|
def _add_limit_line(fig, y_value, label):
|
||||||
|
# Visible dashed line + annotation
|
||||||
|
fig.add_hline(
|
||||||
|
y=y_value,
|
||||||
|
line_dash="dash",
|
||||||
|
line_color="red" if "ttft" in label.lower() else "blue",
|
||||||
|
annotation_text=f"{label}: {y_value} ms",
|
||||||
|
annotation_position="top left",
|
||||||
|
)
|
||||||
|
# Optional: add a legend item (as a transparent helper trace)
|
||||||
|
if plot and plotly_found:
|
||||||
|
import plotly.graph_objects as go
|
||||||
|
|
||||||
|
fig.add_trace(
|
||||||
|
go.Scatter(
|
||||||
|
x=[None],
|
||||||
|
y=[None],
|
||||||
|
mode="lines",
|
||||||
|
line=dict(
|
||||||
|
dash="dash", color="red" if "ttft" in label.lower() else "blue"
|
||||||
|
),
|
||||||
|
name=f"{label}",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _find_concurrency_col(df: pd.DataFrame) -> str:
|
||||||
|
for c in [
|
||||||
|
"# of max concurrency.",
|
||||||
|
"# of max concurrency",
|
||||||
|
"Max Concurrency",
|
||||||
|
"max_concurrency",
|
||||||
|
"Concurrency",
|
||||||
|
]:
|
||||||
|
if c in df.columns:
|
||||||
|
return c
|
||||||
|
# Fallback: guess an integer-like column (harmless if unused)
|
||||||
|
for c in df.columns:
|
||||||
|
if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
|
||||||
|
return c
|
||||||
|
return "# of max concurrency."
|
||||||
|
|
||||||
|
|
||||||
|
def _highlight_threshold(
|
||||||
|
df: pd.DataFrame, threshold: float
|
||||||
|
) -> "pd.io.formats.style.Styler":
|
||||||
|
"""Highlight numeric per-configuration columns with value <= threshold."""
|
||||||
|
conc_col = _find_concurrency_col(df)
|
||||||
|
key_cols = [
|
||||||
|
c
|
||||||
|
for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col]
|
||||||
|
if c in df.columns
|
||||||
|
]
|
||||||
|
conf_cols = [
|
||||||
|
c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
|
||||||
|
]
|
||||||
|
conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
|
||||||
|
return df.style.map(
|
||||||
|
lambda v: "background-color:#e6ffe6;font-weight:bold;"
|
||||||
|
if pd.notna(v) and v <= threshold
|
||||||
|
else "",
|
||||||
|
subset=conf_cols,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -220,6 +289,26 @@ if __name__ == "__main__":
|
|||||||
default="# of max concurrency.",
|
default="# of max concurrency.",
|
||||||
help="column name to use as X Axis in comparison graph",
|
help="column name to use as X Axis in comparison graph",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-l",
|
||||||
|
"--latency",
|
||||||
|
type=str,
|
||||||
|
default="p99",
|
||||||
|
help="take median|p99 for latency like TTFT/TPOT",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--ttft-max-ms",
|
||||||
|
type=float,
|
||||||
|
default=3000.0,
|
||||||
|
help="Reference limit for TTFT plots (ms)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--tpot-max-ms",
|
||||||
|
type=float,
|
||||||
|
default=100.0,
|
||||||
|
help="Reference limit for TPOT plots (ms)",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
drop_column = "P99"
|
drop_column = "P99"
|
||||||
@ -234,12 +323,22 @@ if __name__ == "__main__":
|
|||||||
"# of max concurrency.",
|
"# of max concurrency.",
|
||||||
"qps",
|
"qps",
|
||||||
]
|
]
|
||||||
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
|
|
||||||
html_msgs_for_data_cols = [
|
if "median" in args.latency:
|
||||||
"Compare Output Tokens /n",
|
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
|
||||||
"Median TTFT /n",
|
html_msgs_for_data_cols = [
|
||||||
"Median TPOT /n",
|
"Compare Output Tokens /n",
|
||||||
]
|
"Median TTFT /n",
|
||||||
|
"Median TPOT /n",
|
||||||
|
]
|
||||||
|
drop_column = "P99"
|
||||||
|
elif "p99" in args.latency:
|
||||||
|
data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
|
||||||
|
html_msgs_for_data_cols = [
|
||||||
|
"Compare Output Tokens /n",
|
||||||
|
"P99 TTFT /n",
|
||||||
|
"P99 TPOT /n",
|
||||||
|
]
|
||||||
|
|
||||||
if len(args.file) == 1:
|
if len(args.file) == 1:
|
||||||
files = split_json_by_tp_pp(args.file[0], output_root="splits")
|
files = split_json_by_tp_pp(args.file[0], output_root="splits")
|
||||||
@ -275,33 +374,83 @@ if __name__ == "__main__":
|
|||||||
f"Expected subset: {filtered_info_cols}, "
|
f"Expected subset: {filtered_info_cols}, "
|
||||||
f"but DataFrame has: {list(output_df.columns)}"
|
f"but DataFrame has: {list(output_df.columns)}"
|
||||||
)
|
)
|
||||||
output_df_sorted = output_df.sort_values(by=existing_group_cols)
|
# output_df_sorted = output_df.sort_values(by=existing_group_cols)
|
||||||
|
output_df_sorted = output_df.sort_values(by=args.xaxis)
|
||||||
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
|
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
|
||||||
for name, group in output_groups:
|
for name, group in output_groups:
|
||||||
html = group.to_html()
|
group_name = (
|
||||||
|
",".join(map(str, name)).replace(",", "_").replace("/", "-")
|
||||||
|
)
|
||||||
|
group_html_name = "perf_comparison_" + group_name + ".html"
|
||||||
|
|
||||||
|
metric_name = str(data_cols_to_compare[i]).lower()
|
||||||
|
if "tok/s" in metric_name:
|
||||||
|
html = group.to_html()
|
||||||
|
elif "ttft" in metric_name:
|
||||||
|
styler = _highlight_threshold(group, args.ttft_max_ms).format(
|
||||||
|
{c: "{:.2f}" for c in group.select_dtypes("number").columns},
|
||||||
|
na_rep="—",
|
||||||
|
)
|
||||||
|
html = styler.to_html(
|
||||||
|
table_attributes='border="1" class="dataframe"'
|
||||||
|
)
|
||||||
|
elif (
|
||||||
|
"tpot" in metric_name
|
||||||
|
or "median" in metric_name
|
||||||
|
or "p99" in metric_name
|
||||||
|
):
|
||||||
|
styler = _highlight_threshold(group, args.tpot_max_ms).format(
|
||||||
|
{c: "{:.2f}" for c in group.select_dtypes("number").columns},
|
||||||
|
na_rep="—",
|
||||||
|
)
|
||||||
|
html = styler.to_html(
|
||||||
|
table_attributes='border="1" class="dataframe"'
|
||||||
|
)
|
||||||
|
|
||||||
text_file.write(html_msgs_for_data_cols[i])
|
text_file.write(html_msgs_for_data_cols[i])
|
||||||
text_file.write(html)
|
text_file.write(html)
|
||||||
|
with open(group_html_name, "a+") as sub_text_file:
|
||||||
|
sub_text_file.write(html_msgs_for_data_cols[i])
|
||||||
|
sub_text_file.write(html)
|
||||||
|
|
||||||
if plot and plotly_found:
|
if plot and plotly_found:
|
||||||
import plotly.express as px
|
import plotly.express as px
|
||||||
|
|
||||||
df = group[raw_data_cols]
|
df = group[raw_data_cols]
|
||||||
df_sorted = df.sort_values(by=info_cols[y_axis_index])
|
df_sorted = df.sort_values(by=info_cols[y_axis_index])
|
||||||
# Melt DataFrame for plotting
|
# Melt DataFrame for plotting
|
||||||
df_melted = df_sorted.melt(
|
df_melted = df_sorted.melt(
|
||||||
id_vars=info_cols[y_axis_index],
|
id_vars=info_cols[y_axis_index],
|
||||||
var_name="Configuration",
|
var_name="Configuration",
|
||||||
value_name=data_cols_to_compare[i],
|
value_name=data_cols_to_compare[i],
|
||||||
)
|
)
|
||||||
title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
|
title = (
|
||||||
# Create Plotly line chart
|
data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
|
||||||
fig = px.line(
|
)
|
||||||
df_melted,
|
# Create Plotly line chart
|
||||||
x=info_cols[y_axis_index],
|
fig = px.line(
|
||||||
y=data_cols_to_compare[i],
|
df_melted,
|
||||||
color="Configuration",
|
x=info_cols[y_axis_index],
|
||||||
title=title,
|
y=data_cols_to_compare[i],
|
||||||
markers=True,
|
color="Configuration",
|
||||||
)
|
title=title,
|
||||||
# Export to HTML
|
markers=True,
|
||||||
text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
|
)
|
||||||
|
|
||||||
|
# ---- Add threshold lines based on metric name ----
|
||||||
|
if "ttft" in metric_name:
|
||||||
|
_add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
|
||||||
|
elif (
|
||||||
|
"tpot" in metric_name
|
||||||
|
or "median" in metric_name
|
||||||
|
or "p99" in metric_name
|
||||||
|
):
|
||||||
|
_add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
|
||||||
|
|
||||||
|
# Export to HTML
|
||||||
|
text_file.write(
|
||||||
|
fig.to_html(full_html=True, include_plotlyjs="cdn")
|
||||||
|
)
|
||||||
|
sub_text_file.write(
|
||||||
|
fig.to_html(full_html=True, include_plotlyjs="cdn")
|
||||||
|
)
|
||||||
|
|||||||
@ -63,9 +63,11 @@ serving_column_mapping = {
|
|||||||
"mean_ttft_ms": "Mean TTFT (ms)",
|
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||||
"median_ttft_ms": "Median TTFT (ms)",
|
"median_ttft_ms": "Median TTFT (ms)",
|
||||||
"p99_ttft_ms": "P99 TTFT (ms)",
|
"p99_ttft_ms": "P99 TTFT (ms)",
|
||||||
|
"std_ttft_ms": "STD TTFT (ms)",
|
||||||
"mean_tpot_ms": "Mean TPOT (ms)",
|
"mean_tpot_ms": "Mean TPOT (ms)",
|
||||||
"median_tpot_ms": "Median",
|
"median_tpot_ms": "Median",
|
||||||
"p99_tpot_ms": "P99",
|
"p99_tpot_ms": "P99",
|
||||||
|
"std_tpot_ms": "STD TPOT (ms)",
|
||||||
"mean_itl_ms": "Mean ITL (ms)",
|
"mean_itl_ms": "Mean ITL (ms)",
|
||||||
"median_itl_ms": "Median ITL (ms)",
|
"median_itl_ms": "Median ITL (ms)",
|
||||||
"p99_itl_ms": "P99 ITL (ms)",
|
"p99_itl_ms": "P99 ITL (ms)",
|
||||||
@ -368,7 +370,7 @@ if __name__ == "__main__":
|
|||||||
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
||||||
# we want to turn it into "8xGPUTYPE"
|
# we want to turn it into "8xGPUTYPE"
|
||||||
df["GPU"] = df["GPU"].apply(
|
df["GPU"] = df["GPU"].apply(
|
||||||
lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
|
lambda x: "{}x{}".format(len(x.split("\n")), x.split("\n")[0])
|
||||||
)
|
)
|
||||||
|
|
||||||
# get markdown tables
|
# get markdown tables
|
||||||
|
|||||||
@ -454,11 +454,6 @@ main() {
|
|||||||
fi
|
fi
|
||||||
check_hf_token
|
check_hf_token
|
||||||
|
|
||||||
# Set to v1 to run v1 benchmark
|
|
||||||
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
|
|
||||||
export VLLM_USE_V1=1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# dependencies
|
# dependencies
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
(which jq) || (apt-get update && apt-get -y install jq)
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
@ -476,6 +471,11 @@ main() {
|
|||||||
mkdir -p $RESULTS_FOLDER
|
mkdir -p $RESULTS_FOLDER
|
||||||
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||||
|
|
||||||
|
# dump vllm info via vllm collect-env
|
||||||
|
env_output=$(vllm collect-env)
|
||||||
|
|
||||||
|
echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
|
||||||
|
|
||||||
# benchmarking
|
# benchmarking
|
||||||
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
|
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
|
||||||
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
|
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
|
||||||
|
|||||||
@ -1,28 +1,24 @@
|
|||||||
[
|
[
|
||||||
{
|
{
|
||||||
"test_name": "latency_llama8B_tp1",
|
"test_name": "latency_llama8B_tp2",
|
||||||
"environment_variables": {
|
"environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
},
|
},
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
"tensor_parallel_size": 1,
|
"tensor_parallel_size": 2,
|
||||||
"load_format": "dummy",
|
"dtype": "bfloat16",
|
||||||
"num_iters_warmup": 5,
|
"distributed_executor_backend": "mp",
|
||||||
"num_iters": 15
|
"block_size": 128,
|
||||||
}
|
"trust_remote_code": "",
|
||||||
},
|
"disable_log_stats": "",
|
||||||
{
|
"enforce_eager": "",
|
||||||
"test_name": "latency_llama8B_tp4",
|
"max_num_batched_tokens": 2048,
|
||||||
"environment_variables": {
|
"max_num_seqs": 256,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"num_iters_warmup": 5,
|
"num_iters_warmup": 5,
|
||||||
"num_iters": 15
|
"num_iters": 15
|
||||||
}
|
}
|
||||||
|
|||||||
@ -95,6 +95,38 @@
|
|||||||
"num_prompts": 200
|
"num_prompts": 200
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
|
"test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
@ -233,6 +265,41 @@
|
|||||||
"num_prompts": 1000
|
"num_prompts": 1000
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
|
"test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
@ -365,6 +432,38 @@
|
|||||||
"num_prompts": 200
|
"num_prompts": 200
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp4_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
|
"test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
@ -503,6 +602,41 @@
|
|||||||
"num_prompts": 1000
|
"num_prompts": 1000
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp4_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
|
"test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
@ -638,6 +772,39 @@
|
|||||||
"num_prompts": 200
|
"num_prompts": 200
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp4_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
|
"test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
@ -780,6 +947,42 @@
|
|||||||
"num_prompts": 1000
|
"num_prompts": 1000
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp4_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
|
"test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
"max_concurrency_list": [32],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
@ -28,13 +28,13 @@
|
|||||||
"backend": "vllm",
|
"backend": "vllm",
|
||||||
"dataset_name": "sharegpt",
|
"dataset_name": "sharegpt",
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
"num_prompts": 200
|
"num_prompts": 32
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
"max_concurrency_list": [32],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
@ -60,13 +60,13 @@
|
|||||||
"backend": "vllm",
|
"backend": "vllm",
|
||||||
"dataset_name": "sharegpt",
|
"dataset_name": "sharegpt",
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
"num_prompts": 200
|
"num_prompts": 32
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp4_sharegpt",
|
"test_name": "serving_llama8B_tp1_random_128_128",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
"max_concurrency_list": [32],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
@ -76,39 +76,7 @@
|
|||||||
},
|
},
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 1,
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp4_random_1024_128",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
"dtype": "bfloat16",
|
||||||
"distributed_executor_backend": "mp",
|
"distributed_executor_backend": "mp",
|
||||||
"block_size": 128,
|
"block_size": 128,
|
||||||
@ -124,16 +92,16 @@
|
|||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
"backend": "vllm",
|
"backend": "vllm",
|
||||||
"dataset_name": "random",
|
"dataset_name": "random",
|
||||||
"random-input-len": 1024,
|
"random-input-len": 128,
|
||||||
"random-output-len": 128,
|
"random-output-len": 128,
|
||||||
"ignore-eos": "",
|
"ignore-eos": "",
|
||||||
"num_prompts": 100
|
"num_prompts": 32
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_pp6_random_1024_128",
|
"test_name": "serving_llama8B_tp2_random_128_128",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
"max_concurrency_list": [32],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
@ -143,7 +111,7 @@
|
|||||||
},
|
},
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
"pipeline_parallel_size": 6,
|
"tensor_parallel_size": 2,
|
||||||
"dtype": "bfloat16",
|
"dtype": "bfloat16",
|
||||||
"distributed_executor_backend": "mp",
|
"distributed_executor_backend": "mp",
|
||||||
"block_size": 128,
|
"block_size": 128,
|
||||||
@ -159,10 +127,150 @@
|
|||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
"backend": "vllm",
|
"backend": "vllm",
|
||||||
"dataset_name": "random",
|
"dataset_name": "random",
|
||||||
"random-input-len": 1024,
|
"random-input-len": 128,
|
||||||
"random-output-len": 128,
|
"random-output-len": 128,
|
||||||
"ignore-eos": "",
|
"ignore-eos": "",
|
||||||
"num_prompts": 100
|
"num_prompts": 32
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_random_128_2048",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"max_concurrency_list": [32],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 2048,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 32
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_random_128_2048",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"max_concurrency_list": [32],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 2048,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 32
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_random_2048_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"max_concurrency_list": [32],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 2048,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 32
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_random_2048_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"max_concurrency_list": [32],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 2048,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 32
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|||||||
@ -1,29 +1,24 @@
|
|||||||
[
|
[
|
||||||
{
|
{
|
||||||
"test_name": "throughput_llama8B_tp1",
|
"test_name": "throughput_llama8B_tp2",
|
||||||
"environment_variables": {
|
"environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
},
|
},
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
"tensor_parallel_size": 1,
|
"tensor_parallel_size": 2,
|
||||||
"load_format": "dummy",
|
"dtype": "bfloat16",
|
||||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"distributed_executor_backend": "mp",
|
||||||
"num_prompts": 200,
|
"block_size": 128,
|
||||||
"backend": "vllm"
|
"trust_remote_code": "",
|
||||||
}
|
"disable_log_stats": "",
|
||||||
},
|
"enforce_eager": "",
|
||||||
{
|
"max_num_batched_tokens": 2048,
|
||||||
"test_name": "throughput_llama8B_tp4",
|
"max_num_seqs": 256,
|
||||||
"environment_variables": {
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
"num_prompts": 200,
|
"num_prompts": 200,
|
||||||
"backend": "vllm"
|
"backend": "vllm"
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
steps:
|
steps:
|
||||||
# aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
|
# aarch64 + CUDA builds
|
||||||
- label: "Build arm64 wheel - CUDA 12.9"
|
- label: "Build arm64 wheel - CUDA 12.9"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-arm64-cuda-12-9
|
id: build-wheel-arm64-cuda-12-9
|
||||||
@ -8,13 +8,28 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
# aarch64 build
|
||||||
|
- label: "Build arm64 CPU wheel"
|
||||||
|
depends_on: ~
|
||||||
|
id: build-wheel-arm64-cpu
|
||||||
|
agents:
|
||||||
|
queue: arm64_cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||||
|
- "mkdir artifacts"
|
||||||
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
# x86 + CUDA builds
|
||||||
- label: "Build wheel - CUDA 12.8"
|
- label: "Build wheel - CUDA 12.8"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-cuda-12-8
|
id: build-wheel-cuda-12-8
|
||||||
@ -28,33 +43,33 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 12.6"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-wheel-cuda-12-6
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
# x86 + CUDA builds
|
|
||||||
- label: "Build wheel - CUDA 12.9"
|
- label: "Build wheel - CUDA 12.9"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-cuda-12-9
|
id: build-wheel-cuda-12-9
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
- label: "Build wheel - CUDA 13.0"
|
||||||
|
depends_on: ~
|
||||||
|
id: build-wheel-cuda-13-0
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
|
- "mkdir artifacts"
|
||||||
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
# Build release images (12.9)
|
||||||
- label: "Build release image (x86)"
|
- label: "Build release image (x86)"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-release-image-x86
|
id: build-release-image-x86
|
||||||
@ -62,13 +77,12 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||||
# re-tag to default image tag and push, just in case arm64 build fails
|
# re-tag to default image tag and push, just in case arm64 build fails
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
# PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
|
|
||||||
- label: "Build release image (arm64)"
|
- label: "Build release image (arm64)"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-release-image-arm64
|
id: build-release-image-arm64
|
||||||
@ -76,7 +90,7 @@ steps:
|
|||||||
queue: arm64_cpu_queue_postmerge
|
queue: arm64_cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||||
|
|
||||||
# Add job to create multi-arch manifest
|
# Add job to create multi-arch manifest
|
||||||
@ -142,6 +156,22 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
- block: "Build arm64 CPU release image"
|
||||||
|
key: block-arm64-cpu-release-image-build
|
||||||
|
depends_on: ~
|
||||||
|
|
||||||
|
- label: "Build and publish arm64 CPU release image"
|
||||||
|
depends_on: block-arm64-cpu-release-image-build
|
||||||
|
agents:
|
||||||
|
queue: arm64_cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||||
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
|
||||||
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build and publish nightly multi-arch image to DockerHub"
|
- label: "Build and publish nightly multi-arch image to DockerHub"
|
||||||
depends_on:
|
depends_on:
|
||||||
- create-multi-arch-manifest
|
- create-multi-arch-manifest
|
||||||
|
|||||||
@ -25,25 +25,28 @@ function cpu_tests() {
|
|||||||
|
|
||||||
# offline inference
|
# offline inference
|
||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
set -e
|
set -xve
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
set -e
|
set -evx
|
||||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
||||||
pip install sentence-transformers datamodel_code_generator
|
pip install sentence-transformers datamodel_code_generator
|
||||||
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
|
||||||
|
# Note: disable Bart until supports V1
|
||||||
|
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
||||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||||
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
|
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
|
||||||
|
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
|
||||||
}
|
}
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
|
|
||||||
export container_id
|
export container_id
|
||||||
export -f cpu_tests
|
export -f cpu_tests
|
||||||
timeout 40m bash -c cpu_tests
|
timeout 120m bash -c cpu_tests
|
||||||
|
|
||||||
|
|||||||
@ -70,7 +70,7 @@ function cpu_tests() {
|
|||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -x -s -v \
|
pytest -x -s -v \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
|
||||||
|
|
||||||
# Note: disable it until supports V1
|
# Note: disable it until supports V1
|
||||||
# Run AWQ test
|
# Run AWQ test
|
||||||
|
|||||||
@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
|
|||||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||||
echo "--- Python dependencies installed ---"
|
echo "--- Python dependencies installed ---"
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||||
export VLLM_XLA_CACHE_PATH=
|
export VLLM_XLA_CACHE_PATH=
|
||||||
echo "Using VLLM V1"
|
|
||||||
|
|
||||||
echo "--- Hardware Information ---"
|
echo "--- Hardware Information ---"
|
||||||
# tpu-info
|
# tpu-info
|
||||||
|
|||||||
@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
|
|||||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||||
echo "--- Python dependencies installed ---"
|
echo "--- Python dependencies installed ---"
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||||
export VLLM_XLA_CACHE_PATH=
|
export VLLM_XLA_CACHE_PATH=
|
||||||
echo "Using VLLM V1"
|
|
||||||
|
|
||||||
echo "--- Hardware Information ---"
|
echo "--- Hardware Information ---"
|
||||||
# tpu-info
|
# tpu-info
|
||||||
|
|||||||
@ -44,6 +44,5 @@ docker run \
|
|||||||
pytest -v -s v1/structured_output
|
pytest -v -s v1/structured_output
|
||||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
|
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
|
||||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
||||||
pytest -v -s v1/test_metrics
|
|
||||||
pytest -v -s v1/test_serial_utils.py
|
pytest -v -s v1/test_serial_utils.py
|
||||||
'
|
'
|
||||||
|
|||||||
@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024
|
|||||||
TENSOR_PARALLEL_SIZE=1
|
TENSOR_PARALLEL_SIZE=1
|
||||||
MAX_MODEL_LEN=2048
|
MAX_MODEL_LEN=2048
|
||||||
DOWNLOAD_DIR=/mnt/disks/persist
|
DOWNLOAD_DIR=/mnt/disks/persist
|
||||||
EXPECTED_THROUGHPUT=10.0
|
EXPECTED_THROUGHPUT=8.7
|
||||||
INPUT_LEN=1800
|
INPUT_LEN=1800
|
||||||
OUTPUT_LEN=128
|
OUTPUT_LEN=128
|
||||||
|
|||||||
@ -42,7 +42,7 @@ echo "lanching vllm..."
|
|||||||
echo "logging to $VLLM_LOG"
|
echo "logging to $VLLM_LOG"
|
||||||
echo
|
echo
|
||||||
|
|
||||||
VLLM_USE_V1=1 vllm serve $MODEL \
|
vllm serve $MODEL \
|
||||||
--seed 42 \
|
--seed 42 \
|
||||||
--max-num-seqs $MAX_NUM_SEQS \
|
--max-num-seqs $MAX_NUM_SEQS \
|
||||||
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
||||||
|
|||||||
@ -58,33 +58,25 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
|
|||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
|
|
||||||
if [[ $normal_wheel == *"cu126"* ]]; then
|
if [[ $normal_wheel == *"cu129"* ]]; then
|
||||||
# if $normal_wheel matches cu126, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu126 wheels"
|
|
||||||
elif [[ $normal_wheel == *"cu128"* ]]; then
|
|
||||||
# if $normal_wheel matches cu128, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu128 wheels"
|
|
||||||
else
|
|
||||||
# only upload index.html for cu129 wheels (default wheels) as it
|
# only upload index.html for cu129 wheels (default wheels) as it
|
||||||
# is available on both x86 and arm64
|
# is available on both x86 and arm64
|
||||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||||
|
else
|
||||||
|
echo "Skipping index files for non-cu129 wheels"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# generate index for nightly
|
# generate index for nightly
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
||||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
||||||
|
|
||||||
if [[ $normal_wheel == *"cu126"* ]]; then
|
if [[ $normal_wheel == *"cu129"* ]]; then
|
||||||
# if $normal_wheel matches cu126, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu126 wheels"
|
|
||||||
elif [[ $normal_wheel == *"cu128"* ]]; then
|
|
||||||
# if $normal_wheel matches cu128, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu128 wheels"
|
|
||||||
else
|
|
||||||
# only upload index.html for cu129 wheels (default wheels) as it
|
# only upload index.html for cu129 wheels (default wheels) as it
|
||||||
# is available on both x86 and arm64
|
# is available on both x86 and arm64
|
||||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||||
|
else
|
||||||
|
echo "Skipping index files for non-cu129 wheels"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||||
|
|||||||
1328
.buildkite/test-amd.yaml
Normal file
1328
.buildkite/test-amd.yaml
Normal file
File diff suppressed because it is too large
Load Diff
@ -38,7 +38,7 @@ steps:
|
|||||||
- label: Pytorch Nightly Dependency Override Check # 2min
|
- label: Pytorch Nightly Dependency Override Check # 2min
|
||||||
# if this test fails, it means the nightly torch version is not compatible with some
|
# if this test fails, it means the nightly torch version is not compatible with some
|
||||||
# of the dependencies. Please check the error message and add the package to whitelist
|
# of the dependencies. Please check the error message and add the package to whitelist
|
||||||
# in /vllm/tools/generate_nightly_torch_test.py
|
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
|
||||||
soft_fail: true
|
soft_fail: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- requirements/nightly_torch_test.txt
|
- requirements/nightly_torch_test.txt
|
||||||
@ -172,6 +172,8 @@ steps:
|
|||||||
- tests/v1/engine/test_engine_core_client.py
|
- tests/v1/engine/test_engine_core_client.py
|
||||||
- tests/distributed/test_symm_mem_allreduce.py
|
- tests/distributed/test_symm_mem_allreduce.py
|
||||||
commands:
|
commands:
|
||||||
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
# test with torchrun tp=2 and external_dp=2
|
# test with torchrun tp=2 and external_dp=2
|
||||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
# test with torchrun tp=2 and pp=2
|
# test with torchrun tp=2 and pp=2
|
||||||
@ -203,6 +205,24 @@ steps:
|
|||||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||||
- popd
|
- popd
|
||||||
|
|
||||||
|
- label: Distributed Tests (8 GPUs) # 4min
|
||||||
|
timeout_in_minutes: 10
|
||||||
|
gpu: h100
|
||||||
|
num_gpus: 8
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
source_file_dependencies:
|
||||||
|
- examples/offline_inference/torchrun_dp_example.py
|
||||||
|
- vllm/config/parallel.py
|
||||||
|
- vllm/distributed/
|
||||||
|
- vllm/v1/engine/llm_engine.py
|
||||||
|
- vllm/v1/executor/uniproc_executor.py
|
||||||
|
- vllm/v1/worker/gpu_worker.py
|
||||||
|
commands:
|
||||||
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
|
# test with torchrun tp=2 and dp=4 with ep
|
||||||
|
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
||||||
|
|
||||||
- label: EPLB Algorithm Test # 5min
|
- label: EPLB Algorithm Test # 5min
|
||||||
timeout_in_minutes: 15
|
timeout_in_minutes: 15
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
@ -296,6 +316,7 @@ steps:
|
|||||||
- tests/v1
|
- tests/v1
|
||||||
commands:
|
commands:
|
||||||
# split the test to avoid interference
|
# split the test to avoid interference
|
||||||
|
- pytest -v -s -m 'not cpu_test' v1/core
|
||||||
- pytest -v -s v1/executor
|
- pytest -v -s v1/executor
|
||||||
- pytest -v -s v1/kv_offload
|
- pytest -v -s v1/kv_offload
|
||||||
- pytest -v -s v1/sample
|
- pytest -v -s v1/sample
|
||||||
@ -310,6 +331,15 @@ steps:
|
|||||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||||
|
|
||||||
|
- label: V1 Test attention (H100) # 10min
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
gpu: h100
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/attention
|
||||||
|
- tests/v1/attention
|
||||||
|
commands:
|
||||||
|
- pytest -v -s v1/attention
|
||||||
|
|
||||||
- label: V1 Test others (CPU) # 5 mins
|
- label: V1 Test others (CPU) # 5 mins
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@ -317,7 +347,7 @@ steps:
|
|||||||
no_gpu: true
|
no_gpu: true
|
||||||
commands:
|
commands:
|
||||||
# split the test to avoid interference
|
# split the test to avoid interference
|
||||||
- pytest -v -s v1/core
|
- pytest -v -s -m 'cpu_test' v1/core
|
||||||
- pytest -v -s v1/structured_output
|
- pytest -v -s v1/structured_output
|
||||||
- pytest -v -s v1/test_serial_utils.py
|
- pytest -v -s v1/test_serial_utils.py
|
||||||
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
|
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
|
||||||
@ -348,7 +378,8 @@ steps:
|
|||||||
- python3 offline_inference/basic/embed.py
|
- python3 offline_inference/basic/embed.py
|
||||||
- python3 offline_inference/basic/score.py
|
- python3 offline_inference/basic/score.py
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||||
|
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||||
|
|
||||||
- label: Platform Tests (CUDA) # 4min
|
- label: Platform Tests (CUDA) # 4min
|
||||||
timeout_in_minutes: 15
|
timeout_in_minutes: 15
|
||||||
@ -383,7 +414,12 @@ steps:
|
|||||||
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
|
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
|
||||||
--ignore=lora/test_chatglm3_tp.py \
|
--ignore=lora/test_chatglm3_tp.py \
|
||||||
--ignore=lora/test_llama_tp.py \
|
--ignore=lora/test_llama_tp.py \
|
||||||
--ignore=lora/test_llm_with_multi_loras.py
|
--ignore=lora/test_llm_with_multi_loras.py \
|
||||||
|
--ignore=lora/test_olmoe_tp.py \
|
||||||
|
--ignore=lora/test_deepseekv2_tp.py \
|
||||||
|
--ignore=lora/test_gptoss.py \
|
||||||
|
--ignore=lora/test_qwen3moe_tp.py
|
||||||
|
|
||||||
parallelism: 4
|
parallelism: 4
|
||||||
|
|
||||||
- label: PyTorch Compilation Unit Tests # 15min
|
- label: PyTorch Compilation Unit Tests # 15min
|
||||||
@ -399,11 +435,10 @@ steps:
|
|||||||
- pytest -v -s compile/test_fusion_attn.py
|
- pytest -v -s compile/test_fusion_attn.py
|
||||||
- pytest -v -s compile/test_functionalization.py
|
- pytest -v -s compile/test_functionalization.py
|
||||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||||
- pytest -v -s compile/test_sequence_parallelism.py
|
|
||||||
- pytest -v -s compile/test_async_tp.py
|
|
||||||
- pytest -v -s compile/test_fusion_all_reduce.py
|
- pytest -v -s compile/test_fusion_all_reduce.py
|
||||||
- pytest -v -s compile/test_decorator.py
|
- pytest -v -s compile/test_decorator.py
|
||||||
- pytest -v -s compile/test_noop_elimination.py
|
- pytest -v -s compile/test_noop_elimination.py
|
||||||
|
- pytest -v -s compile/test_aot_compile.py
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
@ -416,8 +451,8 @@ steps:
|
|||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s compile/piecewise/
|
- pytest -v -s compile/piecewise/
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 20min
|
- label: PyTorch Fullgraph Test # 22min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 35
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -425,6 +460,19 @@ steps:
|
|||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_full_graph.py
|
- pytest -v -s compile/test_full_graph.py
|
||||||
|
- pytest -v -s compile/test_fusions_e2e.py
|
||||||
|
|
||||||
|
- label: Cudagraph test
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
source_file_dependencies:
|
||||||
|
- tests/v1/cudagraph
|
||||||
|
- vllm/v1/cudagraph_dispatcher.py
|
||||||
|
- vllm/config/compilation.py
|
||||||
|
- vllm/compilation
|
||||||
|
commands:
|
||||||
|
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
|
||||||
|
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py
|
||||||
|
|
||||||
- label: Kernels Core Operation Test # 48min
|
- label: Kernels Core Operation Test # 48min
|
||||||
timeout_in_minutes: 75
|
timeout_in_minutes: 75
|
||||||
@ -432,8 +480,9 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- tests/kernels/core
|
- tests/kernels/core
|
||||||
|
- tests/kernels/test_top_k_per_row.py
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/core
|
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
|
||||||
|
|
||||||
- label: Kernels Attention Test %N # 23min
|
- label: Kernels Attention Test %N # 23min
|
||||||
timeout_in_minutes: 35
|
timeout_in_minutes: 35
|
||||||
@ -467,6 +516,8 @@ steps:
|
|||||||
- tests/kernels/moe
|
- tests/kernels/moe
|
||||||
- vllm/model_executor/layers/fused_moe/
|
- vllm/model_executor/layers/fused_moe/
|
||||||
- vllm/distributed/device_communicators/
|
- vllm/distributed/device_communicators/
|
||||||
|
- vllm/envs.py
|
||||||
|
- vllm/config
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
parallelism: 2
|
parallelism: 2
|
||||||
@ -526,8 +577,9 @@ steps:
|
|||||||
# since torchao nightly is only compatible with torch nightly currently
|
# since torchao nightly is only compatible with torch nightly currently
|
||||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||||
# we can only upgrade after this is resolved
|
# we can only upgrade after this is resolved
|
||||||
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
|
# TODO(jerryzh168): resolve the above comment
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
|
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
||||||
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
timeout_in_minutes: 75
|
timeout_in_minutes: 75
|
||||||
@ -676,8 +728,10 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/language/generation
|
- tests/models/language/generation
|
||||||
commands:
|
commands:
|
||||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
# Install fast path packages for testing against transformers
|
||||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
# Note: also needed to run plamo2 model in vLLM
|
||||||
|
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
||||||
|
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||||
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
||||||
|
|
||||||
- label: Language Models Test (PPL)
|
- label: Language Models Test (PPL)
|
||||||
@ -732,6 +786,16 @@ steps:
|
|||||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||||
|
|
||||||
|
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
|
||||||
|
timeout_in_minutes: 70
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/multimodal/
|
||||||
|
- vllm/inputs/
|
||||||
|
- vllm/v1/core/
|
||||||
|
commands:
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 1
|
- label: Multi-Modal Models Test (Extended) 1
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
optional: true
|
optional: true
|
||||||
@ -795,8 +859,8 @@ steps:
|
|||||||
# Whisper needs spawn method to avoid deadlock
|
# Whisper needs spawn method to avoid deadlock
|
||||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||||
|
|
||||||
- label: Blackwell Test # 38 min
|
- label: Blackwell Test # 21 min
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 30
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
gpu: b200
|
gpu: b200
|
||||||
# optional: true
|
# optional: true
|
||||||
@ -809,8 +873,6 @@ steps:
|
|||||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
- vllm/compilation/fusion.py
|
|
||||||
- vllm/compilation/fusion_attn.py
|
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
@ -827,13 +889,32 @@ steps:
|
|||||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
||||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
||||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||||
- pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
|
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||||
# Fusion
|
|
||||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
|
||||||
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
|
|
||||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||||
|
|
||||||
|
- label: Blackwell Fusion Tests # 30 min
|
||||||
|
timeout_in_minutes: 40
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/fp4/
|
||||||
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
- vllm/compilation/
|
||||||
|
# can affect pattern matching
|
||||||
|
- vllm/model_executor/layers/layernorm.py
|
||||||
|
- vllm/model_executor/layers/activation.py
|
||||||
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||||
|
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||||
|
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||||
|
- pytest -v -s tests/compile/test_fusions_e2e.py
|
||||||
|
|
||||||
- label: Blackwell GPT-OSS Eval
|
- label: Blackwell GPT-OSS Eval
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
@ -867,7 +948,7 @@ steps:
|
|||||||
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
- label: Blackwell LM Eval Small Models
|
- label: Blackwell LM Eval Small Models
|
||||||
timeout_in_minutes: 75
|
timeout_in_minutes: 120
|
||||||
gpu: b200
|
gpu: b200
|
||||||
optional: true # run on nightlies
|
optional: true # run on nightlies
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -940,6 +1021,8 @@ steps:
|
|||||||
- tests/v1/shutdown
|
- tests/v1/shutdown
|
||||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||||
commands:
|
commands:
|
||||||
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
@ -947,6 +1030,7 @@ steps:
|
|||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
|
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- pytest -v -s distributed/test_sequence_parallel.py
|
- pytest -v -s distributed/test_sequence_parallel.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||||
@ -990,6 +1074,11 @@ steps:
|
|||||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||||
- pip uninstall prithvi_io_processor_plugin -y
|
- pip uninstall prithvi_io_processor_plugin -y
|
||||||
# end io_processor plugins test
|
# end io_processor plugins test
|
||||||
|
# begin stat_logger plugins test
|
||||||
|
- pip install -e ./plugins/vllm_add_dummy_stat_logger
|
||||||
|
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
|
||||||
|
- pip uninstall dummy_stat_logger -y
|
||||||
|
# end stat_logger plugins test
|
||||||
# other tests continue here:
|
# other tests continue here:
|
||||||
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
- pip install -e ./plugins/vllm_add_dummy_model
|
||||||
@ -1029,6 +1118,7 @@ steps:
|
|||||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||||
- pytest -v -s -x lora/test_llama_tp.py
|
- pytest -v -s -x lora/test_llama_tp.py
|
||||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||||
|
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||||
|
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test # 33min
|
- label: Weight Loading Multiple GPU Test # 33min
|
||||||
@ -1055,6 +1145,17 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||||
|
|
||||||
|
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||||
|
- tests/v1/kv_connector/nixl_integration/
|
||||||
|
commands:
|
||||||
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||||
|
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
|
||||||
|
|
||||||
|
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
##### A100 test #####
|
##### A100 test #####
|
||||||
@ -1086,12 +1187,16 @@ steps:
|
|||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||||
|
|
||||||
##### H200 test #####
|
##### H200 test #####
|
||||||
- label: Distrubted Tests (H200) # optional
|
- label: Distributed Tests (H200) # optional
|
||||||
gpu: h200
|
gpu: h200
|
||||||
optional: true
|
optional: true
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
commands:
|
commands:
|
||||||
|
- pytest -v -s tests/compile/test_async_tp.py
|
||||||
|
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
||||||
|
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||||
|
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||||
|
|
||||||
|
|||||||
17
.coveragerc
17
.coveragerc
@ -1,5 +1,10 @@
|
|||||||
[run]
|
[run]
|
||||||
source = vllm
|
# Track the installed vllm package (this is what actually gets imported during tests)
|
||||||
|
# Use wildcard pattern to match the installed location
|
||||||
|
source =
|
||||||
|
vllm
|
||||||
|
*/dist-packages/vllm
|
||||||
|
*/site-packages/vllm
|
||||||
omit =
|
omit =
|
||||||
*/tests/*
|
*/tests/*
|
||||||
*/test_*
|
*/test_*
|
||||||
@ -12,6 +17,16 @@ omit =
|
|||||||
*/benchmarks/*
|
*/benchmarks/*
|
||||||
*/docs/*
|
*/docs/*
|
||||||
|
|
||||||
|
[paths]
|
||||||
|
# Map all possible vllm locations to a canonical "vllm" path
|
||||||
|
# This ensures coverage.combine properly merges data from different test runs
|
||||||
|
source =
|
||||||
|
vllm
|
||||||
|
/vllm-workspace/src/vllm
|
||||||
|
/vllm-workspace/vllm
|
||||||
|
*/site-packages/vllm
|
||||||
|
*/dist-packages/vllm
|
||||||
|
|
||||||
[report]
|
[report]
|
||||||
exclude_lines =
|
exclude_lines =
|
||||||
pragma: no cover
|
pragma: no cover
|
||||||
|
|||||||
4
.git-blame-ignore-revs
Normal file
4
.git-blame-ignore-revs
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
# Migrate from `yapf` & `isort` to `ruff`
|
||||||
|
d6953beb91da4e9c99be4c0a1304a2d24189535c
|
||||||
|
# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
|
||||||
|
8fcaaf6a165e661f63fc51be906bc05b0767332f
|
||||||
22
.github/CODEOWNERS
vendored
22
.github/CODEOWNERS
vendored
@ -5,10 +5,8 @@
|
|||||||
/vllm/attention @LucasWilkinson
|
/vllm/attention @LucasWilkinson
|
||||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
|
||||||
/vllm/model_executor/layers/fused_moe @mgoin
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
|
||||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
|
|
||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
|
||||||
/vllm/model_executor/layers/mamba @tdoublep
|
/vllm/model_executor/layers/mamba @tdoublep
|
||||||
/vllm/model_executor/model_loader @22quinn
|
/vllm/model_executor/model_loader @22quinn
|
||||||
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
|
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||||
@ -26,9 +24,9 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
|
||||||
/vllm/v1/attention @LucasWilkinson
|
/vllm/v1/attention @LucasWilkinson
|
||||||
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
/vllm/v1/attention/backends/mla @pavanimajety
|
||||||
|
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
|
||||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
||||||
/vllm/v1/sample @22quinn @houseroad @njhill
|
/vllm/v1/sample @22quinn @houseroad @njhill
|
||||||
@ -47,7 +45,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
|
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
|
||||||
/tests/models @DarkLight1337 @ywang96
|
/tests/models @DarkLight1337 @ywang96
|
||||||
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
|
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||||
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
|
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
|
||||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||||
@ -60,7 +58,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
/tests/v1/offloading @ApostaC
|
/tests/v1/offloading @ApostaC
|
||||||
|
|
||||||
# Transformers backend
|
# Transformers backend
|
||||||
/vllm/model_executor/models/transformers.py @hmellor
|
/vllm/model_executor/models/transformers @hmellor
|
||||||
/tests/models/test_transformers.py @hmellor
|
/tests/models/test_transformers.py @hmellor
|
||||||
|
|
||||||
# Docs
|
# Docs
|
||||||
@ -121,3 +119,11 @@ mkdocs.yaml @hmellor
|
|||||||
|
|
||||||
# KVConnector installation files
|
# KVConnector installation files
|
||||||
/requirements/kv_connectors.txt @NickLucche
|
/requirements/kv_connectors.txt @NickLucche
|
||||||
|
|
||||||
|
# Pooling models
|
||||||
|
/examples/*/pooling/ @noooop
|
||||||
|
/tests/models/*/pooling* @noooop
|
||||||
|
/tests/entrypoints/pooling @noooop
|
||||||
|
/vllm/config/pooler.py @noooop
|
||||||
|
/vllm/pooling_params.py @noooop
|
||||||
|
/vllm/model_executor/layers/pooler.py @noooop
|
||||||
|
|||||||
2
.github/mergify.yml
vendored
2
.github/mergify.yml
vendored
@ -11,6 +11,8 @@ pull_request_rules:
|
|||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
- documentation
|
- documentation
|
||||||
|
comment:
|
||||||
|
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
|
||||||
|
|
||||||
- name: label-ci-build
|
- name: label-ci-build
|
||||||
description: Automatically apply ci/build label
|
description: Automatically apply ci/build label
|
||||||
|
|||||||
138
.github/workflows/issue_autolabel.yml
vendored
138
.github/workflows/issue_autolabel.yml
vendored
@ -13,6 +13,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Label issues based on keywords
|
- name: Label issues based on keywords
|
||||||
|
id: label-step
|
||||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||||
with:
|
with:
|
||||||
script: |
|
script: |
|
||||||
@ -42,7 +43,6 @@ jobs:
|
|||||||
searchIn: "body"
|
searchIn: "body"
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
|
||||||
// Substring search - matches anywhere in text (partial matches)
|
// Substring search - matches anywhere in text (partial matches)
|
||||||
substrings: [
|
substrings: [
|
||||||
{
|
{
|
||||||
@ -89,14 +89,12 @@ jobs:
|
|||||||
term: "hip_",
|
term: "hip_",
|
||||||
searchIn: "both"
|
searchIn: "both"
|
||||||
},
|
},
|
||||||
|
|
||||||
// ROCm tools and libraries
|
// ROCm tools and libraries
|
||||||
{
|
{
|
||||||
term: "hipify",
|
term: "hipify",
|
||||||
searchIn: "both"
|
searchIn: "both"
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
|
||||||
// Regex patterns - for complex pattern matching
|
// Regex patterns - for complex pattern matching
|
||||||
regexPatterns: [
|
regexPatterns: [
|
||||||
{
|
{
|
||||||
@ -107,13 +105,17 @@ jobs:
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
|
// Add more label configurations here as needed
|
||||||
|
// example: {
|
||||||
|
// keywords: [...],
|
||||||
|
// substrings: [...],
|
||||||
|
// regexPatterns: [...]
|
||||||
|
// },
|
||||||
};
|
};
|
||||||
|
|
||||||
// Helper function to create regex based on search type
|
// Helper function to create regex based on search type
|
||||||
function createSearchRegex(term, type) {
|
function createSearchRegex(term, type) {
|
||||||
// Escape special regex characters in the term
|
// Escape special regex characters in the term
|
||||||
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||||
|
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case 'keyword':
|
case 'keyword':
|
||||||
// Word boundary search - matches whole words only
|
// Word boundary search - matches whole words only
|
||||||
@ -125,16 +127,13 @@ jobs:
|
|||||||
throw new Error(`Unknown search type: ${type}`);
|
throw new Error(`Unknown search type: ${type}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to find matching terms in text with line information
|
// Helper function to find matching terms in text with line information
|
||||||
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
|
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
|
||||||
const matches = [];
|
const matches = [];
|
||||||
const lines = text.split('\n');
|
const lines = text.split('\n');
|
||||||
|
|
||||||
for (const termConfig of searchTerms) {
|
for (const termConfig of searchTerms) {
|
||||||
let regex;
|
let regex;
|
||||||
let term, searchIn, pattern, description, flags;
|
let term, searchIn, pattern, description, flags;
|
||||||
|
|
||||||
// Handle different input formats (string or object)
|
// Handle different input formats (string or object)
|
||||||
if (typeof termConfig === 'string') {
|
if (typeof termConfig === 'string') {
|
||||||
term = termConfig;
|
term = termConfig;
|
||||||
@ -146,21 +145,17 @@ jobs:
|
|||||||
description = termConfig.description;
|
description = termConfig.description;
|
||||||
flags = termConfig.flags;
|
flags = termConfig.flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip if this term shouldn't be searched in the current location
|
// Skip if this term shouldn't be searched in the current location
|
||||||
if (searchIn !== 'both' && searchIn !== searchLocation) {
|
if (searchIn !== 'both' && searchIn !== searchLocation) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create appropriate regex
|
// Create appropriate regex
|
||||||
if (searchType === 'regex') {
|
if (searchType === 'regex') {
|
||||||
regex = new RegExp(pattern, flags || "gi");
|
regex = new RegExp(pattern, flags || "gi");
|
||||||
} else {
|
} else {
|
||||||
regex = createSearchRegex(term, searchType);
|
regex = createSearchRegex(term, searchType);
|
||||||
}
|
}
|
||||||
|
|
||||||
const termMatches = [];
|
const termMatches = [];
|
||||||
|
|
||||||
// Check each line for matches
|
// Check each line for matches
|
||||||
lines.forEach((line, lineIndex) => {
|
lines.forEach((line, lineIndex) => {
|
||||||
const lineMatches = line.match(regex);
|
const lineMatches = line.match(regex);
|
||||||
@ -175,15 +170,14 @@ jobs:
|
|||||||
originalTerm: term || pattern,
|
originalTerm: term || pattern,
|
||||||
description: description,
|
description: description,
|
||||||
// Show context around the match in the line
|
// Show context around the match in the line
|
||||||
context: line.length > 100 ?
|
context: line.length > 100 ?
|
||||||
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
|
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
|
||||||
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
|
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
|
||||||
: line.trim()
|
: line.trim()
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
if (termMatches.length > 0) {
|
if (termMatches.length > 0) {
|
||||||
matches.push({
|
matches.push({
|
||||||
term: term || (description || pattern),
|
term: term || (description || pattern),
|
||||||
@ -196,64 +190,48 @@ jobs:
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return matches;
|
return matches;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to check if label should be added
|
// Helper function to check if label should be added
|
||||||
async function processLabel(labelName, config) {
|
async function processLabel(labelName, config) {
|
||||||
const body = context.payload.issue.body || "";
|
const body = context.payload.issue.body || "";
|
||||||
const title = context.payload.issue.title || "";
|
const title = context.payload.issue.title || "";
|
||||||
|
|
||||||
core.notice(`Processing label: ${labelName}`);
|
core.notice(`Processing label: ${labelName}`);
|
||||||
core.notice(`Issue Title: "${title}"`);
|
core.notice(`Issue Title: "${title}"`);
|
||||||
core.notice(`Issue Body length: ${body.length} characters`);
|
core.notice(`Issue Body length: ${body.length} characters`);
|
||||||
|
|
||||||
let shouldAddLabel = false;
|
let shouldAddLabel = false;
|
||||||
let allMatches = [];
|
let allMatches = [];
|
||||||
let reason = '';
|
let reason = '';
|
||||||
|
|
||||||
const keywords = config.keywords || [];
|
const keywords = config.keywords || [];
|
||||||
const substrings = config.substrings || [];
|
const substrings = config.substrings || [];
|
||||||
const regexPatterns = config.regexPatterns || [];
|
const regexPatterns = config.regexPatterns || [];
|
||||||
|
|
||||||
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
|
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
|
||||||
|
|
||||||
// Search in title
|
// Search in title
|
||||||
if (title.trim()) {
|
if (title.trim()) {
|
||||||
core.notice(`Searching in title: "${title}"`);
|
core.notice(`Searching in title: "${title}"`);
|
||||||
|
|
||||||
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
|
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
|
||||||
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
|
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
|
||||||
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
|
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
|
||||||
|
|
||||||
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
|
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Search in body
|
// Search in body
|
||||||
if (body.trim()) {
|
if (body.trim()) {
|
||||||
core.notice(`Searching in body (${body.length} characters)`);
|
core.notice(`Searching in body (${body.length} characters)`);
|
||||||
|
|
||||||
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
|
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
|
||||||
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
|
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
|
||||||
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
|
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
|
||||||
|
|
||||||
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
|
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (allMatches.length > 0) {
|
if (allMatches.length > 0) {
|
||||||
core.notice(`Found ${allMatches.length} matching term(s):`);
|
core.notice(`Found ${allMatches.length} matching term(s):`);
|
||||||
|
|
||||||
for (const termMatch of allMatches) {
|
for (const termMatch of allMatches) {
|
||||||
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
|
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
|
||||||
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
|
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
|
||||||
|
|
||||||
if (termMatch.searchType === 'regex') {
|
if (termMatch.searchType === 'regex') {
|
||||||
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
||||||
} else {
|
} else {
|
||||||
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Show details for each match
|
// Show details for each match
|
||||||
termMatch.matches.forEach((match, index) => {
|
termMatch.matches.forEach((match, index) => {
|
||||||
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
|
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
|
||||||
@ -266,7 +244,6 @@ jobs:
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
shouldAddLabel = true;
|
shouldAddLabel = true;
|
||||||
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
|
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
|
||||||
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
|
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
|
||||||
@ -274,13 +251,10 @@ jobs:
|
|||||||
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
|
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
|
||||||
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
|
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
|
||||||
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
|
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
|
||||||
|
|
||||||
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
|
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
|
||||||
}
|
}
|
||||||
|
|
||||||
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
|
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
|
||||||
core.notice(`Reason: ${reason || 'No matching terms found'}`);
|
core.notice(`Reason: ${reason || 'No matching terms found'}`);
|
||||||
|
|
||||||
if (shouldAddLabel) {
|
if (shouldAddLabel) {
|
||||||
const existingLabels = context.payload.issue.labels.map(l => l.name);
|
const existingLabels = context.payload.issue.labels.map(l => l.name);
|
||||||
if (!existingLabels.includes(labelName)) {
|
if (!existingLabels.includes(labelName)) {
|
||||||
@ -296,14 +270,92 @@ jobs:
|
|||||||
core.notice(`Label "${labelName}" already present.`);
|
core.notice(`Label "${labelName}" already present.`);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
core.notice(`No matching terms found for label "${labelName}".`);
|
core.notice(`No matching terms found for label "${labelName}".`);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process all configured labels
|
// Process all configured labels
|
||||||
const processLabels = Object.entries(labelConfig)
|
const labelsAddedResults = await Promise.all(
|
||||||
.map(([labelName, config]) => processLabel(labelName, config));
|
Object.entries(labelConfig).map(([labelName, config]) =>
|
||||||
const labelsAdded = await Promise.all(processLabels);
|
processLabel(labelName, config).then(added => ({ labelName, added }))
|
||||||
const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
|
)
|
||||||
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
|
);
|
||||||
|
|
||||||
|
const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
|
||||||
|
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
|
||||||
|
|
||||||
|
// Return which labels were added for the next step
|
||||||
|
const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
|
||||||
|
core.setOutput('labels_added', JSON.stringify(addedLabels));
|
||||||
|
return addedLabels;
|
||||||
|
|
||||||
|
- name: CC users for labeled issues
|
||||||
|
if: steps.label-step.outputs.labels_added != '[]'
|
||||||
|
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||||
|
with:
|
||||||
|
script: |
|
||||||
|
// Configuration: Map labels to GitHub users to CC
|
||||||
|
// You can add multiple users per label, and multiple label configurations
|
||||||
|
const ccConfig = {
|
||||||
|
rocm: {
|
||||||
|
users: ['hongxiayang', 'tjtanaa', 'vllmellm'], // Add more users as needed: ['user1', 'user2', 'user3']
|
||||||
|
message: 'CC {users} for ROCm-related issue' // {users} will be replaced with @mentions
|
||||||
|
},
|
||||||
|
// Add more label -> user mappings here
|
||||||
|
// Example:
|
||||||
|
// cuda: {
|
||||||
|
// users: ['user1', 'user2'],
|
||||||
|
// message: 'CC {users} for CUDA-related issue'
|
||||||
|
// },
|
||||||
|
// performance: {
|
||||||
|
// users: ['perfexpert'],
|
||||||
|
// message: 'CC {users} for performance issue'
|
||||||
|
// },
|
||||||
|
};
|
||||||
|
|
||||||
|
const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
|
||||||
|
core.notice(`Labels added: ${labelsAdded.join(', ')}`);
|
||||||
|
|
||||||
|
// Get existing comments to check for already mentioned users
|
||||||
|
const comments = await github.rest.issues.listComments({
|
||||||
|
owner: context.repo.owner,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
issue_number: context.issue.number,
|
||||||
|
});
|
||||||
|
|
||||||
|
const issueBody = context.payload.issue.body || '';
|
||||||
|
const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
|
||||||
|
|
||||||
|
// Process each label that was added
|
||||||
|
for (const label of labelsAdded) {
|
||||||
|
if (ccConfig[label]) {
|
||||||
|
const config = ccConfig[label];
|
||||||
|
const usersToMention = [];
|
||||||
|
|
||||||
|
// Check which users haven't been mentioned yet
|
||||||
|
for (const user of config.users) {
|
||||||
|
const mentionPattern = new RegExp(`@${user}\\b`, 'i');
|
||||||
|
if (!mentionPattern.test(allExistingText)) {
|
||||||
|
usersToMention.push(user);
|
||||||
|
} else {
|
||||||
|
core.notice(`@${user} already mentioned for label "${label}", skipping`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Post comment if there are users to mention
|
||||||
|
if (usersToMention.length > 0) {
|
||||||
|
const mentions = usersToMention.map(u => `@${u}`).join(' ');
|
||||||
|
const message = config.message.replace('{users}', mentions);
|
||||||
|
|
||||||
|
await github.rest.issues.createComment({
|
||||||
|
owner: context.repo.owner,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
issue_number: context.issue.number,
|
||||||
|
body: message
|
||||||
|
});
|
||||||
|
|
||||||
|
core.notice(`CC comment added for label "${label}": ${mentions}`);
|
||||||
|
} else {
|
||||||
|
core.notice(`All users for label "${label}" already mentioned, skipping comment`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
3
.gitignore
vendored
3
.gitignore
vendored
@ -94,6 +94,9 @@ ipython_config.py
|
|||||||
# generated files
|
# generated files
|
||||||
**/generated/**
|
**/generated/**
|
||||||
|
|
||||||
|
# uv
|
||||||
|
uv.lock
|
||||||
|
|
||||||
# pyenv
|
# pyenv
|
||||||
# For a library or package, you might want to ignore these files since the code is
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
# intended to run in multiple environments; otherwise, check them in:
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
|||||||
@ -4,7 +4,6 @@ MD013: false
|
|||||||
MD024:
|
MD024:
|
||||||
siblings_only: true
|
siblings_only: true
|
||||||
MD033: false
|
MD033: false
|
||||||
MD042: false
|
|
||||||
MD045: false
|
MD045: false
|
||||||
MD046: false
|
MD046: false
|
||||||
MD051: false
|
MD051: false
|
||||||
|
|||||||
@ -7,17 +7,18 @@ default_stages:
|
|||||||
exclude: 'vllm/third_party/.*'
|
exclude: 'vllm/third_party/.*'
|
||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: v0.13.3
|
rev: v0.14.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff-check
|
- id: ruff-check
|
||||||
args: [--output-format, github, --fix]
|
args: [--output-format, github, --fix]
|
||||||
- id: ruff-format
|
- id: ruff-format
|
||||||
- repo: https://github.com/crate-ci/typos
|
- repo: https://github.com/crate-ci/typos
|
||||||
rev: v1.35.5
|
rev: v1.38.1
|
||||||
hooks:
|
hooks:
|
||||||
- id: typos
|
- id: typos
|
||||||
|
args: [--force-exclude]
|
||||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||||
rev: v20.1.3
|
rev: v21.1.2
|
||||||
hooks:
|
hooks:
|
||||||
- id: clang-format
|
- id: clang-format
|
||||||
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
||||||
@ -34,32 +35,27 @@ repos:
|
|||||||
hooks:
|
hooks:
|
||||||
- id: actionlint
|
- id: actionlint
|
||||||
- repo: https://github.com/astral-sh/uv-pre-commit
|
- repo: https://github.com/astral-sh/uv-pre-commit
|
||||||
rev: 0.6.17
|
rev: 0.9.1
|
||||||
hooks:
|
hooks:
|
||||||
- id: pip-compile
|
- id: pip-compile
|
||||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
|
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28]
|
||||||
files: ^requirements/test\.(in|txt)$
|
files: ^requirements/test\.(in|txt)$
|
||||||
- repo: local
|
- repo: local
|
||||||
hooks:
|
hooks:
|
||||||
- id: format-torch-nightly-test
|
- id: format-torch-nightly-test
|
||||||
name: reformat nightly_torch_test.txt to be in sync with test.in
|
name: reformat nightly_torch_test.txt to be in sync with test.in
|
||||||
language: python
|
language: python
|
||||||
entry: python tools/generate_nightly_torch_test.py
|
entry: python tools/pre_commit/generate_nightly_torch_test.py
|
||||||
files: ^requirements/test\.(in|txt)$
|
files: ^requirements/test\.(in|txt)$
|
||||||
- id: mypy-local
|
- id: mypy-local
|
||||||
name: Run mypy for local Python installation
|
name: Run mypy locally for lowest supported Python version
|
||||||
entry: python tools/pre_commit/mypy.py 0 "local"
|
entry: python tools/pre_commit/mypy.py 0 "3.10"
|
||||||
stages: [pre-commit] # Don't run in CI
|
stages: [pre-commit] # Don't run in CI
|
||||||
<<: &mypy_common
|
<<: &mypy_common
|
||||||
language: python
|
language: python
|
||||||
types_or: [python, pyi]
|
types_or: [python, pyi]
|
||||||
require_serial: true
|
require_serial: true
|
||||||
additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
|
additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
|
||||||
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
|
||||||
name: Run mypy for Python 3.9
|
|
||||||
entry: python tools/pre_commit/mypy.py 1 "3.9"
|
|
||||||
<<: *mypy_common
|
|
||||||
stages: [manual] # Only run in CI
|
|
||||||
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.10
|
name: Run mypy for Python 3.10
|
||||||
entry: python tools/pre_commit/mypy.py 1 "3.10"
|
entry: python tools/pre_commit/mypy.py 1 "3.10"
|
||||||
@ -75,14 +71,19 @@ repos:
|
|||||||
entry: python tools/pre_commit/mypy.py 1 "3.12"
|
entry: python tools/pre_commit/mypy.py 1 "3.12"
|
||||||
<<: *mypy_common
|
<<: *mypy_common
|
||||||
stages: [manual] # Only run in CI
|
stages: [manual] # Only run in CI
|
||||||
|
- id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
|
name: Run mypy for Python 3.13
|
||||||
|
entry: python tools/pre_commit/mypy.py 1 "3.13"
|
||||||
|
<<: *mypy_common
|
||||||
|
stages: [manual] # Only run in CI
|
||||||
- id: shellcheck
|
- id: shellcheck
|
||||||
name: Lint shell scripts
|
name: Lint shell scripts
|
||||||
entry: tools/shellcheck.sh
|
entry: tools/pre_commit/shellcheck.sh
|
||||||
language: script
|
language: script
|
||||||
types: [shell]
|
types: [shell]
|
||||||
- id: png-lint
|
- id: png-lint
|
||||||
name: Lint PNG exports from excalidraw
|
name: Lint PNG exports from excalidraw
|
||||||
entry: tools/png-lint.sh
|
entry: tools/pre_commit/png-lint.sh
|
||||||
language: script
|
language: script
|
||||||
types: [png]
|
types: [png]
|
||||||
- id: signoff-commit
|
- id: signoff-commit
|
||||||
@ -99,12 +100,12 @@ repos:
|
|||||||
stages: [commit-msg]
|
stages: [commit-msg]
|
||||||
- id: check-spdx-header
|
- id: check-spdx-header
|
||||||
name: Check SPDX headers
|
name: Check SPDX headers
|
||||||
entry: python tools/check_spdx_header.py
|
entry: python tools/pre_commit/check_spdx_header.py
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
- id: check-root-lazy-imports
|
- id: check-root-lazy-imports
|
||||||
name: Check root lazy imports
|
name: Check root lazy imports
|
||||||
entry: python tools/check_init_lazy_imports.py
|
entry: python tools/pre_commit/check_init_lazy_imports.py
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
- id: check-filenames
|
- id: check-filenames
|
||||||
@ -118,11 +119,11 @@ repos:
|
|||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
- id: update-dockerfile-graph
|
- id: update-dockerfile-graph
|
||||||
name: Update Dockerfile dependency graph
|
name: Update Dockerfile dependency graph
|
||||||
entry: tools/update-dockerfile-graph.sh
|
entry: tools/pre_commit/update-dockerfile-graph.sh
|
||||||
language: script
|
language: script
|
||||||
- id: enforce-import-regex-instead-of-re
|
- id: enforce-import-regex-instead-of-re
|
||||||
name: Enforce import regex as re
|
name: Enforce import regex as re
|
||||||
entry: python tools/enforce_regex_import.py
|
entry: python tools/pre_commit/enforce_regex_import.py
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
@ -130,7 +131,7 @@ repos:
|
|||||||
# forbid directly import triton
|
# forbid directly import triton
|
||||||
- id: forbid-direct-triton-import
|
- id: forbid-direct-triton-import
|
||||||
name: "Forbid direct 'import triton'"
|
name: "Forbid direct 'import triton'"
|
||||||
entry: python tools/check_triton_import.py
|
entry: python tools/pre_commit/check_triton_import.py
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
@ -143,7 +144,7 @@ repos:
|
|||||||
additional_dependencies: [regex]
|
additional_dependencies: [regex]
|
||||||
- id: validate-config
|
- id: validate-config
|
||||||
name: Validate configuration has default values and that each field has a docstring
|
name: Validate configuration has default values and that each field has a docstring
|
||||||
entry: python tools/validate_config.py
|
entry: python tools/pre_commit/validate_config.py
|
||||||
language: python
|
language: python
|
||||||
additional_dependencies: [regex]
|
additional_dependencies: [regex]
|
||||||
# Keep `suggestion` last
|
# Keep `suggestion` last
|
||||||
|
|||||||
@ -34,7 +34,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
|||||||
# Supported python versions. These versions will be searched in order, the
|
# Supported python versions. These versions will be searched in order, the
|
||||||
# first match will be selected. These should be kept in sync with setup.py.
|
# first match will be selected. These should be kept in sync with setup.py.
|
||||||
#
|
#
|
||||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
|
set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
|
||||||
|
|
||||||
# Supported AMD GPU architectures.
|
# Supported AMD GPU architectures.
|
||||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
||||||
@ -49,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
|
|||||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||||
# versions are derived from docker/Dockerfile.rocm
|
# versions are derived from docker/Dockerfile.rocm
|
||||||
#
|
#
|
||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Try to find python package with an executable that exactly matches
|
# Try to find python package with an executable that exactly matches
|
||||||
@ -269,8 +269,8 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/sampler.cu"
|
"csrc/sampler.cu"
|
||||||
"csrc/cuda_view.cu"
|
"csrc/cuda_view.cu"
|
||||||
"csrc/quantization/gptq/q_gemm.cu"
|
"csrc/quantization/gptq/q_gemm.cu"
|
||||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
"csrc/quantization/w8a8/int8/scaled_quant.cu"
|
||||||
"csrc/quantization/fp8/common.cu"
|
"csrc/quantization/w8a8/fp8/common.cu"
|
||||||
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
||||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||||
"csrc/quantization/activation_kernels.cu"
|
"csrc/quantization/activation_kernels.cu"
|
||||||
@ -314,12 +314,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
list(APPEND VLLM_EXT_SRC
|
list(APPEND VLLM_EXT_SRC
|
||||||
"csrc/quantization/awq/gemm_kernels.cu"
|
"csrc/quantization/awq/gemm_kernels.cu"
|
||||||
"csrc/permute_cols.cu"
|
"csrc/permute_cols.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
"csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||||
"csrc/cutlass_extensions/common.cpp"
|
"csrc/cutlass_extensions/common.cpp"
|
||||||
"csrc/quantization/fp8/per_token_group_quant.cu")
|
"csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
|
||||||
|
"csrc/quantization/w8a8/int8/per_token_group_quant.cu")
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${VLLM_EXT_SRC}"
|
SRCS "${VLLM_EXT_SRC}"
|
||||||
@ -423,11 +424,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
|
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -458,9 +459,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
|
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
|
||||||
)
|
)
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@ -492,9 +493,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
||||||
)
|
)
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@ -525,7 +526,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# subtract out the archs that are already built for 3x
|
# subtract out the archs that are already built for 3x
|
||||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||||
if (SCALED_MM_2X_ARCHS)
|
if (SCALED_MM_2X_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
|
set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
|
||||||
@ -648,7 +649,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# if it's possible to compile MoE kernels that use its output.
|
# if it's possible to compile MoE kernels that use its output.
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
|
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -672,7 +673,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
endif()
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
|
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -697,7 +698,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||||
endif()
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
|
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
|
||||||
@ -720,7 +721,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||||
endif()
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
|
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -882,6 +883,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
|
|||||||
set(VLLM_MOE_EXT_SRC
|
set(VLLM_MOE_EXT_SRC
|
||||||
"csrc/moe/torch_bindings.cpp"
|
"csrc/moe/torch_bindings.cpp"
|
||||||
"csrc/moe/moe_align_sum_kernels.cu"
|
"csrc/moe/moe_align_sum_kernels.cu"
|
||||||
|
"csrc/moe/moe_lora_align_sum_kernels.cu"
|
||||||
"csrc/moe/topk_softmax_kernels.cu")
|
"csrc/moe/topk_softmax_kernels.cu")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
@ -1006,6 +1008,7 @@ endif()
|
|||||||
# For CUDA we also build and ship some external projects.
|
# For CUDA we also build and ship some external projects.
|
||||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
include(cmake/external_projects/flashmla.cmake)
|
include(cmake/external_projects/flashmla.cmake)
|
||||||
|
include(cmake/external_projects/qutlass.cmake)
|
||||||
|
|
||||||
# vllm-flash-attn should be last as it overwrites some CMake functions
|
# vllm-flash-attn should be last as it overwrites some CMake functions
|
||||||
include(cmake/external_projects/vllm_flash_attn.cmake)
|
include(cmake/external_projects/vllm_flash_attn.cmake)
|
||||||
|
|||||||
@ -149,6 +149,7 @@ Compute Resources:
|
|||||||
- Trainy
|
- Trainy
|
||||||
- UC Berkeley
|
- UC Berkeley
|
||||||
- UC San Diego
|
- UC San Diego
|
||||||
|
- Volcengine
|
||||||
|
|
||||||
Slack Sponsor: Anyscale
|
Slack Sponsor: Anyscale
|
||||||
|
|
||||||
|
|||||||
@ -74,7 +74,7 @@ start_server() {
|
|||||||
local vllm_log=$4
|
local vllm_log=$4
|
||||||
local profile_dir=$5
|
local profile_dir=$5
|
||||||
|
|
||||||
pkill -if vllm
|
pkill -if "vllm serve" || true
|
||||||
|
|
||||||
# Define the common arguments as a bash array.
|
# Define the common arguments as a bash array.
|
||||||
# Each argument and its value are separate elements.
|
# Each argument and its value are separate elements.
|
||||||
@ -96,11 +96,11 @@ start_server() {
|
|||||||
# This correctly passes each element as a separate argument.
|
# This correctly passes each element as a separate argument.
|
||||||
if [[ -n "$profile_dir" ]]; then
|
if [[ -n "$profile_dir" ]]; then
|
||||||
# Start server with profiling enabled
|
# Start server with profiling enabled
|
||||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
||||||
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||||
else
|
else
|
||||||
# Start server without profiling
|
# Start server without profiling
|
||||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
|
VLLM_SERVER_DEV_MODE=1 \
|
||||||
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||||
fi
|
fi
|
||||||
local server_pid=$!
|
local server_pid=$!
|
||||||
@ -139,7 +139,7 @@ run_benchmark() {
|
|||||||
echo "vllm_log: $vllm_log"
|
echo "vllm_log: $vllm_log"
|
||||||
echo
|
echo
|
||||||
rm -f $vllm_log
|
rm -f $vllm_log
|
||||||
pkill -if vllm
|
pkill -if "vllm serve" || true
|
||||||
|
|
||||||
echo "starting server..."
|
echo "starting server..."
|
||||||
# Call start_server without a profile_dir to avoid profiling overhead
|
# Call start_server without a profile_dir to avoid profiling overhead
|
||||||
@ -232,7 +232,7 @@ run_benchmark() {
|
|||||||
|
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
||||||
|
|
||||||
pkill -if vllm
|
pkill -if "vllm serve" || true
|
||||||
sleep 10
|
sleep 10
|
||||||
echo "===================="
|
echo "===================="
|
||||||
return 0
|
return 0
|
||||||
@ -308,6 +308,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
|
|||||||
else
|
else
|
||||||
echo "No configuration met the latency requirements. Skipping final profiling run."
|
echo "No configuration met the latency requirements. Skipping final profiling run."
|
||||||
fi
|
fi
|
||||||
pkill -if vllm
|
pkill -if "vllm serve" || true
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
|
||||||
|
|||||||
@ -8,7 +8,6 @@ import sys
|
|||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Optional, Union
|
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import huggingface_hub.constants
|
import huggingface_hub.constants
|
||||||
@ -28,13 +27,13 @@ class RequestFuncInput:
|
|||||||
prompt_len: int
|
prompt_len: int
|
||||||
output_len: int
|
output_len: int
|
||||||
model: str
|
model: str
|
||||||
model_name: Optional[str] = None
|
model_name: str | None = None
|
||||||
logprobs: Optional[int] = None
|
logprobs: int | None = None
|
||||||
extra_body: Optional[dict] = None
|
extra_body: dict | None = None
|
||||||
multi_modal_content: Optional[dict | list[dict]] = None
|
multi_modal_content: dict | list[dict] | None = None
|
||||||
ignore_eos: bool = False
|
ignore_eos: bool = False
|
||||||
language: Optional[str] = None
|
language: str | None = None
|
||||||
request_id: Optional[str] = None
|
request_id: str | None = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -52,7 +51,7 @@ class RequestFuncOutput:
|
|||||||
|
|
||||||
async def async_request_tgi(
|
async def async_request_tgi(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
@ -133,7 +132,7 @@ async def async_request_tgi(
|
|||||||
|
|
||||||
async def async_request_trt_llm(
|
async def async_request_trt_llm(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
@ -204,7 +203,7 @@ async def async_request_trt_llm(
|
|||||||
|
|
||||||
async def async_request_deepspeed_mii(
|
async def async_request_deepspeed_mii(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(("completions", "profile")), (
|
assert api_url.endswith(("completions", "profile")), (
|
||||||
@ -267,7 +266,7 @@ async def async_request_deepspeed_mii(
|
|||||||
|
|
||||||
async def async_request_openai_completions(
|
async def async_request_openai_completions(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(("completions", "profile")), (
|
assert api_url.endswith(("completions", "profile")), (
|
||||||
@ -367,7 +366,7 @@ async def async_request_openai_completions(
|
|||||||
|
|
||||||
async def async_request_openai_chat_completions(
|
async def async_request_openai_chat_completions(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(("chat/completions", "profile")), (
|
assert api_url.endswith(("chat/completions", "profile")), (
|
||||||
@ -476,7 +475,7 @@ async def async_request_openai_chat_completions(
|
|||||||
|
|
||||||
async def async_request_openai_audio(
|
async def async_request_openai_audio(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
# Lazy import without PlaceholderModule to avoid vllm dep.
|
# Lazy import without PlaceholderModule to avoid vllm dep.
|
||||||
import soundfile
|
import soundfile
|
||||||
@ -610,7 +609,7 @@ def get_tokenizer(
|
|||||||
tokenizer_mode: str = "auto",
|
tokenizer_mode: str = "auto",
|
||||||
trust_remote_code: bool = False,
|
trust_remote_code: bool = False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
|
||||||
if pretrained_model_name_or_path is not None and not os.path.exists(
|
if pretrained_model_name_or_path is not None and not os.path.exists(
|
||||||
pretrained_model_name_or_path
|
pretrained_model_name_or_path
|
||||||
):
|
):
|
||||||
|
|||||||
@ -5,7 +5,7 @@ import gc
|
|||||||
from benchmark_utils import TimeCollector
|
from benchmark_utils import TimeCollector
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
from vllm.v1.core.block_pool import BlockPool
|
from vllm.v1.core.block_pool import BlockPool
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -46,7 +46,7 @@ import time
|
|||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
|
def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
|
||||||
|
|||||||
@ -19,7 +19,7 @@ from vllm.config import (
|
|||||||
VllmConfig,
|
VllmConfig,
|
||||||
)
|
)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
||||||
from vllm.v1.worker.gpu_input_batch import InputBatch
|
from vllm.v1.worker.gpu_input_batch import InputBatch
|
||||||
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
|
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
|
||||||
|
|||||||
@ -32,13 +32,12 @@ import dataclasses
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
@ -80,7 +79,7 @@ def sample_requests_from_dataset(
|
|||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_length_range: tuple[int, int],
|
input_length_range: tuple[int, int],
|
||||||
fixed_output_len: Optional[int],
|
fixed_output_len: int | None,
|
||||||
) -> list[Request]:
|
) -> list[Request]:
|
||||||
if fixed_output_len is not None and fixed_output_len < 4:
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
raise ValueError("output_len too small")
|
raise ValueError("output_len too small")
|
||||||
@ -128,7 +127,7 @@ def sample_requests_from_random(
|
|||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_length_range: tuple[int, int],
|
input_length_range: tuple[int, int],
|
||||||
fixed_output_len: Optional[int],
|
fixed_output_len: int | None,
|
||||||
prefix_len: int,
|
prefix_len: int,
|
||||||
) -> list[Request]:
|
) -> list[Request]:
|
||||||
requests = []
|
requests = []
|
||||||
|
|||||||
@ -7,12 +7,11 @@ import dataclasses
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||||
|
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
# Select a equi-probable random priority
|
# Select a equi-probable random priority
|
||||||
@ -24,7 +23,7 @@ def sample_requests(
|
|||||||
dataset_path: str,
|
dataset_path: str,
|
||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
fixed_output_len: Optional[int],
|
fixed_output_len: int | None,
|
||||||
) -> list[tuple[str, int, int, int]]:
|
) -> list[tuple[str, int, int, int]]:
|
||||||
if fixed_output_len is not None and fixed_output_len < 4:
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
raise ValueError("output_len too small")
|
raise ValueError("output_len too small")
|
||||||
|
|||||||
@ -31,8 +31,8 @@ import time
|
|||||||
import uuid
|
import uuid
|
||||||
import warnings
|
import warnings
|
||||||
from collections.abc import AsyncGenerator
|
from collections.abc import AsyncGenerator
|
||||||
|
from contextlib import nullcontext
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -51,7 +51,7 @@ except ImportError:
|
|||||||
from backend_request_func import get_tokenizer
|
from backend_request_func import get_tokenizer
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from argparse import ArgumentParser as FlexibleArgumentParser
|
from argparse import ArgumentParser as FlexibleArgumentParser
|
||||||
|
|
||||||
@ -316,7 +316,7 @@ def calculate_metrics(
|
|||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
selected_percentile_metrics: list[str],
|
selected_percentile_metrics: list[str],
|
||||||
selected_percentiles: list[float],
|
selected_percentiles: list[float],
|
||||||
goodput_config_dict: Optional[dict[str, float]] = None,
|
goodput_config_dict: dict[str, float] | None = None,
|
||||||
) -> tuple[BenchmarkMetrics, list[int]]:
|
) -> tuple[BenchmarkMetrics, list[int]]:
|
||||||
actual_output_lens: list[int] = []
|
actual_output_lens: list[int] = []
|
||||||
total_input = 0
|
total_input = 0
|
||||||
@ -436,9 +436,9 @@ async def benchmark(
|
|||||||
selected_percentile_metrics: list[str],
|
selected_percentile_metrics: list[str],
|
||||||
selected_percentiles: list[str],
|
selected_percentiles: list[str],
|
||||||
ignore_eos: bool,
|
ignore_eos: bool,
|
||||||
max_concurrency: Optional[int],
|
max_concurrency: int | None,
|
||||||
structured_output_ratio: float,
|
structured_output_ratio: float,
|
||||||
goodput_config_dict: Optional[dict[str, float]] = None,
|
goodput_config_dict: dict[str, float] | None = None,
|
||||||
):
|
):
|
||||||
if backend in ASYNC_REQUEST_FUNCS:
|
if backend in ASYNC_REQUEST_FUNCS:
|
||||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||||
@ -502,15 +502,9 @@ async def benchmark(
|
|||||||
|
|
||||||
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
||||||
|
|
||||||
# This can be used once the minimum Python version is 3.10 or higher,
|
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()
|
||||||
# and it will simplify the code in limited_request_func.
|
|
||||||
# semaphore = (asyncio.Semaphore(max_concurrency)
|
|
||||||
# if max_concurrency else contextlib.nullcontext())
|
|
||||||
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
|
|
||||||
|
|
||||||
async def limited_request_func(request_func_input, pbar):
|
async def limited_request_func(request_func_input, pbar):
|
||||||
if semaphore is None:
|
|
||||||
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||||
|
|
||||||
|
|||||||
@ -6,7 +6,7 @@ import math
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from types import TracebackType
|
from types import TracebackType
|
||||||
from typing import Any, Optional, Union
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
def convert_to_pytorch_benchmark_format(
|
def convert_to_pytorch_benchmark_format(
|
||||||
@ -92,7 +92,7 @@ class TimeCollector:
|
|||||||
def __init__(self, scale: int) -> None:
|
def __init__(self, scale: int) -> None:
|
||||||
self.cnt: int = 0
|
self.cnt: int = 0
|
||||||
self._sum: int = 0
|
self._sum: int = 0
|
||||||
self._max: Optional[int] = None
|
self._max: int | None = None
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
self.start_time: int = time.monotonic_ns()
|
self.start_time: int = time.monotonic_ns()
|
||||||
|
|
||||||
@ -104,13 +104,13 @@ class TimeCollector:
|
|||||||
else:
|
else:
|
||||||
self._max = max(self._max, v)
|
self._max = max(self._max, v)
|
||||||
|
|
||||||
def avg(self) -> Union[float, str]:
|
def avg(self) -> float | str:
|
||||||
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
|
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
|
||||||
|
|
||||||
def max(self) -> Union[float, str]:
|
def max(self) -> float | str:
|
||||||
return self._max / self.scale if self._max else "N/A"
|
return self._max / self.scale if self._max else "N/A"
|
||||||
|
|
||||||
def dump_avg_max(self) -> list[Union[float, str]]:
|
def dump_avg_max(self) -> list[float | str]:
|
||||||
return [self.avg(), self.max()]
|
return [self.avg(), self.max()]
|
||||||
|
|
||||||
def __enter__(self) -> None:
|
def __enter__(self) -> None:
|
||||||
@ -118,8 +118,8 @@ class TimeCollector:
|
|||||||
|
|
||||||
def __exit__(
|
def __exit__(
|
||||||
self,
|
self,
|
||||||
exc_type: Optional[type[BaseException]],
|
exc_type: type[BaseException] | None,
|
||||||
exc_value: Optional[BaseException],
|
exc_value: BaseException | None,
|
||||||
exc_traceback: Optional[TracebackType],
|
exc_traceback: TracebackType | None,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.collect(time.monotonic_ns() - self.start_time)
|
self.collect(time.monotonic_ns() - self.start_time)
|
||||||
|
|||||||
@ -6,8 +6,7 @@ import copy
|
|||||||
import itertools
|
import itertools
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from collections.abc import Iterable
|
from collections.abc import Callable, Iterable
|
||||||
from typing import Callable
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@ -16,7 +15,7 @@ from utils import make_rand_sparse_tensors
|
|||||||
from weight_shapes import WEIGHT_SHAPES
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
||||||
|
|||||||
@ -6,8 +6,7 @@ import copy
|
|||||||
import itertools
|
import itertools
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from collections.abc import Iterable
|
from collections.abc import Callable, Iterable
|
||||||
from typing import Callable, Optional
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@ -19,7 +18,8 @@ from vllm import _custom_ops as ops
|
|||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
w8a8_triton_block_scaled_mm,
|
w8a8_triton_block_scaled_mm,
|
||||||
)
|
)
|
||||||
from vllm.utils import FlexibleArgumentParser, cdiv
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.math_utils import cdiv
|
||||||
|
|
||||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
||||||
@ -53,7 +53,7 @@ def bench_int8(
|
|||||||
n: int,
|
n: int,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
bench_kernels: Optional[list[str]] = None,
|
bench_kernels: list[str] | None = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
"""Benchmark INT8-based kernels."""
|
"""Benchmark INT8-based kernels."""
|
||||||
assert dtype == torch.int8
|
assert dtype == torch.int8
|
||||||
@ -108,7 +108,7 @@ def bench_fp8(
|
|||||||
n: int,
|
n: int,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
bench_kernels: Optional[list[str]] = None,
|
bench_kernels: list[str] | None = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
"""Benchmark FP8-based kernels."""
|
"""Benchmark FP8-based kernels."""
|
||||||
assert dtype == torch.float8_e4m3fn
|
assert dtype == torch.float8_e4m3fn
|
||||||
@ -183,7 +183,7 @@ def bench(
|
|||||||
n: int,
|
n: int,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
bench_kernels: Optional[list[str]] = None,
|
bench_kernels: list[str] | None = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
if dtype == torch.int8:
|
if dtype == torch.int8:
|
||||||
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
|
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
|
||||||
@ -201,7 +201,7 @@ def print_timers(timers: Iterable[TMeasurement]):
|
|||||||
def run(
|
def run(
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
MKNs: Iterable[tuple[int, int, int]],
|
MKNs: Iterable[tuple[int, int, int]],
|
||||||
bench_kernels: Optional[list[str]] = None,
|
bench_kernels: list[str] | None = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
results = []
|
results = []
|
||||||
for m, k, n in MKNs:
|
for m, k, n in MKNs:
|
||||||
|
|||||||
@ -3,10 +3,9 @@
|
|||||||
|
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from collections.abc import Iterable
|
from collections.abc import Callable, Iterable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from itertools import product
|
from itertools import product
|
||||||
from typing import Callable, Optional
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]:
|
|||||||
def unfused_int8_impl(
|
def unfused_int8_impl(
|
||||||
rms_norm_layer: RMSNorm,
|
rms_norm_layer: RMSNorm,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor],
|
residual: torch.Tensor | None,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
):
|
):
|
||||||
# Norm
|
# Norm
|
||||||
@ -68,7 +67,7 @@ def unfused_int8_impl(
|
|||||||
def unfused_fp8_impl(
|
def unfused_fp8_impl(
|
||||||
rms_norm_layer: RMSNorm,
|
rms_norm_layer: RMSNorm,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor],
|
residual: torch.Tensor | None,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
):
|
):
|
||||||
# Norm
|
# Norm
|
||||||
@ -85,7 +84,7 @@ def unfused_fp8_impl(
|
|||||||
def fused_impl(
|
def fused_impl(
|
||||||
rms_norm_layer: RMSNorm, # this stores the weights
|
rms_norm_layer: RMSNorm, # this stores the weights
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor],
|
residual: torch.Tensor | None,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
):
|
):
|
||||||
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
||||||
|
|||||||
191
benchmarks/kernels/bench_mxfp4_qutlass.py
Normal file
191
benchmarks/kernels/bench_mxfp4_qutlass.py
Normal file
@ -0,0 +1,191 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
#
|
||||||
|
# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
|
||||||
|
# All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import copy
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
|
||||||
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
|
|
||||||
|
from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
|
||||||
|
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
|
||||||
|
from vllm.triton_utils import triton
|
||||||
|
|
||||||
|
PROVIDER_CFGS = {
|
||||||
|
"torch-bf16": dict(enabled=True),
|
||||||
|
"mxfp4": dict(no_a_quant=False, enabled=True),
|
||||||
|
"mxfp4-noquant": dict(no_a_quant=True, enabled=True),
|
||||||
|
}
|
||||||
|
|
||||||
|
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
||||||
|
|
||||||
|
|
||||||
|
def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
|
||||||
|
return (
|
||||||
|
deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
|
||||||
|
* group_size**-0.5
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _quant_weight_mxfp4(
|
||||||
|
b: torch.Tensor, forward_hadamard_matrix: torch.Tensor, device: str
|
||||||
|
):
|
||||||
|
weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeMx(
|
||||||
|
b, forward_hadamard_matrix, method="abs_max"
|
||||||
|
)
|
||||||
|
weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton")
|
||||||
|
return weight_hf_e2m1, weight_hf_scale_block
|
||||||
|
|
||||||
|
|
||||||
|
def build_mxfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device):
|
||||||
|
weight_hf_e2m1, weight_hf_scale_block = _quant_weight_mxfp4(
|
||||||
|
b, forward_hadamard_matrix, device
|
||||||
|
)
|
||||||
|
alpha = torch.tensor([1.0], device="cuda")
|
||||||
|
|
||||||
|
if cfg["no_a_quant"]:
|
||||||
|
# Pre-quantize activation
|
||||||
|
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
|
||||||
|
a, forward_hadamard_matrix, method="abs_max"
|
||||||
|
)
|
||||||
|
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
|
||||||
|
|
||||||
|
def run():
|
||||||
|
return matmul_mxf4_bf16_tn(
|
||||||
|
input_hf_e2m1,
|
||||||
|
weight_hf_e2m1,
|
||||||
|
input_hf_scale_block,
|
||||||
|
weight_hf_scale_block,
|
||||||
|
alpha,
|
||||||
|
)
|
||||||
|
|
||||||
|
return run
|
||||||
|
|
||||||
|
# Quantize activation on-the-fly
|
||||||
|
def run():
|
||||||
|
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
|
||||||
|
a, forward_hadamard_matrix, method="abs_max"
|
||||||
|
)
|
||||||
|
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
|
||||||
|
return matmul_mxf4_bf16_tn(
|
||||||
|
input_hf_e2m1,
|
||||||
|
weight_hf_e2m1,
|
||||||
|
input_hf_scale_block,
|
||||||
|
weight_hf_scale_block,
|
||||||
|
alpha,
|
||||||
|
)
|
||||||
|
|
||||||
|
return run
|
||||||
|
|
||||||
|
|
||||||
|
@triton.testing.perf_report(
|
||||||
|
triton.testing.Benchmark(
|
||||||
|
x_names=["batch_size"],
|
||||||
|
x_vals=[
|
||||||
|
1,
|
||||||
|
4,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
32,
|
||||||
|
64,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
512,
|
||||||
|
1024,
|
||||||
|
2048,
|
||||||
|
4096,
|
||||||
|
8192,
|
||||||
|
16384,
|
||||||
|
24576,
|
||||||
|
32768,
|
||||||
|
],
|
||||||
|
x_log=False,
|
||||||
|
line_arg="provider",
|
||||||
|
line_vals=_enabled,
|
||||||
|
line_names=_enabled,
|
||||||
|
ylabel="TFLOP/s (larger is better)",
|
||||||
|
plot_name="BF16 vs MXFP4 GEMMs",
|
||||||
|
args={},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
def benchmark(batch_size, provider, N, K, had_size):
|
||||||
|
M = batch_size
|
||||||
|
device = "cuda"
|
||||||
|
dtype = torch.bfloat16
|
||||||
|
|
||||||
|
a = torch.randn((M, K), device=device, dtype=dtype)
|
||||||
|
b = torch.randn((N, K), device=device, dtype=dtype)
|
||||||
|
forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
|
||||||
|
|
||||||
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
|
|
||||||
|
if provider == "torch-bf16":
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cfg = PROVIDER_CFGS[provider]
|
||||||
|
run_quant = build_mxfp4_runner(
|
||||||
|
cfg, a, b, forward_hadamard_matrix, dtype, device
|
||||||
|
)
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: run_quant(), rep=200, quantiles=quantiles
|
||||||
|
)
|
||||||
|
|
||||||
|
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
|
||||||
|
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_shapes(args):
|
||||||
|
out = []
|
||||||
|
for model, tp_size in itertools.product(args.models, args.tp_sizes):
|
||||||
|
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
|
||||||
|
KN[tp_dim] //= tp_size
|
||||||
|
KN.append(model)
|
||||||
|
out.append(KN)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--models",
|
||||||
|
nargs="+",
|
||||||
|
type=str,
|
||||||
|
default=["meta-llama/Llama-3.3-70B-Instruct"],
|
||||||
|
choices=list(WEIGHT_SHAPES.keys()),
|
||||||
|
)
|
||||||
|
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
for K, N, model in prepare_shapes(args):
|
||||||
|
for had_size in [32, 64, 128]:
|
||||||
|
print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs MXFP4 GEMMs TFLOP/s:")
|
||||||
|
benchmark.run(
|
||||||
|
print_data=True,
|
||||||
|
show_plots=True,
|
||||||
|
save_path=f"bench_mxfp4_res_n{N}_k{K}",
|
||||||
|
N=N,
|
||||||
|
K=K,
|
||||||
|
had_size=had_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Benchmark finished!")
|
||||||
207
benchmarks/kernels/bench_nvfp4_qutlass.py
Normal file
207
benchmarks/kernels/bench_nvfp4_qutlass.py
Normal file
@ -0,0 +1,207 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
#
|
||||||
|
# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
|
||||||
|
# All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import copy
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
|
||||||
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
|
|
||||||
|
from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm
|
||||||
|
from vllm._custom_ops import fusedQuantizeNv
|
||||||
|
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
|
||||||
|
from vllm.triton_utils import triton
|
||||||
|
|
||||||
|
PROVIDER_CFGS = {
|
||||||
|
"torch-bf16": dict(enabled=True),
|
||||||
|
"nvfp4": dict(no_a_quant=False, enabled=True),
|
||||||
|
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
|
||||||
|
}
|
||||||
|
|
||||||
|
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
||||||
|
|
||||||
|
|
||||||
|
def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
|
||||||
|
return (
|
||||||
|
deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
|
||||||
|
* group_size**-0.5
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _quant_weight_nvfp4(
|
||||||
|
b: torch.Tensor,
|
||||||
|
forward_hadamard_matrix: torch.Tensor,
|
||||||
|
global_scale: torch.Tensor,
|
||||||
|
device: str,
|
||||||
|
M: int,
|
||||||
|
N: int,
|
||||||
|
K: int,
|
||||||
|
):
|
||||||
|
weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeNv(
|
||||||
|
b, forward_hadamard_matrix, global_scale
|
||||||
|
)
|
||||||
|
weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton").view(
|
||||||
|
-1, K // 16
|
||||||
|
)
|
||||||
|
return weight_hf_e2m1, weight_hf_scale_block
|
||||||
|
|
||||||
|
|
||||||
|
def build_nvfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K):
|
||||||
|
alpha = torch.tensor([1.0], device="cuda")
|
||||||
|
global_scale = torch.tensor([1.0], device="cuda")
|
||||||
|
weight_hf_e2m1, weight_hf_scale_block = _quant_weight_nvfp4(
|
||||||
|
b, forward_hadamard_matrix, global_scale, device, M, N, K
|
||||||
|
)
|
||||||
|
|
||||||
|
if cfg["no_a_quant"]:
|
||||||
|
# Pre-quantize activation
|
||||||
|
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
|
||||||
|
a, forward_hadamard_matrix, global_scale
|
||||||
|
)
|
||||||
|
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
|
||||||
|
-1, K // 16
|
||||||
|
)
|
||||||
|
|
||||||
|
def run():
|
||||||
|
return ops.cutlass_scaled_fp4_mm(
|
||||||
|
input_hf_e2m1,
|
||||||
|
weight_hf_e2m1,
|
||||||
|
input_hf_scale_block,
|
||||||
|
weight_hf_scale_block,
|
||||||
|
alpha,
|
||||||
|
torch.bfloat16,
|
||||||
|
)
|
||||||
|
|
||||||
|
return run
|
||||||
|
|
||||||
|
# Quantize activation on-the-fly
|
||||||
|
def run():
|
||||||
|
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
|
||||||
|
a, forward_hadamard_matrix, global_scale
|
||||||
|
)
|
||||||
|
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
|
||||||
|
-1, K // 16
|
||||||
|
)
|
||||||
|
return ops.cutlass_scaled_fp4_mm(
|
||||||
|
input_hf_e2m1,
|
||||||
|
weight_hf_e2m1,
|
||||||
|
input_hf_scale_block,
|
||||||
|
weight_hf_scale_block,
|
||||||
|
alpha,
|
||||||
|
torch.bfloat16,
|
||||||
|
)
|
||||||
|
|
||||||
|
return run
|
||||||
|
|
||||||
|
|
||||||
|
@triton.testing.perf_report(
|
||||||
|
triton.testing.Benchmark(
|
||||||
|
x_names=["batch_size"],
|
||||||
|
x_vals=[
|
||||||
|
1,
|
||||||
|
4,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
32,
|
||||||
|
64,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
512,
|
||||||
|
1024,
|
||||||
|
2048,
|
||||||
|
4096,
|
||||||
|
8192,
|
||||||
|
16384,
|
||||||
|
24576,
|
||||||
|
32768,
|
||||||
|
],
|
||||||
|
x_log=False,
|
||||||
|
line_arg="provider",
|
||||||
|
line_vals=_enabled,
|
||||||
|
line_names=_enabled,
|
||||||
|
ylabel="TFLOP/s (larger is better)",
|
||||||
|
plot_name="BF16 vs NVFP4 GEMMs",
|
||||||
|
args={},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
def benchmark(batch_size, provider, N, K, had_size):
|
||||||
|
M = batch_size
|
||||||
|
device = "cuda"
|
||||||
|
dtype = torch.bfloat16
|
||||||
|
|
||||||
|
a = torch.randn((M, K), device=device, dtype=dtype)
|
||||||
|
b = torch.randn((N, K), device=device, dtype=dtype)
|
||||||
|
forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
|
||||||
|
|
||||||
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
|
|
||||||
|
if provider == "torch-bf16":
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cfg = PROVIDER_CFGS[provider]
|
||||||
|
run_quant = build_nvfp4_runner(
|
||||||
|
cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K
|
||||||
|
)
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: run_quant(), rep=200, quantiles=quantiles
|
||||||
|
)
|
||||||
|
|
||||||
|
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
|
||||||
|
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_shapes(args):
|
||||||
|
out = []
|
||||||
|
for model, tp_size in itertools.product(args.models, args.tp_sizes):
|
||||||
|
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
|
||||||
|
KN[tp_dim] //= tp_size
|
||||||
|
KN.append(model)
|
||||||
|
out.append(KN)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--models",
|
||||||
|
nargs="+",
|
||||||
|
type=str,
|
||||||
|
default=["meta-llama/Llama-3.3-70B-Instruct"],
|
||||||
|
choices=list(WEIGHT_SHAPES.keys()),
|
||||||
|
)
|
||||||
|
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
for K, N, model in prepare_shapes(args):
|
||||||
|
for had_size in [16, 32, 64, 128]:
|
||||||
|
print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs NVFP4 GEMMs TFLOP/s:")
|
||||||
|
benchmark.run(
|
||||||
|
print_data=True,
|
||||||
|
show_plots=True,
|
||||||
|
save_path=f"bench_nvfp4_res_n{N}_k{K}",
|
||||||
|
N=N,
|
||||||
|
K=K,
|
||||||
|
had_size=had_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Benchmark finished!")
|
||||||
@ -1,7 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import itertools
|
import itertools
|
||||||
from typing import Callable
|
from collections.abc import Callable
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@ -10,7 +10,8 @@ import torch
|
|||||||
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
||||||
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
|
|
||||||
def with_triton_mode(fn):
|
def with_triton_mode(fn):
|
||||||
|
|||||||
@ -10,7 +10,8 @@ import vllm.model_executor.layers.activation # noqa F401
|
|||||||
from vllm.model_executor.custom_op import CustomOp
|
from vllm.model_executor.custom_op import CustomOp
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
batch_size_range = [1, 16, 32, 64, 128]
|
batch_size_range = [1, 16, 32, 64, 128]
|
||||||
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
||||||
|
|||||||
@ -28,7 +28,7 @@ except ImportError as e:
|
|||||||
|
|
||||||
from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
|
from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
|
||||||
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Benchmark BitBLAS int4 on a specific target."
|
description="Benchmark BitBLAS int4 on a specific target."
|
||||||
|
|||||||
@ -20,7 +20,7 @@ from vllm.model_executor.layers.fused_moe.config import (
|
|||||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
|
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
||||||
from vllm.scalar_type import scalar_types
|
from vllm.scalar_type import scalar_types
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
WEIGHT_SHAPES_MOE = {
|
WEIGHT_SHAPES_MOE = {
|
||||||
"nvidia/DeepSeek-R1-FP4": [
|
"nvidia/DeepSeek-R1-FP4": [
|
||||||
|
|||||||
@ -14,7 +14,7 @@ from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_confi
|
|||||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
|
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
# Weight shapes for different models: [num_experts, topk, hidden_size,
|
# Weight shapes for different models: [num_experts, topk, hidden_size,
|
||||||
# intermediate_size]
|
# intermediate_size]
|
||||||
|
|||||||
@ -22,8 +22,8 @@ Example:
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
from collections.abc import Callable
|
||||||
from contextlib import nullcontext
|
from contextlib import nullcontext
|
||||||
from typing import Callable, Optional
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
@ -39,7 +39,7 @@ from vllm.distributed.device_communicators.pynccl_allocator import (
|
|||||||
)
|
)
|
||||||
from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
|
from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
@ -264,12 +264,12 @@ class CommunicatorBenchmark:
|
|||||||
def benchmark_allreduce_single(
|
def benchmark_allreduce_single(
|
||||||
self,
|
self,
|
||||||
sequence_length: int,
|
sequence_length: int,
|
||||||
allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
|
allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
|
||||||
should_use_fn: Callable[[torch.Tensor], bool],
|
should_use_fn: Callable[[torch.Tensor], bool],
|
||||||
context,
|
context,
|
||||||
num_warmup: int,
|
num_warmup: int,
|
||||||
num_trials: int,
|
num_trials: int,
|
||||||
) -> Optional[float]:
|
) -> float | None:
|
||||||
"""Benchmark method with CUDA graph optimization."""
|
"""Benchmark method with CUDA graph optimization."""
|
||||||
try:
|
try:
|
||||||
# Create test tensor (2D: sequence_length x hidden_size)
|
# Create test tensor (2D: sequence_length x hidden_size)
|
||||||
|
|||||||
@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
|
|||||||
fused_experts,
|
fused_experts,
|
||||||
fused_topk,
|
fused_topk,
|
||||||
)
|
)
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
DEFAULT_MODELS = [
|
DEFAULT_MODELS = [
|
||||||
"nm-testing/Mixtral-8x7B-Instruct-v0.1",
|
"nm-testing/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
|||||||
@ -7,7 +7,8 @@ import torch
|
|||||||
|
|
||||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
|
|||||||
@ -6,11 +6,12 @@ import copy
|
|||||||
import json
|
import json
|
||||||
import pickle
|
import pickle
|
||||||
import time
|
import time
|
||||||
|
from collections.abc import Callable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from enum import Enum, auto
|
from enum import Enum, auto
|
||||||
from itertools import product
|
from itertools import product
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Callable, Optional
|
from typing import Any
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@ -24,7 +25,7 @@ if HAS_TRITON:
|
|||||||
from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
|
from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
|
||||||
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
|
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
|
||||||
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||||
DEFAULT_TP_SIZES = [1]
|
DEFAULT_TP_SIZES = [1]
|
||||||
@ -158,7 +159,7 @@ def ref_group_gemm(
|
|||||||
seq_lens_cpu: torch.Tensor,
|
seq_lens_cpu: torch.Tensor,
|
||||||
prompt_lora_mapping_cpu: torch.Tensor,
|
prompt_lora_mapping_cpu: torch.Tensor,
|
||||||
scaling: float,
|
scaling: float,
|
||||||
add_inputs: Optional[bool],
|
add_inputs: bool | None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Torch group gemm reference implementation to test correctness of
|
Torch group gemm reference implementation to test correctness of
|
||||||
@ -316,8 +317,8 @@ class BenchmarkContext:
|
|||||||
lora_rank: int
|
lora_rank: int
|
||||||
sort_by_lora_id: bool
|
sort_by_lora_id: bool
|
||||||
dtype: torch.dtype
|
dtype: torch.dtype
|
||||||
seq_length: Optional[int] = None
|
seq_length: int | None = None
|
||||||
num_slices: Optional[int] = None # num_slices for slice based ops
|
num_slices: int | None = None # num_slices for slice based ops
|
||||||
|
|
||||||
def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
|
def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
|
||||||
ctx = copy.copy(self)
|
ctx = copy.copy(self)
|
||||||
@ -561,7 +562,7 @@ class BenchmarkTensors:
|
|||||||
}
|
}
|
||||||
|
|
||||||
def bench_fn_kwargs(
|
def bench_fn_kwargs(
|
||||||
self, op_type: OpType, add_inputs: Optional[bool] = None
|
self, op_type: OpType, add_inputs: bool | None = None
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
if op_type.is_shrink_fn():
|
if op_type.is_shrink_fn():
|
||||||
assert add_inputs is None
|
assert add_inputs is None
|
||||||
@ -575,7 +576,7 @@ class BenchmarkTensors:
|
|||||||
raise ValueError(f"Unrecognized optype {self}")
|
raise ValueError(f"Unrecognized optype {self}")
|
||||||
|
|
||||||
def test_correctness(
|
def test_correctness(
|
||||||
self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
|
self, op_type: OpType, expand_fn_add_inputs: bool | None
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""
|
"""
|
||||||
Test correctness of op_type implementation against a grouped gemm
|
Test correctness of op_type implementation against a grouped gemm
|
||||||
@ -611,8 +612,8 @@ def bench_optype(
|
|||||||
ctx: BenchmarkContext,
|
ctx: BenchmarkContext,
|
||||||
arg_pool_size: int,
|
arg_pool_size: int,
|
||||||
op_type: OpType,
|
op_type: OpType,
|
||||||
cuda_graph_nops: Optional[int] = None,
|
cuda_graph_nops: int | None = None,
|
||||||
expand_fn_add_inputs: Optional[bool] = None,
|
expand_fn_add_inputs: bool | None = None,
|
||||||
test_correctness: bool = False,
|
test_correctness: bool = False,
|
||||||
) -> TMeasurement:
|
) -> TMeasurement:
|
||||||
assert arg_pool_size >= 1
|
assert arg_pool_size >= 1
|
||||||
@ -679,7 +680,7 @@ def bench_torch_mm(
|
|||||||
ctx: BenchmarkContext,
|
ctx: BenchmarkContext,
|
||||||
arg_pool_size: int,
|
arg_pool_size: int,
|
||||||
op_type: OpType,
|
op_type: OpType,
|
||||||
cuda_graph_nops: Optional[int] = None,
|
cuda_graph_nops: int | None = None,
|
||||||
) -> TMeasurement:
|
) -> TMeasurement:
|
||||||
"""
|
"""
|
||||||
Benchmark basic torch.mm as a roofline.
|
Benchmark basic torch.mm as a roofline.
|
||||||
@ -744,7 +745,7 @@ def use_cuda_graph_recommendation() -> str:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
|
def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None):
|
||||||
compare = TBenchmark.Compare(timers)
|
compare = TBenchmark.Compare(timers)
|
||||||
compare.print()
|
compare.print()
|
||||||
|
|
||||||
|
|||||||
@ -8,10 +8,9 @@ import math
|
|||||||
import os
|
import os
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from collections.abc import Iterable
|
from collections.abc import Callable, Iterable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from itertools import product
|
from itertools import product
|
||||||
from typing import Callable, Optional
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import torch
|
import torch
|
||||||
@ -34,7 +33,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
|||||||
quantize_weights,
|
quantize_weights,
|
||||||
)
|
)
|
||||||
from vllm.scalar_type import ScalarType, scalar_types
|
from vllm.scalar_type import ScalarType, scalar_types
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
|
DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
|
||||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
|
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
|
||||||
@ -63,23 +62,23 @@ class BenchmarkTensors:
|
|||||||
a: torch.Tensor
|
a: torch.Tensor
|
||||||
|
|
||||||
w_q: torch.Tensor
|
w_q: torch.Tensor
|
||||||
group_size: Optional[int]
|
group_size: int | None
|
||||||
wtype: ScalarType
|
wtype: ScalarType
|
||||||
w_g_s: torch.Tensor
|
w_g_s: torch.Tensor
|
||||||
w_g_zp: Optional[torch.Tensor]
|
w_g_zp: torch.Tensor | None
|
||||||
w_ch_s: Optional[torch.Tensor]
|
w_ch_s: torch.Tensor | None
|
||||||
w_tok_s: Optional[torch.Tensor]
|
w_tok_s: torch.Tensor | None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TypeConfig:
|
class TypeConfig:
|
||||||
act_type: torch.dtype
|
act_type: torch.dtype
|
||||||
weight_type: ScalarType
|
weight_type: ScalarType
|
||||||
output_type: Optional[torch.dtype]
|
output_type: torch.dtype | None
|
||||||
group_scale_type: Optional[torch.dtype]
|
group_scale_type: torch.dtype | None
|
||||||
group_zero_type: Optional[torch.dtype]
|
group_zero_type: torch.dtype | None
|
||||||
channel_scale_type: Optional[torch.dtype]
|
channel_scale_type: torch.dtype | None
|
||||||
token_scale_type: Optional[torch.dtype]
|
token_scale_type: torch.dtype | None
|
||||||
|
|
||||||
|
|
||||||
def rand_data(shape, dtype=torch.float16, scale=1):
|
def rand_data(shape, dtype=torch.float16, scale=1):
|
||||||
@ -93,8 +92,8 @@ def quantize_and_pack(
|
|||||||
atype: torch.dtype,
|
atype: torch.dtype,
|
||||||
w: torch.Tensor,
|
w: torch.Tensor,
|
||||||
wtype: ScalarType,
|
wtype: ScalarType,
|
||||||
stype: Optional[torch.dtype],
|
stype: torch.dtype | None,
|
||||||
group_size: Optional[int],
|
group_size: int | None,
|
||||||
zero_points: bool = False,
|
zero_points: bool = False,
|
||||||
):
|
):
|
||||||
assert wtype.is_integer(), "TODO: support floating point weights"
|
assert wtype.is_integer(), "TODO: support floating point weights"
|
||||||
@ -113,7 +112,7 @@ def quantize_and_pack(
|
|||||||
|
|
||||||
|
|
||||||
def create_bench_tensors(
|
def create_bench_tensors(
|
||||||
shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
|
shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
|
||||||
) -> list[BenchmarkTensors]:
|
) -> list[BenchmarkTensors]:
|
||||||
m, n, k = shape
|
m, n, k = shape
|
||||||
|
|
||||||
@ -331,8 +330,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
|
|||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
|
_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None
|
||||||
_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
|
_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None
|
||||||
|
|
||||||
|
|
||||||
def bench(
|
def bench(
|
||||||
|
|||||||
@ -44,7 +44,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
|||||||
sort_weights,
|
sort_weights,
|
||||||
)
|
)
|
||||||
from vllm.scalar_type import ScalarType, scalar_types
|
from vllm.scalar_type import ScalarType, scalar_types
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
|
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
|
||||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
|
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
|
||||||
|
|||||||
@ -22,7 +22,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import *
|
|||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.transformers_utils.config import get_config
|
from vllm.transformers_utils.config import get_config
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
FP8_DTYPE = current_platform.fp8_dtype()
|
FP8_DTYPE = current_platform.fp8_dtype()
|
||||||
|
|
||||||
@ -579,10 +579,12 @@ def main(args: argparse.Namespace):
|
|||||||
E = config.ffn_config.moe_num_experts
|
E = config.ffn_config.moe_num_experts
|
||||||
topk = config.ffn_config.moe_top_k
|
topk = config.ffn_config.moe_top_k
|
||||||
intermediate_size = config.ffn_config.ffn_hidden_size
|
intermediate_size = config.ffn_config.ffn_hidden_size
|
||||||
|
hidden_size = config.hidden_size
|
||||||
elif config.architectures[0] == "JambaForCausalLM":
|
elif config.architectures[0] == "JambaForCausalLM":
|
||||||
E = config.num_experts
|
E = config.num_experts
|
||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
intermediate_size = config.intermediate_size
|
intermediate_size = config.intermediate_size
|
||||||
|
hidden_size = config.hidden_size
|
||||||
elif config.architectures[0] in (
|
elif config.architectures[0] in (
|
||||||
"DeepseekV2ForCausalLM",
|
"DeepseekV2ForCausalLM",
|
||||||
"DeepseekV3ForCausalLM",
|
"DeepseekV3ForCausalLM",
|
||||||
@ -592,6 +594,7 @@ def main(args: argparse.Namespace):
|
|||||||
E = config.n_routed_experts
|
E = config.n_routed_experts
|
||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
intermediate_size = config.moe_intermediate_size
|
intermediate_size = config.moe_intermediate_size
|
||||||
|
hidden_size = config.hidden_size
|
||||||
elif config.architectures[0] in (
|
elif config.architectures[0] in (
|
||||||
"Qwen2MoeForCausalLM",
|
"Qwen2MoeForCausalLM",
|
||||||
"Qwen3MoeForCausalLM",
|
"Qwen3MoeForCausalLM",
|
||||||
@ -600,10 +603,18 @@ def main(args: argparse.Namespace):
|
|||||||
E = config.num_experts
|
E = config.num_experts
|
||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
intermediate_size = config.moe_intermediate_size
|
intermediate_size = config.moe_intermediate_size
|
||||||
|
hidden_size = config.hidden_size
|
||||||
|
elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration":
|
||||||
|
text_config = config.get_text_config()
|
||||||
|
E = text_config.num_experts
|
||||||
|
topk = text_config.num_experts_per_tok
|
||||||
|
intermediate_size = text_config.moe_intermediate_size
|
||||||
|
hidden_size = text_config.hidden_size
|
||||||
elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
|
elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
|
||||||
E = config.num_experts
|
E = config.num_experts
|
||||||
topk = config.moe_topk[0]
|
topk = config.moe_topk[0]
|
||||||
intermediate_size = config.moe_intermediate_size[0]
|
intermediate_size = config.moe_intermediate_size[0]
|
||||||
|
hidden_size = config.hidden_size
|
||||||
else:
|
else:
|
||||||
# Support for llama4
|
# Support for llama4
|
||||||
config = config.get_text_config()
|
config = config.get_text_config()
|
||||||
@ -611,6 +622,7 @@ def main(args: argparse.Namespace):
|
|||||||
E = config.num_local_experts
|
E = config.num_local_experts
|
||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
intermediate_size = config.intermediate_size
|
intermediate_size = config.intermediate_size
|
||||||
|
hidden_size = config.hidden_size
|
||||||
enable_ep = bool(args.enable_expert_parallel)
|
enable_ep = bool(args.enable_expert_parallel)
|
||||||
if enable_ep:
|
if enable_ep:
|
||||||
ensure_divisibility(E, args.tp_size, "Number of experts")
|
ensure_divisibility(E, args.tp_size, "Number of experts")
|
||||||
@ -619,8 +631,7 @@ def main(args: argparse.Namespace):
|
|||||||
else:
|
else:
|
||||||
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
|
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
|
||||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||||
hidden_size = config.hidden_size
|
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
|
||||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||||
block_quant_shape = get_weight_block_size_safety(config)
|
block_quant_shape = get_weight_block_size_safety(config)
|
||||||
|
|||||||
@ -17,7 +17,7 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
|
|||||||
)
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
|
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
FP8_DTYPE = current_platform.fp8_dtype()
|
FP8_DTYPE = current_platform.fp8_dtype()
|
||||||
|
|
||||||
@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
|
|||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
|
|
||||||
hidden_size = config.hidden_size
|
hidden_size = config.hidden_size
|
||||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||||
use_customized_permute = args.use_customized_permute
|
use_customized_permute = args.use_customized_permute
|
||||||
|
|||||||
@ -39,7 +39,7 @@ import torch
|
|||||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.transformers_utils.config import get_config
|
from vllm.transformers_utils.config import get_config
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
|
||||||
|
|||||||
@ -3,16 +3,15 @@
|
|||||||
|
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import (
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import (
|
||||||
STR_DTYPE_TO_TORCH_DTYPE,
|
STR_DTYPE_TO_TORCH_DTYPE,
|
||||||
FlexibleArgumentParser,
|
|
||||||
create_kv_caches_with_random,
|
create_kv_caches_with_random,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -37,7 +36,7 @@ def main(
|
|||||||
seed: int,
|
seed: int,
|
||||||
do_profile: bool,
|
do_profile: bool,
|
||||||
device: str = "cuda",
|
device: str = "cuda",
|
||||||
kv_cache_dtype: Optional[str] = None,
|
kv_cache_dtype: str | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
current_platform.seed_everything(seed)
|
current_platform.seed_everything(seed)
|
||||||
|
|
||||||
|
|||||||
@ -3,8 +3,8 @@
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import math
|
import math
|
||||||
|
from collections.abc import Callable
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from typing import Callable
|
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|||||||
@ -1,155 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import itertools
|
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from vllm import _custom_ops as vllm_ops
|
|
||||||
from vllm.triton_utils import triton
|
|
||||||
|
|
||||||
|
|
||||||
def polynorm_naive(
|
|
||||||
x: torch.Tensor,
|
|
||||||
weight: torch.Tensor,
|
|
||||||
bias: torch.Tensor,
|
|
||||||
eps: float = 1e-6,
|
|
||||||
):
|
|
||||||
orig_shape = x.shape
|
|
||||||
x = x.view(-1, x.shape[-1])
|
|
||||||
|
|
||||||
def norm(x, eps: float):
|
|
||||||
return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
|
|
||||||
|
|
||||||
x = x.float()
|
|
||||||
return (
|
|
||||||
(
|
|
||||||
weight[0] * norm(x**3, eps)
|
|
||||||
+ weight[1] * norm(x**2, eps)
|
|
||||||
+ weight[2] * norm(x, eps)
|
|
||||||
+ bias
|
|
||||||
)
|
|
||||||
.to(weight.dtype)
|
|
||||||
.view(orig_shape)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def polynorm_vllm(
|
|
||||||
x: torch.Tensor,
|
|
||||||
weight: torch.Tensor,
|
|
||||||
bias: torch.Tensor,
|
|
||||||
eps: float = 1e-6,
|
|
||||||
):
|
|
||||||
orig_shape = x.shape
|
|
||||||
x = x.view(-1, x.shape[-1])
|
|
||||||
|
|
||||||
out = torch.empty_like(x)
|
|
||||||
vllm_ops.poly_norm(out, x, weight, bias, eps)
|
|
||||||
output = out
|
|
||||||
|
|
||||||
output = output.view(orig_shape)
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
def calculate_diff(batch_size, seq_len, hidden_dim):
|
|
||||||
dtype = torch.bfloat16
|
|
||||||
x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
|
|
||||||
weight = torch.ones(3, dtype=dtype, device="cuda")
|
|
||||||
bias = torch.ones(1, dtype=dtype, device="cuda")
|
|
||||||
|
|
||||||
output_naive = polynorm_naive(x, weight, bias)
|
|
||||||
output_vllm = polynorm_vllm(x, weight, bias)
|
|
||||||
|
|
||||||
if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
|
|
||||||
print("✅ All implementations match")
|
|
||||||
else:
|
|
||||||
print("❌ Implementations differ")
|
|
||||||
|
|
||||||
|
|
||||||
batch_size_range = [2**i for i in range(0, 7, 2)]
|
|
||||||
seq_length_range = [2**i for i in range(6, 11, 1)]
|
|
||||||
dim_range = [2048, 4096]
|
|
||||||
configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
|
|
||||||
|
|
||||||
|
|
||||||
def get_benchmark():
|
|
||||||
@triton.testing.perf_report(
|
|
||||||
triton.testing.Benchmark(
|
|
||||||
x_names=["dim", "batch_size", "seq_len"],
|
|
||||||
x_vals=[list(_) for _ in configs],
|
|
||||||
line_arg="provider",
|
|
||||||
line_vals=["naive", "vllm"],
|
|
||||||
line_names=["Naive", "vLLM"],
|
|
||||||
styles=[("blue", "-"), ("red", "-")],
|
|
||||||
ylabel="us",
|
|
||||||
plot_name="polynorm-perf",
|
|
||||||
args={},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
def benchmark(dim, batch_size, seq_len, provider):
|
|
||||||
dtype = torch.bfloat16
|
|
||||||
hidden_dim = dim * 4
|
|
||||||
|
|
||||||
x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
|
|
||||||
weight = torch.ones(3, dtype=dtype, device="cuda")
|
|
||||||
bias = torch.ones(1, dtype=dtype, device="cuda")
|
|
||||||
|
|
||||||
quantiles = [0.5, 0.2, 0.8]
|
|
||||||
|
|
||||||
if provider == "naive":
|
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
|
||||||
lambda: polynorm_naive(x, weight, bias),
|
|
||||||
quantiles=quantiles,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
|
||||||
lambda: polynorm_vllm(x, weight, bias),
|
|
||||||
quantiles=quantiles,
|
|
||||||
)
|
|
||||||
|
|
||||||
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
|
|
||||||
|
|
||||||
return benchmark
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"--batch-size",
|
|
||||||
type=int,
|
|
||||||
default=4,
|
|
||||||
help="Batch size",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--seq-len",
|
|
||||||
type=int,
|
|
||||||
default=128,
|
|
||||||
help="Sequence length",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--hidden-dim",
|
|
||||||
type=int,
|
|
||||||
default=8192,
|
|
||||||
help="Intermediate size of MLP",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--save-path",
|
|
||||||
type=str,
|
|
||||||
default="./configs/polnorm/",
|
|
||||||
help="Path to save polnorm benchmark results",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# Run correctness test
|
|
||||||
calculate_diff(
|
|
||||||
batch_size=args.batch_size,
|
|
||||||
seq_len=args.seq_len,
|
|
||||||
hidden_dim=args.hidden_dim,
|
|
||||||
)
|
|
||||||
|
|
||||||
benchmark = get_benchmark()
|
|
||||||
# Run performance benchmark
|
|
||||||
benchmark.run(print_data=True, save_path=args.save_path)
|
|
||||||
@ -7,7 +7,8 @@ import torch
|
|||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
|
|||||||
@ -1,7 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
|
|
||||||
@ -11,9 +9,9 @@ from tabulate import tabulate
|
|||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import (
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import (
|
||||||
STR_DTYPE_TO_TORCH_DTYPE,
|
STR_DTYPE_TO_TORCH_DTYPE,
|
||||||
FlexibleArgumentParser,
|
|
||||||
create_kv_caches_with_random,
|
create_kv_caches_with_random,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -1,7 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
|
|
||||||
@ -14,9 +12,9 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import (
|
|||||||
)
|
)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import (
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import (
|
||||||
STR_DTYPE_TO_TORCH_DTYPE,
|
STR_DTYPE_TO_TORCH_DTYPE,
|
||||||
FlexibleArgumentParser,
|
|
||||||
create_kv_caches_with_random_flash,
|
create_kv_caches_with_random_flash,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -2,7 +2,6 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
from typing import Optional, Union
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from flashinfer.norm import fused_add_rmsnorm, rmsnorm
|
from flashinfer.norm import fused_add_rmsnorm, rmsnorm
|
||||||
@ -21,8 +20,8 @@ class HuggingFaceRMSNorm(nn.Module):
|
|||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor] = None,
|
residual: torch.Tensor | None = None,
|
||||||
) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
|
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||||
orig_dtype = x.dtype
|
orig_dtype = x.dtype
|
||||||
x = x.to(torch.float32)
|
x = x.to(torch.float32)
|
||||||
if residual is not None:
|
if residual is not None:
|
||||||
@ -41,7 +40,7 @@ class HuggingFaceRMSNorm(nn.Module):
|
|||||||
def rmsnorm_naive(
|
def rmsnorm_naive(
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
weight: torch.Tensor,
|
weight: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor] = None,
|
residual: torch.Tensor | None = None,
|
||||||
eps: float = 1e-6,
|
eps: float = 1e-6,
|
||||||
):
|
):
|
||||||
naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
|
naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
|
||||||
@ -65,7 +64,7 @@ def rmsnorm_naive(
|
|||||||
def rmsnorm_flashinfer(
|
def rmsnorm_flashinfer(
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
weight: torch.Tensor,
|
weight: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor] = None,
|
residual: torch.Tensor | None = None,
|
||||||
eps: float = 1e-6,
|
eps: float = 1e-6,
|
||||||
):
|
):
|
||||||
orig_shape = x.shape
|
orig_shape = x.shape
|
||||||
@ -89,7 +88,7 @@ def rmsnorm_flashinfer(
|
|||||||
def rmsnorm_vllm(
|
def rmsnorm_vllm(
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
weight: torch.Tensor,
|
weight: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor] = None,
|
residual: torch.Tensor | None = None,
|
||||||
eps: float = 1e-6,
|
eps: float = 1e-6,
|
||||||
):
|
):
|
||||||
orig_shape = x.shape
|
orig_shape = x.shape
|
||||||
|
|||||||
@ -2,14 +2,13 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from itertools import accumulate
|
from itertools import accumulate
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import nvtx
|
import nvtx
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope
|
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
def benchmark_rope_kernels_multi_lora(
|
def benchmark_rope_kernels_multi_lora(
|
||||||
@ -18,7 +17,7 @@ def benchmark_rope_kernels_multi_lora(
|
|||||||
seq_len: int,
|
seq_len: int,
|
||||||
num_heads: int,
|
num_heads: int,
|
||||||
head_size: int,
|
head_size: int,
|
||||||
rotary_dim: Optional[int],
|
rotary_dim: int | None,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
seed: int,
|
seed: int,
|
||||||
device: str,
|
device: str,
|
||||||
|
|||||||
@ -1,5 +1,19 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
"""
|
||||||
|
Comprehensive 3-way SiLU Benchmark Suite
|
||||||
|
|
||||||
|
This benchmark compares three SiLU implementations:
|
||||||
|
1. SiLU V2 (CUDA) - Optimized CUDA kernel implementation
|
||||||
|
2. Triton Kernel - Triton-based implementation
|
||||||
|
|
||||||
|
The suite generates detailed performance comparisons including:
|
||||||
|
- Memory bandwidth utilization
|
||||||
|
- Speedup ratios (baseline vs optimized implementations)
|
||||||
|
- Performance across different expert configurations and token distributions
|
||||||
|
"""
|
||||||
|
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
@ -7,7 +21,7 @@ import numpy as np
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
|
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
|
||||||
silu_mul_fp8_quant_deep_gemm_cuda,
|
persistent_masked_m_silu_mul_quant,
|
||||||
)
|
)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.triton_utils import tl, triton
|
from vllm.triton_utils import tl, triton
|
||||||
@ -94,6 +108,7 @@ def silu_mul_fp8_quant_deep_gemm_triton(
|
|||||||
num_parallel_tokens,
|
num_parallel_tokens,
|
||||||
group_size: int = 128,
|
group_size: int = 128,
|
||||||
eps: float = 1e-10,
|
eps: float = 1e-10,
|
||||||
|
expert_offsets: torch.Tensor = None,
|
||||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
"""Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
|
"""Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
|
||||||
|
|
||||||
@ -174,7 +189,7 @@ def silu_mul_fp8_quant_deep_gemm_triton(
|
|||||||
|
|
||||||
|
|
||||||
# Parse generation strategies
|
# Parse generation strategies
|
||||||
strategies = ["uniform", "max_t", "first_t"]
|
strategies = ["random_imbalanced", "uniform", "max_t"]
|
||||||
|
|
||||||
|
|
||||||
def benchmark(
|
def benchmark(
|
||||||
@ -195,15 +210,27 @@ def benchmark(
|
|||||||
current_platform.seed_everything(42 + seed_offset)
|
current_platform.seed_everything(42 + seed_offset)
|
||||||
y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
|
y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
|
||||||
|
|
||||||
if gen_strategy == "uniform":
|
if gen_strategy == "random_imbalanced":
|
||||||
r = torch.rand(size=(E,), device="cuda")
|
|
||||||
|
def generate_expert_loads(n_e, total_tokens, ratio, device="cuda"):
|
||||||
|
mean = total_tokens // n_e
|
||||||
|
min_max = mean // ratio
|
||||||
|
e = torch.ones(size=(E,), dtype=torch.int64, device=device) * mean
|
||||||
|
e[0] = min_max
|
||||||
|
r = torch.rand(size=(E - 1,))
|
||||||
|
r /= r.sum()
|
||||||
|
r *= total_tokens - min_max
|
||||||
|
r = r.round().long()
|
||||||
|
e[1:] = r.to(device=device)
|
||||||
|
return e
|
||||||
|
|
||||||
|
tokens_per_expert = generate_expert_loads(E, total_tokens, 0.7, "cuda")
|
||||||
|
elif gen_strategy == "uniform":
|
||||||
|
r = torch.rand(size=(E,))
|
||||||
r /= r.sum()
|
r /= r.sum()
|
||||||
r *= total_tokens
|
r *= total_tokens
|
||||||
tokens_per_expert = r.int()
|
r = r.round().long()
|
||||||
tokens_per_expert = torch.minimum(
|
tokens_per_expert = r
|
||||||
tokens_per_expert,
|
|
||||||
torch.ones((E,), device=r.device, dtype=torch.int) * T,
|
|
||||||
)
|
|
||||||
elif gen_strategy == "max_t":
|
elif gen_strategy == "max_t":
|
||||||
tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda")
|
tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda")
|
||||||
tokens_per_expert.fill_(total_tokens / E)
|
tokens_per_expert.fill_(total_tokens / E)
|
||||||
@ -281,40 +308,34 @@ def benchmark(
|
|||||||
|
|
||||||
|
|
||||||
def create_comparison_plot(
|
def create_comparison_plot(
|
||||||
ratio, cuda_times, baseline_times, config_labels, strategy_name, id
|
ratios, silu_v2_times, triton_times, config_labels, strategy_name, id
|
||||||
):
|
):
|
||||||
"""Create a comparison plot for a specific generation strategy"""
|
fig, ax = plt.subplots(1, 1, figsize=(18, 6))
|
||||||
fig, ax = plt.subplots(1, 1, figsize=(16, 6))
|
|
||||||
|
|
||||||
# Configure x-axis positions
|
# Configure x-axis positions
|
||||||
x = np.arange(len(config_labels))
|
x = np.arange(len(config_labels))
|
||||||
width = 0.35
|
width = 0.25
|
||||||
|
|
||||||
# Execution Time plot (lower is better)
|
# Execution Time plot (lower is better)
|
||||||
|
ax.bar(x, silu_v2_times, width, label="SiLU V2 (CUDA)", alpha=0.8, color="blue")
|
||||||
ax.bar(
|
ax.bar(
|
||||||
x - width / 2, cuda_times, width, label="CUDA Kernel", alpha=0.8, color="blue"
|
x + width, triton_times, width, label="Triton Kernel", alpha=0.8, color="green"
|
||||||
)
|
|
||||||
ax.bar(
|
|
||||||
x + width / 2,
|
|
||||||
baseline_times,
|
|
||||||
width,
|
|
||||||
label="Baseline",
|
|
||||||
alpha=0.8,
|
|
||||||
color="orange",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add speedup labels over each bar pair
|
# Add speedup labels over each bar trio
|
||||||
for i in range(len(x)):
|
for i in range(len(x)):
|
||||||
speedup = ratio[i]
|
triton_v2_speedup = ratios[i][1] # triton/v2
|
||||||
max_height = max(cuda_times[i], baseline_times[i])
|
max_height = max(silu_v2_times[i], triton_times[i])
|
||||||
|
|
||||||
|
# Triton/V2 speedup
|
||||||
ax.text(
|
ax.text(
|
||||||
x[i],
|
x[i] + width / 2,
|
||||||
max_height + max_height * 0.02,
|
max_height + max_height * 0.02,
|
||||||
f"{speedup:.2f}x",
|
f"{triton_v2_speedup:.2f}x",
|
||||||
ha="center",
|
ha="center",
|
||||||
va="bottom",
|
va="bottom",
|
||||||
fontweight="bold",
|
fontweight="bold",
|
||||||
fontsize=9,
|
fontsize=8,
|
||||||
)
|
)
|
||||||
|
|
||||||
ax.set_xlabel("Configuration")
|
ax.set_xlabel("Configuration")
|
||||||
@ -332,56 +353,75 @@ def create_comparison_plot(
|
|||||||
|
|
||||||
|
|
||||||
def create_combined_plot(all_results):
|
def create_combined_plot(all_results):
|
||||||
"""Create a combined plot with all strategies in one PNG"""
|
|
||||||
num_strategies = len(all_results)
|
num_strategies = len(all_results)
|
||||||
fig, axes = plt.subplots(num_strategies, 1, figsize=(20, 6 * num_strategies))
|
fig, axes = plt.subplots(num_strategies, 1, figsize=(22, 7 * num_strategies))
|
||||||
|
|
||||||
if num_strategies == 1:
|
if num_strategies == 1:
|
||||||
axes = [axes]
|
axes = [axes]
|
||||||
|
|
||||||
for idx, (
|
for idx, (
|
||||||
strategy_name,
|
strategy_name,
|
||||||
ratio,
|
all_ratios,
|
||||||
cuda_times,
|
all_silu_v2_results,
|
||||||
baseline_times,
|
all_triton_results,
|
||||||
config_labels,
|
config_labels,
|
||||||
|
config_x_axis,
|
||||||
) in enumerate(all_results):
|
) in enumerate(all_results):
|
||||||
ax = axes[idx]
|
ax = axes[idx]
|
||||||
|
|
||||||
|
# Flatten the nested results to get bandwidth percentages for plotting
|
||||||
|
silu_v2_bandwidths = []
|
||||||
|
triton_bandwidths = []
|
||||||
|
flat_ratios = []
|
||||||
|
|
||||||
|
for config_results in all_silu_v2_results:
|
||||||
|
for result in config_results:
|
||||||
|
silu_v2_bandwidths.append(result[3]) # bandwidth percentage
|
||||||
|
|
||||||
|
for config_results in all_triton_results:
|
||||||
|
for result in config_results:
|
||||||
|
triton_bandwidths.append(result[3]) # bandwidth percentage
|
||||||
|
|
||||||
|
for config_ratios in all_ratios:
|
||||||
|
for ratio in config_ratios:
|
||||||
|
flat_ratios.append(ratio)
|
||||||
|
|
||||||
# Configure x-axis positions
|
# Configure x-axis positions
|
||||||
x = np.arange(len(config_labels))
|
x = np.arange(len(config_labels))
|
||||||
width = 0.35
|
width = 0.25
|
||||||
|
|
||||||
# Execution Time plot (lower is better)
|
# Bandwidth utilization plot (higher is better)
|
||||||
ax.bar(
|
ax.bar(
|
||||||
x - width / 2,
|
x,
|
||||||
cuda_times,
|
silu_v2_bandwidths,
|
||||||
width,
|
width,
|
||||||
label="CUDA Kernel",
|
label="SiLU V2 (CUDA)",
|
||||||
alpha=0.8,
|
alpha=0.8,
|
||||||
color="blue",
|
color="blue",
|
||||||
)
|
)
|
||||||
ax.bar(
|
ax.bar(
|
||||||
x + width / 2,
|
x + width,
|
||||||
baseline_times,
|
triton_bandwidths,
|
||||||
width,
|
width,
|
||||||
label="Baseline",
|
label="Triton Kernel",
|
||||||
alpha=0.8,
|
alpha=0.8,
|
||||||
color="orange",
|
color="green",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add speedup labels over each bar pair
|
# Add speedup labels over each bar trio
|
||||||
for i in range(len(x)):
|
for i in range(len(x)):
|
||||||
speedup = ratio[i]
|
triton_v2_speedup = flat_ratios[i] # triton/v2
|
||||||
max_height = max(cuda_times[i], baseline_times[i])
|
max_height = max(silu_v2_bandwidths[i], triton_bandwidths[i])
|
||||||
|
|
||||||
|
# Triton/V2 speedup
|
||||||
ax.text(
|
ax.text(
|
||||||
x[i],
|
x[i] + width / 2,
|
||||||
max_height + max_height * 0.02,
|
max_height + max_height * 0.02,
|
||||||
f"{speedup:.2f}x",
|
f"{triton_v2_speedup:.2f}x",
|
||||||
ha="center",
|
ha="center",
|
||||||
va="bottom",
|
va="bottom",
|
||||||
fontweight="bold",
|
fontweight="bold",
|
||||||
fontsize=9,
|
fontsize=8,
|
||||||
)
|
)
|
||||||
|
|
||||||
ax.set_xlabel("Configuration")
|
ax.set_xlabel("Configuration")
|
||||||
@ -395,7 +435,7 @@ def create_combined_plot(all_results):
|
|||||||
ax.grid(True, alpha=0.3)
|
ax.grid(True, alpha=0.3)
|
||||||
|
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
filename = "../../silu_bench/silu_benchmark_combined.png"
|
filename = "silu_benchmark_combined_3way.png"
|
||||||
plt.savefig(filename, dpi=300, bbox_inches="tight")
|
plt.savefig(filename, dpi=300, bbox_inches="tight")
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
@ -405,7 +445,9 @@ def create_combined_plot(all_results):
|
|||||||
outer_dim = 7168
|
outer_dim = 7168
|
||||||
configs = [
|
configs = [
|
||||||
# DeepSeekV3 Configs
|
# DeepSeekV3 Configs
|
||||||
|
# (1, 56, 7168),
|
||||||
(8, 1024, 7168),
|
(8, 1024, 7168),
|
||||||
|
# (32, 56, 7168),
|
||||||
# DeepSeekV3 Configs
|
# DeepSeekV3 Configs
|
||||||
(32, 1024, 7168),
|
(32, 1024, 7168),
|
||||||
# DeepSeekV3 Configs
|
# DeepSeekV3 Configs
|
||||||
@ -417,6 +459,7 @@ num_warmups = 20
|
|||||||
|
|
||||||
strategy_descriptions = {
|
strategy_descriptions = {
|
||||||
"uniform": "Uniform Random",
|
"uniform": "Uniform Random",
|
||||||
|
"random_imbalanced": "Imbalanced Random",
|
||||||
"max_t": "Even Assignment",
|
"max_t": "Even Assignment",
|
||||||
"first_t": "experts[0] = T, experts[1:] = 0",
|
"first_t": "experts[0] = T, experts[1:] = 0",
|
||||||
}
|
}
|
||||||
@ -433,28 +476,31 @@ for id, strategy in enumerate(strategies):
|
|||||||
print(f"Testing strategy: {strategy_descriptions[strategy]}")
|
print(f"Testing strategy: {strategy_descriptions[strategy]}")
|
||||||
print(f"{'=' * 60}")
|
print(f"{'=' * 60}")
|
||||||
|
|
||||||
# Collect benchmark data for both algorithms
|
# Collect benchmark data for all three algorithms
|
||||||
config_labels = []
|
config_labels = []
|
||||||
config_x_axis = []
|
config_x_axis = []
|
||||||
all_cuda_results = []
|
all_silu_v2_results = []
|
||||||
all_baseline_results = []
|
all_triton_results = []
|
||||||
all_ratios = []
|
all_ratios = []
|
||||||
|
|
||||||
for E, T, H in configs:
|
for E, T, H in configs:
|
||||||
total_tokens_config = [8 * E, 16 * E, 32 * E, 64 * E, 128 * E, 256 * E]
|
total_tokens_config = []
|
||||||
|
for i in [8, 16, 32, 64, 128, 256, 512]:
|
||||||
|
if i <= T:
|
||||||
|
total_tokens_config.append(i * E)
|
||||||
config_x_axis.append(total_tokens_config)
|
config_x_axis.append(total_tokens_config)
|
||||||
|
|
||||||
cuda_results = []
|
silu_v2_results = []
|
||||||
baseline_results = []
|
triton_results = []
|
||||||
ratios = []
|
ratios = []
|
||||||
|
|
||||||
for total_tokens in total_tokens_config:
|
for total_tokens in total_tokens_config:
|
||||||
config_label = f"E={E},T={T},H={H},TT={total_tokens}"
|
config_label = f"E={E},T={T},H={H},TT={total_tokens}"
|
||||||
config_labels.append(config_label)
|
config_labels.append(config_label)
|
||||||
|
|
||||||
# CUDA kernel results
|
# SiLU V2 (CUDA kernel) results
|
||||||
time_ms_cuda, gflops, gbps, perc = benchmark(
|
time_ms_silu_v2, gflops, gbps, perc = benchmark(
|
||||||
silu_mul_fp8_quant_deep_gemm_cuda,
|
persistent_masked_m_silu_mul_quant,
|
||||||
E,
|
E,
|
||||||
T,
|
T,
|
||||||
H,
|
H,
|
||||||
@ -463,9 +509,9 @@ for id, strategy in enumerate(strategies):
|
|||||||
num_warmups=num_warmups,
|
num_warmups=num_warmups,
|
||||||
gen_strategy=strategy,
|
gen_strategy=strategy,
|
||||||
)
|
)
|
||||||
cuda_results.append((time_ms_cuda, gflops, gbps, perc))
|
silu_v2_results.append((time_ms_silu_v2, gflops, gbps, perc))
|
||||||
|
|
||||||
# Baseline results
|
# Triton kernel results
|
||||||
time_ms_triton, gflops, gbps, perc = benchmark(
|
time_ms_triton, gflops, gbps, perc = benchmark(
|
||||||
silu_mul_fp8_quant_deep_gemm_triton,
|
silu_mul_fp8_quant_deep_gemm_triton,
|
||||||
E,
|
E,
|
||||||
@ -476,12 +522,20 @@ for id, strategy in enumerate(strategies):
|
|||||||
num_warmups=num_warmups,
|
num_warmups=num_warmups,
|
||||||
gen_strategy=strategy,
|
gen_strategy=strategy,
|
||||||
)
|
)
|
||||||
baseline_results.append((time_ms_triton, gflops, gbps, perc))
|
triton_results.append((time_ms_triton, gflops, gbps, perc))
|
||||||
ratios.append(time_ms_triton / time_ms_cuda)
|
|
||||||
|
|
||||||
print(f"Completed: {config_label}")
|
# Calculate speedup ratios (triton baseline / implementation)
|
||||||
all_cuda_results.append(cuda_results)
|
triton_v2_ratio = time_ms_triton / time_ms_silu_v2
|
||||||
all_baseline_results.append(baseline_results)
|
ratios.append(triton_v2_ratio)
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"Completed: {config_label}:"
|
||||||
|
f" V2: {time_ms_silu_v2:.3f}ms,"
|
||||||
|
f" Triton: {time_ms_triton:.3f}ms"
|
||||||
|
)
|
||||||
|
|
||||||
|
all_silu_v2_results.append(silu_v2_results)
|
||||||
|
all_triton_results.append(triton_results)
|
||||||
all_ratios.append(ratios)
|
all_ratios.append(ratios)
|
||||||
|
|
||||||
# Store results for combined plotting
|
# Store results for combined plotting
|
||||||
@ -489,8 +543,8 @@ for id, strategy in enumerate(strategies):
|
|||||||
(
|
(
|
||||||
strategy_descriptions[strategy],
|
strategy_descriptions[strategy],
|
||||||
all_ratios,
|
all_ratios,
|
||||||
all_cuda_results,
|
all_silu_v2_results,
|
||||||
all_baseline_results,
|
all_triton_results,
|
||||||
config_labels,
|
config_labels,
|
||||||
config_x_axis,
|
config_x_axis,
|
||||||
)
|
)
|
||||||
@ -498,15 +552,18 @@ for id, strategy in enumerate(strategies):
|
|||||||
|
|
||||||
# Print summary table for this strategy
|
# Print summary table for this strategy
|
||||||
print(f"\nSummary Table - {strategy_descriptions[strategy]}:")
|
print(f"\nSummary Table - {strategy_descriptions[strategy]}:")
|
||||||
print(f"{'Config':<20} {'CUDA Time(ms)':<12} {'Base Time(ms)':<12} {'Speedup':<8}")
|
print(f" {'V2 Time(ms)':<12} {'Triton Time(ms)':<14} {'Triton/V2':<10}")
|
||||||
print("-" * 60)
|
print("-" * 90)
|
||||||
|
|
||||||
for i, (E, T, H) in enumerate(configs):
|
for i, (E, T, H) in enumerate(configs):
|
||||||
speedup = baseline_results[i][0] / cuda_results[i][0]
|
# Get the first result for each config (simplifying for summary)
|
||||||
|
v2_time = silu_v2_results[i][0]
|
||||||
|
triton_time = triton_results[i][0]
|
||||||
|
triton_v2_speedup = triton_time / v2_time
|
||||||
config_label = f"E={E:3d},T={T:4d},H={H:4d}"
|
config_label = f"E={E:3d},T={T:4d},H={H:4d}"
|
||||||
print(
|
print(
|
||||||
f"{config_label:<20} {cuda_results[i][0]:8.5f} "
|
f"{config_label:<20} {v2_time:8.5f} {triton_time:10.5f} "
|
||||||
f"{baseline_results[i][0]:8.5f} {speedup:6.2f}x"
|
f"{triton_v2_speedup:8.2f}x"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -514,15 +571,14 @@ def create_total_tokens_plot(all_results):
|
|||||||
num_strategies = len(all_results)
|
num_strategies = len(all_results)
|
||||||
num_configs = len(configs)
|
num_configs = len(configs)
|
||||||
|
|
||||||
# Create side-by-side subplots: 2 columns for speedup and bandwidth percentage
|
|
||||||
fig, axs = plt.subplots(
|
fig, axs = plt.subplots(
|
||||||
num_strategies, num_configs * 2, figsize=(28, 6 * num_strategies)
|
num_strategies, num_configs * 2, figsize=(32, 8 * num_strategies)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add main title to the entire figure
|
# Add main title to the entire figure
|
||||||
fig.suptitle(
|
fig.suptitle(
|
||||||
"Performance Analysis: Speedup vs Bandwidth Utilization (Triton & CUDA)",
|
"Performance Analysis: Speedup vs Bandwidth Utilization (SiLU V2, and Triton)",
|
||||||
fontsize=16,
|
fontsize=18,
|
||||||
fontweight="bold",
|
fontweight="bold",
|
||||||
y=0.98,
|
y=0.98,
|
||||||
)
|
)
|
||||||
@ -539,8 +595,8 @@ def create_total_tokens_plot(all_results):
|
|||||||
(
|
(
|
||||||
strategy_name,
|
strategy_name,
|
||||||
all_ratios,
|
all_ratios,
|
||||||
all_cuda_results,
|
all_silu_v2_results,
|
||||||
all_baseline_results,
|
all_triton_results,
|
||||||
config_labels,
|
config_labels,
|
||||||
config_x_axis,
|
config_x_axis,
|
||||||
) = result
|
) = result
|
||||||
@ -555,42 +611,54 @@ def create_total_tokens_plot(all_results):
|
|||||||
ratios = all_ratios[config_idx]
|
ratios = all_ratios[config_idx]
|
||||||
total_tokens_values = config_x_axis[config_idx]
|
total_tokens_values = config_x_axis[config_idx]
|
||||||
|
|
||||||
# Extract CUDA and Triton bandwidth percentages
|
# Extract speedup ratios
|
||||||
cuda_bandwidth_percentages = [
|
triton_v2_ratios = [ratio for ratio in ratios]
|
||||||
result[3] for result in all_cuda_results[config_idx]
|
|
||||||
|
# Extract bandwidth percentages for all implementations
|
||||||
|
v2_bandwidth_percentages = [
|
||||||
|
result[3] for result in all_silu_v2_results[config_idx]
|
||||||
]
|
]
|
||||||
triton_bandwidth_percentages = [
|
triton_bandwidth_percentages = [
|
||||||
result[3] for result in all_baseline_results[config_idx]
|
result[3] for result in all_triton_results[config_idx]
|
||||||
]
|
]
|
||||||
|
|
||||||
# Plot speedup ratios vs total tokens (left plot)
|
# Plot speedup ratios vs total tokens (left plot)
|
||||||
ax_speedup.plot(
|
ax_speedup.plot(
|
||||||
total_tokens_values, ratios, "bo-", linewidth=3, markersize=8
|
total_tokens_values,
|
||||||
|
triton_v2_ratios,
|
||||||
|
"go-",
|
||||||
|
linewidth=3,
|
||||||
|
markersize=8,
|
||||||
|
label="Triton/V2 Speedup",
|
||||||
)
|
)
|
||||||
ax_speedup.set_title(
|
ax_speedup.set_title(
|
||||||
f"{strategy_name}\nSpeedup (CUDA/Triton)\nE={E}, T={T}, H={H}",
|
f"{strategy_name}\nSpeedup vs Baseline (Triton)\nE={E}, T={T}, H={H}",
|
||||||
fontsize=12,
|
fontsize=12,
|
||||||
fontweight="bold",
|
fontweight="bold",
|
||||||
)
|
)
|
||||||
ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
|
ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
|
||||||
ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11)
|
ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11)
|
||||||
|
ax_speedup.legend(prop={"weight": "bold"})
|
||||||
ax_speedup.grid(True, alpha=0.3)
|
ax_speedup.grid(True, alpha=0.3)
|
||||||
|
|
||||||
|
# Plot bandwidth utilization (right plot)
|
||||||
ax_bandwidth.plot(
|
ax_bandwidth.plot(
|
||||||
total_tokens_values,
|
total_tokens_values,
|
||||||
cuda_bandwidth_percentages,
|
v2_bandwidth_percentages,
|
||||||
"ro-",
|
"o-",
|
||||||
linewidth=3,
|
linewidth=3,
|
||||||
markersize=8,
|
markersize=8,
|
||||||
label="CUDA",
|
label="SiLU V2",
|
||||||
|
color="blue",
|
||||||
)
|
)
|
||||||
ax_bandwidth.plot(
|
ax_bandwidth.plot(
|
||||||
total_tokens_values,
|
total_tokens_values,
|
||||||
triton_bandwidth_percentages,
|
triton_bandwidth_percentages,
|
||||||
"go-",
|
"o-",
|
||||||
linewidth=3,
|
linewidth=3,
|
||||||
markersize=8,
|
markersize=8,
|
||||||
label="Triton",
|
label="Triton",
|
||||||
|
color="green",
|
||||||
)
|
)
|
||||||
ax_bandwidth.set_title(
|
ax_bandwidth.set_title(
|
||||||
f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}",
|
f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}",
|
||||||
@ -618,38 +686,12 @@ def create_total_tokens_plot(all_results):
|
|||||||
for label in ax.get_xticklabels() + ax.get_yticklabels():
|
for label in ax.get_xticklabels() + ax.get_yticklabels():
|
||||||
label.set_fontweight("bold")
|
label.set_fontweight("bold")
|
||||||
|
|
||||||
# Add value labels on speedup points
|
# Add value labels on Triton/V2 speedup points
|
||||||
for x, y in zip(total_tokens_values, ratios):
|
for x, y in zip(total_tokens_values, triton_v2_ratios):
|
||||||
ax_speedup.annotate(
|
ax_speedup.annotate(
|
||||||
f"{y:.2f}x",
|
f"{y:.2f}x",
|
||||||
(x, y),
|
(x, y),
|
||||||
textcoords="offset points",
|
textcoords="offset points",
|
||||||
xytext=(0, 12),
|
|
||||||
ha="center",
|
|
||||||
fontsize=10,
|
|
||||||
fontweight="bold",
|
|
||||||
bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7),
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add value labels on CUDA bandwidth points
|
|
||||||
for x, y in zip(total_tokens_values, cuda_bandwidth_percentages):
|
|
||||||
ax_bandwidth.annotate(
|
|
||||||
f"{y:.1f}%",
|
|
||||||
(x, y),
|
|
||||||
textcoords="offset points",
|
|
||||||
xytext=(0, 12),
|
|
||||||
ha="center",
|
|
||||||
fontsize=9,
|
|
||||||
fontweight="bold",
|
|
||||||
bbox=dict(boxstyle="round,pad=0.2", facecolor="red", alpha=0.3),
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add value labels on Triton bandwidth points
|
|
||||||
for x, y in zip(total_tokens_values, triton_bandwidth_percentages):
|
|
||||||
ax_bandwidth.annotate(
|
|
||||||
f"{y:.1f}%",
|
|
||||||
(x, y),
|
|
||||||
textcoords="offset points",
|
|
||||||
xytext=(0, -15),
|
xytext=(0, -15),
|
||||||
ha="center",
|
ha="center",
|
||||||
fontsize=9,
|
fontsize=9,
|
||||||
@ -659,17 +701,20 @@ def create_total_tokens_plot(all_results):
|
|||||||
|
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.subplots_adjust(top=0.93) # Make room for main title
|
plt.subplots_adjust(top=0.93) # Make room for main title
|
||||||
filename = "silu_benchmark_total_tokens.png"
|
filename = "silu_benchmark_total_tokens_3way.png"
|
||||||
plt.savefig(filename, dpi=300, bbox_inches="tight")
|
plt.savefig(filename, dpi=300, bbox_inches="tight")
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
return filename
|
return filename
|
||||||
|
|
||||||
|
|
||||||
# Create combined plot with all strategies
|
# Create comprehensive 3-way comparison plots
|
||||||
combined_plot_filename = create_total_tokens_plot(all_results)
|
combined_plot_filename = create_combined_plot(all_results)
|
||||||
|
total_tokens_plot_filename = create_total_tokens_plot(all_results)
|
||||||
|
|
||||||
print(f"\n{'=' * 60}")
|
print(f"\n{'=' * 80}")
|
||||||
print("Benchmark Complete!")
|
print("3-Way Benchmark Suite Complete!")
|
||||||
print(f"Generated combined plot: {combined_plot_filename}")
|
print(f"Generated combined comparison plot: {combined_plot_filename}")
|
||||||
print(f"{'=' * 60}")
|
print(f"Generated total tokens analysis plot: {total_tokens_plot_filename}")
|
||||||
|
print("Compared: SiLU V2 (CUDA), and Triton implementations")
|
||||||
|
print(f"{'=' * 80}")
|
||||||
|
|||||||
@ -4,12 +4,11 @@
|
|||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import flashinfer
|
import flashinfer
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.utils import round_up
|
from vllm.utils.math_utils import round_up
|
||||||
|
|
||||||
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
|
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
|
||||||
FP8_DTYPE = torch.float8_e4m3fn
|
FP8_DTYPE = torch.float8_e4m3fn
|
||||||
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
|
|||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def benchmark_decode(
|
def benchmark_decode(
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
quant_dtypes: tuple[
|
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
|
||||||
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
|
|
||||||
],
|
|
||||||
batch_size: int,
|
batch_size: int,
|
||||||
max_seq_len: int,
|
max_seq_len: int,
|
||||||
num_heads: tuple[int, int] = (64, 8),
|
num_heads: tuple[int, int] = (64, 8),
|
||||||
|
|||||||
@ -4,12 +4,11 @@
|
|||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import flashinfer
|
import flashinfer
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.utils import round_up
|
from vllm.utils.math_utils import round_up
|
||||||
|
|
||||||
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
|
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
|
||||||
FP8_DTYPE = torch.float8_e4m3fn
|
FP8_DTYPE = torch.float8_e4m3fn
|
||||||
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
|
|||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def benchmark_prefill(
|
def benchmark_prefill(
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
quant_dtypes: tuple[
|
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
|
||||||
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
|
|
||||||
],
|
|
||||||
batch_size: int,
|
batch_size: int,
|
||||||
max_seq_len: int,
|
max_seq_len: int,
|
||||||
num_heads: tuple[int, int] = (64, 8),
|
num_heads: tuple[int, int] = (64, 8),
|
||||||
|
|||||||
@ -14,11 +14,11 @@ import torch
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
_w8a8_block_fp8_matmul,
|
_w8a8_triton_block_scaled_mm,
|
||||||
)
|
)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
mp.set_start_method("spawn", force=True)
|
mp.set_start_method("spawn", force=True)
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ def w8a8_block_matmul(
|
|||||||
)
|
)
|
||||||
|
|
||||||
if A.dtype == torch.float8_e4m3fn:
|
if A.dtype == torch.float8_e4m3fn:
|
||||||
kernel = _w8a8_block_fp8_matmul
|
kernel = _w8a8_triton_block_scaled_mm
|
||||||
else:
|
else:
|
||||||
raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.")
|
raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.")
|
||||||
|
|
||||||
|
|||||||
@ -11,7 +11,7 @@ import regex as re
|
|||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
from torch.utils.benchmark import Measurement as TMeasurement
|
from torch.utils.benchmark import Measurement as TMeasurement
|
||||||
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
|
|||||||
@ -2,8 +2,8 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import dataclasses
|
import dataclasses
|
||||||
from collections.abc import Iterable
|
from collections.abc import Callable, Iterable
|
||||||
from typing import Any, Callable, Optional
|
from typing import Any
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@ -55,7 +55,7 @@ class Bench:
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
cuda_graph_params: Optional[CudaGraphBenchParams],
|
cuda_graph_params: CudaGraphBenchParams | None,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
description: str,
|
description: str,
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from statistics import mean
|
from statistics import mean
|
||||||
from typing import Any, NamedTuple, Optional, Union
|
from typing import Any, NamedTuple
|
||||||
|
|
||||||
import numpy as np # type: ignore
|
import numpy as np # type: ignore
|
||||||
import pandas as pd # type: ignore
|
import pandas as pd # type: ignore
|
||||||
@ -35,8 +35,8 @@ class Distribution(ABC):
|
|||||||
class UniformDistribution(Distribution):
|
class UniformDistribution(Distribution):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
min_val: Union[int, float],
|
min_val: int | float,
|
||||||
max_val: Union[int, float],
|
max_val: int | float,
|
||||||
is_integer: bool = True,
|
is_integer: bool = True,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.min_val = min_val
|
self.min_val = min_val
|
||||||
@ -56,7 +56,7 @@ class UniformDistribution(Distribution):
|
|||||||
|
|
||||||
|
|
||||||
class ConstantDistribution(Distribution):
|
class ConstantDistribution(Distribution):
|
||||||
def __init__(self, value: Union[int, float]) -> None:
|
def __init__(self, value: int | float) -> None:
|
||||||
self.value = value
|
self.value = value
|
||||||
self.max_val = value
|
self.max_val = value
|
||||||
|
|
||||||
@ -68,7 +68,7 @@ class ConstantDistribution(Distribution):
|
|||||||
|
|
||||||
|
|
||||||
class ZipfDistribution(Distribution):
|
class ZipfDistribution(Distribution):
|
||||||
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
|
def __init__(self, alpha: float, max_val: int | None = None) -> None:
|
||||||
self.alpha = alpha
|
self.alpha = alpha
|
||||||
self.max_val = max_val
|
self.max_val = max_val
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ class ZipfDistribution(Distribution):
|
|||||||
|
|
||||||
|
|
||||||
class PoissonDistribution(Distribution):
|
class PoissonDistribution(Distribution):
|
||||||
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
|
def __init__(self, alpha: float, max_val: int | None = None) -> None:
|
||||||
self.alpha = alpha
|
self.alpha = alpha
|
||||||
self.max_val = max_val
|
self.max_val = max_val
|
||||||
|
|
||||||
@ -100,11 +100,11 @@ class PoissonDistribution(Distribution):
|
|||||||
class LognormalDistribution(Distribution):
|
class LognormalDistribution(Distribution):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
mean: Optional[float] = None,
|
mean: float | None = None,
|
||||||
sigma: Optional[float] = None,
|
sigma: float | None = None,
|
||||||
average: Optional[int] = None,
|
average: int | None = None,
|
||||||
median_ratio: Optional[float] = None,
|
median_ratio: float | None = None,
|
||||||
max_val: Optional[int] = None,
|
max_val: int | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.average = average
|
self.average = average
|
||||||
self.median_ratio = median_ratio
|
self.median_ratio = median_ratio
|
||||||
|
|||||||
@ -13,7 +13,7 @@ from datetime import datetime
|
|||||||
from enum import Enum
|
from enum import Enum
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from statistics import mean
|
from statistics import mean
|
||||||
from typing import NamedTuple, Optional, Union
|
from typing import NamedTuple
|
||||||
|
|
||||||
import aiohttp # type: ignore
|
import aiohttp # type: ignore
|
||||||
import numpy as np # type: ignore
|
import numpy as np # type: ignore
|
||||||
@ -46,9 +46,9 @@ class ConversationSampling(str, Enum):
|
|||||||
|
|
||||||
class ClientArgs(NamedTuple):
|
class ClientArgs(NamedTuple):
|
||||||
seed: int
|
seed: int
|
||||||
max_num_requests: Optional[int]
|
max_num_requests: int | None
|
||||||
skip_first_turn: bool
|
skip_first_turn: bool
|
||||||
max_turns: Optional[int]
|
max_turns: int | None
|
||||||
max_active_conversations: int
|
max_active_conversations: int
|
||||||
verbose: bool
|
verbose: bool
|
||||||
print_content: bool
|
print_content: bool
|
||||||
@ -109,9 +109,9 @@ class RequestStats(NamedTuple):
|
|||||||
|
|
||||||
class MetricStats:
|
class MetricStats:
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.min: Optional[float] = None
|
self.min: float | None = None
|
||||||
self.max: Optional[float] = None
|
self.max: float | None = None
|
||||||
self.avg: Optional[float] = None
|
self.avg: float | None = None
|
||||||
self.sum = 0.0
|
self.sum = 0.0
|
||||||
self.count = 0
|
self.count = 0
|
||||||
|
|
||||||
@ -143,7 +143,7 @@ class MovingAverage:
|
|||||||
self.index = 0
|
self.index = 0
|
||||||
self.sum = 0.0
|
self.sum = 0.0
|
||||||
self.count = 0
|
self.count = 0
|
||||||
self.avg: Optional[float] = None
|
self.avg: float | None = None
|
||||||
|
|
||||||
def update(self, new_value: float) -> None:
|
def update(self, new_value: float) -> None:
|
||||||
if self.count < self.window_size:
|
if self.count < self.window_size:
|
||||||
@ -169,7 +169,7 @@ class MovingAverage:
|
|||||||
class DebugStats:
|
class DebugStats:
|
||||||
def __init__(self, logger: logging.Logger, window_size: int) -> None:
|
def __init__(self, logger: logging.Logger, window_size: int) -> None:
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
|
self.metrics: dict[str, MovingAverage | MetricStats] = {
|
||||||
"moving_avg_ttft_ms": MovingAverage(window_size),
|
"moving_avg_ttft_ms": MovingAverage(window_size),
|
||||||
"moving_avg_tpot_ms": MovingAverage(window_size),
|
"moving_avg_tpot_ms": MovingAverage(window_size),
|
||||||
"ttft_ms": MetricStats(),
|
"ttft_ms": MetricStats(),
|
||||||
@ -198,14 +198,6 @@ class DebugStats:
|
|||||||
self.logger.info("-" * 50)
|
self.logger.info("-" * 50)
|
||||||
|
|
||||||
|
|
||||||
# Must support Python 3.8, we can't use str.removeprefix(prefix)
|
|
||||||
# introduced in Python 3.9
|
|
||||||
def remove_prefix(text: str, prefix: str) -> str:
|
|
||||||
if text.startswith(prefix):
|
|
||||||
return text[len(prefix) :]
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
def nanosec_to_millisec(value: float) -> float:
|
def nanosec_to_millisec(value: float) -> float:
|
||||||
return value / 1000000.0
|
return value / 1000000.0
|
||||||
|
|
||||||
@ -220,8 +212,8 @@ async def send_request(
|
|||||||
chat_url: str,
|
chat_url: str,
|
||||||
model: str,
|
model: str,
|
||||||
stream: bool = True,
|
stream: bool = True,
|
||||||
min_tokens: Optional[int] = None,
|
min_tokens: int | None = None,
|
||||||
max_tokens: Optional[int] = None,
|
max_tokens: int | None = None,
|
||||||
) -> ServerResponse:
|
) -> ServerResponse:
|
||||||
payload = {
|
payload = {
|
||||||
"model": model,
|
"model": model,
|
||||||
@ -250,9 +242,9 @@ async def send_request(
|
|||||||
timeout = aiohttp.ClientTimeout(total=timeout_sec)
|
timeout = aiohttp.ClientTimeout(total=timeout_sec)
|
||||||
|
|
||||||
valid_response = True
|
valid_response = True
|
||||||
ttft: Optional[float] = None
|
ttft: float | None = None
|
||||||
chunk_delay: list[int] = []
|
chunk_delay: list[int] = []
|
||||||
latency: Optional[float] = None
|
latency: float | None = None
|
||||||
first_chunk = ""
|
first_chunk = ""
|
||||||
generated_text = ""
|
generated_text = ""
|
||||||
|
|
||||||
@ -269,7 +261,7 @@ async def send_request(
|
|||||||
if not chunk_bytes:
|
if not chunk_bytes:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
|
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
|
||||||
if chunk == "[DONE]":
|
if chunk == "[DONE]":
|
||||||
# End of stream
|
# End of stream
|
||||||
latency = time.perf_counter_ns() - start_time
|
latency = time.perf_counter_ns() - start_time
|
||||||
@ -364,7 +356,7 @@ async def send_turn(
|
|||||||
req_args: RequestArgs,
|
req_args: RequestArgs,
|
||||||
verbose: bool,
|
verbose: bool,
|
||||||
verify_output: bool,
|
verify_output: bool,
|
||||||
) -> Optional[RequestStats]:
|
) -> RequestStats | None:
|
||||||
assert messages_to_use > 0
|
assert messages_to_use > 0
|
||||||
assert messages_to_use <= len(conversation_messages)
|
assert messages_to_use <= len(conversation_messages)
|
||||||
|
|
||||||
@ -644,7 +636,7 @@ async def client_main(
|
|||||||
|
|
||||||
if args.verbose:
|
if args.verbose:
|
||||||
curr_time_sec: float = time.perf_counter()
|
curr_time_sec: float = time.perf_counter()
|
||||||
time_since_last_turn: Union[str, float] = "N/A"
|
time_since_last_turn: str | float = "N/A"
|
||||||
if conv_id in time_of_last_turn:
|
if conv_id in time_of_last_turn:
|
||||||
time_since_last_turn = round(
|
time_since_last_turn = round(
|
||||||
curr_time_sec - time_of_last_turn[conv_id], 3
|
curr_time_sec - time_of_last_turn[conv_id], 3
|
||||||
@ -769,7 +761,7 @@ def get_client_config(
|
|||||||
"Number of conversations must be equal or larger than the number of clients"
|
"Number of conversations must be equal or larger than the number of clients"
|
||||||
)
|
)
|
||||||
|
|
||||||
max_req_per_client: Optional[int] = None
|
max_req_per_client: int | None = None
|
||||||
if args.max_num_requests is not None:
|
if args.max_num_requests is not None:
|
||||||
# Max number of requests per client
|
# Max number of requests per client
|
||||||
req_per_client = args.max_num_requests // args.num_clients
|
req_per_client = args.max_num_requests // args.num_clients
|
||||||
@ -936,13 +928,13 @@ async def main_mp(
|
|||||||
f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501
|
f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501
|
||||||
)
|
)
|
||||||
|
|
||||||
rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
|
rps: str | float = round(len(client_metrics) / runtime_sec, 3)
|
||||||
if len(client_metrics) < (5 * bench_args.num_clients):
|
if len(client_metrics) < (5 * bench_args.num_clients):
|
||||||
# Do not estimate the RPS if the number of samples is very low
|
# Do not estimate the RPS if the number of samples is very low
|
||||||
# (threshold can be tuned if needed)
|
# (threshold can be tuned if needed)
|
||||||
rps = "N/A"
|
rps = "N/A"
|
||||||
|
|
||||||
runtime_left_sec: Union[str, float] = round(
|
runtime_left_sec: str | float = round(
|
||||||
(runtime_sec / finished_convs) * (total_convs - finished_convs), 3
|
(runtime_sec / finished_convs) * (total_convs - finished_convs), 3
|
||||||
)
|
)
|
||||||
if percent < 0.05:
|
if percent < 0.05:
|
||||||
@ -1032,7 +1024,7 @@ def process_statistics(
|
|||||||
warmup_percentages: list[float],
|
warmup_percentages: list[float],
|
||||||
test_params: dict,
|
test_params: dict,
|
||||||
verbose: bool,
|
verbose: bool,
|
||||||
gen_conv_args: Optional[GenConvArgs] = None,
|
gen_conv_args: GenConvArgs | None = None,
|
||||||
excel_output: bool = False,
|
excel_output: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
if len(client_metrics) == 0:
|
if len(client_metrics) == 0:
|
||||||
@ -1259,7 +1251,7 @@ async def main() -> None:
|
|||||||
default=None,
|
default=None,
|
||||||
help="The model name used in the API. "
|
help="The model name used in the API. "
|
||||||
"If not specified, the model name will be the "
|
"If not specified, the model name will be the "
|
||||||
"same as the ``--model`` argument. ",
|
"same as the `--model` argument. ",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|||||||
@ -13,7 +13,7 @@ import argparse
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
from statistics import mean
|
from statistics import mean
|
||||||
from typing import Any, Optional
|
from typing import Any
|
||||||
|
|
||||||
import pandas as pd # type: ignore
|
import pandas as pd # type: ignore
|
||||||
import tqdm # type: ignore
|
import tqdm # type: ignore
|
||||||
@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
def content_is_valid(
|
def content_is_valid(
|
||||||
content: str, min_content_len: Optional[int], max_content_len: Optional[int]
|
content: str, min_content_len: int | None, max_content_len: int | None
|
||||||
) -> bool:
|
) -> bool:
|
||||||
if min_content_len and len(content) < min_content_len:
|
if min_content_len and len(content) < min_content_len:
|
||||||
return False
|
return False
|
||||||
@ -37,7 +37,7 @@ def content_is_valid(
|
|||||||
|
|
||||||
|
|
||||||
def print_stats(
|
def print_stats(
|
||||||
conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
|
conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
|
||||||
) -> None:
|
) -> None:
|
||||||
# Collect statistics
|
# Collect statistics
|
||||||
stats = []
|
stats = []
|
||||||
@ -109,12 +109,12 @@ def convert_sharegpt_to_openai(
|
|||||||
seed: int,
|
seed: int,
|
||||||
input_file: str,
|
input_file: str,
|
||||||
output_file: str,
|
output_file: str,
|
||||||
max_items: Optional[int],
|
max_items: int | None,
|
||||||
min_content_len: Optional[int] = None,
|
min_content_len: int | None = None,
|
||||||
max_content_len: Optional[int] = None,
|
max_content_len: int | None = None,
|
||||||
min_turns: Optional[int] = None,
|
min_turns: int | None = None,
|
||||||
max_turns: Optional[int] = None,
|
max_turns: int | None = None,
|
||||||
model: Optional[str] = None,
|
model: str | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
if min_turns and max_turns:
|
if min_turns and max_turns:
|
||||||
assert min_turns <= max_turns
|
assert min_turns <= max_turns
|
||||||
|
|||||||
@ -5,7 +5,7 @@ import cProfile
|
|||||||
import pstats
|
import pstats
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
# A very long prompt, total number of tokens is about 15k.
|
# A very long prompt, total number of tokens is about 15k.
|
||||||
LONG_PROMPT = ["You are an expert in large language models, aren't you?"] * 1000
|
LONG_PROMPT = ["You are an expert in large language models, aren't you?"] * 1000
|
||||||
|
|||||||
@ -188,34 +188,79 @@ else()
|
|||||||
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
|
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
#
|
|
||||||
# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms)
|
|
||||||
# Flag to enable ACL kernels for AARCH64 platforms
|
|
||||||
if (VLLM_BUILD_ACL STREQUAL "ON")
|
|
||||||
set(USE_ACL ON)
|
|
||||||
else()
|
|
||||||
set(USE_ACL OFF)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
|
# Build oneDNN for GEMM kernels (only for x86-AVX512 /ARM platforms)
|
||||||
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
||||||
FetchContent_Declare(
|
# Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64
|
||||||
oneDNN
|
# TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN
|
||||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
if(ASIMD_FOUND)
|
||||||
GIT_TAG v3.9
|
if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}")
|
||||||
GIT_PROGRESS TRUE
|
message(STATUS "Using ACL from specified source directory: $ENV{ACL_ROOT_DIR}")
|
||||||
GIT_SHALLOW TRUE
|
else()
|
||||||
)
|
message(STATUS "Downloading Arm Compute Library (ACL) from GitHub")
|
||||||
|
FetchContent_Populate(arm_compute
|
||||||
if(USE_ACL)
|
SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild"
|
||||||
find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
|
SOURCE_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-src"
|
||||||
if(NOT ARM_COMPUTE_LIBRARY)
|
GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git
|
||||||
message(FATAL_ERROR "Could not find ARM Compute Library: please set ACL_ROOT_DIR")
|
GIT_TAG v52.2.0
|
||||||
|
GIT_SHALLOW TRUE
|
||||||
|
GIT_PROGRESS TRUE
|
||||||
|
)
|
||||||
|
set(ENV{ACL_ROOT_DIR} "${arm_compute_SOURCE_DIR}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# Build ACL with scons
|
||||||
|
include(ProcessorCount)
|
||||||
|
ProcessorCount(_NPROC)
|
||||||
|
set(_scons_cmd
|
||||||
|
scons -j${_NPROC}
|
||||||
|
Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
|
||||||
|
arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
|
||||||
|
multi_isa=1 openmp=1 cppthreads=0
|
||||||
|
)
|
||||||
|
|
||||||
|
# locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
|
||||||
|
# and create a local shim dir with it
|
||||||
|
include("${CMAKE_CURRENT_LIST_DIR}/utils.cmake")
|
||||||
|
vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
|
||||||
|
|
||||||
|
if(NOT VLLM_TORCH_GOMP_SHIM_DIR STREQUAL "")
|
||||||
|
list(APPEND _scons_cmd extra_link_flags=-L${VLLM_TORCH_GOMP_SHIM_DIR})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
execute_process(
|
||||||
|
COMMAND ${_scons_cmd}
|
||||||
|
WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
|
||||||
|
RESULT_VARIABLE _acl_rc
|
||||||
|
)
|
||||||
|
if(NOT _acl_rc EQUAL 0)
|
||||||
|
message(FATAL_ERROR "ACL SCons build failed (exit ${_acl_rc}).")
|
||||||
|
endif()
|
||||||
|
|
||||||
set(ONEDNN_AARCH64_USE_ACL "ON")
|
set(ONEDNN_AARCH64_USE_ACL "ON")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
|
||||||
add_compile_definitions(VLLM_USE_ACL)
|
add_compile_definitions(VLLM_USE_ACL)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
|
||||||
|
|
||||||
|
if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
|
||||||
|
message(STATUS "Using oneDNN from specified source directory: ${FETCHCONTENT_SOURCE_DIR_ONEDNN}")
|
||||||
|
FetchContent_Declare(
|
||||||
|
oneDNN
|
||||||
|
SOURCE_DIR ${FETCHCONTENT_SOURCE_DIR_ONEDNN}
|
||||||
|
)
|
||||||
|
else()
|
||||||
|
message(STATUS "Downloading oneDNN from GitHub")
|
||||||
|
FetchContent_Declare(
|
||||||
|
oneDNN
|
||||||
|
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
||||||
|
GIT_TAG v3.9
|
||||||
|
GIT_PROGRESS TRUE
|
||||||
|
GIT_SHALLOW TRUE
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
||||||
set(ONEDNN_LIBRARY_TYPE "STATIC")
|
set(ONEDNN_LIBRARY_TYPE "STATIC")
|
||||||
set(ONEDNN_BUILD_DOC "OFF")
|
set(ONEDNN_BUILD_DOC "OFF")
|
||||||
set(ONEDNN_BUILD_EXAMPLES "OFF")
|
set(ONEDNN_BUILD_EXAMPLES "OFF")
|
||||||
@ -227,7 +272,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
|||||||
set(ONEDNN_ENABLE_ITT_TASKS "OFF")
|
set(ONEDNN_ENABLE_ITT_TASKS "OFF")
|
||||||
set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
|
set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
|
||||||
set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
|
set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
|
||||||
set(ONEDNN_VERBOSE "ON")
|
set(ONEDNN_VERBOSE "OFF")
|
||||||
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
|
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
|
||||||
|
|
||||||
FetchContent_MakeAvailable(oneDNN)
|
FetchContent_MakeAvailable(oneDNN)
|
||||||
@ -309,4 +354,4 @@ define_gpu_extension_target(
|
|||||||
WITH_SOABI
|
WITH_SOABI
|
||||||
)
|
)
|
||||||
|
|
||||||
message(STATUS "Enabling C extension.")
|
message(STATUS "Enabling C extension.")
|
||||||
@ -19,7 +19,7 @@ else()
|
|||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
flashmla
|
flashmla
|
||||||
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
|
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
|
||||||
GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f
|
GIT_TAG 46d64a8ebef03fa50b4ae74937276a5c940e3f95
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
CONFIGURE_COMMAND ""
|
CONFIGURE_COMMAND ""
|
||||||
BUILD_COMMAND ""
|
BUILD_COMMAND ""
|
||||||
@ -66,6 +66,7 @@ if(FLASH_MLA_ARCHS)
|
|||||||
${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
|
${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
|
||||||
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
|
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
|
||||||
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
|
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
|
||||||
|
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_metadata.cu
|
||||||
)
|
)
|
||||||
|
|
||||||
set(FlashMLA_INCLUDES
|
set(FlashMLA_INCLUDES
|
||||||
|
|||||||
97
cmake/external_projects/qutlass.cmake
Normal file
97
cmake/external_projects/qutlass.cmake
Normal file
@ -0,0 +1,97 @@
|
|||||||
|
include(FetchContent)
|
||||||
|
|
||||||
|
set(CUTLASS_INCLUDE_DIR "${CUTLASS_INCLUDE_DIR}" CACHE PATH "Path to CUTLASS include/ directory")
|
||||||
|
|
||||||
|
if(DEFINED ENV{QUTLASS_SRC_DIR})
|
||||||
|
set(QUTLASS_SRC_DIR $ENV{QUTLASS_SRC_DIR})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if(QUTLASS_SRC_DIR)
|
||||||
|
FetchContent_Declare(
|
||||||
|
qutlass
|
||||||
|
SOURCE_DIR ${QUTLASS_SRC_DIR}
|
||||||
|
CONFIGURE_COMMAND ""
|
||||||
|
BUILD_COMMAND ""
|
||||||
|
)
|
||||||
|
else()
|
||||||
|
FetchContent_Declare(
|
||||||
|
qutlass
|
||||||
|
GIT_REPOSITORY https://github.com/IST-DASLab/qutlass.git
|
||||||
|
GIT_TAG 830d2c4537c7396e14a02a46fbddd18b5d107c65
|
||||||
|
GIT_PROGRESS TRUE
|
||||||
|
CONFIGURE_COMMAND ""
|
||||||
|
BUILD_COMMAND ""
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
FetchContent_Populate(qutlass)
|
||||||
|
|
||||||
|
if(NOT qutlass_SOURCE_DIR)
|
||||||
|
message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
|
||||||
|
endif()
|
||||||
|
message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}")
|
||||||
|
|
||||||
|
cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a" "${CUDA_ARCHS}")
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND QUTLASS_ARCHS)
|
||||||
|
|
||||||
|
if(QUTLASS_ARCHS MATCHES "10\\.0a")
|
||||||
|
set(QUTLASS_TARGET_CC 100)
|
||||||
|
elseif(QUTLASS_ARCHS MATCHES "12\\.0a")
|
||||||
|
set(QUTLASS_TARGET_CC 120)
|
||||||
|
else()
|
||||||
|
message(FATAL_ERROR "[QUTLASS] internal error parsing CUDA_ARCHS='${QUTLASS_ARCHS}'.")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set(QUTLASS_SOURCES
|
||||||
|
${qutlass_SOURCE_DIR}/qutlass/csrc/bindings.cpp
|
||||||
|
${qutlass_SOURCE_DIR}/qutlass/csrc/gemm.cu
|
||||||
|
${qutlass_SOURCE_DIR}/qutlass/csrc/gemm_ada.cu
|
||||||
|
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx.cu
|
||||||
|
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv.cu
|
||||||
|
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx_sm100.cu
|
||||||
|
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv_sm100.cu
|
||||||
|
)
|
||||||
|
|
||||||
|
set(QUTLASS_INCLUDES
|
||||||
|
${qutlass_SOURCE_DIR}
|
||||||
|
${qutlass_SOURCE_DIR}/qutlass
|
||||||
|
${qutlass_SOURCE_DIR}/qutlass/csrc/include
|
||||||
|
${qutlass_SOURCE_DIR}/qutlass/csrc/include/cutlass_extensions
|
||||||
|
)
|
||||||
|
|
||||||
|
if(CUTLASS_INCLUDE_DIR AND EXISTS "${CUTLASS_INCLUDE_DIR}/cutlass/cutlass.h")
|
||||||
|
list(APPEND QUTLASS_INCLUDES "${CUTLASS_INCLUDE_DIR}")
|
||||||
|
elseif(EXISTS "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include/cutlass/cutlass.h")
|
||||||
|
list(APPEND QUTLASS_INCLUDES "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include")
|
||||||
|
message(STATUS "[QUTLASS] Using QuTLASS vendored CUTLASS headers (no vLLM CUTLASS detected).")
|
||||||
|
else()
|
||||||
|
message(FATAL_ERROR "[QUTLASS] CUTLASS headers not found. "
|
||||||
|
"Set -DCUTLASS_INCLUDE_DIR=/path/to/cutlass/include")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${QUTLASS_SOURCES}"
|
||||||
|
CUDA_ARCHS "${QUTLASS_ARCHS}"
|
||||||
|
)
|
||||||
|
|
||||||
|
target_sources(_C PRIVATE ${QUTLASS_SOURCES})
|
||||||
|
target_include_directories(_C PRIVATE ${QUTLASS_INCLUDES})
|
||||||
|
target_compile_definitions(_C PRIVATE
|
||||||
|
QUTLASS_DISABLE_PYBIND=1
|
||||||
|
TARGET_CUDA_ARCH=${QUTLASS_TARGET_CC}
|
||||||
|
)
|
||||||
|
|
||||||
|
set_property(SOURCE ${QUTLASS_SOURCES} APPEND PROPERTY COMPILE_OPTIONS
|
||||||
|
$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr --use_fast_math -O3>
|
||||||
|
)
|
||||||
|
|
||||||
|
else()
|
||||||
|
if("${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS "12.8")
|
||||||
|
message(STATUS
|
||||||
|
"[QUTLASS] Skipping build: CUDA 12.8 or newer is required (found ${CMAKE_CUDA_COMPILER_VERSION}).")
|
||||||
|
else()
|
||||||
|
message(STATUS
|
||||||
|
"[QUTLASS] Skipping build: no supported arch (12.0a / 10.0a) found in "
|
||||||
|
"CUDA_ARCHS='${CUDA_ARCHS}'.")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
@ -38,7 +38,7 @@ else()
|
|||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
vllm-flash-attn
|
vllm-flash-attn
|
||||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||||
GIT_TAG 4695e6bed5366c41e28c06cd86170166e4f43d00
|
GIT_TAG a893712401d70362fbb299cd9c4b3476e8e9ed54
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
# Don't share the vllm-flash-attn build between build types
|
# Don't share the vllm-flash-attn build between build types
|
||||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||||
|
|||||||
@ -129,6 +129,44 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
|
|||||||
set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE)
|
set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE)
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
|
# Find libgomp that gets shipped with PyTorch wheel and create a shim dir with:
|
||||||
|
# libgomp.so -> libgomp-<hash>.so...
|
||||||
|
# libgomp.so.1 -> libgomp-<hash>.so...
|
||||||
|
# OUTPUT: TORCH_GOMP_SHIM_DIR ("" if not found)
|
||||||
|
function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
|
||||||
|
set(${TORCH_GOMP_SHIM_DIR} "" PARENT_SCOPE)
|
||||||
|
|
||||||
|
# Use run_python to locate vendored libgomp; never throw on failure.
|
||||||
|
run_python(_VLLM_TORCH_GOMP_PATH
|
||||||
|
"
|
||||||
|
import os, glob
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
torch_pkg = os.path.dirname(torch.__file__)
|
||||||
|
site_root = os.path.dirname(torch_pkg)
|
||||||
|
torch_libs = os.path.join(site_root, 'torch.libs')
|
||||||
|
print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0])
|
||||||
|
except:
|
||||||
|
print('')
|
||||||
|
"
|
||||||
|
"failed to probe torch.libs for libgomp")
|
||||||
|
|
||||||
|
if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
|
||||||
|
return()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# Create shim under the build tree
|
||||||
|
set(_shim "${CMAKE_BINARY_DIR}/gomp_shim")
|
||||||
|
file(MAKE_DIRECTORY "${_shim}")
|
||||||
|
|
||||||
|
execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so")
|
||||||
|
execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so.1")
|
||||||
|
execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so")
|
||||||
|
execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so.1")
|
||||||
|
|
||||||
|
set(${TORCH_GOMP_SHIM_DIR} "${_shim}" PARENT_SCOPE)
|
||||||
|
endfunction()
|
||||||
|
|
||||||
# Macro for converting a `gencode` version number to a cmake version number.
|
# Macro for converting a `gencode` version number to a cmake version number.
|
||||||
macro(string_to_ver OUT_VER IN_STR)
|
macro(string_to_ver OUT_VER IN_STR)
|
||||||
string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
|
string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
|
||||||
|
|||||||
12
codecov.yml
Normal file
12
codecov.yml
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
codecov:
|
||||||
|
require_ci_to_pass: false
|
||||||
|
|
||||||
|
fixes:
|
||||||
|
# Map source code paths to repository root paths
|
||||||
|
# Wildcards match any Python version (python3.*)
|
||||||
|
- "/vllm-workspace/src/vllm/::vllm/"
|
||||||
|
- "/vllm-workspace/vllm/::vllm/"
|
||||||
|
- "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/"
|
||||||
|
- "/usr/local/lib/python3.*/site-packages/vllm/::vllm/"
|
||||||
|
- "/usr/lib/python3.*/dist-packages/vllm/::vllm/"
|
||||||
|
- "/usr/lib/python3.*/site-packages/vllm/::vllm/"
|
||||||
@ -28,10 +28,10 @@
|
|||||||
|
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
#include <hip/hip_bf16.h>
|
#include <hip/hip_bf16.h>
|
||||||
#include "../quantization/fp8/amd/quant_utils.cuh"
|
#include "../quantization/w8a8/fp8/amd/quant_utils.cuh"
|
||||||
typedef __hip_bfloat16 __nv_bfloat16;
|
typedef __hip_bfloat16 __nv_bfloat16;
|
||||||
#else
|
#else
|
||||||
#include "../quantization/fp8/nvidia/quant_utils.cuh"
|
#include "../quantization/w8a8/fp8/nvidia/quant_utils.cuh"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
|||||||
@ -125,32 +125,37 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void set_split_kv (KernelArguments& args) {
|
static void set_split_kv (KernelArguments& args) {
|
||||||
// printf("set_split_kv start");
|
|
||||||
if (args.split_kv >= 1) return;
|
if (args.split_kv >= 1) return;
|
||||||
auto [H, K, D, B] = args.problem_shape;
|
auto [H, K, D, B] = args.problem_shape;
|
||||||
// std::cout << H << " " << K << " " << D << " " << B << "\n";
|
|
||||||
int sm_count = args.hw_info.sm_count;
|
int sm_count = args.hw_info.sm_count;
|
||||||
// printf(" sm_count = %d\n", sm_count);
|
float seq_length_k = static_cast<float>(K) / 1024.0f;
|
||||||
int max_splits = ceil_div(K, 128);
|
int max_splits = 1;
|
||||||
max_splits = min(16, max_splits);
|
|
||||||
|
|
||||||
// TODO: This avoids a hang when the batch size larger than 1 and
|
if (B <= 4 && seq_length_k >= 16) {
|
||||||
// there is more than 1 kv_splits.
|
max_splits = 16;
|
||||||
// Discuss with NVIDIA how this can be fixed.
|
|
||||||
if (B > 1) {
|
|
||||||
max_splits = min(1, max_splits);
|
|
||||||
}
|
}
|
||||||
|
else if (B <= 8 && seq_length_k >= 4) {
|
||||||
// printf(" max_splits = %d\n", max_splits);
|
max_splits = 8;
|
||||||
|
}
|
||||||
|
else if ((B <= 16 && seq_length_k >= 8) ||
|
||||||
|
(B == 48 && seq_length_k >= 32)) {
|
||||||
|
max_splits = 4;
|
||||||
|
}
|
||||||
|
else if ((B <= 32 && seq_length_k >= 16) ||
|
||||||
|
(B == 96 && seq_length_k >= 16)) {
|
||||||
|
max_splits = 2;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
max_splits = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wave-aware scheduling: ensure integer number of waves in K dimension
|
||||||
int sms_per_batch = max(1, sm_count / B);
|
int sms_per_batch = max(1, sm_count / B);
|
||||||
// printf(" sms_per_batch = %d\n", sms_per_batch);
|
|
||||||
int split_heur = min(max_splits, sms_per_batch);
|
int split_heur = min(max_splits, sms_per_batch);
|
||||||
int waves = ceil_div(B * split_heur, sm_count);
|
int waves = ceil_div(B * split_heur, sm_count);
|
||||||
int k_waves = ceil_div(max_splits, split_heur);
|
int k_waves = ceil_div(max_splits, split_heur);
|
||||||
int split_wave_aware = ceil_div(max_splits, k_waves);
|
int split_wave_aware = ceil_div(max_splits, k_waves);
|
||||||
args.split_kv = split_wave_aware;
|
args.split_kv = split_wave_aware;
|
||||||
// printf(" args.split_kv = %d\n", args.split_kv);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Determines whether the GEMM can execute the given problem.
|
/// Determines whether the GEMM can execute the given problem.
|
||||||
|
|||||||
@ -64,3 +64,11 @@ void indexer_k_quant_and_cache(
|
|||||||
torch::Tensor& slot_mapping, // [num_tokens]
|
torch::Tensor& slot_mapping, // [num_tokens]
|
||||||
int64_t quant_block_size, // quantization block size
|
int64_t quant_block_size, // quantization block size
|
||||||
const std::string& scale_fmt);
|
const std::string& scale_fmt);
|
||||||
|
|
||||||
|
// Extract function to gather quantized K cache
|
||||||
|
void cp_gather_indexer_k_quant_cache(
|
||||||
|
const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
|
||||||
|
torch::Tensor& dst_k, // [num_tokens, head_dim]
|
||||||
|
torch::Tensor& dst_scale, // [num_tokens, head_dim / quant_block_size * 4]
|
||||||
|
const torch::Tensor& block_table, // [batch_size, num_blocks]
|
||||||
|
const torch::Tensor& cu_seq_lens); // [batch_size + 1]
|
||||||
@ -9,9 +9,9 @@
|
|||||||
#include "quantization/vectorization_utils.cuh"
|
#include "quantization/vectorization_utils.cuh"
|
||||||
|
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
#include "quantization/fp8/amd/quant_utils.cuh"
|
#include "quantization/w8a8/fp8/amd/quant_utils.cuh"
|
||||||
#else
|
#else
|
||||||
#include "quantization/fp8/nvidia/quant_utils.cuh"
|
#include "quantization/w8a8/fp8/nvidia/quant_utils.cuh"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
@ -572,6 +572,70 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <int BLOCK_Y_SIZE>
|
||||||
|
__global__ void cp_gather_indexer_k_quant_cache_kernel(
|
||||||
|
const char* __restrict__ kv_cache, // [num_blocks, block_size,
|
||||||
|
// cache_stride]
|
||||||
|
char* __restrict__ dst_k, // [num_tokens, head_dim]
|
||||||
|
char* __restrict__ dst_scale, // [num_tokens, head_dim / quant_block_size *
|
||||||
|
// 4]
|
||||||
|
const int* __restrict__ block_table, // [batch_size, num_blocks]
|
||||||
|
const int* __restrict__ cu_seq_lens, // [batch_size + 1]
|
||||||
|
const int batch_size, // batch size
|
||||||
|
const int64_t token_stride, // stride for each token in dst_k
|
||||||
|
const int64_t head_dim, // dimension of each head
|
||||||
|
const int64_t block_stride, // stride for each block in kv_cache
|
||||||
|
const int64_t cache_token_stride, // stride for each token in kv_cache
|
||||||
|
const int64_t cache_block_size, // num_tokens for each block in kv_cache
|
||||||
|
const int num_blocks, // number of blocks
|
||||||
|
const int num_tokens, // number of tokens
|
||||||
|
const int quant_block_size // quantization block size
|
||||||
|
) {
|
||||||
|
constexpr int VEC_SIZE = sizeof(float4) / sizeof(char);
|
||||||
|
const int token_idx = blockIdx.x * blockDim.y + threadIdx.y;
|
||||||
|
const int head_idx = (blockIdx.y * blockDim.x + threadIdx.x) * VEC_SIZE;
|
||||||
|
// Find batch index within a block
|
||||||
|
__shared__ int batch_idx[BLOCK_Y_SIZE];
|
||||||
|
for (int iter = 0; iter < cuda_utils::ceil_div(batch_size, int(blockDim.x));
|
||||||
|
iter++) {
|
||||||
|
int tid = iter * blockDim.x + threadIdx.x;
|
||||||
|
if (tid < batch_size) {
|
||||||
|
const int seq_start = cu_seq_lens[tid];
|
||||||
|
const int seq_end = cu_seq_lens[tid + 1];
|
||||||
|
if (token_idx >= seq_start && token_idx < seq_end) {
|
||||||
|
batch_idx[threadIdx.y] = tid;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
__syncwarp();
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (head_idx >= head_dim || token_idx >= num_tokens) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const int inbatch_seq_idx = token_idx - cu_seq_lens[batch_idx[threadIdx.y]];
|
||||||
|
const int block_idx = block_table[batch_idx[threadIdx.y] * num_blocks +
|
||||||
|
inbatch_seq_idx / cache_block_size];
|
||||||
|
const int64_t src_block_offset = block_idx * block_stride;
|
||||||
|
const int64_t cache_inblock_offset =
|
||||||
|
(inbatch_seq_idx % cache_block_size) * head_dim + head_idx;
|
||||||
|
const int64_t src_inblock_offset = src_block_offset + cache_inblock_offset;
|
||||||
|
const int64_t dst_inblock_offset = token_idx * token_stride + head_idx;
|
||||||
|
|
||||||
|
reinterpret_cast<float4*>(dst_k)[dst_inblock_offset / VEC_SIZE] =
|
||||||
|
reinterpret_cast<const float4*>(kv_cache)[src_inblock_offset / VEC_SIZE];
|
||||||
|
;
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
const int64_t src_scale_offset =
|
||||||
|
src_block_offset + cache_block_size * head_dim +
|
||||||
|
cache_inblock_offset * 4 / quant_block_size;
|
||||||
|
reinterpret_cast<float*>(dst_scale)[dst_inblock_offset / quant_block_size] =
|
||||||
|
reinterpret_cast<const float*>(kv_cache)[src_scale_offset / 4];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
// KV_T is the data type of key and value tensors.
|
// KV_T is the data type of key and value tensors.
|
||||||
@ -1173,3 +1237,59 @@ void indexer_k_quant_and_cache(
|
|||||||
DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3",
|
DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3",
|
||||||
CALL_INDEXER_K_QUANT_AND_CACHE);
|
CALL_INDEXER_K_QUANT_AND_CACHE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Macro to dispatch the kernel based on the data amount.
|
||||||
|
#define CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(BLOCK_Y_SIZE) \
|
||||||
|
vllm::cp_gather_indexer_k_quant_cache_kernel<BLOCK_Y_SIZE> \
|
||||||
|
<<<dim3((num_tokens + BLOCK_Y_SIZE - 1) / BLOCK_Y_SIZE, \
|
||||||
|
(head_dim + 8 * vec_size - 1) / (8 * vec_size)), \
|
||||||
|
dim3(8, BLOCK_Y_SIZE), 0, stream>>>( \
|
||||||
|
reinterpret_cast<char*>(kv_cache.data_ptr()), \
|
||||||
|
reinterpret_cast<char*>(dst_k.data_ptr()), \
|
||||||
|
reinterpret_cast<char*>(dst_scale.data_ptr()), \
|
||||||
|
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
|
||||||
|
batch_size, dst_k.stride(0), dst_k.size(1), kv_cache.stride(0), \
|
||||||
|
kv_cache.stride(1), kv_cache.size(1), block_table.size(1), \
|
||||||
|
num_tokens, quant_block_size);
|
||||||
|
|
||||||
|
void cp_gather_indexer_k_quant_cache(
|
||||||
|
const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
|
||||||
|
torch::Tensor& dst_k, // [num_tokens, head_dim]
|
||||||
|
torch::Tensor& dst_scale, // [num_tokens, head_dim / quant_block_size * 4]
|
||||||
|
const torch::Tensor& block_table, // [batch_size, num_blocks]
|
||||||
|
const torch::Tensor& cu_seq_lens // [batch_size + 1]
|
||||||
|
) {
|
||||||
|
int batch_size = block_table.size(0);
|
||||||
|
int num_tokens = dst_k.size(0);
|
||||||
|
int head_dim = dst_k.size(1);
|
||||||
|
int quant_block_size = head_dim * 4 / dst_scale.size(1);
|
||||||
|
|
||||||
|
TORCH_CHECK(kv_cache.device() == dst_k.device(),
|
||||||
|
"kv_cache and dst_k must be on the same device");
|
||||||
|
TORCH_CHECK(kv_cache.device() == dst_scale.device(),
|
||||||
|
"kv_cache and dst_scale must be on the same device");
|
||||||
|
TORCH_CHECK(kv_cache.device() == block_table.device(),
|
||||||
|
"kv_cache and block_table must be on the same device");
|
||||||
|
TORCH_CHECK(kv_cache.device() == cu_seq_lens.device(),
|
||||||
|
"kv_cache and cu_seq_lens must be on the same device");
|
||||||
|
TORCH_CHECK(head_dim % quant_block_size == 0,
|
||||||
|
"head_dim must be divisible by quant_block_size");
|
||||||
|
|
||||||
|
constexpr int vec_size = 16;
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_cache));
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
|
if (num_tokens < 32) {
|
||||||
|
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(1);
|
||||||
|
} else if (num_tokens < 64) {
|
||||||
|
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(2);
|
||||||
|
} else if (num_tokens < 128) {
|
||||||
|
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(4);
|
||||||
|
} else if (num_tokens < 256) {
|
||||||
|
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(8);
|
||||||
|
} else if (num_tokens < 512) {
|
||||||
|
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(16);
|
||||||
|
} else {
|
||||||
|
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@ -5,12 +5,15 @@
|
|||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// vllm_kernel_override_batch_invariant(); returns true
|
// vllm_is_batch_invariant(); returns true
|
||||||
// if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
|
// if env VLLM_BATCH_INVARIANT=1
|
||||||
inline bool vllm_kernel_override_batch_invariant() {
|
inline bool vllm_is_batch_invariant() {
|
||||||
std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
|
static bool cached = []() {
|
||||||
const char* val = std::getenv(env_key.c_str());
|
std::string env_key = "VLLM_BATCH_INVARIANT";
|
||||||
return (val && std::atoi(val) != 0) ? 1 : 0;
|
const char* val = std::getenv(env_key.c_str());
|
||||||
|
return (val && std::atoi(val) != 0) ? 1 : 0;
|
||||||
|
}();
|
||||||
|
return cached;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|||||||
@ -187,7 +187,8 @@ template <>
|
|||||||
struct hash<MatMulPrimitiveHandler::ClassMatmulCacheKey> {
|
struct hash<MatMulPrimitiveHandler::ClassMatmulCacheKey> {
|
||||||
size_t operator()(
|
size_t operator()(
|
||||||
const MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const {
|
const MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const {
|
||||||
return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size);
|
return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size) ^
|
||||||
|
hash<int>()(static_cast<int>(val.b_type));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -216,7 +217,8 @@ bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l,
|
|||||||
|
|
||||||
bool operator==(const MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
|
bool operator==(const MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
|
||||||
const MatMulPrimitiveHandler::ClassMatmulCacheKey& r) {
|
const MatMulPrimitiveHandler::ClassMatmulCacheKey& r) {
|
||||||
return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size;
|
return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size &&
|
||||||
|
l.b_type == r.b_type;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool operator==(const MatMulPrimitiveHandler::MSizeCacheKey& l,
|
bool operator==(const MatMulPrimitiveHandler::MSizeCacheKey& l,
|
||||||
@ -493,8 +495,10 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) {
|
|||||||
dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache(
|
dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache(
|
||||||
const MSizeCacheKey& key) {
|
const MSizeCacheKey& key) {
|
||||||
if (m_size_cache_.get() == nullptr) {
|
if (m_size_cache_.get() == nullptr) {
|
||||||
ClassMatmulCacheKey key = {.b_n_size = b_n_size_, .b_k_size = b_k_size_};
|
ClassMatmulCacheKey class_key = {
|
||||||
m_size_cache_ = get_matul_class_primitive_cache(key, primitive_cache_size_);
|
.b_n_size = b_n_size_, .b_k_size = b_k_size_, .b_type = b_type_};
|
||||||
|
m_size_cache_ =
|
||||||
|
get_matul_class_primitive_cache(class_key, primitive_cache_size_);
|
||||||
}
|
}
|
||||||
return m_size_cache_->get_or_create(key, [&]() {
|
return m_size_cache_->get_or_create(key, [&]() {
|
||||||
dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
|
dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
|
||||||
|
|||||||
@ -199,6 +199,7 @@ class MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
|
|||||||
struct ClassMatmulCacheKey {
|
struct ClassMatmulCacheKey {
|
||||||
dnnl_dim_t b_n_size;
|
dnnl_dim_t b_n_size;
|
||||||
dnnl_dim_t b_k_size;
|
dnnl_dim_t b_k_size;
|
||||||
|
dnnl::memory::data_type b_type;
|
||||||
|
|
||||||
friend bool operator==(const ClassMatmulCacheKey& l,
|
friend bool operator==(const ClassMatmulCacheKey& l,
|
||||||
const ClassMatmulCacheKey& r);
|
const ClassMatmulCacheKey& r);
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user