Compare commits
1457 Commits
sampler-en
...
fp8_ep_dp
| Author | SHA1 | Date | |
|---|---|---|---|
| 1236aebf0e | |||
| ca2f6b9c30 | |||
| 20133cfee2 | |||
| ebb1ec9318 | |||
| 5b168b6d7a | |||
| 9760fd8f6a | |||
| b9f61e1387 | |||
| d6fd3a33b8 | |||
| 432ec9926e | |||
| 2b102d51ad | |||
| aa54a7bf7b | |||
| 2ad6194a02 | |||
| c594cbf565 | |||
| a35ca765a5 | |||
| 6aa8f9a4e7 | |||
| 1bc86a3da1 | |||
| bbfa0c61d1 | |||
| 20079c6e36 | |||
| 9a1b9b99d7 | |||
| 8bf507d766 | |||
| 306d60401d | |||
| f2c3f66d59 | |||
| 0f5e0d567e | |||
| c55d804672 | |||
| 749f5bdd38 | |||
| 2a50ef5760 | |||
| b8b904795d | |||
| ba5111f237 | |||
| 1e123529d7 | |||
| dff80b0e42 | |||
| 7782464a17 | |||
| 0f71e24034 | |||
| 1dab4d5718 | |||
| 7f21e8052b | |||
| 5a8641638a | |||
| f49239cb45 | |||
| 2dbe8c0774 | |||
| 84ec470fca | |||
| b29ca5c4d5 | |||
| ec6833c5e9 | |||
| e1fadf1197 | |||
| 43ff405b90 | |||
| fba02e3bd1 | |||
| 4577fc9abb | |||
| 5f1d0c8118 | |||
| c3bb9f2331 | |||
| 8f8900cee9 | |||
| 6acb7a6285 | |||
| 4f4a6b844a | |||
| 4d0a1541be | |||
| 77b6e74fe2 | |||
| 5acf828d99 | |||
| 3987e2ae96 | |||
| 77164dad5e | |||
| 95c40f9b09 | |||
| 3de3eadf5b | |||
| 3132290a14 | |||
| 1aa2f81b43 | |||
| d54af615d5 | |||
| a0efd3106c | |||
| e69879996f | |||
| a1cc9f33a3 | |||
| a521ef06e5 | |||
| 922165cba3 | |||
| 12ea698498 | |||
| 64eaf5fe05 | |||
| d1d61f3351 | |||
| 32ce3cf7c9 | |||
| d58f9c7f7a | |||
| c29034037d | |||
| 1b7cfd5a36 | |||
| da4b69d0b4 | |||
| c9479b2920 | |||
| 6f2909405e | |||
| b169d5f7b6 | |||
| f8977c233f | |||
| f274581f44 | |||
| 0b1447f890 | |||
| 24d0ef8970 | |||
| 7fcfd954ff | |||
| e740d07f07 | |||
| a652e71dd0 | |||
| 34d6c447c4 | |||
| 972eddf7c9 | |||
| fd7bb88d72 | |||
| 3c49dbdd03 | |||
| 1661a9c28f | |||
| 8e882ffdc0 | |||
| 26b4fa45be | |||
| 515b413ebf | |||
| caca0b718a | |||
| d86e3f0172 | |||
| 3ca8322b74 | |||
| 03b41b6cad | |||
| cad6447664 | |||
| c169b05541 | |||
| 468d16654a | |||
| 909f234faa | |||
| f8510587c2 | |||
| 9cfebf51ba | |||
| 77f95b99a6 | |||
| bbe888d033 | |||
| 25ed6738d4 | |||
| e568e401da | |||
| 269d901734 | |||
| 7951d78738 | |||
| 6dbe5b5c93 | |||
| 643622ba46 | |||
| a09c7ca9f2 | |||
| 0e98964e94 | |||
| c68b5c63eb | |||
| fced756923 | |||
| 321331b8ae | |||
| 6e4cea1cc5 | |||
| 435fa95444 | |||
| 4c2b38ce9e | |||
| d781930f90 | |||
| ce75efeecb | |||
| aa42561e40 | |||
| de65fc8e1e | |||
| 0c492b7824 | |||
| 0f0926b43f | |||
| 7f2c1a87e9 | |||
| b78f844a67 | |||
| 5e13c07d00 | |||
| 774c5fde30 | |||
| 9a21e331ff | |||
| 3e9ce609bd | |||
| 794ae1f551 | |||
| d73a9457a5 | |||
| a3896c7f02 | |||
| 51e98e4ffd | |||
| e56f44d9ec | |||
| e0cbad4e30 | |||
| b48d5cca16 | |||
| 5873877241 | |||
| 696259ca01 | |||
| 6b6d496114 | |||
| aaa4ac1c95 | |||
| 06a0338015 | |||
| 4318c0559d | |||
| a68e293cb9 | |||
| 6881107948 | |||
| e0f0ff87b8 | |||
| c24b1572ac | |||
| 4693a3438c | |||
| bbd9a84dc5 | |||
| a547aeb828 | |||
| fc6d0c290f | |||
| 753944fa9b | |||
| 25a817f202 | |||
| d260f799a9 | |||
| b50602d5f0 | |||
| 1f1b1bc03b | |||
| 1f88dbd2bb | |||
| 0eebd74842 | |||
| 27bebcd897 | |||
| e7523c2e03 | |||
| a869baca73 | |||
| 82e2339b06 | |||
| 9553fdb41e | |||
| 243eb9199f | |||
| 0665e29998 | |||
| e76be06550 | |||
| 0877750029 | |||
| 6d68030f1c | |||
| 5a2c76cbe1 | |||
| 38b13dfe78 | |||
| 61a45e7a72 | |||
| 65523a0995 | |||
| 4b7740a105 | |||
| 4ea62c0ea0 | |||
| 561b77a0d6 | |||
| abd4030d94 | |||
| 8820821b59 | |||
| fba0642704 | |||
| 6071e989df | |||
| 57fd13a707 | |||
| 3a886bd58c | |||
| 35be8fad62 | |||
| f2faac745d | |||
| 279f854519 | |||
| 624b77a2b3 | |||
| 503f8487c2 | |||
| 44073a7ac3 | |||
| 63934543a0 | |||
| 75f81750f3 | |||
| 6ab681bcbe | |||
| cebc22f3b6 | |||
| 6c6dcd8611 | |||
| 7891fdf0c6 | |||
| 6825d9a998 | |||
| b554ab736e | |||
| 9ea7f1abf3 | |||
| 2807271c86 | |||
| b9018a3f9f | |||
| 4ceafb6299 | |||
| 2e6705784f | |||
| 1cb194a018 | |||
| 2cd4d58df4 | |||
| 6d166a8d35 | |||
| ef1dd6870f | |||
| e77dc4bad8 | |||
| 07458a51ce | |||
| c1e4a4052d | |||
| a859320575 | |||
| 441dc63ac7 | |||
| d55e446d13 | |||
| ec82c3e388 | |||
| 45ab403a1f | |||
| 2b10ba7491 | |||
| 4fc1bf813a | |||
| f2036734fb | |||
| 7d9216495c | |||
| 0ddf88e16e | |||
| 1645b60196 | |||
| 2628a69e35 | |||
| 371f7e4ca2 | |||
| 15b45ffb9a | |||
| 273cb3b4d9 | |||
| 8ddd1cf26a | |||
| 6550114c9c | |||
| 9520a989df | |||
| 3d28ad343f | |||
| 6a7988c55b | |||
| 022d8abe29 | |||
| 5221815a00 | |||
| 1068556b2c | |||
| 2cd1fa4556 | |||
| d4c2919760 | |||
| 6220f3c6b0 | |||
| 52fb23f47e | |||
| 6dd51c7ef1 | |||
| 2edb533af2 | |||
| 38a95cb4a8 | |||
| cd821ea5d2 | |||
| 7ab056c273 | |||
| 6526e05111 | |||
| e493e48524 | |||
| 4ce64e2df4 | |||
| fbb13a2c15 | |||
| a1fe24d961 | |||
| d0bc2f810b | |||
| b046cf792d | |||
| 54af915949 | |||
| 71ea614d4a | |||
| 4c611348a7 | |||
| 60cad94b86 | |||
| 9c1baa5bc6 | |||
| 4be2255c81 | |||
| ed5d408255 | |||
| 583507d130 | |||
| e44d8ce8c7 | |||
| 93ecb8139c | |||
| fae453f8ce | |||
| 4b0da7b60e | |||
| c6b636f9fb | |||
| 04eb88dc80 | |||
| 46791e1b4b | |||
| c32e249a23 | |||
| c91fe7b1b9 | |||
| a04720bc36 | |||
| 7b9d832c80 | |||
| 6e588da0f4 | |||
| f8d2cc5f55 | |||
| 721fb9b181 | |||
| 1f3a1200e4 | |||
| 54631f8262 | |||
| cb506ecb5a | |||
| 93f71673ce | |||
| 3f505233fd | |||
| 4e04eceb58 | |||
| 71075029f2 | |||
| ca86a7cf6e | |||
| a35a494745 | |||
| f6037d1907 | |||
| fa72f9a812 | |||
| ebed81fbf5 | |||
| e2d7d31244 | |||
| 23b67b37b2 | |||
| db5a29ba19 | |||
| 51797775c3 | |||
| cf5984b2fe | |||
| d022115cc6 | |||
| acb54ca8e1 | |||
| 6e0fd34d3c | |||
| 176d62e4ea | |||
| 20bd6f4d2e | |||
| 1f079540db | |||
| 94d8ec8d2b | |||
| bb0a311213 | |||
| dd5fa7e04f | |||
| 2b16104557 | |||
| 371376f996 | |||
| c6c10ca920 | |||
| c154d89306 | |||
| eca18691d2 | |||
| 61acfc45bc | |||
| 107f5fc4cb | |||
| 907f935de9 | |||
| 5d7f545204 | |||
| cd8dfc6dfc | |||
| d06dd72ba9 | |||
| ad0012a0ac | |||
| 92247c522e | |||
| 0c15c2e486 | |||
| 3b17ea26e4 | |||
| 23baa2180b | |||
| 980a172474 | |||
| e1f5a71ed7 | |||
| f4a8a37465 | |||
| 8f55962a7f | |||
| be48360c1f | |||
| 86847700d7 | |||
| d6c86d09ae | |||
| 6b35cb10a0 | |||
| 1b1e8e05ff | |||
| bca55b556f | |||
| d981396778 | |||
| 9609327fa4 | |||
| f07a673eb2 | |||
| d565e0976f | |||
| 258bf621d5 | |||
| dc1440cf9f | |||
| 8171221834 | |||
| 7937c2fd52 | |||
| e2ee1e8e9e | |||
| 20d8ce81eb | |||
| 84ab4feb7e | |||
| 6781af5608 | |||
| 1b15df2546 | |||
| 43b5f61dce | |||
| c5bb0ebdc6 | |||
| d637b96099 | |||
| 275c5daeb0 | |||
| 47fda6d089 | |||
| 27d0952600 | |||
| 221cfc2fea | |||
| 9da1095daf | |||
| d1211f8794 | |||
| b6a6e7a529 | |||
| 4fb349f66a | |||
| 908733aca7 | |||
| 1a8f68bb90 | |||
| 9ab2c02ff8 | |||
| 66e63e86ec | |||
| 9214e60631 | |||
| f880d42582 | |||
| dcfe95234c | |||
| 48ac2bed5b | |||
| 3e0d435027 | |||
| 4ee4826ede | |||
| 60017dc841 | |||
| 55f1a468d9 | |||
| fd195b194e | |||
| fabe89bbc4 | |||
| e73b7dfd69 | |||
| 7fdfa01530 | |||
| aef94c6d07 | |||
| 0ceaebf87b | |||
| 1db4f47f81 | |||
| d3d91b6f71 | |||
| 87d871470d | |||
| a5f8c111c2 | |||
| e23564cb70 | |||
| 390ec88905 | |||
| 541817670c | |||
| 67da5720d4 | |||
| 5c04bb8b86 | |||
| 3d2779c29a | |||
| 6b31c84aff | |||
| b18201fe06 | |||
| f4937a51c1 | |||
| ee659e3b60 | |||
| 4e1c6a0264 | |||
| c7852a6d9b | |||
| 8795eb9975 | |||
| 0b34593017 | |||
| e3f3aee6f4 | |||
| 92540529c0 | |||
| fadb8d5c2d | |||
| 2aa5470ac5 | |||
| 51ff154639 | |||
| 566ec04c3d | |||
| 01c22335ba | |||
| 451da4bcbd | |||
| 07ad27121f | |||
| a9944aabfa | |||
| a8f5aec20a | |||
| de71fec81b | |||
| 70f8b96724 | |||
| dd2a94596a | |||
| 420caf7557 | |||
| 4f07a64075 | |||
| e6b8e65d2d | |||
| 26d0419309 | |||
| 83f74c698f | |||
| 2dff093574 | |||
| afe3236e90 | |||
| 65334ef3b9 | |||
| e60f550b38 | |||
| f25e0d1125 | |||
| 09f106a91e | |||
| 2142035b51 | |||
| 78aa341d12 | |||
| 7974736740 | |||
| 2fc9075b82 | |||
| d93c976a0d | |||
| 749f792553 | |||
| 856865008e | |||
| f9c069c85e | |||
| 418d2f8bfb | |||
| 964472b966 | |||
| 59dd311cf5 | |||
| d066e52013 | |||
| c8ea982d9b | |||
| dc372b9c8a | |||
| 9b5b39b650 | |||
| 9ccc6ded42 | |||
| d62a076e84 | |||
| 259127f8b8 | |||
| 612c2edb4f | |||
| 38fe728d60 | |||
| 82e7f9bb03 | |||
| 63dc3426e0 | |||
| 8f5dc41481 | |||
| 63ad622233 | |||
| e7ef61c1f0 | |||
| d4154c35a2 | |||
| 6685890d11 | |||
| 33011318c2 | |||
| 4f8b373225 | |||
| 7b2f28deba | |||
| 2d912fb66f | |||
| 12e6c0b41c | |||
| 9a2a6357de | |||
| 6266c57bae | |||
| 754b699cbe | |||
| 6e27c6d86b | |||
| d5af47a149 | |||
| 65f0f74b66 | |||
| 176a95c670 | |||
| f2ae883b67 | |||
| 40de1ef455 | |||
| 0189a65a2e | |||
| 55aa7af994 | |||
| 0b217da646 | |||
| 19324d660c | |||
| fc407a1425 | |||
| 009d9e7590 | |||
| b922c2ebd2 | |||
| 00b14e0f16 | |||
| 54e467e6f8 | |||
| 79a1d25bbd | |||
| 9944011b30 | |||
| 8c946cecca | |||
| ff334ca1cd | |||
| 6223dd8114 | |||
| 906f0598fc | |||
| cb528d0585 | |||
| 98fcba1575 | |||
| 23b3134eb5 | |||
| ea6ae8cb45 | |||
| 2ff297dce9 | |||
| 8dd0671bac | |||
| f0d610a8ae | |||
| e57e4d6e9e | |||
| ee5be834e7 | |||
| 48545728d8 | |||
| dc1a821768 | |||
| 61e0a506a3 | |||
| 1df491c522 | |||
| d8487ef557 | |||
| c06af9a959 | |||
| 60f7624334 | |||
| f6518b2b48 | |||
| d67085c2c8 | |||
| 307939f299 | |||
| 9d7ea9dbbf | |||
| acee8f48aa | |||
| f065de4e88 | |||
| dc9905368d | |||
| ebab1ac37c | |||
| 2b0db9b0e2 | |||
| 195adb47c0 | |||
| 302f3aca7e | |||
| e9c730c9bd | |||
| 289199feb6 | |||
| b9fd0d7a69 | |||
| 72a3f6b898 | |||
| 98ea35601c | |||
| d19110204c | |||
| 05a4324f8e | |||
| 7ea6cb28b2 | |||
| 9fbf2bfbd5 | |||
| 3a5ea75129 | |||
| 891b9d33de | |||
| 430783018c | |||
| 19a3c78d1f | |||
| ada50aa295 | |||
| 08bf784078 | |||
| d45fe333fb | |||
| 021c16c7ca | |||
| 7de18d541b | |||
| a810b5b088 | |||
| 009b3d5382 | |||
| e4b8713380 | |||
| 06c0922a69 | |||
| cd3edfc908 | |||
| 9cea90eab4 | |||
| d1110f5b5a | |||
| 8132365b74 | |||
| eea22a56ab | |||
| 9112155283 | |||
| 90d0a74b60 | |||
| d74e5f37bc | |||
| ca66a1674c | |||
| 950751a987 | |||
| 4c31218f80 | |||
| 68311891f5 | |||
| fc4441a4ee | |||
| 246e3e0a36 | |||
| 7042cc96b0 | |||
| 0c0fdae84f | |||
| 3b602cdea7 | |||
| 4b2ed7926a | |||
| 7e3571134f | |||
| ea2236bf95 | |||
| 7d4aedae7c | |||
| 22481fbfa3 | |||
| 5c4c08f6f1 | |||
| c44c384b1c | |||
| 85b72cb7b1 | |||
| 6e5595ca39 | |||
| 200da9a517 | |||
| 9f64e93415 | |||
| ec61ea20a8 | |||
| c6798baa9c | |||
| 5b2dcbf0b8 | |||
| 6e4a93e3f7 | |||
| 217db4baa6 | |||
| ff8c400502 | |||
| 89a0315f4c | |||
| 3d1e387652 | |||
| d310e6de98 | |||
| 5e6f939484 | |||
| 760e3ecc8f | |||
| 3c9396a64f | |||
| 376786fac1 | |||
| 4f605a6de5 | |||
| 8342e3abd1 | |||
| a83a0f92b5 | |||
| 226a4272cf | |||
| ec54d73c31 | |||
| a944f8ede7 | |||
| 015815fe01 | |||
| e4ca6e3a99 | |||
| 53d0cb7423 | |||
| f50dcb7c21 | |||
| a1e19b635d | |||
| bb239a730f | |||
| a463555dee | |||
| ca04b97c93 | |||
| 0a9bbaa104 | |||
| 39956efb3f | |||
| 597051e56f | |||
| 96722aa81d | |||
| 843b222723 | |||
| e515668edf | |||
| 5a499e70d5 | |||
| 6930a41116 | |||
| 998eea4a0e | |||
| c747d84576 | |||
| b2da14a05a | |||
| 7ea2adb802 | |||
| 3d13ca0e24 | |||
| 66ab3b13c9 | |||
| a8238bbdb0 | |||
| d43f914d42 | |||
| ed5272cf21 | |||
| c20ef40fd0 | |||
| db593aa67f | |||
| f98e307588 | |||
| 646a31e51e | |||
| be8ff88e66 | |||
| 1a6af1453d | |||
| 32aa74c09c | |||
| 7377dd0307 | |||
| 98c89e16ff | |||
| 324a3119b0 | |||
| 8a15c2603a | |||
| 043e4c4955 | |||
| ba7703e659 | |||
| f80ae5bdcf | |||
| 1a45a61387 | |||
| c3e9d5060e | |||
| 822de7fb94 | |||
| 8d84d836d1 | |||
| 950b71186f | |||
| e50a1f1a9c | |||
| a17cef70ea | |||
| 18dd5e01f2 | |||
| 6de3e13413 | |||
| ed3a1d2106 | |||
| 022afbeb4e | |||
| 2f925e5777 | |||
| de906b95f9 | |||
| d456aea71f | |||
| 621ca2c0ab | |||
| 6115b11582 | |||
| 5b8c390747 | |||
| 7525d5f3d5 | |||
| aabcd2cae3 | |||
| 0d115460a7 | |||
| 175bda67a1 | |||
| cba31c47c4 | |||
| a6fed02068 | |||
| d419aa5dc4 | |||
| f9bc5a0693 | |||
| 05e1f96419 | |||
| 6eae34533a | |||
| 63ced7b43f | |||
| dc47ba32f8 | |||
| edbf2d609e | |||
| 999328be0d | |||
| 98834fefaa | |||
| 90bd2ae172 | |||
| 5941e0b7ea | |||
| 9765940824 | |||
| 5ea5c514da | |||
| d3efde8176 | |||
| aea302be6c | |||
| cc05b90d86 | |||
| 1d0c9d6b2d | |||
| f62cad6431 | |||
| 5394ad7387 | |||
| 68e1ee0072 | |||
| 2858830c39 | |||
| d6484ef3c3 | |||
| 46fae69cf0 | |||
| f66f1e0fa3 | |||
| 887d7af882 | |||
| a92842454c | |||
| c8386fa61d | |||
| 87baebebd8 | |||
| e3d0a1d190 | |||
| d47b605eca | |||
| 22c6f6397f | |||
| 3ec97e2cc5 | |||
| 9b103a1d76 | |||
| b90b0852e9 | |||
| 9352cdb56d | |||
| 182f40ea8b | |||
| 3e887d2e0c | |||
| 0f87d8f7b2 | |||
| 4c33d67321 | |||
| cb234955df | |||
| 3a500cd0b6 | |||
| 868c546da4 | |||
| 99404f53c7 | |||
| 785d75a03b | |||
| 6d1479ca4b | |||
| b8b0859b5c | |||
| d7543862bd | |||
| c777df79f7 | |||
| cc2a77d7f1 | |||
| 9e2de9b9e9 | |||
| 109e15a335 | |||
| f192ca90e6 | |||
| f89d0e11bf | |||
| b4003d11fc | |||
| 292fc59d61 | |||
| afcb3f8863 | |||
| afb12e4294 | |||
| 24aebae177 | |||
| 39c0813a7f | |||
| 9b70e2b4c1 | |||
| 173daac19d | |||
| 04f2cfc894 | |||
| 811a6c0972 | |||
| 9b1769dd9a | |||
| 61c299f81f | |||
| 4acfa3354a | |||
| 88c8304104 | |||
| 6768ff4a22 | |||
| f2e7af9b86 | |||
| 7423cf0a9b | |||
| 460a2b1100 | |||
| 28566d73b3 | |||
| 98060b001d | |||
| f5a3c655b2 | |||
| 7169f87ad0 | |||
| b74d888c63 | |||
| 2007d4d54f | |||
| 48e925fab5 | |||
| 1903c0b8a3 | |||
| 86a1f67a3b | |||
| a257d9bccc | |||
| 015069b017 | |||
| fbefc8a78d | |||
| 26bc4bbcd8 | |||
| 3c3d767201 | |||
| 13cf6b6236 | |||
| 90d0a54c4d | |||
| 7a0a146c54 | |||
| 7ab643e425 | |||
| afb4429b4f | |||
| aa4502e7f3 | |||
| 17b4d85f63 | |||
| 1144a8efe7 | |||
| 08fb5587b4 | |||
| dbc18e7816 | |||
| 02bd654846 | |||
| 200bbf92e8 | |||
| 81ecf425f0 | |||
| 42d9a2c4c7 | |||
| 2ac74d098e | |||
| 584f5fb4c6 | |||
| d586ddc691 | |||
| 0b7e701dd4 | |||
| 947f2f5375 | |||
| 739e03b344 | |||
| da4e7687b5 | |||
| 39317cf42b | |||
| 2990cee95b | |||
| 0be6d05b5e | |||
| 77073c77bc | |||
| a7d5b016bd | |||
| d803786731 | |||
| 1534d389af | |||
| ece5a8b0b6 | |||
| 54072f315f | |||
| be633fba0f | |||
| ed6cfb90c8 | |||
| 6ed9f6047e | |||
| a44c4f1d2f | |||
| 88fcf00dda | |||
| d1f569b1b9 | |||
| 13698db634 | |||
| 2c4f59afc3 | |||
| 1c2bc7ead0 | |||
| 4055130a85 | |||
| 34120f5acd | |||
| 7489ec0bab | |||
| 70788bdbdc | |||
| c9c1b59e59 | |||
| 0350809f3a | |||
| a6977dbd15 | |||
| 2fa2a50bf9 | |||
| 08e15defa9 | |||
| b37685afbb | |||
| 792595b59d | |||
| 0c1c788312 | |||
| 56d64fbe30 | |||
| 608968b7c5 | |||
| 06ffc7e1d3 | |||
| d3cf61b89b | |||
| a39203f99e | |||
| 24e6ad3f16 | |||
| 2ef5d106bb | |||
| 0ed27ef66c | |||
| 900edfa8d4 | |||
| 88ad9ec6b2 | |||
| 40896bdf3f | |||
| 00ee37efa2 | |||
| 890f104cdf | |||
| 4a5e13149a | |||
| 97cc8729f0 | |||
| 4464109219 | |||
| 193e78e35d | |||
| bdb2cddafc | |||
| ebb3930d28 | |||
| cde384cd92 | |||
| 96e06e3cb7 | |||
| 17eb306fcc | |||
| 165cb56329 | |||
| d6da8a8ff2 | |||
| b4ac4fa04d | |||
| e136000595 | |||
| 86d9fc29cb | |||
| 506475de5f | |||
| cfe4532093 | |||
| 8fc88d63f1 | |||
| 6e74fd4945 | |||
| dcbac4cb4b | |||
| ed2462030f | |||
| cc5befbced | |||
| 2c89cd96a8 | |||
| a0304dc504 | |||
| c7941cca18 | |||
| b6dd32aa07 | |||
| f94886946e | |||
| 72dfe4c74f | |||
| 8b464d9660 | |||
| 889ebb2638 | |||
| 3ad986c28b | |||
| 344e193b7d | |||
| fb1c933ade | |||
| 72c5b97231 | |||
| fa93cd9f60 | |||
| aec9674dbe | |||
| 7fcc4223dc | |||
| 8262a3e23b | |||
| f211331c48 | |||
| 9053d0b134 | |||
| cb3f2d8d10 | |||
| c12df53b60 | |||
| d1aeea7553 | |||
| d8bccde686 | |||
| 20e489eaa1 | |||
| 4213475ec7 | |||
| d92879baf6 | |||
| 690fe019f0 | |||
| ed7a29d9f8 | |||
| 756848e79e | |||
| 18445edd0f | |||
| 30215ca61f | |||
| 838cedade7 | |||
| 4283a28c2f | |||
| 93a126fbc7 | |||
| 8e4b351a0c | |||
| 9869453c42 | |||
| 3642c59aa8 | |||
| 43eea2953b | |||
| de7eb10ce4 | |||
| fd11a325b8 | |||
| 4d17e20310 | |||
| 10fd1d7380 | |||
| 52b4f4a8d7 | |||
| e782e0a170 | |||
| dc2ceca5c5 | |||
| f8acd01ff7 | |||
| c48334d405 | |||
| 909fdaf152 | |||
| 8c1c926d00 | |||
| df6f3ce883 | |||
| 513f074766 | |||
| b07bf83c7d | |||
| 53e8cf53a4 | |||
| 54271bb766 | |||
| 9e96f56efb | |||
| b278911229 | |||
| 7bd0c7745c | |||
| 1cf0719ebd | |||
| 537d5ee025 | |||
| c8e5be35f7 | |||
| a6e72e1e4f | |||
| 5e83a7277f | |||
| 68af5f6c5c | |||
| 8de2901fea | |||
| c53e0730cb | |||
| a0e619e62a | |||
| 70116459c3 | |||
| 65e262b93b | |||
| 43faa0461a | |||
| 48cb2109b6 | |||
| a5450f11c9 | |||
| 9d98ab5ec6 | |||
| df5c879527 | |||
| 423e9f1cbe | |||
| 0bd7f8fca5 | |||
| d5615af9ae | |||
| 19dcc02a72 | |||
| 7feae92c1f | |||
| f851b84266 | |||
| fc966e9cc6 | |||
| ef19e67d2c | |||
| a41351f363 | |||
| 6aae216b4e | |||
| b22980a1dc | |||
| 881f735827 | |||
| 2f54045508 | |||
| 5aa6efb9a5 | |||
| 6ca0234478 | |||
| 649818995f | |||
| 7a0a9da72b | |||
| 69bff9bc89 | |||
| 41ca7eb491 | |||
| eef364723c | |||
| 0d6e187e88 | |||
| 9420a1fc30 | |||
| 583e900996 | |||
| 05e1fbfc52 | |||
| fe92176321 | |||
| 6d0df0ebeb | |||
| 0fa939e2d1 | |||
| 0422ce109f | |||
| 47bdee409c | |||
| 49f189439d | |||
| 5adf6f6b7f | |||
| 4115f19958 | |||
| 340d7b1b21 | |||
| 1bcbcbf574 | |||
| 82e43b2d7e | |||
| 67309a1cb5 | |||
| b724afe343 | |||
| 21f4f1c9a4 | |||
| b0c1f6202d | |||
| c0dfd97519 | |||
| a9138e85b1 | |||
| 0a05ed57e6 | |||
| 14288d1332 | |||
| b411418ff0 | |||
| 2bc0f72ae5 | |||
| 9c1244de57 | |||
| db2f8d915c | |||
| 6167c0e5d2 | |||
| ed2e464653 | |||
| 2c8ed8ee48 | |||
| ed50f46641 | |||
| 46e678bcff | |||
| 6b2427f995 | |||
| b07d741661 | |||
| 41fb013d29 | |||
| 32d4b669d0 | |||
| 3cde34a4a4 | |||
| bdb3660312 | |||
| f3a21e9c68 | |||
| 8e630d680e | |||
| af869f6dff | |||
| 53c0fa1e25 | |||
| f7912cba3d | |||
| 6317a5174a | |||
| aa72d9a4ea | |||
| ce17db8085 | |||
| 8c87a9ad46 | |||
| ec69124eb4 | |||
| d0da99fb70 | |||
| b2f195c429 | |||
| 047797ef90 | |||
| eb8ef4224d | |||
| 56a735261c | |||
| e1cf90e099 | |||
| 6bc1e30ef9 | |||
| 7e081ba7ca | |||
| 1e013fa388 | |||
| bc7c4d206b | |||
| f67e9e9f22 | |||
| 36fe78769f | |||
| 83d933718c | |||
| 5175b884f7 | |||
| 5536b30a4c | |||
| 7f58fb9718 | |||
| 30bc3e0f66 | |||
| f34410715f | |||
| 68d4c33202 | |||
| f961d7f6ef | |||
| d059110498 | |||
| 571e8dd65e | |||
| 4b91c927f6 | |||
| 0e237f0035 | |||
| 8f7bace7c3 | |||
| e4d6144232 | |||
| 8d32dc603d | |||
| c4ab9f3e71 | |||
| 2689d5c027 | |||
| acba33a0f1 | |||
| a114bf20a3 | |||
| 3097ce3a32 | |||
| d6da9322c8 | |||
| 71ce44047f | |||
| 188b7f9b8c | |||
| b9b4746950 | |||
| 7b8a2ab76f | |||
| c9acbf1141 | |||
| 5b794cae8d | |||
| 0e4254492f | |||
| 1311913f55 | |||
| 29f395c97c | |||
| fa3bba2a53 | |||
| 986537f1c3 | |||
| 210207525e | |||
| 71eda0bb76 | |||
| 471fe65630 | |||
| 3a0fba5cf4 | |||
| 299ebb62b2 | |||
| f728ab8e35 | |||
| 63e26fff78 | |||
| fe3462c774 | |||
| 3b34fd5273 | |||
| 55d6d3fdb8 | |||
| 7272bfae77 | |||
| d9ac9e3dc5 | |||
| d41faaf9df | |||
| b34f33438a | |||
| 26c0406555 | |||
| 4c41278b77 | |||
| bb3605db85 | |||
| fe742aef5a | |||
| 4b07d36891 | |||
| 87aaadef73 | |||
| 682e0b6d2f | |||
| d6195a748b | |||
| 205d84aaa9 | |||
| 5124f5bf51 | |||
| 83f3c3bd91 | |||
| d9737ca1c6 | |||
| 9d4ca19d50 | |||
| 2ef0dc53b8 | |||
| 1d4680fad2 | |||
| 2c1bd848a6 | |||
| 5c9121203c | |||
| 490b1698a5 | |||
| 5a5e29de88 | |||
| 3d3ab3689f | |||
| 686623c5e7 | |||
| aadb656562 | |||
| 87e067de41 | |||
| 26507f8973 | |||
| 9c1d5b456d | |||
| e31045f95c | |||
| aaec845f8e | |||
| 7bdfd29a35 | |||
| e78587a64c | |||
| 7eb4255628 | |||
| 6a0f547561 | |||
| 30ed81b7ca | |||
| 7a4a5de729 | |||
| c16fb5dae8 | |||
| e37073efd7 | |||
| 183dad7a85 | |||
| 3408e47159 | |||
| 0377b8310b | |||
| e4755f7fac | |||
| 92edf35826 | |||
| eb5819b2d9 | |||
| 5989f4684d | |||
| 5125d72f02 | |||
| a018e555fd | |||
| 6211b92273 | |||
| 05fcd1b430 | |||
| 7c02d6a137 | |||
| 11c3b98491 | |||
| dbe7f07001 | |||
| c69bf4ee06 | |||
| d27ea94034 | |||
| 99ed526101 | |||
| 207da28186 | |||
| 5b1aca2ae3 | |||
| d8e557b5e5 | |||
| 61a44a0b22 | |||
| a6481525b8 | |||
| 8cac35ba43 | |||
| 9dbf7a2dc1 | |||
| 607029e515 | |||
| cb072ce93b | |||
| 95aca283b4 | |||
| 2b05b8ce69 | |||
| 3c776dcefb | |||
| 2cbd4d2999 | |||
| 3092375e27 | |||
| 3cd91dc955 | |||
| 8a7368e069 | |||
| 93e561ec4d | |||
| e1b004839a | |||
| ee378f3d49 | |||
| e82ee40de3 | |||
| facbe2a114 | |||
| 7168920491 | |||
| 21378a2323 | |||
| 976711d9db | |||
| 44fa4d556c | |||
| 3ac98edcb1 | |||
| 966c742ed2 | |||
| 0d7d05f4b6 | |||
| 96bb8aa68b | |||
| 3badb0213b | |||
| fdcb850f14 | |||
| 54a66e5fee | |||
| 280d62b8a2 | |||
| 1666e66443 | |||
| 1575c1701a | |||
| 6ae996a873 | |||
| b590adfdc1 | |||
| b4fe16c75b | |||
| bc5dd4f669 | |||
| dbb036cf61 | |||
| 70e7ed841d | |||
| d06ba4ed3f | |||
| 6b40996ae8 | |||
| d2020acac7 | |||
| 1eb3c2ed48 | |||
| c64ee87267 | |||
| b1308b84a3 | |||
| 7b5ecf79bd | |||
| 9883a18859 | |||
| b3f2fddd17 | |||
| aa29841ede | |||
| 6bf27affb6 | |||
| 1dd23386ec | |||
| 7cbfc10943 | |||
| ce4ddd2d1a | |||
| e51929ebca | |||
| dc1b4a6f13 | |||
| 63d2705edb | |||
| d085a44082 | |||
| f49e5aff11 | |||
| 6c11ecf8d3 | |||
| 93e5f3c5fb | |||
| 70363bccfa | |||
| 3cdc57669f | |||
| 68bb122eb4 | |||
| d9fc8cd9da | |||
| f069f3ea74 | |||
| c5bc0e7fcc | |||
| 4a3a518722 | |||
| fbf722c6e6 | |||
| e92d7085bf | |||
| bd6028d6b0 | |||
| 802329dee9 | |||
| 41cc883c29 | |||
| 57504a4bcf | |||
| ed4792c990 | |||
| 87b836ba77 | |||
| 56c76c2e0e | |||
| c09632a66c | |||
| a3bf8d4a2b | |||
| 16eda8c43a | |||
| cd77382ac1 | |||
| 71b9cde010 | |||
| 5285589f37 | |||
| f41647ee6b | |||
| 4d022cbc75 | |||
| 70de35a881 | |||
| 34b2cf3b33 | |||
| 9e90c9f73f | |||
| e9528f6dc6 | |||
| 51baa9c333 | |||
| 35e076b3a8 | |||
| a26f59ccbc | |||
| aa3b3d76e0 | |||
| f7030df3be | |||
| 905e91e9ac | |||
| f8f9c0ba62 | |||
| dda811021a | |||
| 93195146ea | |||
| ed37599544 | |||
| 99ef59cf7f | |||
| d544d141ec | |||
| 3e397a9484 | |||
| 268c325078 | |||
| 3cc9af88ff | |||
| 7cd0bd7212 | |||
| 56d4aefa33 | |||
| dd143ef541 | |||
| daefed052c | |||
| 5fbab20e02 | |||
| e8224f3dca | |||
| 9665313c39 | |||
| 0c54fc7273 | |||
| c1b57855ec | |||
| 83b824c8b4 | |||
| 7678fcd5b6 | |||
| 8661c0241d | |||
| ce8d6b75fc | |||
| 61de3ef74b | |||
| ec1f9c8c91 | |||
| 65e09094c4 | |||
| c70cf0fe06 | |||
| a5d11a54dc | |||
| 3d4c87758e | |||
| a9bd832fc5 | |||
| 417bcefbae | |||
| baada0e737 | |||
| 82eb61dd4c | |||
| 0d4d06fe2f | |||
| 4aed0ca6a2 | |||
| 1621b25288 | |||
| a564797151 | |||
| 1da6a09274 | |||
| 1e44ffc3ff | |||
| a454748544 | |||
| 1bff42c4b7 | |||
| cb391d85dc | |||
| fee5b8d37f | |||
| b2ce859bd2 | |||
| 566f10a929 | |||
| c3b5189137 | |||
| a25866ac8d | |||
| 098900d7c2 | |||
| 98d01d3ce2 | |||
| d55244df31 | |||
| 04149cce27 | |||
| 24834f4894 | |||
| ec7da6fcf3 | |||
| 819d548e8a | |||
| 477d2a8aa2 | |||
| e484e02857 | |||
| 24f6b9a713 | |||
| 9cdde47289 | |||
| b1eb4ca152 | |||
| 87b4ac56c2 | |||
| cb84e45ac7 | |||
| 4716377fbc | |||
| 4e9cf8c1dd | |||
| 2976dc27e9 | |||
| 102bf967f0 | |||
| 1f4b09b525 | |||
| 86c3369eb8 | |||
| 2755c34a8f | |||
| db10422184 | |||
| e1a2c699dd | |||
| 0115ccd5c0 | |||
| 40b4284fe3 | |||
| 4ebc0b9640 | |||
| dc96fd54c6 | |||
| 1f5d13ab9f | |||
| 90cb44eb02 | |||
| e11880deea | |||
| 9351f91be9 | |||
| 5a1e1c8353 | |||
| 69ecaa7c79 | |||
| 7f00899ff7 | |||
| 995e3d1f41 | |||
| b4ac449a83 | |||
| 8e5314a468 | |||
| 87918e40c4 | |||
| f6b32efb7f | |||
| b99733d092 | |||
| 05a015d6a5 | |||
| ad971af8c7 | |||
| f2ebb6f541 | |||
| 1d01211264 | |||
| f94ab12f79 | |||
| a865bc1ca6 | |||
| 21802c4b6d | |||
| 652907b354 | |||
| 24f1c01e0f | |||
| fad6e2538e | |||
| 7f6d47c1a2 | |||
| 3147586ebd | |||
| ed636d99ca | |||
| 090c856d76 | |||
| ad434d4cfe | |||
| 66d433b94f | |||
| 027b204ff1 | |||
| 55dcce91df | |||
| 8017c8db7f | |||
| dc3529dbf6 | |||
| 7699258ef0 | |||
| e9ba99f296 | |||
| 7c80368710 | |||
| 95d63f38c0 | |||
| bb8dab821e | |||
| fc0f87768a | |||
| 0a57386721 | |||
| 3749e28774 | |||
| 86fc2321ff | |||
| 2549c0dfef | |||
| b10e519895 | |||
| 9bde5ba127 | |||
| 72c8f1ad04 | |||
| da224daaa9 | |||
| 3a100b9278 | |||
| 242a637aea | |||
| c2a9671510 | |||
| d5ae4f7f42 | |||
| b6c502a150 | |||
| 9ca710e525 | |||
| eb07c8cb5b | |||
| ba10801961 | |||
| 620fc2d09e | |||
| 29283eaa7e | |||
| 2fa66ef713 | |||
| 13affc432d | |||
| d8f094a92a | |||
| 97ae6d777f | |||
| 6baeee70d1 | |||
| d2517a4939 | |||
| 6342adc438 | |||
| 0adba91547 | |||
| 4285e423a6 | |||
| 63375f0cdb | |||
| 70ad3f9e98 | |||
| d6fc629f4d | |||
| af51d80fa1 | |||
| f5722a5052 | |||
| 651cf0fec1 | |||
| 4dc52e1c53 | |||
| 4708f13a9c | |||
| a6d042df0a | |||
| 40a36ccfeb | |||
| ef608c37a7 | |||
| 2386803f2a | |||
| 95862f7b4d | |||
| 230b131b54 | |||
| 0812d8dd41 | |||
| bf7e3c51ae | |||
| a35a8a8392 | |||
| 4ef0bb1fcf | |||
| fadc59c0e6 | |||
| 86cbd2eee9 | |||
| 092475f738 | |||
| dcc56d62da | |||
| f15e70d906 | |||
| b6be6f8d1e | |||
| 03a70eacaf | |||
| 45b1ff7a25 | |||
| 15ba07ef25 | |||
| d2b58ca203 | |||
| 82e7e19a6e | |||
| 421c462948 | |||
| 84884cd9ac | |||
| a43aa183dc | |||
| 463bbb1835 | |||
| 5e125e74d1 | |||
| 06f21ce7a5 | |||
| 57a810db9c | |||
| 8b664706aa | |||
| 37bfee92bf | |||
| e73ff24e31 | |||
| bd7599d34a | |||
| 01b6113659 | |||
| 1b84eff03a | |||
| 55acf86bf8 | |||
| f021b97993 | |||
| 1cab43c2d2 | |||
| 8bd651b318 | |||
| 58e234a754 | |||
| e86c414d6a | |||
| 550b2801ad | |||
| cefb9e5a28 | |||
| 98d7367b61 | |||
| 594a8b9030 | |||
| 44f990515b | |||
| 252937806c | |||
| 51826d51fa | |||
| 14e53ed11f | |||
| ddb94c2605 | |||
| 90969fb39a | |||
| 101f1481f9 | |||
| 2edc87b161 | |||
| 4203926f10 | |||
| cdb57015a7 | |||
| aa557e6422 | |||
| 0e00d40e4f | |||
| c920e01242 | |||
| 274d8e8818 | |||
| 2039c6305b | |||
| 6efb195a6e | |||
| 24b7fb455a | |||
| 58f5a59769 | |||
| db9dfcfa6a | |||
| 9ef98d527e | |||
| 93491aefc7 | |||
| 7acd539cd7 | |||
| e75a6301bd | |||
| a79cc68b3a | |||
| 7e3f7a4ee7 | |||
| 9ec8257914 | |||
| 38327cf454 | |||
| dfa82e2a3d | |||
| e59ca942f5 | |||
| a57a3044aa | |||
| 4e5a0f6ae2 | |||
| b63bd14999 | |||
| 2041c0e360 | |||
| 085cbc4f9f | |||
| 2b93162fb0 | |||
| 2e45bd29fe | |||
| 51d7c6a2b2 | |||
| f3aca1ee30 | |||
| 8dd41d6bcc | |||
| 0a298ea418 | |||
| d330558bab | |||
| 656fd72976 | |||
| 79455cf421 | |||
| 30d6a015e0 | |||
| 8af5a5c4e5 | |||
| 3a5f0afcd2 | |||
| c7e63aa4d8 | |||
| 4a9ce1784c | |||
| 7e4e709b43 | |||
| 63d8eabed0 | |||
| e830b01383 | |||
| ff6473980d | |||
| a164aea35d | |||
| a76f547e11 | |||
| b7b7676d67 | |||
| e6e3c55ef2 | |||
| f98a4920f9 | |||
| d4bfc23ef0 | |||
| 9a2160fa55 | |||
| 2de4118243 | |||
| 239b7befdd | |||
| 09e974d483 | |||
| e5ef4fa99a | |||
| 037bcd942c | |||
| c2e7507ad4 | |||
| 3aa2b6a637 | |||
| 555aa21905 | |||
| e7ae3bf3d6 | |||
| b932c048ac | |||
| e85829450d | |||
| effc5d24fa | |||
| 18ed3132d2 | |||
| 9b459eca88 | |||
| 70fedd0f79 | |||
| bb103b29bf | |||
| 248e76c4df | |||
| 803d5c35f3 | |||
| 7fd8c0f85c | |||
| 44c3a5abc3 | |||
| 6909a76201 | |||
| 045533716b | |||
| 3c0ff914ac | |||
| 2bc4be4e32 | |||
| c67abd614f | |||
| 6fa7cd3dbc | |||
| 94744ba41a | |||
| 4965ec42d2 | |||
| 73aa7041bf | |||
| 7c1f760024 | |||
| da461f3cbf | |||
| 5b800f0932 | |||
| 8427f70493 | |||
| 7a7992085b | |||
| 1286211f57 | |||
| 6d531ad7b8 | |||
| 762b424a52 | |||
| de1cb38769 | |||
| c802f5430d | |||
| cff8991a50 | |||
| f3f8d8fff4 | |||
| 26df46ee59 | |||
| c3f687ac22 | |||
| 04437e313d | |||
| 038bededba | |||
| d03308be0c | |||
| c6bc0034d0 | |||
| 70e132244a | |||
| 47e9038d23 | |||
| 432cf22a6a | |||
| 2914006fe0 | |||
| 7329ff5468 | |||
| 541d1df486 | |||
| 3b00ff9138 | |||
| 91276c5721 | |||
| 0b4167526d | |||
| fd5fd26902 | |||
| 3bbaacbe15 | |||
| a10314c6b3 | |||
| 70f2c2a709 | |||
| 280d074103 | |||
| 32b14baf8a | |||
| 2d9045fce8 | |||
| 355f66348c | |||
| 8693e47e6a | |||
| cec8c7d7f8 | |||
| 4d0ec37267 | |||
| e7f720ea56 | |||
| 4ae17bf1e2 | |||
| 8a49eea74b | |||
| b4245a48df | |||
| 4e0f6076be | |||
| 726efc6a32 | |||
| bd45912b99 |
@ -8,12 +8,12 @@ import zipfile
|
|||||||
# Note that we have 400 MiB quota, please use it wisely.
|
# Note that we have 400 MiB quota, please use it wisely.
|
||||||
# See https://github.com/pypi/support/issues/3792 .
|
# See https://github.com/pypi/support/issues/3792 .
|
||||||
# Please also sync the value with the one in Dockerfile.
|
# Please also sync the value with the one in Dockerfile.
|
||||||
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
|
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
|
||||||
|
|
||||||
|
|
||||||
def print_top_10_largest_files(zip_file):
|
def print_top_10_largest_files(zip_file):
|
||||||
"""Print the top 10 largest files in the given zip file."""
|
"""Print the top 10 largest files in the given zip file."""
|
||||||
with zipfile.ZipFile(zip_file, 'r') as z:
|
with zipfile.ZipFile(zip_file, "r") as z:
|
||||||
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
|
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
|
||||||
file_sizes.sort(key=lambda x: x[1], reverse=True)
|
file_sizes.sort(key=lambda x: x[1], reverse=True)
|
||||||
for f, size in file_sizes[:10]:
|
for f, size in file_sizes[:10]:
|
||||||
@ -28,14 +28,18 @@ def check_wheel_size(directory):
|
|||||||
wheel_path = os.path.join(root, file_name)
|
wheel_path = os.path.join(root, file_name)
|
||||||
wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
|
wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
|
||||||
if wheel_size_mb > VLLM_MAX_SIZE_MB:
|
if wheel_size_mb > VLLM_MAX_SIZE_MB:
|
||||||
print(f"Not allowed: Wheel {wheel_path} is larger "
|
print(
|
||||||
f"({wheel_size_mb:.2f} MB) than the limit "
|
f"Not allowed: Wheel {wheel_path} is larger "
|
||||||
f"({VLLM_MAX_SIZE_MB} MB).")
|
f"({wheel_size_mb:.2f} MB) than the limit "
|
||||||
|
f"({VLLM_MAX_SIZE_MB} MB)."
|
||||||
|
)
|
||||||
print_top_10_largest_files(wheel_path)
|
print_top_10_largest_files(wheel_path)
|
||||||
return 1
|
return 1
|
||||||
else:
|
else:
|
||||||
print(f"Wheel {wheel_path} is within the allowed size "
|
print(
|
||||||
f"({wheel_size_mb:.2f} MB).")
|
f"Wheel {wheel_path} is within the allowed size "
|
||||||
|
f"({wheel_size_mb:.2f} MB)."
|
||||||
|
)
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
@ -45,4 +49,4 @@ if __name__ == "__main__":
|
|||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
directory = sys.argv[1]
|
directory = sys.argv[1]
|
||||||
sys.exit(check_wheel_size(directory))
|
sys.exit(check_wheel_size(directory))
|
||||||
|
|||||||
@ -22,5 +22,5 @@ with open("index.html", "w") as f:
|
|||||||
print(f"Generated index.html for {args.wheel}")
|
print(f"Generated index.html for {args.wheel}")
|
||||||
# cloudfront requires escaping the '+' character
|
# cloudfront requires escaping the '+' character
|
||||||
f.write(
|
f.write(
|
||||||
template.format(wheel=filename,
|
template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
|
||||||
wheel_html_escaped=filename.replace("+", "%2B")))
|
)
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
||||||
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For hf script, without -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
|
||||||
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
|
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For hf script, without -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
||||||
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
||||||
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
|
# For hf script, without -t option (tensor parallel size).
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
|
||||||
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
||||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
|
||||||
|
model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.335
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.323
|
||||||
|
limit: 1319
|
||||||
|
num_fewshot: 5
|
||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||||
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
|
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "mgoin/Minitron-4B-Base-FP8"
|
model_name: "mgoin/Minitron-4B-Base-FP8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
||||||
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
||||||
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
|
# For hf script, without -t option (tensor parallel size).
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
|
||||||
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
|
|||||||
@ -0,0 +1,12 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
|
||||||
|
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.30
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.465
|
||||||
|
limit: 1319
|
||||||
|
num_fewshot: 5
|
||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
|
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||||
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
||||||
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
|
||||||
|
model_name: "Qwen/Qwen2.5-1.5B-Instruct"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.54
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.59
|
||||||
|
limit: 1319
|
||||||
|
num_fewshot: 5
|
||||||
@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
|
||||||
|
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.47
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.64
|
||||||
|
limit: 1319
|
||||||
|
num_fewshot: 5
|
||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
|
||||||
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
|
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -3,3 +3,4 @@ Meta-Llama-3-70B-Instruct.yaml
|
|||||||
Mixtral-8x7B-Instruct-v0.1.yaml
|
Mixtral-8x7B-Instruct-v0.1.yaml
|
||||||
Qwen2-57B-A14-Instruct.yaml
|
Qwen2-57B-A14-Instruct.yaml
|
||||||
DeepSeek-V2-Lite-Chat.yaml
|
DeepSeek-V2-Lite-Chat.yaml
|
||||||
|
Meta-Llama-3-8B-QQQ.yaml
|
||||||
|
|||||||
@ -1,10 +1,6 @@
|
|||||||
Meta-Llama-3-8B-Instruct.yaml
|
Qwen2.5-1.5B-Instruct.yaml
|
||||||
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
|
|
||||||
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
||||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
||||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
||||||
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
|
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
|
||||||
Minitron-4B-Base-FP8.yaml
|
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
|
||||||
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
|
|
||||||
Qwen2-1.5B-Instruct-FP8W8.yaml
|
|
||||||
Meta-Llama-3-8B-QQQ.yaml
|
|
||||||
|
|||||||
43
.buildkite/lm-eval-harness/conftest.py
Normal file
43
.buildkite/lm-eval-harness/conftest.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_addoption(parser):
|
||||||
|
parser.addoption(
|
||||||
|
"--config-list-file",
|
||||||
|
action="store",
|
||||||
|
help="Path to the file listing model config YAMLs (one per line)",
|
||||||
|
)
|
||||||
|
parser.addoption(
|
||||||
|
"--tp-size",
|
||||||
|
action="store",
|
||||||
|
default="1",
|
||||||
|
help="Tensor parallel size to use for evaluation",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def config_list_file(pytestconfig, config_dir):
|
||||||
|
rel_path = pytestconfig.getoption("--config-list-file")
|
||||||
|
return config_dir / rel_path
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def tp_size(pytestconfig):
|
||||||
|
return pytestconfig.getoption("--tp-size")
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_generate_tests(metafunc):
|
||||||
|
if "config_filename" in metafunc.fixturenames:
|
||||||
|
rel_path = metafunc.config.getoption("--config-list-file")
|
||||||
|
config_list_file = Path(rel_path).resolve()
|
||||||
|
config_dir = config_list_file.parent
|
||||||
|
with open(config_list_file, encoding="utf-8") as f:
|
||||||
|
configs = [
|
||||||
|
config_dir / line.strip()
|
||||||
|
for line in f
|
||||||
|
if line.strip() and not line.startswith("#")
|
||||||
|
]
|
||||||
|
metafunc.parametrize("config_filename", configs)
|
||||||
@ -1,59 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
echo``
|
|
||||||
echo "Runs lm eval harness on GSM8k using vllm and compares to "
|
|
||||||
echo "precomputed baseline (measured by HF transformers.)"
|
|
||||||
echo
|
|
||||||
echo "usage: ${0} <options>"
|
|
||||||
echo
|
|
||||||
echo " -c - path to the test data config (e.g. configs/small-models.txt)"
|
|
||||||
echo " -t - tensor parallel size"
|
|
||||||
echo
|
|
||||||
}
|
|
||||||
|
|
||||||
SUCCESS=0
|
|
||||||
|
|
||||||
while getopts "c:t:" OPT; do
|
|
||||||
case ${OPT} in
|
|
||||||
c )
|
|
||||||
CONFIG="$OPTARG"
|
|
||||||
;;
|
|
||||||
t )
|
|
||||||
TP_SIZE="$OPTARG"
|
|
||||||
;;
|
|
||||||
\? )
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
# Parse list of configs.
|
|
||||||
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
|
|
||||||
|
|
||||||
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
|
|
||||||
do
|
|
||||||
LOCAL_SUCCESS=0
|
|
||||||
|
|
||||||
echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
|
|
||||||
|
|
||||||
export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
|
|
||||||
export LM_EVAL_TP_SIZE=$TP_SIZE
|
|
||||||
pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
|
|
||||||
|
|
||||||
if [[ $LOCAL_SUCCESS == 0 ]]; then
|
|
||||||
echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
|
|
||||||
else
|
|
||||||
echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
|
|
||||||
fi
|
|
||||||
|
|
||||||
SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
|
|
||||||
|
|
||||||
done
|
|
||||||
|
|
||||||
if [ "${SUCCESS}" -eq "0" ]; then
|
|
||||||
exit 0
|
|
||||||
else
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
@ -3,67 +3,52 @@
|
|||||||
LM eval harness on model to compare vs HF baseline computed offline.
|
LM eval harness on model to compare vs HF baseline computed offline.
|
||||||
Configs are found in configs/$MODEL.yaml
|
Configs are found in configs/$MODEL.yaml
|
||||||
|
|
||||||
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
|
pytest -s -v test_lm_eval_correctness.py \
|
||||||
* export LM_EVAL_TP_SIZE=4
|
--config-list-file=configs/models-small.txt \
|
||||||
* pytest -s test_lm_eval_correctness.py
|
--tp-size=1
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import lm_eval
|
import lm_eval
|
||||||
import numpy
|
import numpy as np
|
||||||
import pytest
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
RTOL = 0.05
|
RTOL = 0.08
|
||||||
TEST_DATA_FILE = os.environ.get(
|
|
||||||
"LM_EVAL_TEST_DATA_FILE",
|
|
||||||
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
|
|
||||||
|
|
||||||
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
|
|
||||||
|
|
||||||
|
|
||||||
def launch_lm_eval(eval_config):
|
def launch_lm_eval(eval_config, tp_size):
|
||||||
trust_remote_code = eval_config.get('trust_remote_code', False)
|
trust_remote_code = eval_config.get("trust_remote_code", False)
|
||||||
|
model_args = (
|
||||||
model_args = f"pretrained={eval_config['model_name']}," \
|
f"pretrained={eval_config['model_name']},"
|
||||||
f"tensor_parallel_size={TP_SIZE}," \
|
f"tensor_parallel_size={tp_size},"
|
||||||
f"add_bos_token=true," \
|
f"enforce_eager=true,"
|
||||||
f"trust_remote_code={trust_remote_code}"
|
f"add_bos_token=true,"
|
||||||
|
f"trust_remote_code={trust_remote_code}"
|
||||||
|
)
|
||||||
results = lm_eval.simple_evaluate(
|
results = lm_eval.simple_evaluate(
|
||||||
model="vllm",
|
model="vllm",
|
||||||
model_args=model_args,
|
model_args=model_args,
|
||||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||||
num_fewshot=eval_config["num_fewshot"],
|
num_fewshot=eval_config["num_fewshot"],
|
||||||
limit=eval_config["limit"],
|
limit=eval_config["limit"],
|
||||||
batch_size="auto")
|
batch_size="auto",
|
||||||
|
)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def test_lm_eval_correctness():
|
def test_lm_eval_correctness_param(config_filename, tp_size):
|
||||||
eval_config = yaml.safe_load(
|
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
|
||||||
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
|
|
||||||
|
|
||||||
if eval_config[
|
results = launch_lm_eval(eval_config, tp_size)
|
||||||
"model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501
|
|
||||||
pytest.skip("FBGEMM is currently failing on main.")
|
|
||||||
|
|
||||||
# Launch eval requests.
|
|
||||||
results = launch_lm_eval(eval_config)
|
|
||||||
|
|
||||||
# Confirm scores match ground truth.
|
|
||||||
success = True
|
success = True
|
||||||
for task in eval_config["tasks"]:
|
for task in eval_config["tasks"]:
|
||||||
for metric in task["metrics"]:
|
for metric in task["metrics"]:
|
||||||
ground_truth = metric["value"]
|
ground_truth = metric["value"]
|
||||||
measured_value = results["results"][task["name"]][metric["name"]]
|
measured_value = results["results"][task["name"]][metric["name"]]
|
||||||
print(f'{task["name"]} | {metric["name"]}: '
|
print(
|
||||||
f'ground_truth={ground_truth} | measured={measured_value}')
|
f"{task['name']} | {metric['name']}: "
|
||||||
success = success and numpy.isclose(
|
f"ground_truth={ground_truth} | measured={measured_value}"
|
||||||
ground_truth, measured_value, rtol=RTOL)
|
)
|
||||||
|
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
|
||||||
|
|
||||||
# Assert at the end, print all scores even on failure for debugging.
|
|
||||||
assert success
|
assert success
|
||||||
|
|||||||
@ -113,7 +113,7 @@ WARNING: The benchmarking script will save json results by itself, so please do
|
|||||||
|
|
||||||
### Visualizing the results
|
### Visualizing the results
|
||||||
|
|
||||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
|
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
|
||||||
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
||||||
If you do not see the table, please wait till the benchmark finish running.
|
If you do not see the table, please wait till the benchmark finish running.
|
||||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||||
|
|||||||
@ -65,18 +65,18 @@ def read_markdown(file):
|
|||||||
|
|
||||||
|
|
||||||
def results_to_json(latency, throughput, serving):
|
def results_to_json(latency, throughput, serving):
|
||||||
return json.dumps({
|
return json.dumps(
|
||||||
'latency': latency.to_dict(),
|
{
|
||||||
'throughput': throughput.to_dict(),
|
"latency": latency.to_dict(),
|
||||||
'serving': serving.to_dict()
|
"throughput": throughput.to_dict(),
|
||||||
})
|
"serving": serving.to_dict(),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
# collect results
|
# collect results
|
||||||
for test_file in results_folder.glob("*.json"):
|
for test_file in results_folder.glob("*.json"):
|
||||||
|
|
||||||
with open(test_file) as f:
|
with open(test_file) as f:
|
||||||
raw_result = json.loads(f.read())
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
@ -120,7 +120,8 @@ if __name__ == "__main__":
|
|||||||
for perc in [10, 25, 50, 75, 90, 99]:
|
for perc in [10, 25, 50, 75, 90, 99]:
|
||||||
# Multiply 1000 to convert the time unit from s to ms
|
# Multiply 1000 to convert the time unit from s to ms
|
||||||
raw_result.update(
|
raw_result.update(
|
||||||
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
|
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
|
||||||
|
)
|
||||||
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
|
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
|
||||||
|
|
||||||
# add the result to raw_result
|
# add the result to raw_result
|
||||||
@ -153,26 +154,27 @@ if __name__ == "__main__":
|
|||||||
serving_results = pd.DataFrame.from_dict(serving_results)
|
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||||
throughput_results = pd.DataFrame.from_dict(throughput_results)
|
throughput_results = pd.DataFrame.from_dict(throughput_results)
|
||||||
|
|
||||||
raw_results_json = results_to_json(latency_results, throughput_results,
|
raw_results_json = results_to_json(
|
||||||
serving_results)
|
latency_results, throughput_results, serving_results
|
||||||
|
)
|
||||||
|
|
||||||
# remapping the key, for visualization purpose
|
# remapping the key, for visualization purpose
|
||||||
if not latency_results.empty:
|
if not latency_results.empty:
|
||||||
latency_results = latency_results[list(
|
latency_results = latency_results[list(latency_column_mapping.keys())].rename(
|
||||||
latency_column_mapping.keys())].rename(
|
columns=latency_column_mapping
|
||||||
columns=latency_column_mapping)
|
)
|
||||||
if not serving_results.empty:
|
if not serving_results.empty:
|
||||||
serving_results = serving_results[list(
|
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
|
||||||
serving_column_mapping.keys())].rename(
|
columns=serving_column_mapping
|
||||||
columns=serving_column_mapping)
|
)
|
||||||
if not throughput_results.empty:
|
if not throughput_results.empty:
|
||||||
throughput_results = throughput_results[list(
|
throughput_results = throughput_results[
|
||||||
throughput_results_column_mapping.keys())].rename(
|
list(throughput_results_column_mapping.keys())
|
||||||
columns=throughput_results_column_mapping)
|
].rename(columns=throughput_results_column_mapping)
|
||||||
|
|
||||||
processed_results_json = results_to_json(latency_results,
|
processed_results_json = results_to_json(
|
||||||
throughput_results,
|
latency_results, throughput_results, serving_results
|
||||||
serving_results)
|
)
|
||||||
|
|
||||||
for df in [latency_results, serving_results, throughput_results]:
|
for df in [latency_results, serving_results, throughput_results]:
|
||||||
if df.empty:
|
if df.empty:
|
||||||
@ -184,38 +186,39 @@ if __name__ == "__main__":
|
|||||||
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
||||||
# we want to turn it into "8xGPUTYPE"
|
# we want to turn it into "8xGPUTYPE"
|
||||||
df["GPU"] = df["GPU"].apply(
|
df["GPU"] = df["GPU"].apply(
|
||||||
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
|
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
|
||||||
|
)
|
||||||
|
|
||||||
# get markdown tables
|
# get markdown tables
|
||||||
latency_md_table = tabulate(latency_results,
|
latency_md_table = tabulate(
|
||||||
headers='keys',
|
latency_results, headers="keys", tablefmt="pipe", showindex=False
|
||||||
tablefmt='pipe',
|
)
|
||||||
showindex=False)
|
serving_md_table = tabulate(
|
||||||
serving_md_table = tabulate(serving_results,
|
serving_results, headers="keys", tablefmt="pipe", showindex=False
|
||||||
headers='keys',
|
)
|
||||||
tablefmt='pipe',
|
throughput_md_table = tabulate(
|
||||||
showindex=False)
|
throughput_results, headers="keys", tablefmt="pipe", showindex=False
|
||||||
throughput_md_table = tabulate(throughput_results,
|
)
|
||||||
headers='keys',
|
|
||||||
tablefmt='pipe',
|
|
||||||
showindex=False)
|
|
||||||
|
|
||||||
# document the result
|
# document the result
|
||||||
with open(results_folder / "benchmark_results.md", "w") as f:
|
with open(results_folder / "benchmark_results.md", "w") as f:
|
||||||
|
results = read_markdown(
|
||||||
results = read_markdown("../.buildkite/nightly-benchmarks/" +
|
"../.buildkite/nightly-benchmarks/"
|
||||||
"performance-benchmarks-descriptions.md")
|
+ "performance-benchmarks-descriptions.md"
|
||||||
|
)
|
||||||
results = results.format(
|
results = results.format(
|
||||||
latency_tests_markdown_table=latency_md_table,
|
latency_tests_markdown_table=latency_md_table,
|
||||||
throughput_tests_markdown_table=throughput_md_table,
|
throughput_tests_markdown_table=throughput_md_table,
|
||||||
serving_tests_markdown_table=serving_md_table,
|
serving_tests_markdown_table=serving_md_table,
|
||||||
benchmarking_results_in_json_string=processed_results_json)
|
benchmarking_results_in_json_string=processed_results_json,
|
||||||
|
)
|
||||||
f.write(results)
|
f.write(results)
|
||||||
|
|
||||||
# document benchmarking results in json
|
# document benchmarking results in json
|
||||||
with open(results_folder / "benchmark_results.json", "w") as f:
|
with open(results_folder / "benchmark_results.json", "w") as f:
|
||||||
|
results = (
|
||||||
results = latency_results.to_dict(
|
latency_results.to_dict(orient="records")
|
||||||
orient='records') + throughput_results.to_dict(
|
+ throughput_results.to_dict(orient="records")
|
||||||
orient='records') + serving_results.to_dict(orient='records')
|
+ serving_results.to_dict(orient="records")
|
||||||
|
)
|
||||||
f.write(json.dumps(results))
|
f.write(json.dumps(results))
|
||||||
|
|||||||
@ -14,15 +14,12 @@ def main(model, cachedir):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Download and save Hugging Face tokenizer")
|
description="Download and save Hugging Face tokenizer"
|
||||||
parser.add_argument("--model",
|
)
|
||||||
type=str,
|
parser.add_argument("--model", type=str, required=True, help="Name of the model")
|
||||||
required=True,
|
parser.add_argument(
|
||||||
help="Name of the model")
|
"--cachedir", type=str, required=True, help="Directory to save the tokenizer"
|
||||||
parser.add_argument("--cachedir",
|
)
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help="Directory to save the tokenizer")
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args.model, args.cachedir)
|
main(args.model, args.cachedir)
|
||||||
|
|||||||
@ -11,33 +11,33 @@ from tabulate import tabulate
|
|||||||
|
|
||||||
def parse_arguments():
|
def parse_arguments():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description=
|
description="Parse command line arguments for summary-nightly-results script."
|
||||||
'Parse command line arguments for summary-nightly-results script.')
|
)
|
||||||
parser.add_argument('--results-folder',
|
parser.add_argument(
|
||||||
type=str,
|
"--results-folder",
|
||||||
required=True,
|
type=str,
|
||||||
help='The folder where the results are stored.')
|
required=True,
|
||||||
parser.add_argument('--description',
|
help="The folder where the results are stored.",
|
||||||
type=str,
|
)
|
||||||
required=True,
|
parser.add_argument(
|
||||||
help='Description of the results.')
|
"--description", type=str, required=True, help="Description of the results."
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
return args
|
return args
|
||||||
|
|
||||||
|
|
||||||
def get_perf(df, method, model, metric):
|
def get_perf(df, method, model, metric):
|
||||||
|
|
||||||
means = []
|
means = []
|
||||||
|
|
||||||
for qps in [2, 4, 8, 16, "inf"]:
|
for qps in [2, 4, 8, 16, "inf"]:
|
||||||
target = df['Test name'].str.contains(model)
|
target = df["Test name"].str.contains(model)
|
||||||
target = target & df['Engine'].str.contains(method)
|
target = target & df["Engine"].str.contains(method)
|
||||||
target = target & df['Test name'].str.contains("qps_" + str(qps))
|
target = target & df["Test name"].str.contains("qps_" + str(qps))
|
||||||
filtered_df = df[target]
|
filtered_df = df[target]
|
||||||
|
|
||||||
if filtered_df.empty:
|
if filtered_df.empty:
|
||||||
means.append(0.)
|
means.append(0.0)
|
||||||
else:
|
else:
|
||||||
means.append(filtered_df[metric].values[0])
|
means.append(filtered_df[metric].values[0])
|
||||||
|
|
||||||
@ -45,7 +45,6 @@ def get_perf(df, method, model, metric):
|
|||||||
|
|
||||||
|
|
||||||
def get_perf_w_std(df, method, model, metric):
|
def get_perf_w_std(df, method, model, metric):
|
||||||
|
|
||||||
if metric in ["TTFT", "ITL"]:
|
if metric in ["TTFT", "ITL"]:
|
||||||
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
|
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
|
||||||
mean = mean.tolist()
|
mean = mean.tolist()
|
||||||
@ -60,7 +59,8 @@ def get_perf_w_std(df, method, model, metric):
|
|||||||
else:
|
else:
|
||||||
assert metric == "Tput"
|
assert metric == "Tput"
|
||||||
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
|
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
|
||||||
df, method, model, "Output Tput (tok/s)")
|
df, method, model, "Output Tput (tok/s)"
|
||||||
|
)
|
||||||
mean = mean.tolist()
|
mean = mean.tolist()
|
||||||
std = None
|
std = None
|
||||||
|
|
||||||
@ -80,18 +80,17 @@ def main(args):
|
|||||||
# generate markdown table
|
# generate markdown table
|
||||||
df = pd.DataFrame.from_dict(results)
|
df = pd.DataFrame.from_dict(results)
|
||||||
|
|
||||||
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
|
md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
|
||||||
|
|
||||||
with open(args.description) as f:
|
with open(args.description) as f:
|
||||||
description = f.read()
|
description = f.read()
|
||||||
|
|
||||||
description = description.format(
|
description = description.format(nightly_results_benchmarking_table=md_table)
|
||||||
nightly_results_benchmarking_table=md_table)
|
|
||||||
|
|
||||||
with open("nightly_results.md", "w") as f:
|
with open("nightly_results.md", "w") as f:
|
||||||
f.write(description)
|
f.write(description)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
args = parse_arguments()
|
args = parse_arguments()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@ -10,15 +10,24 @@ set -x
|
|||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
|
||||||
check_gpus() {
|
check_gpus() {
|
||||||
# check the number of GPUs and GPU type.
|
if command -v nvidia-smi; then
|
||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
# check the number of GPUs and GPU type.
|
||||||
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
|
elif command -v amd-smi; then
|
||||||
|
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
||||||
|
fi
|
||||||
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
echo "GPU found."
|
echo "GPU found."
|
||||||
else
|
else
|
||||||
echo "Need at least 1 GPU to run benchmarking."
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
if command -v nvidia-smi; then
|
||||||
|
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||||
|
elif command -v amd-smi; then
|
||||||
|
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
||||||
|
fi
|
||||||
echo "GPU type is $gpu_type"
|
echo "GPU type is $gpu_type"
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -90,9 +99,15 @@ kill_gpu_processes() {
|
|||||||
|
|
||||||
|
|
||||||
# wait until GPU memory usage smaller than 1GB
|
# wait until GPU memory usage smaller than 1GB
|
||||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
if command -v nvidia-smi; then
|
||||||
sleep 1
|
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||||
done
|
sleep 1
|
||||||
|
done
|
||||||
|
elif command -v amd-smi; then
|
||||||
|
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
# remove vllm config file
|
# remove vllm config file
|
||||||
rm -rf ~/.config/vllm
|
rm -rf ~/.config/vllm
|
||||||
|
|||||||
@ -34,10 +34,8 @@ serving_column_mapping = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
# collect results
|
# collect results
|
||||||
for test_file in results_folder.glob("*.json"):
|
for test_file in results_folder.glob("*.json"):
|
||||||
|
|
||||||
with open(test_file) as f:
|
with open(test_file) as f:
|
||||||
raw_result = json.loads(f.read())
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
@ -56,17 +54,16 @@ if __name__ == "__main__":
|
|||||||
serving_results = pd.DataFrame.from_dict(serving_results)
|
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||||
|
|
||||||
if not serving_results.empty:
|
if not serving_results.empty:
|
||||||
serving_results = serving_results[list(
|
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
|
||||||
serving_column_mapping.keys())].rename(
|
columns=serving_column_mapping
|
||||||
columns=serving_column_mapping)
|
)
|
||||||
|
|
||||||
serving_md_table_with_headers = tabulate(serving_results,
|
serving_md_table_with_headers = tabulate(
|
||||||
headers='keys',
|
serving_results, headers="keys", tablefmt="pipe", showindex=False
|
||||||
tablefmt='pipe',
|
)
|
||||||
showindex=False)
|
|
||||||
# remove the first line of header
|
# remove the first line of header
|
||||||
serving_md_table_lines = serving_md_table_with_headers.split('\n')
|
serving_md_table_lines = serving_md_table_with_headers.split("\n")
|
||||||
serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
|
serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
|
||||||
|
|
||||||
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||||
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
|
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
|
||||||
@ -76,10 +73,9 @@ if __name__ == "__main__":
|
|||||||
# document results with header.
|
# document results with header.
|
||||||
# for those who wants to reproduce our benchmark.
|
# for those who wants to reproduce our benchmark.
|
||||||
f.write(serving_md_table_with_headers)
|
f.write(serving_md_table_with_headers)
|
||||||
f.write('\n')
|
f.write("\n")
|
||||||
|
|
||||||
# document benchmarking results in json
|
# document benchmarking results in json
|
||||||
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
|
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
|
||||||
|
results = serving_results.to_dict(orient="records")
|
||||||
results = serving_results.to_dict(orient='records')
|
|
||||||
f.write(json.dumps(results))
|
f.write(json.dumps(results))
|
||||||
|
|||||||
@ -63,10 +63,12 @@
|
|||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
"disable_log_requests": "",
|
"disable_log_requests": "",
|
||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
|
"speculative_config": {
|
||||||
"num_speculative_tokens": 4,
|
"model": "turboderp/Qwama-0.5B-Instruct",
|
||||||
"speculative_draft_tensor_parallel_size": 1
|
"num_speculative_tokens": 4,
|
||||||
|
"draft_tensor_parallel_size": 1
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
|||||||
46
.buildkite/pyproject.toml
Normal file
46
.buildkite/pyproject.toml
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
# This local pyproject file is part of the migration from yapf to ruff format.
|
||||||
|
# It uses the same core rules as the main pyproject.toml file, but with the
|
||||||
|
# following differences:
|
||||||
|
# - ruff line length is overridden to 88
|
||||||
|
# - deprecated typing ignores (UP006, UP035) have been removed
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
line-length = 88
|
||||||
|
|
||||||
|
[tool.ruff.lint.per-file-ignores]
|
||||||
|
"vllm/third_party/**" = ["ALL"]
|
||||||
|
"vllm/version.py" = ["F401"]
|
||||||
|
"vllm/_version.py" = ["ALL"]
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = [
|
||||||
|
# pycodestyle
|
||||||
|
"E",
|
||||||
|
# Pyflakes
|
||||||
|
"F",
|
||||||
|
# pyupgrade
|
||||||
|
"UP",
|
||||||
|
# flake8-bugbear
|
||||||
|
"B",
|
||||||
|
# flake8-simplify
|
||||||
|
"SIM",
|
||||||
|
# isort
|
||||||
|
"I",
|
||||||
|
# flake8-logging-format
|
||||||
|
"G",
|
||||||
|
]
|
||||||
|
ignore = [
|
||||||
|
# star imports
|
||||||
|
"F405", "F403",
|
||||||
|
# lambda expression assignment
|
||||||
|
"E731",
|
||||||
|
# Loop control variable not used within loop body
|
||||||
|
"B007",
|
||||||
|
# f-string format
|
||||||
|
"UP032",
|
||||||
|
# Can remove once 3.10+ is the minimum Python version
|
||||||
|
"UP007",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.ruff.format]
|
||||||
|
docstring-code-format = true
|
||||||
@ -1,23 +1,23 @@
|
|||||||
steps:
|
steps:
|
||||||
- label: "Build wheel - CUDA 12.4"
|
- label: "Build wheel - CUDA 12.8"
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 12.1"
|
- label: "Build wheel - CUDA 12.6"
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
@ -31,10 +31,10 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
@ -48,7 +48,7 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
- label: "Build and publish TPU release image"
|
- label: "Build and publish TPU release image"
|
||||||
@ -57,12 +57,14 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: tpu_queue_postmerge
|
queue: tpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
|
- "yes | docker system prune -a"
|
||||||
|
- "git fetch --all"
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
|
||||||
- "docker push vllm/vllm-tpu:nightly"
|
- "docker push vllm/vllm-tpu:nightly"
|
||||||
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
||||||
plugins:
|
plugins:
|
||||||
- docker-login#v3.0.0:
|
- docker-login#v3.0.0:
|
||||||
username: vllm
|
username: vllmbot
|
||||||
password-env: DOCKERHUB_TOKEN
|
password-env: DOCKERHUB_TOKEN
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
@ -82,7 +84,22 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
- block: "Build Neuron release image"
|
||||||
|
key: block-neuron-release-image-build
|
||||||
|
depends_on: ~
|
||||||
|
|
||||||
|
- label: "Build and publish Neuron release image"
|
||||||
|
depends_on: block-neuron-release-image-build
|
||||||
|
agents:
|
||||||
|
queue: neuron-postmerge
|
||||||
|
commands:
|
||||||
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
|
||||||
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|||||||
@ -1,42 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
# Build the docker image.
|
|
||||||
docker build -f Dockerfile.tpu -t vllm-tpu .
|
|
||||||
|
|
||||||
# Set up cleanup.
|
|
||||||
remove_docker_container() { docker rm -f tpu-test || true; }
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
# Remove the container that might not be cleaned up in the previous run.
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# For HF_TOKEN.
|
|
||||||
source /etc/environment
|
|
||||||
# Run a simple end-to-end example.
|
|
||||||
docker run --privileged --net host --shm-size=16G -it \
|
|
||||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
|
||||||
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
|
||||||
&& python3 -m pip install pytest \
|
|
||||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
|
||||||
&& export VLLM_USE_V1=1 \
|
|
||||||
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
|
|
||||||
&& echo TEST_1 \
|
|
||||||
&& pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
|
|
||||||
&& echo TEST_2 \
|
|
||||||
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
|
|
||||||
&& echo TEST_3 \
|
|
||||||
&& pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
|
|
||||||
&& echo TEST_4 \
|
|
||||||
&& pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
|
||||||
&& echo TEST_5 \
|
|
||||||
&& python3 /workspace/vllm/examples/offline_inference/tpu.py \
|
|
||||||
&& echo TEST_6 \
|
|
||||||
&& pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py \
|
|
||||||
&& echo TEST_7 \
|
|
||||||
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
|
|
||||||
|
|
||||||
|
|
||||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
|
||||||
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
|
||||||
|
|
||||||
@ -3,6 +3,9 @@
|
|||||||
# This script runs test inside the corresponding ROCm docker container.
|
# This script runs test inside the corresponding ROCm docker container.
|
||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
|
||||||
|
# Export Python path
|
||||||
|
export PYTHONPATH=".."
|
||||||
|
|
||||||
# Print ROCm version
|
# Print ROCm version
|
||||||
echo "--- Confirming Clean Initial State"
|
echo "--- Confirming Clean Initial State"
|
||||||
while true; do
|
while true; do
|
||||||
@ -74,50 +77,102 @@ HF_MOUNT="/root/.cache/huggingface"
|
|||||||
|
|
||||||
commands=$@
|
commands=$@
|
||||||
echo "Commands:$commands"
|
echo "Commands:$commands"
|
||||||
|
|
||||||
|
if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
|
||||||
|
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
||||||
|
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
|
||||||
|
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
||||||
|
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
||||||
|
fi
|
||||||
|
|
||||||
#ignore certain kernels tests
|
#ignore certain kernels tests
|
||||||
if [[ $commands == *" kernels "* ]]; then
|
if [[ $commands == *" kernels/core"* ]]; then
|
||||||
commands="${commands} \
|
commands="${commands} \
|
||||||
--ignore=kernels/test_attention_selector.py \
|
--ignore=kernels/core/test_fused_quant_layernorm.py \
|
||||||
--ignore=kernels/test_blocksparse_attention.py \
|
--ignore=kernels/core/test_permute_cols.py"
|
||||||
--ignore=kernels/test_causal_conv1d.py \
|
fi
|
||||||
--ignore=kernels/test_cutlass.py \
|
|
||||||
--ignore=kernels/test_encoder_decoder_attn.py \
|
if [[ $commands == *" kernels/attention"* ]]; then
|
||||||
--ignore=kernels/test_flash_attn.py \
|
commands="${commands} \
|
||||||
--ignore=kernels/test_flashinfer.py \
|
--ignore=kernels/attention/stest_attention_selector.py \
|
||||||
--ignore=kernels/test_int8_quant.py \
|
--ignore=kernels/attention/test_blocksparse_attention.py \
|
||||||
--ignore=kernels/test_machete_gemm.py \
|
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
||||||
--ignore=kernels/test_mamba_ssm.py \
|
--ignore=kernels/attention/test_attention_selector.py \
|
||||||
--ignore=kernels/test_marlin_gemm.py \
|
--ignore=kernels/attention/test_flash_attn.py \
|
||||||
--ignore=kernels/test_moe.py \
|
--ignore=kernels/attention/test_flashinfer.py \
|
||||||
--ignore=kernels/test_prefix_prefill.py \
|
--ignore=kernels/attention/test_prefix_prefill.py \
|
||||||
--ignore=kernels/test_rand.py \
|
--ignore=kernels/attention/test_cascade_flash_attn.py \
|
||||||
--ignore=kernels/test_sampler.py \
|
--ignore=kernels/attention/test_mha_attn.py \
|
||||||
--ignore=kernels/test_cascade_flash_attn.py \
|
--ignore=kernels/attention/test_lightning_attn.py \
|
||||||
--ignore=kernels/test_mamba_mixer2.py \
|
--ignore=kernels/attention/test_attention.py"
|
||||||
--ignore=kernels/test_aqlm.py \
|
fi
|
||||||
--ignore=kernels/test_machete_mm.py \
|
|
||||||
--ignore=kernels/test_mha_attn.py \
|
if [[ $commands == *" kernels/quantization"* ]]; then
|
||||||
--ignore=kernels/test_block_fp8.py \
|
commands="${commands} \
|
||||||
--ignore=kernels/test_permute_cols.py"
|
--ignore=kernels/quantization/test_int8_quant.py \
|
||||||
|
--ignore=kernels/quantization/test_aqlm.py \
|
||||||
|
--ignore=kernels/quantization/test_machete_mm.py \
|
||||||
|
--ignore=kernels/quantization/test_block_fp8.py \
|
||||||
|
--ignore=kernels/quantization/test_block_int8.py \
|
||||||
|
--ignore=kernels/quantization/test_marlin_gemm.py \
|
||||||
|
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
|
||||||
|
--ignore=kernels/quantization/test_int8_kernel.py"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $commands == *" kernels/mamba"* ]]; then
|
||||||
|
commands="${commands} \
|
||||||
|
--ignore=kernels/mamba/test_mamba_mixer2.py \
|
||||||
|
--ignore=kernels/mamba/test_causal_conv1d.py \
|
||||||
|
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $commands == *" kernels/moe"* ]]; then
|
||||||
|
commands="${commands} \
|
||||||
|
--ignore=kernels/moe/test_moe.py \
|
||||||
|
--ignore=kernels/moe/test_cutlass_moe.py \
|
||||||
|
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
#ignore certain Entrypoints/openai tests
|
#ignore certain Entrypoints/openai tests
|
||||||
if [[ $commands == *" entrypoints/openai "* ]]; then
|
if [[ $commands == *" entrypoints/openai "* ]]; then
|
||||||
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
||||||
--ignore=entrypoints/openai/test_audio.py \
|
--ignore=entrypoints/openai/test_audio.py \
|
||||||
--ignore=entrypoints/openai/test_chat.py \
|
|
||||||
--ignore=entrypoints/openai/test_shutdown.py \
|
--ignore=entrypoints/openai/test_shutdown.py \
|
||||||
--ignore=entrypoints/openai/test_completion.py \
|
--ignore=entrypoints/openai/test_completion.py \
|
||||||
--ignore=entrypoints/openai/test_sleep.py \
|
--ignore=entrypoints/openai/test_sleep.py \
|
||||||
--ignore=entrypoints/openai/test_models.py \
|
--ignore=entrypoints/openai/test_models.py \
|
||||||
|
--ignore=entrypoints/openai/test_lora_adapters.py \
|
||||||
|
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
||||||
|
--ignore=entrypoints/openai/test_root_path.py \
|
||||||
|
--ignore=entrypoints/openai/test_tokenization.py \
|
||||||
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
#ignore certain Entrypoints/llm tests
|
#ignore certain Entrypoints/llm tests
|
||||||
if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
if [[ $commands == *" entrypoints/llm "* ]]; then
|
||||||
commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
commands=${commands//" entrypoints/llm "/" entrypoints/llm \
|
||||||
|
--ignore=entrypoints/llm/test_chat.py \
|
||||||
|
--ignore=entrypoints/llm/test_accuracy.py \
|
||||||
|
--ignore=entrypoints/llm/test_init.py \
|
||||||
|
--ignore=entrypoints/llm/test_generate_multiple_loras.py \
|
||||||
|
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
#Obsolete currently
|
||||||
|
##ignore certain Entrypoints/llm tests
|
||||||
|
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
||||||
|
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
||||||
|
#fi
|
||||||
|
|
||||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||||
# --ignore=entrypoints/openai/test_embedding.py \
|
# --ignore=entrypoints/openai/test_embedding.py \
|
||||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
# --ignore=entrypoints/openai/test_oot_registration.py
|
||||||
@ -126,6 +181,8 @@ fi
|
|||||||
|
|
||||||
|
|
||||||
PARALLEL_JOB_COUNT=8
|
PARALLEL_JOB_COUNT=8
|
||||||
|
MYPYTHONPATH=".."
|
||||||
|
|
||||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||||
if [[ $commands == *"--shard-id="* ]]; then
|
if [[ $commands == *"--shard-id="* ]]; then
|
||||||
# assign job count as the number of shards used
|
# assign job count as the number of shards used
|
||||||
@ -146,6 +203,7 @@ if [[ $commands == *"--shard-id="* ]]; then
|
|||||||
-e AWS_SECRET_ACCESS_KEY \
|
-e AWS_SECRET_ACCESS_KEY \
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
|
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
||||||
--name "${container_name}_${GPU}" \
|
--name "${container_name}_${GPU}" \
|
||||||
"${image_name}" \
|
"${image_name}" \
|
||||||
/bin/bash -c "${commands_gpu}" \
|
/bin/bash -c "${commands_gpu}" \
|
||||||
@ -176,6 +234,7 @@ else
|
|||||||
-e AWS_SECRET_ACCESS_KEY \
|
-e AWS_SECRET_ACCESS_KEY \
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
|
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
||||||
--name "${container_name}" \
|
--name "${container_name}" \
|
||||||
"${image_name}" \
|
"${image_name}" \
|
||||||
/bin/bash -c "${commands}"
|
/bin/bash -c "${commands}"
|
||||||
48
.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
Executable file
48
.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
Executable file
@ -0,0 +1,48 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script build the CPU docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() {
|
||||||
|
if [[ -n "$container_id" ]]; then
|
||||||
|
podman rm -f "$container_id" || true
|
||||||
|
fi
|
||||||
|
podman system prune -f
|
||||||
|
}
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
|
||||||
|
|
||||||
|
# Run the image
|
||||||
|
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
|
||||||
|
|
||||||
|
function cpu_tests() {
|
||||||
|
|
||||||
|
# offline inference
|
||||||
|
podman exec -it "$container_id" bash -c "
|
||||||
|
set -e
|
||||||
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||||
|
|
||||||
|
# Run basic model test
|
||||||
|
podman exec -it "$container_id" bash -c "
|
||||||
|
set -e
|
||||||
|
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
||||||
|
pip install sentence-transformers datamodel_code_generator
|
||||||
|
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
||||||
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
||||||
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
||||||
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
||||||
|
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||||
|
pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
|
||||||
|
}
|
||||||
|
|
||||||
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
|
|
||||||
|
export container_id
|
||||||
|
export -f cpu_tests
|
||||||
|
timeout 40m bash -c cpu_tests
|
||||||
|
|
||||||
@ -10,5 +10,4 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
docker build -t cpu-test -f Dockerfile.ppc64le .
|
docker build -t cpu-test -f docker/Dockerfile.s390x .
|
||||||
|
|
||||||
@ -8,15 +8,19 @@ set -ex
|
|||||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||||
NUMA_NODE=${NUMA_NODE:-1}
|
NUMA_NODE=${NUMA_NODE:-1}
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
|
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
|
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
|
remove_docker_container() {
|
||||||
|
set -e;
|
||||||
|
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
|
||||||
|
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
|
||||||
|
}
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
|
||||||
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||||
|
|
||||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||||
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
||||||
@ -36,8 +40,6 @@ function cpu_tests() {
|
|||||||
# Run basic model test
|
# Run basic model test
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pip install -r vllm/requirements/test.txt
|
|
||||||
pip install -r vllm/requirements/cpu.txt
|
|
||||||
pytest -v -s tests/kernels/test_cache.py -m cpu_model
|
pytest -v -s tests/kernels/test_cache.py -m cpu_model
|
||||||
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
|
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
|
||||||
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
||||||
@ -9,6 +9,7 @@ python3 use_existing_torch.py
|
|||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
DOCKER_BUILDKIT=1 docker build . \
|
DOCKER_BUILDKIT=1 docker build . \
|
||||||
|
--file docker/Dockerfile \
|
||||||
--target vllm-openai \
|
--target vllm-openai \
|
||||||
--platform "linux/arm64" \
|
--platform "linux/arm64" \
|
||||||
-t gh200-test \
|
-t gh200-test \
|
||||||
@ -5,20 +5,22 @@
|
|||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
docker build -t hpu-test-env -f Dockerfile.hpu .
|
docker build -t hpu-test-env -f docker/Dockerfile.hpu .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
# certain versions of HPU software stack have a bug that can
|
# certain versions of HPU software stack have a bug that can
|
||||||
# override the exit code of the script, so we need to use
|
# override the exit code of the script, so we need to use
|
||||||
# separate remove_docker_container and remove_docker_container_and_exit
|
# separate remove_docker_containers and remove_docker_containers_and_exit
|
||||||
# functions, while other platforms only need one remove_docker_container
|
# functions, while other platforms only need one remove_docker_container
|
||||||
# function.
|
# function.
|
||||||
EXITCODE=1
|
EXITCODE=1
|
||||||
remove_docker_container() { docker rm -f hpu-test || true; }
|
remove_docker_containers() { docker rm -f hpu-test || true; docker rm -f hpu-test-tp2 || true; }
|
||||||
remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
|
remove_docker_containers_and_exit() { remove_docker_containers; exit $EXITCODE; }
|
||||||
trap remove_docker_container_and_exit EXIT
|
trap remove_docker_containers_and_exit EXIT
|
||||||
remove_docker_container
|
remove_docker_containers
|
||||||
|
|
||||||
# Run the image and launch offline inference
|
# Run the image and launch offline inference
|
||||||
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
|
docker run --runtime=habana --name=hpu-test-tp2 --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --tensor-parallel-size 2
|
||||||
|
|
||||||
EXITCODE=$?
|
EXITCODE=$?
|
||||||
@ -11,13 +11,14 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
|||||||
HF_CACHE="$(realpath ~)/huggingface"
|
HF_CACHE="$(realpath ~)/huggingface"
|
||||||
mkdir -p "${HF_CACHE}"
|
mkdir -p "${HF_CACHE}"
|
||||||
HF_MOUNT="/root/.cache/huggingface"
|
HF_MOUNT="/root/.cache/huggingface"
|
||||||
|
HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
|
||||||
|
|
||||||
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
||||||
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
||||||
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
|
||||||
|
|
||||||
# prune old image and containers to save disk space, and only once a day
|
# prune old image and containers to save disk space, and only once a day
|
||||||
# by using a timestamp file in tmp.
|
# by using a timestamp file in tmp.
|
||||||
@ -35,7 +36,7 @@ else
|
|||||||
date "+%s" > /tmp/neuron-docker-build-timestamp
|
date "+%s" > /tmp/neuron-docker-build-timestamp
|
||||||
fi
|
fi
|
||||||
|
|
||||||
docker build -t "${image_name}" -f Dockerfile.neuron .
|
docker build -t "${image_name}" -f docker/Dockerfile.neuron .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
@ -47,8 +48,16 @@ trap remove_docker_container EXIT
|
|||||||
docker run --rm -it --device=/dev/neuron0 --network bridge \
|
docker run --rm -it --device=/dev/neuron0 --network bridge \
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
|
-e "HF_TOKEN=${HF_TOKEN}" \
|
||||||
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
--name "${container_name}" \
|
--name "${container_name}" \
|
||||||
${image_name} \
|
${image_name} \
|
||||||
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
|
/bin/bash -c "
|
||||||
|
python3 /workspace/vllm/examples/offline_inference/neuron.py;
|
||||||
|
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
|
||||||
|
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
|
||||||
|
echo 'Running test file: '$f;
|
||||||
|
python3 -m pytest \$f -v --capture=tee-sys;
|
||||||
|
done
|
||||||
|
"
|
||||||
181
.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
Executable file
181
.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
Executable file
@ -0,0 +1,181 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -xu
|
||||||
|
|
||||||
|
|
||||||
|
remove_docker_container() {
|
||||||
|
docker rm -f tpu-test || true;
|
||||||
|
docker rm -f vllm-tpu || true;
|
||||||
|
}
|
||||||
|
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
|
||||||
|
# Remove the container that might not be cleaned up in the previous run.
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Build the docker image.
|
||||||
|
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
||||||
|
|
||||||
|
# Set up cleanup.
|
||||||
|
cleanup_docker() {
|
||||||
|
# Get Docker's root directory
|
||||||
|
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||||
|
if [ -z "$docker_root" ]; then
|
||||||
|
echo "Failed to determine Docker root directory."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Docker root directory: $docker_root"
|
||||||
|
# Check disk usage of the filesystem where Docker's root directory is located
|
||||||
|
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||||
|
# Define the threshold
|
||||||
|
threshold=70
|
||||||
|
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||||
|
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||||
|
# Remove dangling images (those that are not tagged and not used by any container)
|
||||||
|
docker image prune -f
|
||||||
|
# Remove unused volumes / force the system prune for old images as well.
|
||||||
|
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||||
|
echo "Docker images and volumes cleanup completed."
|
||||||
|
else
|
||||||
|
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
cleanup_docker
|
||||||
|
|
||||||
|
# For HF_TOKEN.
|
||||||
|
source /etc/environment
|
||||||
|
|
||||||
|
docker run --privileged --net host --shm-size=16G -it \
|
||||||
|
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||||
|
vllm-tpu /bin/bash -c '
|
||||||
|
set -e # Exit immediately if a command exits with a non-zero status.
|
||||||
|
set -u # Treat unset variables as an error.
|
||||||
|
|
||||||
|
echo "--- Starting script inside Docker container ---"
|
||||||
|
|
||||||
|
# Create results directory
|
||||||
|
RESULTS_DIR=$(mktemp -d)
|
||||||
|
# If mktemp fails, set -e will cause the script to exit.
|
||||||
|
echo "Results will be stored in: $RESULTS_DIR"
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
echo "--- Installing Python dependencies ---"
|
||||||
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
|
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
|
||||||
|
echo "--- Python dependencies installed ---"
|
||||||
|
export VLLM_USE_V1=1
|
||||||
|
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||||
|
export VLLM_XLA_CACHE_PATH=
|
||||||
|
echo "Using VLLM V1"
|
||||||
|
|
||||||
|
echo "--- Hardware Information ---"
|
||||||
|
tpu-info
|
||||||
|
echo "--- Starting Tests ---"
|
||||||
|
set +e
|
||||||
|
overall_script_exit_code=0
|
||||||
|
|
||||||
|
# --- Test Definitions ---
|
||||||
|
# If a test fails, this function will print logs and will not cause the main script to exit.
|
||||||
|
run_test() {
|
||||||
|
local test_num=$1
|
||||||
|
local test_name=$2
|
||||||
|
local test_command=$3
|
||||||
|
local log_file="$RESULTS_DIR/test_${test_num}.log"
|
||||||
|
local actual_exit_code
|
||||||
|
|
||||||
|
echo "--- TEST_$test_num: Running $test_name ---"
|
||||||
|
|
||||||
|
# Execute the test command.
|
||||||
|
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
|
||||||
|
actual_exit_code=$?
|
||||||
|
|
||||||
|
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
|
||||||
|
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
|
||||||
|
|
||||||
|
if [ "$actual_exit_code" -ne 0 ]; then
|
||||||
|
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
|
||||||
|
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
|
||||||
|
if [ -f "$log_file" ]; then
|
||||||
|
cat "$log_file" >&2
|
||||||
|
else
|
||||||
|
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
|
||||||
|
fi
|
||||||
|
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
|
||||||
|
return "$actual_exit_code" # Return the failure code
|
||||||
|
else
|
||||||
|
echo "TEST_$test_num ($test_name) PASSED."
|
||||||
|
return 0 # Return success
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Helper function to call run_test and update the overall script exit code
|
||||||
|
run_and_track_test() {
|
||||||
|
local test_num_arg="$1"
|
||||||
|
local test_name_arg="$2"
|
||||||
|
local test_command_arg="$3"
|
||||||
|
|
||||||
|
# Run the test
|
||||||
|
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
|
||||||
|
local test_specific_exit_code=$?
|
||||||
|
|
||||||
|
# If the test failed, set the overall script exit code to 1
|
||||||
|
if [ "$test_specific_exit_code" -ne 0 ]; then
|
||||||
|
# No need for extra echo here, run_test already logged the failure.
|
||||||
|
overall_script_exit_code=1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- Actual Test Execution ---
|
||||||
|
run_and_track_test 0 "test_perf.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
|
||||||
|
run_and_track_test 1 "test_compilation.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
|
||||||
|
run_and_track_test 2 "test_basic.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
|
||||||
|
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
|
||||||
|
run_and_track_test 4 "test_quantization_accuracy.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
|
||||||
|
run_and_track_test 5 "examples/offline_inference/tpu.py" \
|
||||||
|
"python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
||||||
|
run_and_track_test 6 "test_tpu_model_runner.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
|
||||||
|
run_and_track_test 7 "test_sampler.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
|
||||||
|
run_and_track_test 8 "test_topk_topp_sampler.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
|
||||||
|
run_and_track_test 9 "test_multimodal.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
|
||||||
|
run_and_track_test 10 "test_pallas.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
|
||||||
|
run_and_track_test 11 "test_struct_output_generate.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py"
|
||||||
|
run_and_track_test 12 "test_moe_pallas.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
||||||
|
run_and_track_test 13 "test_lora.py" \
|
||||||
|
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
||||||
|
|
||||||
|
# After all tests have been attempted, exit with the overall status.
|
||||||
|
if [ "$overall_script_exit_code" -ne 0 ]; then
|
||||||
|
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
|
||||||
|
else
|
||||||
|
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
|
||||||
|
fi
|
||||||
|
exit "$overall_script_exit_code"
|
||||||
|
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
|
||||||
|
|
||||||
|
# Capture the exit code of the docker run command
|
||||||
|
DOCKER_RUN_EXIT_CODE=$?
|
||||||
|
|
||||||
|
# The trap will run for cleanup.
|
||||||
|
# Exit the main script with the Docker run command's exit code.
|
||||||
|
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
|
||||||
|
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
|
||||||
|
exit "$DOCKER_RUN_EXIT_CODE"
|
||||||
|
else
|
||||||
|
echo "Docker run command completed successfully."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||||
|
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||||
@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
|
|||||||
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
docker build -t ${image_name} -f Dockerfile.xpu .
|
docker build -t ${image_name} -f docker/Dockerfile.xpu .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
@ -5,8 +5,8 @@
|
|||||||
set -ex
|
set -ex
|
||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
|
||||||
# cd into parent directory of this file
|
# cd 2 levels into the working directory
|
||||||
cd "$(dirname "${BASH_SOURCE[0]}")/.."
|
cd "$(dirname "${BASH_SOURCE[0]}")/../.."
|
||||||
|
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
|
||||||
@ -3,7 +3,7 @@
|
|||||||
set -euox pipefail
|
set -euox pipefail
|
||||||
|
|
||||||
if [[ $# -lt 4 ]]; then
|
if [[ $# -lt 4 ]]; then
|
||||||
echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@ -50,11 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
|||||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
# if $normal_wheel matches cu118, do not upload the index.html
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
echo "Skipping index files for cu118 wheels"
|
echo "Skipping index files for cu118 wheels"
|
||||||
elif [[ $normal_wheel == *"cu121"* ]]; then
|
elif [[ $normal_wheel == *"cu126"* ]]; then
|
||||||
# if $normal_wheel matches cu121, do not upload the index.html
|
# if $normal_wheel matches cu126, do not upload the index.html
|
||||||
echo "Skipping index files for cu121 wheels"
|
echo "Skipping index files for cu126 wheels"
|
||||||
else
|
else
|
||||||
# only upload index.html for cu124 wheels (default wheels)
|
# only upload index.html for cu128 wheels (default wheels)
|
||||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||||
fi
|
fi
|
||||||
@ -66,12 +66,13 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
|||||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
# if $normal_wheel matches cu118, do not upload the index.html
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
echo "Skipping index files for cu118 wheels"
|
echo "Skipping index files for cu118 wheels"
|
||||||
elif [[ $normal_wheel == *"cu121"* ]]; then
|
elif [[ $normal_wheel == *"cu126"* ]]; then
|
||||||
# if $normal_wheel matches cu121, do not upload the index.html
|
# if $normal_wheel matches cu126, do not upload the index.html
|
||||||
echo "Skipping index files for cu121 wheels"
|
echo "Skipping index files for cu126 wheels"
|
||||||
else
|
else
|
||||||
# only upload index.html for cu124 wheels (default wheels)
|
# only upload index.html for cu128 wheels (default wheels)
|
||||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||||
|
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
|
||||||
@ -8,6 +8,7 @@
|
|||||||
# Documentation
|
# Documentation
|
||||||
# label(str): the name of the test. emoji allowed.
|
# label(str): the name of the test. emoji allowed.
|
||||||
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
||||||
|
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
|
||||||
# fast_check_only(bool): run this test on fastcheck pipeline only
|
# fast_check_only(bool): run this test on fastcheck pipeline only
|
||||||
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
|
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
|
||||||
# command(str): the single command to run for tests. incompatible with commands.
|
# command(str): the single command to run for tests. incompatible with commands.
|
||||||
@ -31,16 +32,17 @@ steps:
|
|||||||
##### fast check tests #####
|
##### fast check tests #####
|
||||||
|
|
||||||
- label: Documentation Build # 2min
|
- label: Documentation Build # 2min
|
||||||
working_dir: "/vllm-workspace/test_docs/docs"
|
mirror_hardwares: [amdexperimental]
|
||||||
|
working_dir: "/vllm-workspace/test_docs"
|
||||||
fast_check: true
|
fast_check: true
|
||||||
no_gpu: True
|
no_gpu: True
|
||||||
commands:
|
commands:
|
||||||
- pip install -r ../../requirements/docs.txt
|
- pip install -r ../requirements/docs.txt
|
||||||
- SPHINXOPTS=\"-W\" make html
|
# TODO: add `--strict` once warnings in docstrings are fixed
|
||||||
# Check API reference (if it fails, you may have missing mock imports)
|
- mkdocs build
|
||||||
- grep \"sig sig-object py\" build/html/api/inference_params.html
|
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/mq_llm_engine
|
- tests/mq_llm_engine
|
||||||
@ -56,11 +58,13 @@ steps:
|
|||||||
- pytest -v -s async_engine # AsyncLLMEngine
|
- pytest -v -s async_engine # AsyncLLMEngine
|
||||||
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
||||||
- pytest -v -s test_inputs.py
|
- pytest -v -s test_inputs.py
|
||||||
|
- pytest -v -s test_outputs.py
|
||||||
- pytest -v -s multimodal
|
- pytest -v -s multimodal
|
||||||
- pytest -v -s test_utils.py # Utils
|
- pytest -v -s test_utils.py # Utils
|
||||||
- pytest -v -s worker # Worker
|
- pytest -v -s worker # Worker
|
||||||
|
|
||||||
- label: Python-only Installation Test
|
- label: Python-only Installation Test
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- tests/standalone_tests/python_only_compile.sh
|
- tests/standalone_tests/python_only_compile.sh
|
||||||
- setup.py
|
- setup.py
|
||||||
@ -68,8 +72,9 @@ steps:
|
|||||||
- bash standalone_tests/python_only_compile.sh
|
- bash standalone_tests/python_only_compile.sh
|
||||||
|
|
||||||
- label: Basic Correctness Test # 30min
|
- label: Basic Correctness Test # 30min
|
||||||
#mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
fast_check: true
|
fast_check: true
|
||||||
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/basic_correctness/test_basic_correctness
|
- tests/basic_correctness/test_basic_correctness
|
||||||
@ -84,6 +89,7 @@ steps:
|
|||||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||||
|
|
||||||
- label: Chunked Prefill Test
|
- label: Chunked Prefill Test
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/basic_correctness/test_chunked_prefill
|
- tests/basic_correctness/test_chunked_prefill
|
||||||
@ -92,7 +98,7 @@ steps:
|
|||||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||||
|
|
||||||
- label: Core Test # 10min
|
- label: Core Test # 10min
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
fast_check: true
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/core
|
- vllm/core
|
||||||
@ -102,9 +108,10 @@ steps:
|
|||||||
- pytest -v -s core
|
- pytest -v -s core
|
||||||
|
|
||||||
- label: Entrypoints Test # 40min
|
- label: Entrypoints Test # 40min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
fast_check: true
|
fast_check: true
|
||||||
mirror_hardwares: [amd]
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/entrypoints/llm
|
- tests/entrypoints/llm
|
||||||
@ -118,11 +125,12 @@ steps:
|
|||||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
|
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
|
||||||
- pytest -v -s entrypoints/test_chat_utils.py
|
- pytest -v -s entrypoints/test_chat_utils.py
|
||||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
|
||||||
- label: Distributed Tests (4 GPUs) # 10min
|
- label: Distributed Tests (4 GPUs) # 10min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -130,6 +138,7 @@ steps:
|
|||||||
- vllm/core/
|
- vllm/core/
|
||||||
- tests/distributed/test_utils
|
- tests/distributed/test_utils
|
||||||
- tests/distributed/test_pynccl
|
- tests/distributed/test_pynccl
|
||||||
|
- tests/distributed/test_events
|
||||||
- tests/spec_decode/e2e/test_integration_dist_tp4
|
- tests/spec_decode/e2e/test_integration_dist_tp4
|
||||||
- tests/compile/test_basic_correctness
|
- tests/compile/test_basic_correctness
|
||||||
- examples/offline_inference/rlhf.py
|
- examples/offline_inference/rlhf.py
|
||||||
@ -140,21 +149,25 @@ steps:
|
|||||||
# test with tp=2 and external_dp=2
|
# test with tp=2 and external_dp=2
|
||||||
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
|
# test with tp=2 and pp=2
|
||||||
|
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
# test with internal dp
|
# test with internal dp
|
||||||
- python3 ../examples/offline_inference/data_parallel.py
|
- python3 ../examples/offline_inference/data_parallel.py
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||||
- pytest -v -s distributed/test_utils.py
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
|
- pytest -v -s distributed/test_events.py
|
||||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||||
# TODO: create a dedicated test section for multi-GPU example tests
|
# TODO: create a dedicated test section for multi-GPU example tests
|
||||||
# when we have multiple distributed example tests
|
# when we have multiple distributed example tests
|
||||||
- pushd ../examples/offline_inference
|
- pushd ../examples/offline_inference
|
||||||
- VLLM_ENABLE_V1_MULTIPROCESSING=0 python3 rlhf.py
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||||
- VLLM_ENABLE_V1_MULTIPROCESSING=0 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||||
- popd
|
- popd
|
||||||
|
|
||||||
- label: Metrics, Tracing Test # 10min
|
- label: Metrics, Tracing Test # 10min
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@ -162,18 +175,13 @@ steps:
|
|||||||
- tests/tracing
|
- tests/tracing
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s metrics
|
- pytest -v -s metrics
|
||||||
- "pip install \
|
|
||||||
'opentelemetry-sdk>=1.26.0,<1.27.0' \
|
|
||||||
'opentelemetry-api>=1.26.0,<1.27.0' \
|
|
||||||
'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
|
|
||||||
'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
|
|
||||||
- pytest -v -s tracing
|
- pytest -v -s tracing
|
||||||
|
|
||||||
##### fast check tests #####
|
##### fast check tests #####
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
|
|
||||||
- label: Regression Test # 5min
|
- label: Regression Test # 5min
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/test_regression
|
- tests/test_regression
|
||||||
@ -183,7 +191,7 @@ steps:
|
|||||||
working_dir: "/vllm-workspace/tests" # optional
|
working_dir: "/vllm-workspace/tests" # optional
|
||||||
|
|
||||||
- label: Engine Test # 10min
|
- label: Engine Test # 10min
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/engine
|
- tests/engine
|
||||||
@ -191,28 +199,31 @@ steps:
|
|||||||
- tests/test_sequence
|
- tests/test_sequence
|
||||||
- tests/test_config
|
- tests/test_config
|
||||||
- tests/test_logger
|
- tests/test_logger
|
||||||
|
- tests/test_vllm_port
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||||
# OOM in the CI unless we run this separately
|
# OOM in the CI unless we run this separately
|
||||||
- pytest -v -s tokenization
|
- pytest -v -s tokenization
|
||||||
|
|
||||||
- label: V1 Test
|
- label: V1 Test
|
||||||
#mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/v1
|
- tests/v1
|
||||||
commands:
|
commands:
|
||||||
# split the test to avoid interference
|
# split the test to avoid interference
|
||||||
- pytest -v -s v1/core
|
- pytest -v -s v1/core
|
||||||
- pytest -v -s v1/entrypoints
|
|
||||||
- pytest -v -s v1/engine
|
- pytest -v -s v1/engine
|
||||||
- pytest -v -s v1/entrypoints
|
- pytest -v -s v1/entrypoints
|
||||||
- pytest -v -s v1/sample
|
- pytest -v -s v1/sample
|
||||||
- pytest -v -s v1/worker
|
- pytest -v -s v1/worker
|
||||||
- pytest -v -s v1/structured_output
|
- pytest -v -s v1/structured_output
|
||||||
- pytest -v -s v1/test_stats.py
|
- pytest -v -s v1/spec_decode
|
||||||
|
- pytest -v -s v1/kv_connector/unit
|
||||||
|
- pytest -v -s v1/test_serial_utils.py
|
||||||
- pytest -v -s v1/test_utils.py
|
- pytest -v -s v1/test_utils.py
|
||||||
- pytest -v -s v1/test_oracle.py
|
- pytest -v -s v1/test_oracle.py
|
||||||
|
- pytest -v -s v1/test_metrics_reader.py
|
||||||
# TODO: accuracy does not match, whether setting
|
# TODO: accuracy does not match, whether setting
|
||||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||||
- pytest -v -s v1/e2e
|
- pytest -v -s v1/e2e
|
||||||
@ -221,8 +232,8 @@ steps:
|
|||||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||||
|
|
||||||
- label: Examples Test # 25min
|
- label: Examples Test # 25min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/examples"
|
working_dir: "/vllm-workspace/examples"
|
||||||
#mirror_hardwares: [amd]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/entrypoints
|
- vllm/entrypoints
|
||||||
- examples/
|
- examples/
|
||||||
@ -237,7 +248,7 @@ steps:
|
|||||||
- python3 offline_inference/vision_language.py --seed 0
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
- python3 offline_inference/vision_language_embedding.py --seed 0
|
- python3 offline_inference/vision_language_embedding.py --seed 0
|
||||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||||
- VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference/encoder_decoder.py
|
- python3 offline_inference/encoder_decoder.py
|
||||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||||
- python3 offline_inference/basic/classify.py
|
- python3 offline_inference/basic/classify.py
|
||||||
@ -246,7 +257,7 @@ steps:
|
|||||||
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||||
|
|
||||||
- label: Prefix Caching Test # 9min
|
- label: Prefix Caching Test # 9min
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/prefix_caching
|
- tests/prefix_caching
|
||||||
@ -254,6 +265,7 @@ steps:
|
|||||||
- pytest -v -s prefix_caching
|
- pytest -v -s prefix_caching
|
||||||
|
|
||||||
- label: Samplers Test # 36min
|
- label: Samplers Test # 36min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/layers
|
- vllm/model_executor/layers
|
||||||
- vllm/sampling_metadata.py
|
- vllm/sampling_metadata.py
|
||||||
@ -263,18 +275,8 @@ steps:
|
|||||||
- pytest -v -s samplers
|
- pytest -v -s samplers
|
||||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||||
|
|
||||||
- label: LogitsProcessor Test # 5min
|
|
||||||
mirror_hardwares: [amd]
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/model_executor/layers
|
|
||||||
- vllm/model_executor/guided_decoding
|
|
||||||
- tests/test_logits_processor
|
|
||||||
- tests/model_executor/test_guided_processors
|
|
||||||
commands:
|
|
||||||
- pytest -v -s test_logits_processor.py
|
|
||||||
- pytest -v -s model_executor/test_guided_processors.py
|
|
||||||
|
|
||||||
- label: Speculative decoding tests # 40min
|
- label: Speculative decoding tests # 40min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/spec_decode
|
- vllm/spec_decode
|
||||||
- tests/spec_decode
|
- tests/spec_decode
|
||||||
@ -285,14 +287,29 @@ steps:
|
|||||||
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
||||||
|
|
||||||
- label: LoRA Test %N # 15min each
|
- label: LoRA Test %N # 15min each
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
- tests/lora
|
- tests/lora
|
||||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py
|
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
|
||||||
parallelism: 4
|
parallelism: 4
|
||||||
|
|
||||||
|
- label: PyTorch Compilation Unit Tests
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
torch_nightly: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/compile
|
||||||
|
commands:
|
||||||
|
- pytest -v -s compile/test_pass_manager.py
|
||||||
|
- pytest -v -s compile/test_fusion.py
|
||||||
|
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||||
|
- pytest -v -s compile/test_sequence_parallelism.py
|
||||||
|
- pytest -v -s compile/test_async_tp.py
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 9min
|
- label: PyTorch Fullgraph Smoke Test # 9min
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
@ -301,61 +318,123 @@ steps:
|
|||||||
# these tests need to be separated, cannot combine
|
# these tests need to be separated, cannot combine
|
||||||
- pytest -v -s compile/piecewise/test_simple.py
|
- pytest -v -s compile/piecewise/test_simple.py
|
||||||
- pytest -v -s compile/piecewise/test_toy_llama.py
|
- pytest -v -s compile/piecewise/test_toy_llama.py
|
||||||
- pytest -v -s compile/test_pass_manager.py
|
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 18min
|
- label: PyTorch Fullgraph Test # 18min
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_full_graph.py
|
- pytest -v -s compile/test_full_graph.py
|
||||||
|
|
||||||
- label: Kernels Test %N # 1h each
|
- label: Kernels Core Operation Test
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/attention
|
- tests/kernels/core
|
||||||
- tests/kernels
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
- pytest -v -s kernels/core
|
||||||
parallelism: 4
|
|
||||||
|
- label: Kernels Attention Test %N
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/attention/
|
||||||
|
- vllm/attention
|
||||||
|
- vllm/v1/attention
|
||||||
|
- tests/kernels/attention
|
||||||
|
commands:
|
||||||
|
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
|
parallelism: 2
|
||||||
|
|
||||||
|
- label: Kernels Quantization Test %N
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
- tests/kernels/quantization
|
||||||
|
commands:
|
||||||
|
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
|
parallelism: 2
|
||||||
|
|
||||||
|
- label: Kernels MoE Test
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/moe/
|
||||||
|
- tests/kernels/moe
|
||||||
|
- vllm/model_executor/layers/fused_moe/
|
||||||
|
commands:
|
||||||
|
- pytest -v -s kernels/moe
|
||||||
|
|
||||||
|
- label: Kernels Mamba Test
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/mamba/
|
||||||
|
- tests/kernels/mamba
|
||||||
|
commands:
|
||||||
|
- pytest -v -s kernels/mamba
|
||||||
|
|
||||||
- label: Tensorizer Test # 11min
|
- label: Tensorizer Test # 11min
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
soft_fail: true
|
soft_fail: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/model_loader
|
- vllm/model_executor/model_loader
|
||||||
- tests/tensorizer_loader
|
- tests/tensorizer_loader
|
||||||
|
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
commands:
|
commands:
|
||||||
- apt-get update && apt-get install -y curl libsodium23
|
- apt-get update && apt-get install -y curl libsodium23
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s tensorizer_loader
|
- pytest -v -s tensorizer_loader
|
||||||
|
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
|
|
||||||
|
- label: Model Executor Test
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
soft_fail: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor
|
||||||
|
- tests/model_executor
|
||||||
|
commands:
|
||||||
|
- apt-get update && apt-get install -y curl libsodium23
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -v -s model_executor
|
||||||
|
|
||||||
- label: Benchmarks # 9min
|
- label: Benchmarks # 9min
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
working_dir: "/vllm-workspace/.buildkite"
|
working_dir: "/vllm-workspace/.buildkite"
|
||||||
mirror_hardwares: [amd]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- benchmarks/
|
- benchmarks/
|
||||||
commands:
|
commands:
|
||||||
- bash run-benchmarks.sh
|
- bash scripts/run-benchmarks.sh
|
||||||
|
|
||||||
- label: Quantization Test # 33min
|
- label: Benchmarks CLI Test # 10min
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/benchmarks/
|
||||||
|
commands:
|
||||||
|
- pytest -v -s benchmarks/
|
||||||
|
|
||||||
|
- label: Quantization Test
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
- tests/quantization
|
- tests/quantization
|
||||||
command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
commands:
|
||||||
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||||
|
|
||||||
- label: OpenAI API correctness
|
- label: OpenAI API correctness
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/entrypoints/openai/
|
- vllm/entrypoints/openai/
|
||||||
@ -364,6 +443,7 @@ steps:
|
|||||||
- pytest -s entrypoints/openai/correctness/
|
- pytest -s entrypoints/openai/correctness/
|
||||||
|
|
||||||
- label: Encoder Decoder tests # 5min
|
- label: Encoder Decoder tests # 5min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/encoder_decoder
|
- tests/encoder_decoder
|
||||||
@ -371,98 +451,117 @@ steps:
|
|||||||
- pytest -v -s encoder_decoder
|
- pytest -v -s encoder_decoder
|
||||||
|
|
||||||
- label: OpenAI-Compatible Tool Use # 20 min
|
- label: OpenAI-Compatible Tool Use # 20 min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
fast_check: false
|
fast_check: false
|
||||||
mirror_hardwares: [ amd ]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/tool_use
|
- tests/tool_use
|
||||||
|
- tests/mistral_tool_use
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s tool_use
|
- pytest -v -s tool_use
|
||||||
|
- pytest -v -s mistral_tool_use
|
||||||
|
|
||||||
##### models test #####
|
##### models test #####
|
||||||
|
|
||||||
- label: Basic Models Test # 24min
|
- label: Basic Models Test # 24min
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models
|
- tests/models
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/test_transformers.py
|
- pytest -v -s models/test_transformers.py
|
||||||
- pytest -v -s models/test_registry.py
|
- pytest -v -s models/test_registry.py
|
||||||
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
|
- pytest -v -s models/test_utils.py
|
||||||
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
|
- pytest -v -s models/test_vision.py
|
||||||
|
- pytest -v -s models/test_initialization.py
|
||||||
|
|
||||||
- label: Language Models Test (Standard) # 32min
|
- label: Language Models Test (Standard)
|
||||||
#mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental]
|
||||||
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/decoder_only/language
|
- tests/models/language
|
||||||
- tests/models/embedding/language
|
|
||||||
- tests/models/encoder_decoder/language
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
|
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||||
- pytest -v -s models/embedding/language -m core_model
|
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||||
|
- pip freeze | grep -E 'torch'
|
||||||
|
- pytest -v -s models/language -m core_model
|
||||||
|
|
||||||
- label: Language Models Test (Extended) # 1h10min
|
- label: Language Models Test (Extended Generation) # 1hr20min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/decoder_only/language
|
- tests/models/language/generation
|
||||||
- tests/models/embedding/language
|
|
||||||
- tests/models/encoder_decoder/language
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
|
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||||
- pytest -v -s models/embedding/language -m 'not core_model'
|
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||||
|
- pytest -v -s models/language/generation -m 'not core_model'
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Standard) # 40min
|
- label: Language Models Test (Extended Pooling) # 36min
|
||||||
#mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/decoder_only/audio_language
|
|
||||||
- tests/models/decoder_only/vision_language
|
|
||||||
- tests/models/embedding/vision_language
|
|
||||||
- tests/models/encoder_decoder/audio_language
|
|
||||||
- tests/models/encoder_decoder/vision_language
|
|
||||||
commands:
|
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
|
||||||
- pytest -v -s models/multimodal
|
|
||||||
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
|
|
||||||
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
|
|
||||||
- pytest -v -s models/embedding/vision_language -m core_model
|
|
||||||
- pytest -v -s models/encoder_decoder/audio_language -m core_model
|
|
||||||
- pytest -v -s models/encoder_decoder/language -m core_model
|
|
||||||
- pytest -v -s models/encoder_decoder/vision_language -m core_model
|
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 1 # 48m
|
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/decoder_only/audio_language
|
- tests/models/language/pooling
|
||||||
- tests/models/decoder_only/vision_language
|
commands:
|
||||||
- tests/models/embedding/vision_language
|
- pytest -v -s models/language/pooling -m 'not core_model'
|
||||||
- tests/models/encoder_decoder/vision_language
|
|
||||||
|
- label: Multi-Modal Models Test (Standard)
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
torch_nightly: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
|
- pip freeze | grep -E 'torch'
|
||||||
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
|
- pytest -v -s models/multimodal/processing
|
||||||
# HACK - run phi3v tests separately to sidestep this transformers bug
|
- pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
|
||||||
# https://github.com/huggingface/transformers/issues/34307
|
- cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||||
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
|
|
||||||
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
|
|
||||||
- pytest -v -s models/embedding/vision_language -m 'not core_model'
|
|
||||||
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
|
|
||||||
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
|
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 2 # 38m
|
- label: Multi-Modal Models Test (Extended) 1
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/decoder_only/vision_language
|
- tests/models/multimodal
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
|
- pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
|
||||||
|
|
||||||
|
- label: Multi-Modal Models Test (Extended) 2
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
|
commands:
|
||||||
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
||||||
|
|
||||||
|
- label: Multi-Modal Models Test (Extended) 3
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
|
commands:
|
||||||
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
||||||
|
|
||||||
|
- label: Quantized Models Test
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
- tests/models/quantization
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models/quantization
|
||||||
|
|
||||||
# This test is used only in PR development phase to test individual models and should never run on main
|
# This test is used only in PR development phase to test individual models and should never run on main
|
||||||
- label: Custom Models Test
|
- label: Custom Models Test
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
optional: true
|
optional: true
|
||||||
commands:
|
commands:
|
||||||
- echo 'Testing custom models...'
|
- echo 'Testing custom models...'
|
||||||
@ -474,6 +573,7 @@ steps:
|
|||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
|
|
||||||
- label: Distributed Comm Ops Test # 7min
|
- label: Distributed Comm Ops Test # 7min
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -484,6 +584,7 @@ steps:
|
|||||||
- pytest -v -s distributed/test_shm_broadcast.py
|
- pytest -v -s distributed/test_shm_broadcast.py
|
||||||
|
|
||||||
- label: 2 Node Tests (4 GPUs in total) # 16min
|
- label: 2 Node Tests (4 GPUs in total) # 16min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
num_nodes: 2
|
num_nodes: 2
|
||||||
@ -502,7 +603,7 @@ steps:
|
|||||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
|
|
||||||
- label: Distributed Tests (2 GPUs) # 40min
|
- label: Distributed Tests (2 GPUs) # 40min
|
||||||
#mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -517,33 +618,38 @@ steps:
|
|||||||
- vllm/worker/model_runner.py
|
- vllm/worker/model_runner.py
|
||||||
- entrypoints/llm/test_collective_rpc.py
|
- entrypoints/llm/test_collective_rpc.py
|
||||||
- tests/v1/test_async_llm_dp.py
|
- tests/v1/test_async_llm_dp.py
|
||||||
|
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
- vllm/v1/engine/
|
- vllm/v1/engine/
|
||||||
commands:
|
commands:
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||||
- VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
|
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||||
# Avoid importing model tests that cause CUDA reinitialization error
|
# Avoid importing model tests that cause CUDA reinitialization error
|
||||||
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
|
# test sequence parallel
|
||||||
|
- pytest -v -s distributed/test_sequence_parallel.py
|
||||||
# this test fails consistently.
|
# this test fails consistently.
|
||||||
# TODO: investigate and fix
|
# TODO: investigate and fix
|
||||||
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
|
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
|
||||||
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||||
|
|
||||||
- label: Plugin Tests (2 GPUs) # 40min
|
- label: Plugin Tests (2 GPUs) # 40min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/plugins/
|
- vllm/plugins/
|
||||||
- tests/plugins/
|
- tests/plugins/
|
||||||
commands:
|
commands:
|
||||||
# begin platform plugin tests, all the code in-between runs on dummy platform
|
# begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
|
||||||
- pip install -e ./plugins/vllm_add_dummy_platform
|
- pip install -e ./plugins/vllm_add_dummy_platform
|
||||||
- pytest -v -s plugins_tests/test_platform_plugins.py
|
- pytest -v -s plugins_tests/test_platform_plugins.py
|
||||||
- pip uninstall vllm_add_dummy_platform -y
|
- pip uninstall vllm_add_dummy_platform -y
|
||||||
@ -554,8 +660,10 @@ steps:
|
|||||||
- pytest -v -s distributed/test_distributed_oot.py
|
- pytest -v -s distributed/test_distributed_oot.py
|
||||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||||
|
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||||
|
|
||||||
- label: Multi-step Tests (4 GPUs) # 36min
|
- label: Multi-step Tests (4 GPUs) # 36min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -576,6 +684,7 @@ steps:
|
|||||||
- pytest -v -s multi_step/test_correctness_llm.py
|
- pytest -v -s multi_step/test_correctness_llm.py
|
||||||
|
|
||||||
- label: Pipeline Parallelism Test # 45min
|
- label: Pipeline Parallelism Test # 45min
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -589,6 +698,7 @@ steps:
|
|||||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
|
|
||||||
- label: LoRA TP Test (Distributed)
|
- label: LoRA TP Test (Distributed)
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
@ -601,11 +711,10 @@ steps:
|
|||||||
# requires multi-GPU testing for validation.
|
# requires multi-GPU testing for validation.
|
||||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||||
- pytest -v -s -x lora/test_llama_tp.py
|
- pytest -v -s -x lora/test_llama_tp.py
|
||||||
- pytest -v -s -x lora/test_minicpmv_tp.py
|
|
||||||
- pytest -v -s -x lora/test_transfomers_model.py
|
|
||||||
|
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test # 33min
|
- label: Weight Loading Multiple GPU Test # 33min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -615,6 +724,7 @@ steps:
|
|||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
gpu: a100
|
gpu: a100
|
||||||
@ -653,4 +763,4 @@ steps:
|
|||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- bash ./run-tests.sh -c configs/models-large.txt -t 4
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||||
|
|||||||
7
.github/CODEOWNERS
vendored
7
.github/CODEOWNERS
vendored
@ -12,6 +12,8 @@
|
|||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
||||||
/vllm/model_executor/guided_decoding @mgoin @russellb
|
/vllm/model_executor/guided_decoding @mgoin @russellb
|
||||||
/vllm/multimodal @DarkLight1337 @ywang96
|
/vllm/multimodal @DarkLight1337 @ywang96
|
||||||
|
/vllm/vllm_flash_attn @LucasWilkinson
|
||||||
|
/vllm/lora @jeejeelee
|
||||||
CMakeLists.txt @tlrmchlsmth
|
CMakeLists.txt @tlrmchlsmth
|
||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
@ -39,3 +41,8 @@ CMakeLists.txt @tlrmchlsmth
|
|||||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
|
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
|
||||||
/tests/v1/structured_output @mgoin @russellb
|
/tests/v1/structured_output @mgoin @russellb
|
||||||
/tests/weight_loading @mgoin @youkaichao
|
/tests/weight_loading @mgoin @youkaichao
|
||||||
|
/tests/lora @jeejeelee
|
||||||
|
|
||||||
|
# Docs
|
||||||
|
/docs @hmellor
|
||||||
|
mkdocs.yaml @hmellor
|
||||||
2
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
2
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
@ -14,7 +14,7 @@ body:
|
|||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
|
|||||||
2
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
2
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
@ -14,7 +14,7 @@ body:
|
|||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
|
|||||||
14
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
14
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
@ -14,19 +14,19 @@ body:
|
|||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
||||||
value: |
|
value: |
|
||||||
<details>
|
<details>
|
||||||
<summary>The output of `python collect_env.py`</summary>
|
<summary>The output of <code>python collect_env.py</code></summary>
|
||||||
|
|
||||||
```text
|
```text
|
||||||
Your output of `python collect_env.py` here
|
Your output of `python collect_env.py` here
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
validations:
|
validations:
|
||||||
required: true
|
required: true
|
||||||
@ -75,20 +75,20 @@ body:
|
|||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
The error message you got, with the full traceback.
|
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
|
||||||
```
|
```
|
||||||
validations:
|
validations:
|
||||||
required: true
|
required: true
|
||||||
- type: markdown
|
- type: markdown
|
||||||
attributes:
|
attributes:
|
||||||
value: >
|
value: |
|
||||||
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
|
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output:
|
||||||
|
|
||||||
- Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
|
- Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
|
||||||
|
|
||||||
- If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
|
- If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
|
||||||
|
|
||||||
Thanks for contributing 🎉!
|
Thanks for reporting 🙏!
|
||||||
- type: checkboxes
|
- type: checkboxes
|
||||||
id: askllm
|
id: askllm
|
||||||
attributes:
|
attributes:
|
||||||
|
|||||||
69
.github/ISSUE_TEMPLATE/450-ci-failure.yml
vendored
Normal file
69
.github/ISSUE_TEMPLATE/450-ci-failure.yml
vendored
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
name: 🧪 CI failure report
|
||||||
|
description: Report a failing test.
|
||||||
|
title: "[CI Failure]: "
|
||||||
|
labels: ["ci-failure"]
|
||||||
|
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
#### Include the name of the failing Buildkite step and test file in the title.
|
||||||
|
- type: input
|
||||||
|
attributes:
|
||||||
|
label: Name of failing test
|
||||||
|
description: |
|
||||||
|
Paste in the fully-qualified name of the failing test from the logs.
|
||||||
|
placeholder: |
|
||||||
|
`path/to/test_file.py::test_name[params]`
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: checkboxes
|
||||||
|
attributes:
|
||||||
|
label: Basic information
|
||||||
|
description: Select all items that apply to the failing test.
|
||||||
|
options:
|
||||||
|
- label: Flaky test
|
||||||
|
- label: Can reproduce locally
|
||||||
|
- label: Caused by external libraries (e.g. bug in `transformers`)
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: 🧪 Describe the failing test
|
||||||
|
description: |
|
||||||
|
Please provide a clear and concise description of the failing test.
|
||||||
|
placeholder: |
|
||||||
|
A clear and concise description of the failing test.
|
||||||
|
|
||||||
|
```
|
||||||
|
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
|
||||||
|
```
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: 📝 History of failing test
|
||||||
|
description: |
|
||||||
|
Since when did the test start to fail?
|
||||||
|
You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main).
|
||||||
|
|
||||||
|
If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods:
|
||||||
|
|
||||||
|
- Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally.
|
||||||
|
|
||||||
|
- Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally.
|
||||||
|
|
||||||
|
- Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only)
|
||||||
|
placeholder: |
|
||||||
|
Approximate timeline and/or problematic PRs
|
||||||
|
|
||||||
|
A link to the Buildkite analytics of the failing test (if available)
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: CC List.
|
||||||
|
description: >
|
||||||
|
The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test.
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
Thanks for reporting 🙏!
|
||||||
2
.github/ISSUE_TEMPLATE/600-new-model.yml
vendored
2
.github/ISSUE_TEMPLATE/600-new-model.yml
vendored
@ -9,7 +9,7 @@ body:
|
|||||||
value: >
|
value: >
|
||||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||||
|
|
||||||
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
|
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
|
||||||
- type: textarea
|
- type: textarea
|
||||||
attributes:
|
attributes:
|
||||||
label: The model to consider.
|
label: The model to consider.
|
||||||
|
|||||||
@ -35,7 +35,7 @@ body:
|
|||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
|
|||||||
2
.github/PULL_REQUEST_TEMPLATE.md
vendored
2
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -3,4 +3,4 @@ FILL IN THE PR DESCRIPTION HERE
|
|||||||
FIX #xxxx (*link existing issues this PR will resolve*)
|
FIX #xxxx (*link existing issues this PR will resolve*)
|
||||||
|
|
||||||
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
||||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**
|
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
|
||||||
|
|||||||
45
.github/mergify.yml
vendored
45
.github/mergify.yml
vendored
@ -19,7 +19,7 @@ pull_request_rules:
|
|||||||
- files~=\.buildkite/
|
- files~=\.buildkite/
|
||||||
- files~=^cmake/
|
- files~=^cmake/
|
||||||
- files=CMakeLists.txt
|
- files=CMakeLists.txt
|
||||||
- files~=^Dockerfile
|
- files~=^docker/Dockerfile
|
||||||
- files~=^requirements.*\.txt
|
- files~=^requirements.*\.txt
|
||||||
- files=setup.py
|
- files=setup.py
|
||||||
actions:
|
actions:
|
||||||
@ -55,11 +55,19 @@ pull_request_rules:
|
|||||||
description: Automatically apply structured-output label
|
description: Automatically apply structured-output label
|
||||||
conditions:
|
conditions:
|
||||||
- or:
|
- or:
|
||||||
|
- files~=^benchmarks/structured_schemas/
|
||||||
|
- files=benchmarks/benchmark_serving_structured_output.py
|
||||||
|
- files=benchmarks/run_structured_output_benchmark.sh
|
||||||
|
- files=docs/features/structured_outputs.md
|
||||||
|
- files=examples/offline_inference/structured_outputs.py
|
||||||
|
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
||||||
|
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
||||||
- files~=^vllm/model_executor/guided_decoding/
|
- files~=^vllm/model_executor/guided_decoding/
|
||||||
- files=tests/model_executor/test_guided_processors.py
|
- files=tests/model_executor/test_guided_processors.py
|
||||||
- files=tests/entrypoints/llm/test_guided_generate.py
|
- files=tests/entrypoints/llm/test_guided_generate.py
|
||||||
- files=benchmarks/benchmark_serving_guided.py
|
- files~=^tests/v1/structured_output/
|
||||||
- files=benchmarks/benchmark_guided.py
|
- files=tests/v1/entrypoints/llm/test_guided_generate.py
|
||||||
|
- files~=^vllm/v1/structured_output/
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
@ -118,6 +126,26 @@ pull_request_rules:
|
|||||||
remove:
|
remove:
|
||||||
- tpu
|
- tpu
|
||||||
|
|
||||||
|
- name: label-tool-calling
|
||||||
|
description: Automatically add tool-calling label
|
||||||
|
conditions:
|
||||||
|
- or:
|
||||||
|
- files~=^tests/tool_use/
|
||||||
|
- files~=^tests/mistral_tool_use/
|
||||||
|
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||||
|
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
||||||
|
- files~=^vllm/entrypoints/openai/tool_parsers/
|
||||||
|
- files=docs/features/tool_calling.md
|
||||||
|
- files~=^examples/tool_chat_*
|
||||||
|
- files=examples/offline_inference/chat_with_tools.py
|
||||||
|
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
|
||||||
|
- files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
|
||||||
|
- files=examples/online_serving/openai_chat_completion_client_with_tools.py
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- tool-calling
|
||||||
|
|
||||||
- name: ping author on conflicts and add 'needs-rebase' label
|
- name: ping author on conflicts and add 'needs-rebase' label
|
||||||
conditions:
|
conditions:
|
||||||
- conflict
|
- conflict
|
||||||
@ -133,6 +161,17 @@ pull_request_rules:
|
|||||||
|
|
||||||
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
|
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
|
||||||
|
|
||||||
|
- name: assign reviewer for tensorizer changes
|
||||||
|
conditions:
|
||||||
|
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
||||||
|
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
||||||
|
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
|
- files~=^tests/tensorizer_loader/
|
||||||
|
actions:
|
||||||
|
assign:
|
||||||
|
users:
|
||||||
|
- "sangstar"
|
||||||
|
|
||||||
- name: remove 'needs-rebase' label when conflict is resolved
|
- name: remove 'needs-rebase' label when conflict is resolved
|
||||||
conditions:
|
conditions:
|
||||||
- -conflict
|
- -conflict
|
||||||
|
|||||||
2
.github/scripts/cleanup_pr_body.sh
vendored
2
.github/scripts/cleanup_pr_body.sh
vendored
@ -26,7 +26,7 @@ sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
|
|||||||
|
|
||||||
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
|
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
|
||||||
python3 - <<EOF
|
python3 - <<EOF
|
||||||
import re
|
import regex as re
|
||||||
|
|
||||||
with open("${NEW}", "r") as file:
|
with open("${NEW}", "r") as file:
|
||||||
content = file.read()
|
content = file.read()
|
||||||
|
|||||||
2
.github/workflows/add_label_automerge.yml
vendored
2
.github/workflows/add_label_automerge.yml
vendored
@ -1,4 +1,6 @@
|
|||||||
name: Add label on auto-merge enabled
|
name: Add label on auto-merge enabled
|
||||||
|
permissions:
|
||||||
|
pull-requests: write
|
||||||
on:
|
on:
|
||||||
pull_request_target:
|
pull_request_target:
|
||||||
types:
|
types:
|
||||||
|
|||||||
7
.github/workflows/cleanup_pr_body.yml
vendored
7
.github/workflows/cleanup_pr_body.yml
vendored
@ -20,7 +20,12 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
python-version: '3.12'
|
python-version: '3.12'
|
||||||
|
|
||||||
|
- name: Install Python dependencies
|
||||||
|
run: |
|
||||||
|
python3 -m pip install --upgrade pip
|
||||||
|
python3 -m pip install regex
|
||||||
|
|
||||||
- name: Update PR description
|
- name: Update PR description
|
||||||
env:
|
env:
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
|
run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
|
||||||
|
|||||||
9
.github/workflows/lint-and-deploy.yaml
vendored
9
.github/workflows/lint-and-deploy.yaml
vendored
@ -2,6 +2,9 @@ name: Lint and Deploy Charts
|
|||||||
|
|
||||||
on: pull_request
|
on: pull_request
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
lint-and-deploy:
|
lint-and-deploy:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@ -50,7 +53,7 @@ jobs:
|
|||||||
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
|
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
|
||||||
|
|
||||||
- name: Build the Docker image vllm cpu
|
- name: Build the Docker image vllm cpu
|
||||||
run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
|
run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
|
||||||
|
|
||||||
- name: Configuration of docker images, network and namespace for the kind cluster
|
- name: Configuration of docker images, network and namespace for the kind cluster
|
||||||
run: |
|
run: |
|
||||||
@ -66,7 +69,7 @@ jobs:
|
|||||||
export AWS_SECRET_ACCESS_KEY=minioadmin
|
export AWS_SECRET_ACCESS_KEY=minioadmin
|
||||||
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
|
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
|
||||||
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
||||||
|
|
||||||
- name: curl test
|
- name: curl test
|
||||||
run: |
|
run: |
|
||||||
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
|
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
|
||||||
@ -79,4 +82,4 @@ jobs:
|
|||||||
"max_tokens": 7,
|
"max_tokens": 7,
|
||||||
"temperature": 0
|
"temperature": 0
|
||||||
}'):$CODE"
|
}'):$CODE"
|
||||||
echo "$CODE"
|
echo "$CODE"
|
||||||
|
|||||||
3
.github/workflows/pre-commit.yml
vendored
3
.github/workflows/pre-commit.yml
vendored
@ -5,6 +5,9 @@ on:
|
|||||||
push:
|
push:
|
||||||
branches: [main]
|
branches: [main]
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
pre-commit:
|
pre-commit:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|||||||
2
.github/workflows/reminder_comment.yml
vendored
2
.github/workflows/reminder_comment.yml
vendored
@ -1,4 +1,6 @@
|
|||||||
name: PR Reminder Comment Bot
|
name: PR Reminder Comment Bot
|
||||||
|
permissions:
|
||||||
|
pull-requests: write
|
||||||
on:
|
on:
|
||||||
pull_request_target:
|
pull_request_target:
|
||||||
types: [opened]
|
types: [opened]
|
||||||
|
|||||||
9
.gitignore
vendored
9
.gitignore
vendored
@ -3,7 +3,6 @@
|
|||||||
|
|
||||||
# vllm-flash-attn built from source
|
# vllm-flash-attn built from source
|
||||||
vllm/vllm_flash_attn/*
|
vllm/vllm_flash_attn/*
|
||||||
!vllm/vllm_flash_attn/fa_utils.py
|
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
@ -78,10 +77,6 @@ instance/
|
|||||||
# Scrapy stuff:
|
# Scrapy stuff:
|
||||||
.scrapy
|
.scrapy
|
||||||
|
|
||||||
# Sphinx documentation
|
|
||||||
docs/_build/
|
|
||||||
docs/source/getting_started/examples/
|
|
||||||
|
|
||||||
# PyBuilder
|
# PyBuilder
|
||||||
.pybuilder/
|
.pybuilder/
|
||||||
target/
|
target/
|
||||||
@ -151,6 +146,7 @@ venv.bak/
|
|||||||
|
|
||||||
# mkdocs documentation
|
# mkdocs documentation
|
||||||
/site
|
/site
|
||||||
|
docs/examples
|
||||||
|
|
||||||
# mypy
|
# mypy
|
||||||
.mypy_cache/
|
.mypy_cache/
|
||||||
@ -203,3 +199,6 @@ benchmarks/**/*.json
|
|||||||
# Linting
|
# Linting
|
||||||
actionlint
|
actionlint
|
||||||
shellcheck*/
|
shellcheck*/
|
||||||
|
|
||||||
|
# Ingore moe/marlin_moe gen code
|
||||||
|
csrc/moe/marlin_moe_wna16/kernel_*
|
||||||
|
|||||||
@ -1,3 +1,6 @@
|
|||||||
|
default_install_hook_types:
|
||||||
|
- pre-commit
|
||||||
|
- commit-msg
|
||||||
default_stages:
|
default_stages:
|
||||||
- pre-commit # Run locally
|
- pre-commit # Run locally
|
||||||
- manual # Run in CI
|
- manual # Run in CI
|
||||||
@ -8,43 +11,45 @@ repos:
|
|||||||
hooks:
|
hooks:
|
||||||
- id: yapf
|
- id: yapf
|
||||||
args: [--in-place, --verbose]
|
args: [--in-place, --verbose]
|
||||||
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
|
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: v0.9.3
|
rev: v0.11.7
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff
|
- id: ruff
|
||||||
args: [--output-format, github, --fix]
|
args: [--output-format, github, --fix]
|
||||||
|
- id: ruff-format
|
||||||
|
files: ^(.buildkite|benchmarks|examples)/.*
|
||||||
- repo: https://github.com/codespell-project/codespell
|
- repo: https://github.com/codespell-project/codespell
|
||||||
rev: v2.4.0
|
rev: v2.4.1
|
||||||
hooks:
|
hooks:
|
||||||
- id: codespell
|
- id: codespell
|
||||||
additional_dependencies: ['tomli']
|
additional_dependencies: ['tomli']
|
||||||
args: ['--toml', 'pyproject.toml']
|
args: ['--toml', 'pyproject.toml']
|
||||||
- repo: https://github.com/PyCQA/isort
|
- repo: https://github.com/PyCQA/isort
|
||||||
rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
|
rev: 6.0.1
|
||||||
hooks:
|
hooks:
|
||||||
- id: isort
|
- id: isort
|
||||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||||
rev: v19.1.7
|
rev: v20.1.3
|
||||||
hooks:
|
hooks:
|
||||||
- id: clang-format
|
- id: clang-format
|
||||||
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
||||||
types_or: [c++, cuda]
|
types_or: [c++, cuda]
|
||||||
args: [--style=file, --verbose]
|
args: [--style=file, --verbose]
|
||||||
- repo: https://github.com/jackdewinter/pymarkdown
|
- repo: https://github.com/jackdewinter/pymarkdown
|
||||||
rev: v0.9.27
|
rev: v0.9.29
|
||||||
hooks:
|
hooks:
|
||||||
- id: pymarkdown
|
- id: pymarkdown
|
||||||
|
exclude: '.*\.inc\.md'
|
||||||
args: [fix]
|
args: [fix]
|
||||||
- repo: https://github.com/rhysd/actionlint
|
- repo: https://github.com/rhysd/actionlint
|
||||||
rev: v1.7.7
|
rev: v1.7.7
|
||||||
hooks:
|
hooks:
|
||||||
- id: actionlint
|
- id: actionlint
|
||||||
- repo: https://github.com/astral-sh/uv-pre-commit
|
- repo: https://github.com/astral-sh/uv-pre-commit
|
||||||
rev: 0.6.2
|
rev: 0.6.17
|
||||||
hooks:
|
hooks:
|
||||||
- id: pip-compile
|
- id: pip-compile
|
||||||
args: [requirements/test.in, -o, requirements/test.txt]
|
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
|
||||||
files: ^requirements/test\.(in|txt)$
|
files: ^requirements/test\.(in|txt)$
|
||||||
- repo: local
|
- repo: local
|
||||||
hooks:
|
hooks:
|
||||||
@ -53,7 +58,7 @@ repos:
|
|||||||
entry: tools/mypy.sh 0 "local"
|
entry: tools/mypy.sh 0 "local"
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
|
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
|
||||||
stages: [pre-commit] # Don't run in CI
|
stages: [pre-commit] # Don't run in CI
|
||||||
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.9
|
name: Run mypy for Python 3.9
|
||||||
@ -99,8 +104,8 @@ repos:
|
|||||||
args:
|
args:
|
||||||
- -c
|
- -c
|
||||||
- |
|
- |
|
||||||
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
|
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
|
||||||
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
|
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
|
||||||
fi
|
fi
|
||||||
language: system
|
language: system
|
||||||
verbose: true
|
verbose: true
|
||||||
@ -119,6 +124,25 @@ repos:
|
|||||||
language: system
|
language: system
|
||||||
always_run: true
|
always_run: true
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
|
- id: update-dockerfile-graph
|
||||||
|
name: Update Dockerfile dependency graph
|
||||||
|
entry: tools/update-dockerfile-graph.sh
|
||||||
|
language: script
|
||||||
|
- id: enforce-import-regex-instead-of-re
|
||||||
|
name: Enforce import regex as re
|
||||||
|
entry: python tools/enforce_regex_import.py
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
|
pass_filenames: false
|
||||||
|
additional_dependencies: [regex]
|
||||||
|
# forbid directly import triton
|
||||||
|
- id: forbid-direct-triton-import
|
||||||
|
name: "Forbid direct 'import triton'"
|
||||||
|
entry: python tools/check_triton_import.py
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
|
pass_filenames: false
|
||||||
|
additional_dependencies: [regex]
|
||||||
# Keep `suggestion` last
|
# Keep `suggestion` last
|
||||||
- id: suggestion
|
- id: suggestion
|
||||||
name: Suggestion
|
name: Suggestion
|
||||||
|
|||||||
@ -8,12 +8,8 @@ build:
|
|||||||
tools:
|
tools:
|
||||||
python: "3.12"
|
python: "3.12"
|
||||||
|
|
||||||
sphinx:
|
mkdocs:
|
||||||
configuration: docs/source/conf.py
|
configuration: mkdocs.yaml
|
||||||
fail_on_warning: true
|
|
||||||
|
|
||||||
# If using Sphinx, optionally build your docs in additional formats such as PDF
|
|
||||||
formats: []
|
|
||||||
|
|
||||||
# Optionally declare the Python requirements required to build your docs
|
# Optionally declare the Python requirements required to build your docs
|
||||||
python:
|
python:
|
||||||
|
|||||||
196
CMakeLists.txt
196
CMakeLists.txt
@ -15,7 +15,6 @@ project(vllm_extensions LANGUAGES CXX)
|
|||||||
|
|
||||||
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
||||||
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
||||||
|
|
||||||
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
||||||
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
||||||
|
|
||||||
@ -24,17 +23,17 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
|
|||||||
# Suppress potential warnings about unused manually-specified variables
|
# Suppress potential warnings about unused manually-specified variables
|
||||||
set(ignoreMe "${VLLM_PYTHON_PATH}")
|
set(ignoreMe "${VLLM_PYTHON_PATH}")
|
||||||
|
|
||||||
|
# Prevent installation of dependencies (cutlass) by default.
|
||||||
|
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supported python versions. These versions will be searched in order, the
|
# Supported python versions. These versions will be searched in order, the
|
||||||
# first match will be selected. These should be kept in sync with setup.py.
|
# first match will be selected. These should be kept in sync with setup.py.
|
||||||
#
|
#
|
||||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
||||||
|
|
||||||
# Supported NVIDIA architectures.
|
|
||||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
|
||||||
|
|
||||||
# Supported AMD GPU architectures.
|
# Supported AMD GPU architectures.
|
||||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supported/expected torch versions for CUDA/ROCm.
|
# Supported/expected torch versions for CUDA/ROCm.
|
||||||
@ -44,10 +43,10 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
|
|||||||
#
|
#
|
||||||
# Note: the CUDA torch version is derived from pyproject.toml and various
|
# Note: the CUDA torch version is derived from pyproject.toml and various
|
||||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||||
# versions are derived from Dockerfile.rocm
|
# versions are derived from docker/Dockerfile.rocm
|
||||||
#
|
#
|
||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Try to find python package with an executable that exactly matches
|
# Try to find python package with an executable that exactly matches
|
||||||
@ -80,6 +79,15 @@ endif()
|
|||||||
#
|
#
|
||||||
find_package(Torch REQUIRED)
|
find_package(Torch REQUIRED)
|
||||||
|
|
||||||
|
# Supported NVIDIA architectures.
|
||||||
|
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
|
||||||
|
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
||||||
|
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
||||||
|
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
||||||
|
else()
|
||||||
|
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# Forward the non-CUDA device extensions to external CMake scripts.
|
# Forward the non-CUDA device extensions to external CMake scripts.
|
||||||
#
|
#
|
||||||
@ -227,28 +235,34 @@ endif()
|
|||||||
#
|
#
|
||||||
|
|
||||||
set(VLLM_EXT_SRC
|
set(VLLM_EXT_SRC
|
||||||
|
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
|
||||||
|
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
||||||
"csrc/cache_kernels.cu"
|
"csrc/cache_kernels.cu"
|
||||||
"csrc/attention/paged_attention_v1.cu"
|
"csrc/attention/paged_attention_v1.cu"
|
||||||
"csrc/attention/paged_attention_v2.cu"
|
"csrc/attention/paged_attention_v2.cu"
|
||||||
|
"csrc/attention/merge_attn_states.cu"
|
||||||
|
"csrc/attention/vertical_slash_index.cu"
|
||||||
"csrc/pos_encoding_kernels.cu"
|
"csrc/pos_encoding_kernels.cu"
|
||||||
"csrc/activation_kernels.cu"
|
"csrc/activation_kernels.cu"
|
||||||
"csrc/layernorm_kernels.cu"
|
"csrc/layernorm_kernels.cu"
|
||||||
"csrc/layernorm_quant_kernels.cu"
|
"csrc/layernorm_quant_kernels.cu"
|
||||||
|
"csrc/cuda_view.cu"
|
||||||
"csrc/quantization/gptq/q_gemm.cu"
|
"csrc/quantization/gptq/q_gemm.cu"
|
||||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||||
"csrc/quantization/fp8/common.cu"
|
"csrc/quantization/fp8/common.cu"
|
||||||
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
||||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||||
|
"csrc/quantization/activation_kernels.cu"
|
||||||
"csrc/cuda_utils_kernels.cu"
|
"csrc/cuda_utils_kernels.cu"
|
||||||
"csrc/prepare_inputs/advance_step.cu"
|
"csrc/prepare_inputs/advance_step.cu"
|
||||||
|
"csrc/custom_all_reduce.cu"
|
||||||
"csrc/torch_bindings.cpp")
|
"csrc/torch_bindings.cpp")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||||
|
|
||||||
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
|
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
||||||
# Please keep this in sync with FetchContent_Declare line below.
|
set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")
|
||||||
set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")
|
|
||||||
|
|
||||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||||
@ -266,7 +280,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
cutlass
|
cutlass
|
||||||
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
||||||
# Please keep this in sync with CUTLASS_REVISION line above.
|
# Please keep this in sync with CUTLASS_REVISION line above.
|
||||||
GIT_TAG v3.8.0
|
GIT_TAG ${CUTLASS_REVISION}
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
|
|
||||||
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
||||||
@ -278,17 +292,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
FetchContent_MakeAvailable(cutlass)
|
FetchContent_MakeAvailable(cutlass)
|
||||||
|
|
||||||
list(APPEND VLLM_EXT_SRC
|
list(APPEND VLLM_EXT_SRC
|
||||||
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
|
|
||||||
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
|
||||||
"csrc/quantization/aqlm/gemm_kernels.cu"
|
"csrc/quantization/aqlm/gemm_kernels.cu"
|
||||||
"csrc/quantization/awq/gemm_kernels.cu"
|
"csrc/quantization/awq/gemm_kernels.cu"
|
||||||
"csrc/custom_all_reduce.cu"
|
|
||||||
"csrc/permute_cols.cu"
|
"csrc/permute_cols.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||||
|
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
|
||||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||||
"csrc/cutlass_extensions/common.cpp")
|
"csrc/cutlass_extensions/common.cpp"
|
||||||
|
"csrc/attention/mla/cutlass_mla_entry.cu")
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${VLLM_EXT_SRC}"
|
SRCS "${VLLM_EXT_SRC}"
|
||||||
@ -297,10 +310,55 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# Only build Marlin kernels if we are building for at least some compatible archs.
|
# Only build Marlin kernels if we are building for at least some compatible archs.
|
||||||
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
||||||
# are not supported by Machete yet.
|
# are not supported by Machete yet.
|
||||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
# 9.0 for latest bf16 atomicAdd PTX
|
||||||
|
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
|
||||||
if (MARLIN_ARCHS)
|
if (MARLIN_ARCHS)
|
||||||
|
|
||||||
|
#
|
||||||
|
# For the Marlin kernels we automatically generate sources for various
|
||||||
|
# preselected input type pairs and schedules.
|
||||||
|
# Generate sources:
|
||||||
|
set(MARLIN_GEN_SCRIPT
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
|
||||||
|
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
|
||||||
|
|
||||||
|
message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
|
||||||
|
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
|
||||||
|
|
||||||
|
if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
|
||||||
|
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
|
||||||
|
execute_process(
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E env
|
||||||
|
PYTHONPATH=$PYTHONPATH
|
||||||
|
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
|
||||||
|
RESULT_VARIABLE marlin_generation_result
|
||||||
|
OUTPUT_VARIABLE marlin_generation_result
|
||||||
|
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
||||||
|
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
||||||
|
)
|
||||||
|
|
||||||
|
if (NOT marlin_generation_result EQUAL 0)
|
||||||
|
message(FATAL_ERROR "Marlin generation failed."
|
||||||
|
" Result: \"${marlin_generation_result}\""
|
||||||
|
"\nCheck the log for details: "
|
||||||
|
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
|
||||||
|
else()
|
||||||
|
set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
|
||||||
|
CACHE STRING "Last run Marlin generate script hash" FORCE)
|
||||||
|
message(STATUS "Marlin generation completed successfully.")
|
||||||
|
endif()
|
||||||
|
else()
|
||||||
|
message(STATUS "Marlin generation script has not changed, skipping generation.")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
||||||
|
CUDA_ARCHS "${MARLIN_ARCHS}")
|
||||||
|
|
||||||
|
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
||||||
|
|
||||||
set(MARLIN_SRCS
|
set(MARLIN_SRCS
|
||||||
"csrc/quantization/fp8/fp8_marlin.cu"
|
|
||||||
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
||||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||||
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
|
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
|
||||||
@ -372,6 +430,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
||||||
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
||||||
)
|
)
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@ -396,8 +455,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
#
|
#
|
||||||
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
|
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
|
||||||
# kernels for the remaining archs that are not already built for 3x.
|
# kernels for the remaining archs that are not already built for 3x.
|
||||||
|
# (Build 8.9 for FP8)
|
||||||
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
||||||
"7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
"7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
|
||||||
# subtract out the archs that are already built for 3x
|
# subtract out the archs that are already built for 3x
|
||||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||||
if (SCALED_MM_2X_ARCHS)
|
if (SCALED_MM_2X_ARCHS)
|
||||||
@ -448,7 +508,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu")
|
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
|
||||||
|
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
|
||||||
|
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${FP4_ARCHS}")
|
CUDA_ARCHS "${FP4_ARCHS}")
|
||||||
@ -461,13 +523,32 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
set(FP4_ARCHS)
|
set(FP4_ARCHS)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
#
|
# CUTLASS MLA Archs and flags
|
||||||
|
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
|
||||||
|
set(SRCS
|
||||||
|
"csrc/attention/mla/cutlass_mla_kernels.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${SRCS}"
|
||||||
|
CUDA_ARCHS "${MLA_ARCHS}")
|
||||||
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
|
||||||
|
# Add MLA-specific include directories only to MLA source files
|
||||||
|
set_source_files_properties(${SRCS}
|
||||||
|
PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
|
||||||
|
message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
|
||||||
|
# clear MLA_ARCHS
|
||||||
|
set(MLA_ARCHS)
|
||||||
|
endif()
|
||||||
|
|
||||||
# CUTLASS MoE kernels
|
# CUTLASS MoE kernels
|
||||||
|
|
||||||
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
|
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
|
||||||
# on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
|
# on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
|
||||||
# to compile MoE kernels that use its output.
|
# to compile MoE kernels that use its output.
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
||||||
@ -605,23 +686,54 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||||
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
||||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
# 9.0 for latest bf16 atomicAdd PTX
|
||||||
|
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
|
||||||
if (MARLIN_MOE_ARCHS)
|
if (MARLIN_MOE_ARCHS)
|
||||||
set(MARLIN_MOE_SRC
|
|
||||||
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
|
|
||||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
|
|
||||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
|
|
||||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
|
|
||||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
|
|
||||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
|
|
||||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
|
|
||||||
"csrc/moe/marlin_moe_ops.cu")
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# For the Marlin MOE kernels we automatically generate sources for various
|
||||||
|
# preselected input type pairs and schedules.
|
||||||
|
# Generate sources:
|
||||||
|
set(MOE_MARLIN_GEN_SCRIPT
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
|
||||||
|
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
|
||||||
|
|
||||||
|
message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
|
||||||
|
message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
|
||||||
|
|
||||||
|
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
|
||||||
|
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
|
||||||
|
execute_process(
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E env
|
||||||
|
PYTHONPATH=$PYTHONPATH
|
||||||
|
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
|
||||||
|
RESULT_VARIABLE moe_marlin_generation_result
|
||||||
|
OUTPUT_VARIABLE moe_marlin_generation_output
|
||||||
|
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
||||||
|
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
||||||
|
)
|
||||||
|
|
||||||
|
if (NOT moe_marlin_generation_result EQUAL 0)
|
||||||
|
message(FATAL_ERROR "Marlin MOE generation failed."
|
||||||
|
" Result: \"${moe_marlin_generation_result}\""
|
||||||
|
"\nCheck the log for details: "
|
||||||
|
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
|
||||||
|
else()
|
||||||
|
set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
|
||||||
|
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
|
||||||
|
message(STATUS "Marlin MOE generation completed successfully.")
|
||||||
|
endif()
|
||||||
|
else()
|
||||||
|
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${MARLIN_MOE_SRC}"
|
SRCS "${MOE_WNAA16_MARLIN_SRC}"
|
||||||
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
||||||
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
|
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
|
||||||
|
|
||||||
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
||||||
else()
|
else()
|
||||||
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
||||||
@ -629,6 +741,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
|
set(MOE_PERMUTE_SRC
|
||||||
|
"csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
|
||||||
|
"csrc/moe/moe_permute_unpermute_op.cu")
|
||||||
|
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${MARLIN_PERMUTE_SRC}"
|
||||||
|
CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
|
||||||
|
|
||||||
|
list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
|
||||||
|
endif()
|
||||||
message(STATUS "Enabling moe extension.")
|
message(STATUS "Enabling moe extension.")
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
_moe_C
|
_moe_C
|
||||||
@ -637,6 +760,8 @@ define_gpu_extension_target(
|
|||||||
SOURCES ${VLLM_MOE_EXT_SRC}
|
SOURCES ${VLLM_MOE_EXT_SRC}
|
||||||
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||||
|
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
|
||||||
|
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
|
||||||
USE_SABI 3
|
USE_SABI 3
|
||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
|
|
||||||
@ -646,6 +771,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
|||||||
#
|
#
|
||||||
set(VLLM_ROCM_EXT_SRC
|
set(VLLM_ROCM_EXT_SRC
|
||||||
"csrc/rocm/torch_bindings.cpp"
|
"csrc/rocm/torch_bindings.cpp"
|
||||||
|
"csrc/rocm/skinny_gemms.cu"
|
||||||
"csrc/rocm/attention.cu")
|
"csrc/rocm/attention.cu")
|
||||||
|
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
@ -662,5 +788,7 @@ endif()
|
|||||||
# For CUDA we also build and ship some external projects.
|
# For CUDA we also build and ship some external projects.
|
||||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
include(cmake/external_projects/flashmla.cmake)
|
include(cmake/external_projects/flashmla.cmake)
|
||||||
|
|
||||||
|
# vllm-flash-attn should be last as it overwrites some CMake functions
|
||||||
include(cmake/external_projects/vllm_flash_attn.cmake)
|
include(cmake/external_projects/vllm_flash_attn.cmake)
|
||||||
endif ()
|
endif ()
|
||||||
|
|||||||
@ -1,3 +1,3 @@
|
|||||||
# Contributing to vLLM
|
# Contributing to vLLM
|
||||||
|
|
||||||
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
|
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing).
|
||||||
|
|||||||
@ -1,69 +0,0 @@
|
|||||||
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
|
|
||||||
|
|
||||||
FROM ubuntu:22.04 AS cpu-test-1
|
|
||||||
|
|
||||||
ENV CCACHE_DIR=/root/.cache/ccache
|
|
||||||
|
|
||||||
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/var/cache/apt \
|
|
||||||
apt-get update -y \
|
|
||||||
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
|
|
||||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
|
||||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
|
||||||
|
|
||||||
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
|
|
||||||
# intel-openmp provides additional performance improvement vs. openmp
|
|
||||||
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install intel-openmp==2025.0.1
|
|
||||||
|
|
||||||
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
|
|
||||||
|
|
||||||
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
|
||||||
|
|
||||||
RUN pip install intel_extension_for_pytorch==2.6.0
|
|
||||||
|
|
||||||
WORKDIR /workspace
|
|
||||||
|
|
||||||
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
|
||||||
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
|
|
||||||
pip install --upgrade pip && \
|
|
||||||
pip install -r requirements/build.txt
|
|
||||||
|
|
||||||
FROM cpu-test-1 AS build
|
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
|
|
||||||
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
|
|
||||||
pip install -v -r requirements/cpu.txt
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
ARG GIT_REPO_CHECK=0
|
|
||||||
RUN --mount=type=bind,source=.git,target=.git \
|
|
||||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
|
||||||
|
|
||||||
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
|
|
||||||
ARG VLLM_CPU_DISABLE_AVX512
|
|
||||||
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
--mount=type=cache,target=/root/.cache/ccache \
|
|
||||||
--mount=type=bind,source=.git,target=.git \
|
|
||||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
|
|
||||||
pip install dist/*.whl && \
|
|
||||||
rm -rf dist
|
|
||||||
|
|
||||||
WORKDIR /workspace/
|
|
||||||
|
|
||||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install -e tests/vllm_test_utils
|
|
||||||
|
|
||||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
|
||||||
36
README.md
36
README.md
@ -1,7 +1,7 @@
|
|||||||
<p align="center">
|
<p align="center">
|
||||||
<picture>
|
<picture>
|
||||||
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png">
|
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png">
|
||||||
<img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png" width=55%>
|
<img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-light.png" width=55%>
|
||||||
</picture>
|
</picture>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
@ -10,29 +10,26 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
</h3>
|
</h3>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
|
|
||||||
|
|
||||||
[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
|
||||||
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
|
||||||
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
|
||||||
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
|
|
||||||
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
||||||
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
|
|
||||||
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
|
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary>Previous News</summary>
|
<summary>Previous News</summary>
|
||||||
|
|
||||||
|
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
|
||||||
|
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
||||||
|
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
||||||
|
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
|
||||||
|
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
|
||||||
|
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
|
||||||
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
|
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
|
||||||
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
|
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
|
||||||
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
|
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
|
||||||
@ -61,7 +58,7 @@ vLLM is fast with:
|
|||||||
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
|
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
|
||||||
- Continuous batching of incoming requests
|
- Continuous batching of incoming requests
|
||||||
- Fast model execution with CUDA/HIP graph
|
- Fast model execution with CUDA/HIP graph
|
||||||
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
|
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8.
|
||||||
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
|
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
|
||||||
- Speculative decoding
|
- Speculative decoding
|
||||||
- Chunked prefill
|
- Chunked prefill
|
||||||
@ -77,7 +74,7 @@ vLLM is flexible and easy to use with:
|
|||||||
- OpenAI-compatible API server
|
- OpenAI-compatible API server
|
||||||
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
|
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
|
||||||
- Prefix caching support
|
- Prefix caching support
|
||||||
- Multi-lora support
|
- Multi-LoRA support
|
||||||
|
|
||||||
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
||||||
- Transformer-like LLMs (e.g., Llama)
|
- Transformer-like LLMs (e.g., Llama)
|
||||||
@ -103,14 +100,14 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
|
|||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
We welcome and value any contributions and collaborations.
|
We welcome and value any contributions and collaborations.
|
||||||
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
|
Please check out [Contributing to vLLM](https://docs.vllm.ai/en/latest/contributing/index.html) for how to get involved.
|
||||||
|
|
||||||
## Sponsors
|
## Sponsors
|
||||||
|
|
||||||
vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
|
vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
|
||||||
|
|
||||||
<!-- Note: Please sort them in alphabetical order. -->
|
<!-- Note: Please sort them in alphabetical order. -->
|
||||||
<!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
|
<!-- Note: Please keep these consistent with docs/community/sponsors.md -->
|
||||||
Cash Donations:
|
Cash Donations:
|
||||||
- a16z
|
- a16z
|
||||||
- Dropbox
|
- Dropbox
|
||||||
@ -126,6 +123,7 @@ Compute Resources:
|
|||||||
- Databricks
|
- Databricks
|
||||||
- DeepInfra
|
- DeepInfra
|
||||||
- Google Cloud
|
- Google Cloud
|
||||||
|
- Intel
|
||||||
- Lambda Lab
|
- Lambda Lab
|
||||||
- Nebius
|
- Nebius
|
||||||
- Novita AI
|
- Novita AI
|
||||||
|
|||||||
@ -8,4 +8,6 @@ Please report security issues privately using [the vulnerability submission form
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
|
||||||
|
|
||||||
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
|
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
|
||||||
|
|||||||
@ -41,29 +41,45 @@ become available.
|
|||||||
<td><code>synthetic</code></td>
|
<td><code>synthetic</code></td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td><strong>HuggingFace</strong></td>
|
<td><strong>HuggingFace-VisionArena</strong></td>
|
||||||
<td style="text-align: center;">🟡</td>
|
<td style="text-align: center;">✅</td>
|
||||||
<td style="text-align: center;">🟡</td>
|
<td style="text-align: center;">✅</td>
|
||||||
<td>Specify your dataset path on HuggingFace</td>
|
<td><code>lmarena-ai/VisionArena-Chat</code></td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td><strong>VisionArena</strong></td>
|
<td><strong>HuggingFace-InstructCoder</strong></td>
|
||||||
<td style="text-align: center;">✅</td>
|
<td style="text-align: center;">✅</td>
|
||||||
<td style="text-align: center;">✅</td>
|
<td style="text-align: center;">✅</td>
|
||||||
<td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
|
<td><code>likaixin/InstructCoder</code></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><strong>HuggingFace-AIMO</strong></td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><strong>HuggingFace-Other</strong></td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><strong>Custom</strong></td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td>Local file: <code>data.jsonl</code></td>
|
||||||
</tr>
|
</tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
|
|
||||||
✅: supported
|
✅: supported
|
||||||
|
|
||||||
|
🟡: Partial support
|
||||||
|
|
||||||
🚧: to be supported
|
🚧: to be supported
|
||||||
|
|
||||||
🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
|
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
|
||||||
similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
|
|
||||||
If you need support for other dataset formats, please consider contributing.
|
|
||||||
|
|
||||||
**Note**: VisionArena’s `dataset-name` should be set to `hf`
|
|
||||||
|
|
||||||
---
|
---
|
||||||
## Example - Online Benchmark
|
## Example - Online Benchmark
|
||||||
@ -71,8 +87,7 @@ If you need support for other dataset formats, please consider contributing.
|
|||||||
First start serving your model
|
First start serving your model
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
|
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
|
||||||
vllm serve ${MODEL_NAME} --disable-log-requests
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Then run the benchmarking script
|
Then run the benchmarking script
|
||||||
@ -80,12 +95,13 @@ Then run the benchmarking script
|
|||||||
```bash
|
```bash
|
||||||
# download dataset
|
# download dataset
|
||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
NUM_PROMPTS=10
|
--backend vllm \
|
||||||
BACKEND="vllm"
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
DATASET_NAME="sharegpt"
|
--endpoint /v1/completions \
|
||||||
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
|
--dataset-name sharegpt \
|
||||||
python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
|
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||||
|
--num-prompts 10
|
||||||
```
|
```
|
||||||
|
|
||||||
If successful, you will see the following output
|
If successful, you will see the following output
|
||||||
@ -114,6 +130,38 @@ P99 ITL (ms): 8.39
|
|||||||
==================================================
|
==================================================
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Custom Dataset
|
||||||
|
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
|
||||||
|
|
||||||
|
```
|
||||||
|
{"prompt": "What is the capital of India?"}
|
||||||
|
{"prompt": "What is the capital of Iran?"}
|
||||||
|
{"prompt": "What is the capital of China?"}
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# start server
|
||||||
|
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# run benchmarking script
|
||||||
|
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
|
||||||
|
--backend vllm \
|
||||||
|
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||||
|
--endpoint /v1/completions \
|
||||||
|
--dataset-name custom \
|
||||||
|
--dataset-path <path-to-your-data-jsonl> \
|
||||||
|
--custom-skip-chat-template \
|
||||||
|
--num-prompts 80 \
|
||||||
|
--max-concurrency 1 \
|
||||||
|
--temperature=0.3 \
|
||||||
|
--top-p=0.75 \
|
||||||
|
--result-dir "./log/"
|
||||||
|
```
|
||||||
|
|
||||||
|
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
|
||||||
|
|
||||||
### VisionArena Benchmark for Vision Language Models
|
### VisionArena Benchmark for Vision Language Models
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@ -122,88 +170,114 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
|||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
|
|
||||||
NUM_PROMPTS=10
|
|
||||||
BACKEND="openai-chat"
|
|
||||||
DATASET_NAME="hf"
|
|
||||||
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
|
|
||||||
DATASET_SPLIT='train'
|
|
||||||
|
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
--backend "${BACKEND}" \
|
--backend openai-chat \
|
||||||
--model "${MODEL_NAME}" \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--endpoint "/v1/chat/completions" \
|
--endpoint /v1/chat/completions \
|
||||||
--dataset-name "${DATASET_NAME}" \
|
--dataset-name hf \
|
||||||
--dataset-path "${DATASET_PATH}" \
|
--dataset-path lmarena-ai/VisionArena-Chat \
|
||||||
--hf-split "${DATASET_SPLIT}" \
|
--hf-split train \
|
||||||
--num-prompts "${NUM_PROMPTS}"
|
--num-prompts 1000
|
||||||
```
|
```
|
||||||
|
|
||||||
### HuggingFaceDataset Examples
|
### InstructCoder Benchmark with Speculative Decoding
|
||||||
|
|
||||||
Currently, HuggingFaceDataset only supports dataset formats
|
``` bash
|
||||||
similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset
|
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
||||||
formats, please consider contributing.
|
--speculative-config $'{"method": "ngram",
|
||||||
|
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
||||||
|
"prompt_lookup_min": 2}'
|
||||||
|
```
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
python3 benchmarks/benchmark_serving.py \
|
||||||
|
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path likaixin/InstructCoder \
|
||||||
|
--num-prompts 2048
|
||||||
|
```
|
||||||
|
|
||||||
|
### Other HuggingFaceDataset Examples
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# need a model with vision capability here
|
|
||||||
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
||||||
```
|
```
|
||||||
|
|
||||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
**`lmms-lab/LLaVA-OneVision-Data`**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
|
|
||||||
NUM_PROMPTS=10
|
|
||||||
BACKEND="openai-chat"
|
|
||||||
DATASET_NAME="hf"
|
|
||||||
DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
|
|
||||||
DATASET_SPLIT='train'
|
|
||||||
DATASET_SUBSET='chart2text(cauldron)'
|
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
--backend "${BACKEND}" \
|
--backend openai-chat \
|
||||||
--model "${MODEL_NAME}" \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--endpoint "/v1/chat/completions" \
|
--endpoint /v1/chat/completions \
|
||||||
--dataset-name "${DATASET_NAME}" \
|
--dataset-name hf \
|
||||||
--dataset-path "${DATASET_PATH}" \
|
--dataset-path lmms-lab/LLaVA-OneVision-Data \
|
||||||
--hf-split "${DATASET_SPLIT}" \
|
--hf-split train \
|
||||||
--num-prompts "${NUM_PROMPTS}" \
|
--hf-subset "chart2text(cauldron)" \
|
||||||
--hf-subset "${DATASET_SUBSET}"
|
--num-prompts 10
|
||||||
```
|
```
|
||||||
|
|
||||||
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
|
|
||||||
NUM_PROMPTS=10
|
|
||||||
BACKEND="openai-chat"
|
|
||||||
DATASET_NAME="hf"
|
|
||||||
DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
|
|
||||||
DATASET_SPLIT='train'
|
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
--backend "${BACKEND}" \
|
--backend openai-chat \
|
||||||
--model "${MODEL_NAME}" \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--endpoint "/v1/chat/completions" \
|
--endpoint /v1/chat/completions \
|
||||||
--dataset-name "${DATASET_NAME}" \
|
--dataset-name hf \
|
||||||
--dataset-path "${DATASET_PATH}" \
|
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
|
||||||
--hf-split "${DATASET_SPLIT}" \
|
--hf-split train \
|
||||||
--num-prompts "${NUM_PROMPTS}" \
|
--num-prompts 10
|
||||||
|
```
|
||||||
|
|
||||||
|
**`AI-MO/aimo-validation-aime`**
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
|
--model Qwen/QwQ-32B \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path AI-MO/aimo-validation-aime \
|
||||||
|
--num-prompts 10 \
|
||||||
|
--seed 42
|
||||||
|
```
|
||||||
|
|
||||||
|
**`philschmid/mt-bench`**
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
|
--model Qwen/QwQ-32B \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path philschmid/mt-bench \
|
||||||
|
--num-prompts 80
|
||||||
|
```
|
||||||
|
|
||||||
|
### Running With Sampling Parameters
|
||||||
|
|
||||||
|
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
||||||
|
parameters can be specified. Example client command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
|
--backend vllm \
|
||||||
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
|
--endpoint /v1/completions \
|
||||||
|
--dataset-name sharegpt \
|
||||||
|
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||||
|
--top-k 10 \
|
||||||
|
--top-p 0.9 \
|
||||||
|
--temperature 0.5 \
|
||||||
|
--num-prompts 10
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
## Example - Offline Throughput Benchmark
|
## Example - Offline Throughput Benchmark
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
|
|
||||||
NUM_PROMPTS=10
|
|
||||||
DATASET_NAME="sonnet"
|
|
||||||
DATASET_PATH="vllm/benchmarks/sonnet.txt"
|
|
||||||
|
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
--model "${MODEL_NAME}" \
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
--dataset-name "${DATASET_NAME}" \
|
--dataset-name sonnet \
|
||||||
--dataset-path "${DATASET_PATH}" \
|
--dataset-path vllm/benchmarks/sonnet.txt \
|
||||||
--num-prompts "${NUM_PROMPTS}"
|
--num-prompts 10
|
||||||
```
|
```
|
||||||
|
|
||||||
If successful, you will see the following output
|
If successful, you will see the following output
|
||||||
@ -217,19 +291,13 @@ Total num output tokens: 1500
|
|||||||
### VisionArena Benchmark for Vision Language Models
|
### VisionArena Benchmark for Vision Language Models
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
|
|
||||||
NUM_PROMPTS=10
|
|
||||||
DATASET_NAME="hf"
|
|
||||||
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
|
|
||||||
DATASET_SPLIT="train"
|
|
||||||
|
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
--model "${MODEL_NAME}" \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--backend "vllm-chat" \
|
--backend vllm-chat \
|
||||||
--dataset-name "${DATASET_NAME}" \
|
--dataset-name hf \
|
||||||
--dataset-path "${DATASET_PATH}" \
|
--dataset-path lmarena-ai/VisionArena-Chat \
|
||||||
--num-prompts "${NUM_PROMPTS}" \
|
--num-prompts 1000 \
|
||||||
--hf-split "${DATASET_SPLIT}"
|
--hf-split train
|
||||||
```
|
```
|
||||||
|
|
||||||
The `num prompt tokens` now includes image token counts
|
The `num prompt tokens` now includes image token counts
|
||||||
@ -240,29 +308,82 @@ Total num prompt tokens: 14527
|
|||||||
Total num output tokens: 1280
|
Total num output tokens: 1280
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### InstructCoder Benchmark with Speculative Decoding
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||||
|
VLLM_USE_V1=1 \
|
||||||
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
|
--dataset-name=hf \
|
||||||
|
--dataset-path=likaixin/InstructCoder \
|
||||||
|
--model=meta-llama/Meta-Llama-3-8B-Instruct \
|
||||||
|
--input-len=1000 \
|
||||||
|
--output-len=100 \
|
||||||
|
--num-prompts=2048 \
|
||||||
|
--async-engine \
|
||||||
|
--speculative-config $'{"method": "ngram",
|
||||||
|
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
||||||
|
"prompt_lookup_min": 2}'
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
|
||||||
|
Total num prompt tokens: 261136
|
||||||
|
Total num output tokens: 204800
|
||||||
|
```
|
||||||
|
|
||||||
|
### Other HuggingFaceDataset Examples
|
||||||
|
|
||||||
|
**`lmms-lab/LLaVA-OneVision-Data`**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
|
--backend vllm-chat \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path lmms-lab/LLaVA-OneVision-Data \
|
||||||
|
--hf-split train \
|
||||||
|
--hf-subset "chart2text(cauldron)" \
|
||||||
|
--num-prompts 10
|
||||||
|
```
|
||||||
|
|
||||||
|
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
|
--backend vllm-chat \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
|
||||||
|
--hf-split train \
|
||||||
|
--num-prompts 10
|
||||||
|
```
|
||||||
|
|
||||||
|
**`AI-MO/aimo-validation-aime`**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 benchmarks/benchmark_throughput.py \
|
||||||
|
--model Qwen/QwQ-32B \
|
||||||
|
--backend vllm \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path AI-MO/aimo-validation-aime \
|
||||||
|
--hf-split train \
|
||||||
|
--num-prompts 10
|
||||||
|
```
|
||||||
|
|
||||||
### Benchmark with LoRA Adapters
|
### Benchmark with LoRA Adapters
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
# download dataset
|
# download dataset
|
||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
MODEL_NAME="meta-llama/Llama-2-7b-hf"
|
|
||||||
BACKEND="vllm"
|
|
||||||
DATASET_NAME="sharegpt"
|
|
||||||
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
|
|
||||||
NUM_PROMPTS=10
|
|
||||||
MAX_LORAS=2
|
|
||||||
MAX_LORA_RANK=8
|
|
||||||
ENABLE_LORA="--enable-lora"
|
|
||||||
LORA_PATH="yard1/llama-2-7b-sql-lora-test"
|
|
||||||
|
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
--model "${MODEL_NAME}" \
|
--model meta-llama/Llama-2-7b-hf \
|
||||||
--backend "${BACKEND}" \
|
--backend vllm \
|
||||||
--dataset_path "${DATASET_PATH}" \
|
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||||
--dataset_name "${DATASET_NAME}" \
|
--dataset_name sharegpt \
|
||||||
--num-prompts "${NUM_PROMPTS}" \
|
--num-prompts 10 \
|
||||||
--max-loras "${MAX_LORAS}" \
|
--max-loras 2 \
|
||||||
--max-lora-rank "${MAX_LORA_RANK}" \
|
--max-lora-rank 8 \
|
||||||
${ENABLE_LORA} \
|
--enable-lora \
|
||||||
--lora-path "${LORA_PATH}"
|
--lora-path yard1/llama-2-7b-sql-lora-test
|
||||||
```
|
```
|
||||||
|
|||||||
212
benchmarks/auto_tune.sh
Normal file
212
benchmarks/auto_tune.sh
Normal file
@ -0,0 +1,212 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
|
||||||
|
# The current server parameter combination is max_num_seqs and max_num_batched_tokens
|
||||||
|
# It also supports additional requirement: e2e latency and prefix cache.
|
||||||
|
|
||||||
|
# Pre-requisite:
|
||||||
|
# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version.
|
||||||
|
# 2. If the model is customized, replace the MODEL's config with the customized config.
|
||||||
|
# 3. Set variables (ALL REQUIRED)
|
||||||
|
# BASE: your directory for vllm repo
|
||||||
|
# MODEL: the model served by vllm
|
||||||
|
# DOWNLOAD_DIR: directory to download and load model weights.
|
||||||
|
# INPUT_LEN: request input len
|
||||||
|
# OUTPUT_LEN: request output len
|
||||||
|
# MIN_CACHE_HIT_PCT: prefix cache rate
|
||||||
|
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
|
||||||
|
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
|
||||||
|
# 5. The final result will be saved in RESULT file.
|
||||||
|
|
||||||
|
|
||||||
|
# Example use cases
|
||||||
|
# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput?
|
||||||
|
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000
|
||||||
|
# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter?
|
||||||
|
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500
|
||||||
|
# 3. If we want to reach 60% prefix cache, what's the best server parameter?
|
||||||
|
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500
|
||||||
|
|
||||||
|
TAG=$(date +"%Y_%m_%d_%H_%M")
|
||||||
|
BASE=""
|
||||||
|
MODEL="meta-llama/Llama-3.1-8B-Instruct"
|
||||||
|
DOWNLOAD_DIR=""
|
||||||
|
INPUT_LEN=4000
|
||||||
|
OUTPUT_LEN=16
|
||||||
|
MIN_CACHE_HIT_PCT_PCT=0
|
||||||
|
MAX_LATENCY_ALLOWED_MS=100000000000
|
||||||
|
|
||||||
|
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
||||||
|
RESULT="$LOG_FOLDER/result.txt"
|
||||||
|
|
||||||
|
echo "result file$ $RESULT"
|
||||||
|
echo "model: $MODEL"
|
||||||
|
echo
|
||||||
|
|
||||||
|
rm -rf $LOG_FOLDER
|
||||||
|
mkdir -p $LOG_FOLDER
|
||||||
|
|
||||||
|
cd "$BASE/vllm"
|
||||||
|
# create sonnet-4x.txt so that we can sample 2048 tokens for input
|
||||||
|
echo "" > benchmarks/sonnet_4x.txt
|
||||||
|
for _ in {1..4}
|
||||||
|
do
|
||||||
|
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
|
||||||
|
done
|
||||||
|
|
||||||
|
pip install datasets
|
||||||
|
|
||||||
|
current_hash=$(git rev-parse HEAD)
|
||||||
|
echo "hash:$current_hash" >> "$RESULT"
|
||||||
|
echo "current_hash: $current_hash"
|
||||||
|
|
||||||
|
best_throughput=0
|
||||||
|
best_max_num_seqs=0
|
||||||
|
best_num_batched_tokens=0
|
||||||
|
best_goodput=0
|
||||||
|
run_benchmark() {
|
||||||
|
local max_num_seqs=$1
|
||||||
|
local max_num_batched_tokens=$2
|
||||||
|
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
||||||
|
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
|
||||||
|
echo "vllm_log: $vllm_log"
|
||||||
|
echo
|
||||||
|
rm -f $vllm_log
|
||||||
|
|
||||||
|
# start the server
|
||||||
|
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
|
||||||
|
--disable-log-requests \
|
||||||
|
--port 8004 \
|
||||||
|
--gpu-memory-utilization 0.98 \
|
||||||
|
--max-num-seqs $max_num_seqs \
|
||||||
|
--max-num-batched-tokens $max_num_batched_tokens \
|
||||||
|
--tensor-parallel-size 1 \
|
||||||
|
--enable-prefix-caching \
|
||||||
|
--load-format dummy \
|
||||||
|
--download-dir $DOWNLOAD_DIR \
|
||||||
|
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
|
||||||
|
echo "wait for 10 minutes.."
|
||||||
|
echo
|
||||||
|
# wait for 10 minutes...
|
||||||
|
server_started=0
|
||||||
|
for i in {1..60}; do
|
||||||
|
if grep -Fq "Application startup complete" "$vllm_log"; then
|
||||||
|
echo "Application started"
|
||||||
|
server_started=1
|
||||||
|
break
|
||||||
|
else
|
||||||
|
# echo "wait for 10 seconds..."
|
||||||
|
sleep 10
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if (( ! server_started )); then
|
||||||
|
echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
|
||||||
|
echo "pkill -f vllm"
|
||||||
|
echo
|
||||||
|
pkill vllm
|
||||||
|
sleep 10
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "run benchmark test..."
|
||||||
|
echo
|
||||||
|
meet_latency_requirement=0
|
||||||
|
# get a basic qps by using request-rate inf
|
||||||
|
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
||||||
|
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
||||||
|
python benchmarks/benchmark_serving.py \
|
||||||
|
--backend vllm \
|
||||||
|
--model $MODEL \
|
||||||
|
--dataset-name sonnet \
|
||||||
|
--dataset-path benchmarks/sonnet_4x.txt \
|
||||||
|
--sonnet-input-len $INPUT_LEN \
|
||||||
|
--sonnet-output-len $OUTPUT_LEN \
|
||||||
|
--ignore-eos \
|
||||||
|
--disable-tqdm \
|
||||||
|
--request-rate inf \
|
||||||
|
--percentile-metrics ttft,tpot,itl,e2el \
|
||||||
|
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||||
|
--num-prompts 100 \
|
||||||
|
--sonnet-prefix-len $prefix_len \
|
||||||
|
--port 8004 > "$bm_log"
|
||||||
|
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
|
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||||
|
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
|
|
||||||
|
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
||||||
|
meet_latency_requirement=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( ! meet_latency_requirement )); then
|
||||||
|
# start from request-rate as int(through_put) + 1
|
||||||
|
request_rate=$((${through_put%.*} + 1))
|
||||||
|
while ((request_rate > 0)); do
|
||||||
|
# clear prefix cache
|
||||||
|
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
||||||
|
sleep 5
|
||||||
|
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
||||||
|
python benchmarks/benchmark_serving.py \
|
||||||
|
--backend vllm \
|
||||||
|
--model $MODEL \
|
||||||
|
--dataset-name sonnet \
|
||||||
|
--dataset-path benchmarks/sonnet_4x.txt \
|
||||||
|
--sonnet-input-len $INPUT_LEN \
|
||||||
|
--sonnet-output-len $OUTPUT_LEN \
|
||||||
|
--ignore_eos \
|
||||||
|
--disable-tqdm \
|
||||||
|
--request-rate $request_rate \
|
||||||
|
--percentile-metrics ttft,tpot,itl,e2el \
|
||||||
|
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||||
|
--num-prompts 100 \
|
||||||
|
--sonnet-prefix-len $prefix_len \
|
||||||
|
--port 8004 > "$bm_log"
|
||||||
|
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
|
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||||
|
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
|
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
||||||
|
meet_latency_requirement=1
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
request_rate=$((request_rate-1))
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
# write the results and update the best result.
|
||||||
|
if ((meet_latency_requirement)); then
|
||||||
|
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
|
||||||
|
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
|
||||||
|
if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
|
||||||
|
best_throughput=$through_put
|
||||||
|
best_max_num_seqs=$max_num_seqs
|
||||||
|
best_num_batched_tokens=$max_num_batched_tokens
|
||||||
|
best_goodput=$goodput
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
|
||||||
|
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" >> "$RESULT"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
||||||
|
|
||||||
|
echo "pkill -f vllm"
|
||||||
|
echo
|
||||||
|
pkill vllm
|
||||||
|
sleep 10
|
||||||
|
rm -f $vllm_log
|
||||||
|
printf '=%.0s' $(seq 1 20)
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
num_seqs_list="128 256"
|
||||||
|
num_batched_tokens_list="512 1024 2048 4096"
|
||||||
|
for num_seqs in $num_seqs_list; do
|
||||||
|
for num_batched_tokens in $num_batched_tokens_list; do
|
||||||
|
run_benchmark $num_seqs $num_batched_tokens
|
||||||
|
exit 0
|
||||||
|
done
|
||||||
|
done
|
||||||
|
echo "finish permutations"
|
||||||
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
||||||
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT"
|
||||||
|
|
||||||
@ -1,5 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
import io
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
@ -11,8 +12,7 @@ from typing import Optional, Union
|
|||||||
import aiohttp
|
import aiohttp
|
||||||
import huggingface_hub.constants
|
import huggingface_hub.constants
|
||||||
from tqdm.asyncio import tqdm
|
from tqdm.asyncio import tqdm
|
||||||
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
|
||||||
PreTrainedTokenizerFast)
|
|
||||||
|
|
||||||
# NOTE(simon): do not import vLLM here so the benchmark script
|
# NOTE(simon): do not import vLLM here so the benchmark script
|
||||||
# can run without vLLM installed.
|
# can run without vLLM installed.
|
||||||
@ -32,6 +32,7 @@ class RequestFuncInput:
|
|||||||
extra_body: Optional[dict] = None
|
extra_body: Optional[dict] = None
|
||||||
multi_modal_content: Optional[dict] = None
|
multi_modal_content: Optional[dict] = None
|
||||||
ignore_eos: bool = False
|
ignore_eos: bool = False
|
||||||
|
language: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -41,8 +42,7 @@ class RequestFuncOutput:
|
|||||||
latency: float = 0.0
|
latency: float = 0.0
|
||||||
output_tokens: int = 0
|
output_tokens: int = 0
|
||||||
ttft: float = 0.0 # Time to first token
|
ttft: float = 0.0 # Time to first token
|
||||||
itl: list[float] = field(
|
itl: list[float] = field(default_factory=list) # list of inter-token latencies
|
||||||
default_factory=list) # list of inter-token latencies
|
|
||||||
tpot: float = 0.0 # avg next-token latencies
|
tpot: float = 0.0 # avg next-token latencies
|
||||||
prompt_len: int = 0
|
prompt_len: int = 0
|
||||||
error: str = ""
|
error: str = ""
|
||||||
@ -55,8 +55,9 @@ async def async_request_tgi(
|
|||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
|
|
||||||
async with aiohttp.ClientSession(trust_env=True,
|
async with aiohttp.ClientSession(
|
||||||
timeout=AIOHTTP_TIMEOUT) as session:
|
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
||||||
|
) as session:
|
||||||
params = {
|
params = {
|
||||||
"max_new_tokens": request_func_input.output_len,
|
"max_new_tokens": request_func_input.output_len,
|
||||||
"do_sample": True,
|
"do_sample": True,
|
||||||
@ -103,8 +104,7 @@ async def async_request_tgi(
|
|||||||
|
|
||||||
# Decoding phase
|
# Decoding phase
|
||||||
else:
|
else:
|
||||||
output.itl.append(timestamp -
|
output.itl.append(timestamp - most_recent_timestamp)
|
||||||
most_recent_timestamp)
|
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
|
|
||||||
@ -131,8 +131,9 @@ async def async_request_trt_llm(
|
|||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
|
|
||||||
async with aiohttp.ClientSession(trust_env=True,
|
async with aiohttp.ClientSession(
|
||||||
timeout=AIOHTTP_TIMEOUT) as session:
|
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
||||||
|
) as session:
|
||||||
payload = {
|
payload = {
|
||||||
"accumulate_tokens": True,
|
"accumulate_tokens": True,
|
||||||
"text_input": request_func_input.prompt,
|
"text_input": request_func_input.prompt,
|
||||||
@ -157,8 +158,7 @@ async def async_request_trt_llm(
|
|||||||
if not chunk_bytes:
|
if not chunk_bytes:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
chunk = chunk_bytes.decode("utf-8").removeprefix("data:")
|
||||||
"data:")
|
|
||||||
|
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
output.generated_text += data["text_output"]
|
output.generated_text += data["text_output"]
|
||||||
@ -170,8 +170,7 @@ async def async_request_trt_llm(
|
|||||||
|
|
||||||
# Decoding phase
|
# Decoding phase
|
||||||
else:
|
else:
|
||||||
output.itl.append(timestamp -
|
output.itl.append(timestamp - most_recent_timestamp)
|
||||||
most_recent_timestamp)
|
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
|
|
||||||
@ -195,15 +194,23 @@ async def async_request_deepspeed_mii(
|
|||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
async with aiohttp.ClientSession(trust_env=True,
|
api_url = request_func_input.api_url
|
||||||
timeout=AIOHTTP_TIMEOUT) as session:
|
assert api_url.endswith(("completions", "profile")), (
|
||||||
|
"OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||||
|
)
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession(
|
||||||
|
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
||||||
|
) as session:
|
||||||
payload = {
|
payload = {
|
||||||
|
"model": request_func_input.model,
|
||||||
"prompt": request_func_input.prompt,
|
"prompt": request_func_input.prompt,
|
||||||
"max_tokens": request_func_input.output_len,
|
"max_tokens": request_func_input.output_len,
|
||||||
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
|
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
|
||||||
"top_p": 1.0,
|
"top_p": 1.0,
|
||||||
}
|
}
|
||||||
|
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||||
|
|
||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
@ -214,12 +221,22 @@ async def async_request_deepspeed_mii(
|
|||||||
|
|
||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
try:
|
try:
|
||||||
async with session.post(url=request_func_input.api_url,
|
async with session.post(
|
||||||
json=payload) as response:
|
url=api_url, json=payload, headers=headers
|
||||||
|
) as response:
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
parsed_resp = await response.json()
|
parsed_resp = await response.json()
|
||||||
output.latency = time.perf_counter() - st
|
output.latency = time.perf_counter() - st
|
||||||
output.generated_text = parsed_resp["text"][0]
|
if "choices" in parsed_resp:
|
||||||
|
output.generated_text = parsed_resp["choices"][0]["text"]
|
||||||
|
elif "text" in parsed_resp:
|
||||||
|
output.generated_text = parsed_resp["text"][0]
|
||||||
|
else:
|
||||||
|
output.error = (
|
||||||
|
"Unexpected response format: "
|
||||||
|
"neither 'choices' nor 'text' found"
|
||||||
|
)
|
||||||
|
output.success = False
|
||||||
output.success = True
|
output.success = True
|
||||||
else:
|
else:
|
||||||
output.error = response.reason or ""
|
output.error = response.reason or ""
|
||||||
@ -239,17 +256,20 @@ async def async_request_openai_completions(
|
|||||||
pbar: Optional[tqdm] = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(
|
assert api_url.endswith(("completions", "profile")), (
|
||||||
("completions", "profile")
|
"OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||||
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
|
)
|
||||||
|
|
||||||
async with aiohttp.ClientSession(trust_env=True,
|
async with aiohttp.ClientSession(
|
||||||
timeout=AIOHTTP_TIMEOUT) as session:
|
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
||||||
|
) as session:
|
||||||
payload = {
|
payload = {
|
||||||
"model": request_func_input.model_name \
|
"model": request_func_input.model_name
|
||||||
if request_func_input.model_name else request_func_input.model,
|
if request_func_input.model_name
|
||||||
|
else request_func_input.model,
|
||||||
"prompt": request_func_input.prompt,
|
"prompt": request_func_input.prompt,
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
|
"repetition_penalty": 1.0,
|
||||||
"max_tokens": request_func_input.output_len,
|
"max_tokens": request_func_input.output_len,
|
||||||
"logprobs": request_func_input.logprobs,
|
"logprobs": request_func_input.logprobs,
|
||||||
"stream": True,
|
"stream": True,
|
||||||
@ -261,9 +281,7 @@ async def async_request_openai_completions(
|
|||||||
payload["ignore_eos"] = request_func_input.ignore_eos
|
payload["ignore_eos"] = request_func_input.ignore_eos
|
||||||
if request_func_input.extra_body:
|
if request_func_input.extra_body:
|
||||||
payload.update(request_func_input.extra_body)
|
payload.update(request_func_input.extra_body)
|
||||||
headers = {
|
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
|
|
||||||
}
|
|
||||||
|
|
||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
@ -272,8 +290,9 @@ async def async_request_openai_completions(
|
|||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
most_recent_timestamp = st
|
most_recent_timestamp = st
|
||||||
try:
|
try:
|
||||||
async with session.post(url=api_url, json=payload,
|
async with session.post(
|
||||||
headers=headers) as response:
|
url=api_url, json=payload, headers=headers
|
||||||
|
) as response:
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
first_chunk_received = False
|
first_chunk_received = False
|
||||||
async for chunk_bytes in response.content:
|
async for chunk_bytes in response.content:
|
||||||
@ -281,8 +300,7 @@ async def async_request_openai_completions(
|
|||||||
if not chunk_bytes:
|
if not chunk_bytes:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
|
||||||
"data: ")
|
|
||||||
if chunk != "[DONE]":
|
if chunk != "[DONE]":
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
|
|
||||||
@ -302,21 +320,20 @@ async def async_request_openai_completions(
|
|||||||
|
|
||||||
# Decoding phase
|
# Decoding phase
|
||||||
else:
|
else:
|
||||||
output.itl.append(timestamp -
|
output.itl.append(timestamp - most_recent_timestamp)
|
||||||
most_recent_timestamp)
|
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
generated_text += text or ""
|
generated_text += text or ""
|
||||||
elif usage := data.get("usage"):
|
if usage := data.get("usage"):
|
||||||
output.output_tokens = usage.get(
|
output.output_tokens = usage.get("completion_tokens")
|
||||||
"completion_tokens")
|
|
||||||
if first_chunk_received:
|
if first_chunk_received:
|
||||||
output.success = True
|
output.success = True
|
||||||
else:
|
else:
|
||||||
output.success = False
|
output.success = False
|
||||||
output.error = (
|
output.error = (
|
||||||
"Never received a valid chunk to calculate TTFT."
|
"Never received a valid chunk to calculate TTFT."
|
||||||
"This response will be marked as failed!")
|
"This response will be marked as failed!"
|
||||||
|
)
|
||||||
output.generated_text = generated_text
|
output.generated_text = generated_text
|
||||||
output.latency = most_recent_timestamp - st
|
output.latency = most_recent_timestamp - st
|
||||||
else:
|
else:
|
||||||
@ -337,23 +354,22 @@ async def async_request_openai_chat_completions(
|
|||||||
pbar: Optional[tqdm] = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(
|
assert api_url.endswith(("chat/completions", "profile")), (
|
||||||
("chat/completions", "profile")
|
"OpenAI Chat Completions API URL must end with 'chat/completions'."
|
||||||
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
)
|
||||||
|
|
||||||
async with aiohttp.ClientSession(trust_env=True,
|
async with aiohttp.ClientSession(
|
||||||
timeout=AIOHTTP_TIMEOUT) as session:
|
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
||||||
|
) as session:
|
||||||
content = [{"type": "text", "text": request_func_input.prompt}]
|
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||||
if request_func_input.multi_modal_content:
|
if request_func_input.multi_modal_content:
|
||||||
content.append(request_func_input.multi_modal_content)
|
content.append(request_func_input.multi_modal_content)
|
||||||
payload = {
|
payload = {
|
||||||
"model": request_func_input.model_name \
|
"model": request_func_input.model_name
|
||||||
if request_func_input.model_name else request_func_input.model,
|
if request_func_input.model_name
|
||||||
|
else request_func_input.model,
|
||||||
"messages": [
|
"messages": [
|
||||||
{
|
{"role": "user", "content": content},
|
||||||
"role": "user",
|
|
||||||
"content": content
|
|
||||||
},
|
|
||||||
],
|
],
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
"max_completion_tokens": request_func_input.output_len,
|
"max_completion_tokens": request_func_input.output_len,
|
||||||
@ -379,16 +395,16 @@ async def async_request_openai_chat_completions(
|
|||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
most_recent_timestamp = st
|
most_recent_timestamp = st
|
||||||
try:
|
try:
|
||||||
async with session.post(url=api_url, json=payload,
|
async with session.post(
|
||||||
headers=headers) as response:
|
url=api_url, json=payload, headers=headers
|
||||||
|
) as response:
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
async for chunk_bytes in response.content:
|
async for chunk_bytes in response.content:
|
||||||
chunk_bytes = chunk_bytes.strip()
|
chunk_bytes = chunk_bytes.strip()
|
||||||
if not chunk_bytes:
|
if not chunk_bytes:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
|
||||||
"data: ")
|
|
||||||
if chunk != "[DONE]":
|
if chunk != "[DONE]":
|
||||||
timestamp = time.perf_counter()
|
timestamp = time.perf_counter()
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
@ -402,13 +418,11 @@ async def async_request_openai_chat_completions(
|
|||||||
|
|
||||||
# Decoding phase
|
# Decoding phase
|
||||||
else:
|
else:
|
||||||
output.itl.append(timestamp -
|
output.itl.append(timestamp - most_recent_timestamp)
|
||||||
most_recent_timestamp)
|
|
||||||
|
|
||||||
generated_text += content or ""
|
generated_text += content or ""
|
||||||
elif usage := data.get("usage"):
|
elif usage := data.get("usage"):
|
||||||
output.output_tokens = usage.get(
|
output.output_tokens = usage.get("completion_tokens")
|
||||||
"completion_tokens")
|
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
|
|
||||||
@ -428,8 +442,115 @@ async def async_request_openai_chat_completions(
|
|||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
async def async_request_openai_audio(
|
||||||
|
request_func_input: RequestFuncInput,
|
||||||
|
pbar: Optional[tqdm] = None,
|
||||||
|
) -> RequestFuncOutput:
|
||||||
|
# Lazy import without PlaceholderModule to avoid vllm dep.
|
||||||
|
import soundfile
|
||||||
|
|
||||||
|
api_url = request_func_input.api_url
|
||||||
|
assert api_url.endswith(("transcriptions", "translations")), (
|
||||||
|
"OpenAI Chat Completions API URL must end with 'transcriptions' "
|
||||||
|
)
|
||||||
|
"or `translations`."
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession(
|
||||||
|
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
||||||
|
) as session:
|
||||||
|
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||||
|
payload = {
|
||||||
|
"model": request_func_input.model_name
|
||||||
|
if request_func_input.model_name
|
||||||
|
else request_func_input.model,
|
||||||
|
"temperature": 0.0,
|
||||||
|
"max_completion_tokens": request_func_input.output_len,
|
||||||
|
"stream": True,
|
||||||
|
"language": "en",
|
||||||
|
# Flattened due to multipart/form-data
|
||||||
|
"stream_include_usage": True,
|
||||||
|
"stream_continuous_usage_stats": True,
|
||||||
|
}
|
||||||
|
if request_func_input.extra_body:
|
||||||
|
payload.update(request_func_input.extra_body)
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Send audio file
|
||||||
|
def to_bytes(y, sr):
|
||||||
|
buffer = io.BytesIO()
|
||||||
|
soundfile.write(buffer, y, sr, format="WAV")
|
||||||
|
buffer.seek(0)
|
||||||
|
return buffer
|
||||||
|
|
||||||
|
with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
|
||||||
|
form = aiohttp.FormData()
|
||||||
|
form.add_field("file", f, content_type="audio/wav")
|
||||||
|
for key, value in payload.items():
|
||||||
|
form.add_field(key, str(value))
|
||||||
|
|
||||||
|
output = RequestFuncOutput()
|
||||||
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
|
generated_text = ""
|
||||||
|
ttft = 0.0
|
||||||
|
st = time.perf_counter()
|
||||||
|
most_recent_timestamp = st
|
||||||
|
try:
|
||||||
|
async with session.post(
|
||||||
|
url=api_url, data=form, headers=headers
|
||||||
|
) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
async for chunk_bytes in response.content:
|
||||||
|
chunk_bytes = chunk_bytes.strip()
|
||||||
|
if not chunk_bytes:
|
||||||
|
continue
|
||||||
|
|
||||||
|
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
|
||||||
|
if chunk != "[DONE]":
|
||||||
|
timestamp = time.perf_counter()
|
||||||
|
data = json.loads(chunk)
|
||||||
|
|
||||||
|
if choices := data.get("choices"):
|
||||||
|
content = choices[0]["delta"].get("content")
|
||||||
|
# First token
|
||||||
|
if ttft == 0.0:
|
||||||
|
ttft = timestamp - st
|
||||||
|
output.ttft = ttft
|
||||||
|
|
||||||
|
# Decoding phase
|
||||||
|
else:
|
||||||
|
output.itl.append(
|
||||||
|
timestamp - most_recent_timestamp
|
||||||
|
)
|
||||||
|
|
||||||
|
generated_text += content or ""
|
||||||
|
elif usage := data.get("usage"):
|
||||||
|
output.output_tokens = usage.get(
|
||||||
|
"completion_tokens"
|
||||||
|
)
|
||||||
|
|
||||||
|
most_recent_timestamp = timestamp
|
||||||
|
|
||||||
|
output.generated_text = generated_text
|
||||||
|
output.success = True
|
||||||
|
output.latency = most_recent_timestamp - st
|
||||||
|
else:
|
||||||
|
output.error = response.reason or ""
|
||||||
|
output.success = False
|
||||||
|
except Exception:
|
||||||
|
output.success = False
|
||||||
|
exc_info = sys.exc_info()
|
||||||
|
output.error = "".join(traceback.format_exception(*exc_info))
|
||||||
|
|
||||||
|
if pbar:
|
||||||
|
pbar.update(1)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
def get_model(pretrained_model_name_or_path: str) -> str:
|
def get_model(pretrained_model_name_or_path: str) -> str:
|
||||||
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
|
if os.getenv("VLLM_USE_MODELSCOPE", "False").lower() == "true":
|
||||||
from modelscope import snapshot_download
|
from modelscope import snapshot_download
|
||||||
|
|
||||||
from vllm.model_executor.model_loader.weight_utils import get_lock
|
from vllm.model_executor.model_loader.weight_utils import get_lock
|
||||||
@ -440,7 +561,8 @@ def get_model(pretrained_model_name_or_path: str) -> str:
|
|||||||
model_path = snapshot_download(
|
model_path = snapshot_download(
|
||||||
model_id=pretrained_model_name_or_path,
|
model_id=pretrained_model_name_or_path,
|
||||||
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
||||||
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
|
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
|
||||||
|
)
|
||||||
|
|
||||||
return model_path
|
return model_path
|
||||||
return pretrained_model_name_or_path
|
return pretrained_model_name_or_path
|
||||||
@ -453,23 +575,23 @@ def get_tokenizer(
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
||||||
if pretrained_model_name_or_path is not None and not os.path.exists(
|
if pretrained_model_name_or_path is not None and not os.path.exists(
|
||||||
pretrained_model_name_or_path):
|
pretrained_model_name_or_path
|
||||||
pretrained_model_name_or_path = get_model(
|
):
|
||||||
pretrained_model_name_or_path)
|
pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
|
||||||
if tokenizer_mode == "slow":
|
if tokenizer_mode == "slow":
|
||||||
if kwargs.get("use_fast", False):
|
if kwargs.get("use_fast", False):
|
||||||
raise ValueError(
|
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
|
||||||
"Cannot use the fast tokenizer in slow tokenizer mode.")
|
|
||||||
kwargs["use_fast"] = False
|
kwargs["use_fast"] = False
|
||||||
if tokenizer_mode == "mistral":
|
if tokenizer_mode == "mistral":
|
||||||
try:
|
try:
|
||||||
from vllm.transformers_utils.tokenizer import MistralTokenizer
|
from vllm.transformers_utils.tokenizer import MistralTokenizer
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
raise ImportError("MistralTokenizer requires vllm package.\n"
|
raise ImportError(
|
||||||
"Please install it with `pip install vllm` "
|
"MistralTokenizer requires vllm package.\n"
|
||||||
"to use mistral tokenizer mode.") from e
|
"Please install it with `pip install vllm` "
|
||||||
return MistralTokenizer.from_pretrained(
|
"to use mistral tokenizer mode."
|
||||||
str(pretrained_model_name_or_path))
|
) from e
|
||||||
|
return MistralTokenizer.from_pretrained(str(pretrained_model_name_or_path))
|
||||||
else:
|
else:
|
||||||
return AutoTokenizer.from_pretrained(
|
return AutoTokenizer.from_pretrained(
|
||||||
pretrained_model_name_or_path,
|
pretrained_model_name_or_path,
|
||||||
@ -485,7 +607,15 @@ ASYNC_REQUEST_FUNCS = {
|
|||||||
"deepspeed-mii": async_request_deepspeed_mii,
|
"deepspeed-mii": async_request_deepspeed_mii,
|
||||||
"openai": async_request_openai_completions,
|
"openai": async_request_openai_completions,
|
||||||
"openai-chat": async_request_openai_chat_completions,
|
"openai-chat": async_request_openai_chat_completions,
|
||||||
|
"openai-audio": async_request_openai_audio,
|
||||||
"tensorrt-llm": async_request_trt_llm,
|
"tensorrt-llm": async_request_trt_llm,
|
||||||
"scalellm": async_request_openai_completions,
|
"scalellm": async_request_openai_completions,
|
||||||
"sglang": async_request_openai_completions,
|
"sglang": async_request_openai_completions,
|
||||||
|
"llama.cpp": async_request_openai_completions,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
OPENAI_COMPATIBLE_BACKENDS = [
|
||||||
|
k
|
||||||
|
for k, v in ASYNC_REQUEST_FUNCS.items()
|
||||||
|
if v in (async_request_openai_completions, async_request_openai_chat_completions)
|
||||||
|
]
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -6,14 +6,13 @@ import dataclasses
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
|
||||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
import vllm.envs as envs
|
||||||
|
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.inputs import PromptType
|
from vllm.inputs import PromptType
|
||||||
@ -21,13 +20,14 @@ from vllm.sampling_params import BeamSearchParams
|
|||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
def save_to_pytorch_benchmark_format(
|
||||||
results: dict[str, Any]) -> None:
|
args: argparse.Namespace, results: dict[str, Any]
|
||||||
|
) -> None:
|
||||||
pt_records = convert_to_pytorch_benchmark_format(
|
pt_records = convert_to_pytorch_benchmark_format(
|
||||||
args=args,
|
args=args,
|
||||||
metrics={"latency": results["latencies"]},
|
metrics={"latency": results["latencies"]},
|
||||||
extra_info={k: results[k]
|
extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
|
||||||
for k in ["avg_latency", "percentiles"]})
|
)
|
||||||
if pt_records:
|
if pt_records:
|
||||||
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
||||||
write_to_json(pt_file, pt_records)
|
write_to_json(pt_file, pt_records)
|
||||||
@ -42,9 +42,11 @@ def main(args: argparse.Namespace):
|
|||||||
# the engine will automatically process the request in multiple batches.
|
# the engine will automatically process the request in multiple batches.
|
||||||
llm = LLM(**dataclasses.asdict(engine_args))
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
assert llm.llm_engine.model_config.max_model_len >= (
|
assert llm.llm_engine.model_config.max_model_len >= (
|
||||||
args.input_len +
|
args.input_len + args.output_len
|
||||||
args.output_len), ("Please ensure that max_model_len is greater than"
|
), (
|
||||||
" the sum of input_len and output_len.")
|
"Please ensure that max_model_len is greater than"
|
||||||
|
" the sum of input_len and output_len."
|
||||||
|
)
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
n=args.n,
|
n=args.n,
|
||||||
@ -55,18 +57,16 @@ def main(args: argparse.Namespace):
|
|||||||
detokenize=not args.disable_detokenize,
|
detokenize=not args.disable_detokenize,
|
||||||
)
|
)
|
||||||
print(sampling_params)
|
print(sampling_params)
|
||||||
dummy_prompt_token_ids = np.random.randint(10000,
|
dummy_prompt_token_ids = np.random.randint(
|
||||||
size=(args.batch_size,
|
10000, size=(args.batch_size, args.input_len)
|
||||||
args.input_len))
|
)
|
||||||
dummy_prompts: list[PromptType] = [{
|
dummy_prompts: list[PromptType] = [
|
||||||
"prompt_token_ids": batch
|
{"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
|
||||||
} for batch in dummy_prompt_token_ids.tolist()]
|
]
|
||||||
|
|
||||||
def llm_generate():
|
def llm_generate():
|
||||||
if not args.use_beam_search:
|
if not args.use_beam_search:
|
||||||
llm.generate(dummy_prompts,
|
llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
|
||||||
sampling_params=sampling_params,
|
|
||||||
use_tqdm=False)
|
|
||||||
else:
|
else:
|
||||||
llm.beam_search(
|
llm.beam_search(
|
||||||
dummy_prompts,
|
dummy_prompts,
|
||||||
@ -79,16 +79,9 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
def run_to_completion(profile_dir: Optional[str] = None):
|
def run_to_completion(profile_dir: Optional[str] = None):
|
||||||
if profile_dir:
|
if profile_dir:
|
||||||
with torch.profiler.profile(
|
llm.start_profile()
|
||||||
activities=[
|
llm_generate()
|
||||||
torch.profiler.ProfilerActivity.CPU,
|
llm.stop_profile()
|
||||||
torch.profiler.ProfilerActivity.CUDA,
|
|
||||||
],
|
|
||||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
|
||||||
str(profile_dir)),
|
|
||||||
) as p:
|
|
||||||
llm_generate()
|
|
||||||
print(p.key_averages().table(sort_by="self_cuda_time_total"))
|
|
||||||
else:
|
else:
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
llm_generate()
|
llm_generate()
|
||||||
@ -101,10 +94,7 @@ def main(args: argparse.Namespace):
|
|||||||
run_to_completion(profile_dir=None)
|
run_to_completion(profile_dir=None)
|
||||||
|
|
||||||
if args.profile:
|
if args.profile:
|
||||||
profile_dir = args.profile_result_dir
|
profile_dir = envs.VLLM_TORCH_PROFILER_DIR
|
||||||
if not profile_dir:
|
|
||||||
profile_dir = (Path(".") / "vllm_benchmark_result" /
|
|
||||||
f"latency_result_{time.time()}")
|
|
||||||
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
||||||
run_to_completion(profile_dir=profile_dir)
|
run_to_completion(profile_dir=profile_dir)
|
||||||
return
|
return
|
||||||
@ -135,7 +125,8 @@ def main(args: argparse.Namespace):
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Benchmark the latency of processing a single batch of "
|
description="Benchmark the latency of processing a single batch of "
|
||||||
"requests till completion.")
|
"requests till completion."
|
||||||
|
)
|
||||||
parser.add_argument("--input-len", type=int, default=32)
|
parser.add_argument("--input-len", type=int, default=32)
|
||||||
parser.add_argument("--output-len", type=int, default=128)
|
parser.add_argument("--output-len", type=int, default=128)
|
||||||
parser.add_argument("--batch-size", type=int, default=8)
|
parser.add_argument("--batch-size", type=int, default=8)
|
||||||
@ -152,22 +143,14 @@ if __name__ == "__main__":
|
|||||||
default=10,
|
default=10,
|
||||||
help="Number of iterations to run for warmup.",
|
help="Number of iterations to run for warmup.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--num-iters",
|
parser.add_argument(
|
||||||
type=int,
|
"--num-iters", type=int, default=30, help="Number of iterations to run."
|
||||||
default=30,
|
)
|
||||||
help="Number of iterations to run.")
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--profile",
|
"--profile",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="profile the generation process of a single batch",
|
help="profile the generation process of a single batch",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
|
||||||
"--profile-result-dir",
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help=("path to save the pytorch profiler output. Can be visualized "
|
|
||||||
"with ui.perfetto.dev or Tensorboard."),
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--output-json",
|
"--output-json",
|
||||||
type=str,
|
type=str,
|
||||||
@ -177,10 +160,20 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--disable-detokenize",
|
"--disable-detokenize",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help=("Do not detokenize responses (i.e. do not include "
|
help=(
|
||||||
"detokenization time in the latency measurement)"),
|
"Do not detokenize responses (i.e. do not include "
|
||||||
|
"detokenization time in the latency measurement)"
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
|
# V1 enables prefix caching by default which skews the latency
|
||||||
|
# numbers. We need to disable prefix caching by default.
|
||||||
|
parser.set_defaults(enable_prefix_caching=False)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
|
||||||
|
raise OSError(
|
||||||
|
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
|
||||||
|
"Please set it to a valid path to use torch profiler."
|
||||||
|
)
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@ -76,7 +76,7 @@ def repeat_prompts(prompts, repeat_count, mode: str):
|
|||||||
- 'random': Shuffle the prompts randomly after repetition.
|
- 'random': Shuffle the prompts randomly after repetition.
|
||||||
- 'tile': Repeat the entire prompt list in sequence.
|
- 'tile': Repeat the entire prompt list in sequence.
|
||||||
Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3].
|
Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3].
|
||||||
- 'interleave': Repeat each prompt consecutively before moving to
|
- 'interleave': Repeat each prompt consecutively before moving to
|
||||||
the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3].
|
the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3].
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@ -86,20 +86,21 @@ def repeat_prompts(prompts, repeat_count, mode: str):
|
|||||||
ValueError: If an invalid mode is provided.
|
ValueError: If an invalid mode is provided.
|
||||||
"""
|
"""
|
||||||
print("Repeat mode: ", mode)
|
print("Repeat mode: ", mode)
|
||||||
if mode == 'random':
|
if mode == "random":
|
||||||
repeated_prompts = prompts * repeat_count
|
repeated_prompts = prompts * repeat_count
|
||||||
random.shuffle(repeated_prompts)
|
random.shuffle(repeated_prompts)
|
||||||
return repeated_prompts
|
return repeated_prompts
|
||||||
elif mode == 'tile':
|
elif mode == "tile":
|
||||||
return prompts * repeat_count
|
return prompts * repeat_count
|
||||||
elif mode == 'interleave':
|
elif mode == "interleave":
|
||||||
repeated_prompts = []
|
repeated_prompts = []
|
||||||
for prompt in prompts:
|
for prompt in prompts:
|
||||||
repeated_prompts.extend([prompt] * repeat_count)
|
repeated_prompts.extend([prompt] * repeat_count)
|
||||||
return repeated_prompts
|
return repeated_prompts
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Invalid mode: {mode}, only support "
|
raise ValueError(
|
||||||
"'random', 'tile', 'interleave'")
|
f"Invalid mode: {mode}, only support 'random', 'tile', 'interleave'"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
@ -109,16 +110,16 @@ def main(args):
|
|||||||
# we append the document id at the beginning to avoid any of the document
|
# we append the document id at the beginning to avoid any of the document
|
||||||
# being the prefix of other documents
|
# being the prefix of other documents
|
||||||
prompts = [
|
prompts = [
|
||||||
str(i) + ' '.join(['hi'] * args.document_length)
|
str(i) + " ".join(["hi"] * args.document_length)
|
||||||
for i in range(args.num_documents)
|
for i in range(args.num_documents)
|
||||||
]
|
]
|
||||||
|
|
||||||
prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)
|
prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)
|
||||||
|
|
||||||
warmup_prompts = [
|
warmup_prompts = [
|
||||||
"This is warm up request " + str(i) + \
|
"This is warm up request " + str(i) + " ".join(["hi"] * args.document_length)
|
||||||
' '.join(['hi'] * args.document_length)
|
for i in range(args.num_documents)
|
||||||
for i in range(args.num_documents)]
|
]
|
||||||
|
|
||||||
# Create the LLM engine
|
# Create the LLM engine
|
||||||
engine_args = EngineArgs.from_cli_args(args)
|
engine_args = EngineArgs.from_cli_args(args)
|
||||||
@ -142,42 +143,52 @@ def main(args):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description=
|
description="Benchmark the performance with or "
|
||||||
'Benchmark the performance with or without automatic prefix caching.')
|
"without automatic prefix caching."
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--document-length',
|
"--document-length",
|
||||||
type=int,
|
type=int,
|
||||||
# Roughly the number of tokens for a system paper,
|
# Roughly the number of tokens for a system paper,
|
||||||
# excluding images
|
# excluding images
|
||||||
default=20000,
|
default=20000,
|
||||||
help='Range of input lengths for sampling prompts,'
|
help="Range of input lengths for sampling prompts, "
|
||||||
'specified as "min:max" (e.g., "128:256").')
|
'specified as "min:max" (e.g., "128:256").',
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument('--num-documents',
|
parser.add_argument(
|
||||||
type=int,
|
"--num-documents",
|
||||||
default=8,
|
type=int,
|
||||||
help='Range of input lengths for sampling prompts,'
|
default=8,
|
||||||
'specified as "min:max" (e.g., "128:256").')
|
help="Range of input lengths for sampling prompts, "
|
||||||
|
'specified as "min:max" (e.g., "128:256").',
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument('--output-len', type=int, default=10)
|
parser.add_argument("--output-len", type=int, default=10)
|
||||||
|
|
||||||
parser.add_argument('--repeat-count',
|
parser.add_argument(
|
||||||
type=int,
|
"--repeat-count",
|
||||||
default=2,
|
type=int,
|
||||||
help='Number of times to repeat each prompt')
|
default=2,
|
||||||
|
help="Number of times to repeat each prompt",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument("--repeat-mode",
|
parser.add_argument(
|
||||||
type=str,
|
"--repeat-mode",
|
||||||
default='random',
|
type=str,
|
||||||
help='The mode to repeat prompts. The supported '
|
default="random",
|
||||||
'modes are "random", "tile", and "interleave". '
|
help="The mode to repeat prompts. The supported "
|
||||||
'See repeat_prompts() in the source code for details.')
|
'modes are "random", "tile", and "interleave". '
|
||||||
|
"See repeat_prompts() in the source code for details.",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument("--shuffle-seed",
|
parser.add_argument(
|
||||||
type=int,
|
"--shuffle-seed",
|
||||||
default=0,
|
type=int,
|
||||||
help='Random seed when the repeat mode is "random"')
|
default=0,
|
||||||
|
help='Random seed when the repeat mode is "random"',
|
||||||
|
)
|
||||||
|
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|||||||
@ -63,14 +63,15 @@ class Request:
|
|||||||
output_len: int
|
output_len: int
|
||||||
|
|
||||||
|
|
||||||
def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> str:
|
def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]:
|
||||||
vocab = tokenizer.get_vocab()
|
vocab = tokenizer.get_vocab()
|
||||||
|
all_special_ids = set(tokenizer.all_special_ids)
|
||||||
|
|
||||||
# Remove the special tokens.
|
# Remove the special tokens.
|
||||||
vocab = {
|
return random.choices(
|
||||||
k: v
|
[v for k, v in vocab.items() if k not in all_special_ids],
|
||||||
for k, v in vocab.items() if k not in tokenizer.all_special_ids
|
k=length,
|
||||||
}
|
)
|
||||||
return random.choices(list(vocab.values()), k=length)
|
|
||||||
|
|
||||||
|
|
||||||
def sample_requests_from_dataset(
|
def sample_requests_from_dataset(
|
||||||
@ -89,8 +90,10 @@ def sample_requests_from_dataset(
|
|||||||
# Filter out the conversations with less than 2 turns.
|
# Filter out the conversations with less than 2 turns.
|
||||||
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
||||||
# Only keep the first two turns of each conversation.
|
# Only keep the first two turns of each conversation.
|
||||||
dataset = [(data["conversations"][0]["value"],
|
dataset = [
|
||||||
data["conversations"][1]["value"]) for data in dataset]
|
(data["conversations"][0]["value"], data["conversations"][1]["value"])
|
||||||
|
for data in dataset
|
||||||
|
]
|
||||||
|
|
||||||
# Shuffle the dataset.
|
# Shuffle the dataset.
|
||||||
random.shuffle(dataset)
|
random.shuffle(dataset)
|
||||||
@ -111,8 +114,9 @@ def sample_requests_from_dataset(
|
|||||||
completion = dataset[i][1]
|
completion = dataset[i][1]
|
||||||
completion_token_ids = tokenizer(completion).input_ids
|
completion_token_ids = tokenizer(completion).input_ids
|
||||||
prompt_len = len(prompt_token_ids)
|
prompt_len = len(prompt_token_ids)
|
||||||
output_len = (len(completion_token_ids)
|
output_len = (
|
||||||
if fixed_output_len is None else fixed_output_len)
|
len(completion_token_ids) if fixed_output_len is None else fixed_output_len
|
||||||
|
)
|
||||||
if min_len <= prompt_len <= max_len:
|
if min_len <= prompt_len <= max_len:
|
||||||
filtered_requests.append(Request(prompt, prompt_len, output_len))
|
filtered_requests.append(Request(prompt, prompt_len, output_len))
|
||||||
|
|
||||||
@ -126,27 +130,27 @@ def sample_requests_from_random(
|
|||||||
fixed_output_len: Optional[int],
|
fixed_output_len: Optional[int],
|
||||||
prefix_len: int,
|
prefix_len: int,
|
||||||
) -> list[Request]:
|
) -> list[Request]:
|
||||||
|
|
||||||
requests = []
|
requests = []
|
||||||
prefix_token_ids = sample_tokens(tokenizer, prefix_len)
|
prefix_token_ids = sample_tokens(tokenizer, prefix_len)
|
||||||
min_len, max_len = input_length_range
|
min_len, max_len = input_length_range
|
||||||
|
|
||||||
for i in range(num_requests):
|
for i in range(num_requests):
|
||||||
unique_part_token_ids = sample_tokens(
|
unique_part_token_ids = sample_tokens(
|
||||||
tokenizer,
|
tokenizer, random.randint(min_len - prefix_len, max_len - prefix_len)
|
||||||
random.randint(min_len - prefix_len, max_len - prefix_len))
|
)
|
||||||
prompt_token_ids = prefix_token_ids + unique_part_token_ids
|
prompt_token_ids = prefix_token_ids + unique_part_token_ids
|
||||||
prompt = tokenizer.decode(prompt_token_ids)
|
prompt = tokenizer.decode(prompt_token_ids)
|
||||||
prompt_len = len(prompt_token_ids)
|
prompt_len = len(prompt_token_ids)
|
||||||
assert (min_len <= prompt_len <= max_len
|
assert min_len <= prompt_len <= max_len, (
|
||||||
), f"prompt_len {prompt_len} out of range {min_len}:{max_len}"
|
f"prompt_len {prompt_len} out of range {min_len}:{max_len}"
|
||||||
|
)
|
||||||
requests.append(Request(prompt, prompt_len, fixed_output_len))
|
requests.append(Request(prompt, prompt_len, fixed_output_len))
|
||||||
return requests
|
return requests
|
||||||
|
|
||||||
|
|
||||||
def repeat_and_sort_requests(requests: list[Request],
|
def repeat_and_sort_requests(
|
||||||
repeat_count: int,
|
requests: list[Request], repeat_count: int, sort: bool = False
|
||||||
sort: bool = False) -> list[str]:
|
) -> list[str]:
|
||||||
repeated_requests = requests * repeat_count
|
repeated_requests = requests * repeat_count
|
||||||
if sort:
|
if sort:
|
||||||
repeated_requests.sort(key=lambda x: x[1])
|
repeated_requests.sort(key=lambda x: x[1])
|
||||||
@ -157,14 +161,14 @@ def repeat_and_sort_requests(requests: list[Request],
|
|||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
tokenizer = get_tokenizer(args.model, trust_remote_code=True)
|
tokenizer = get_tokenizer(args.model, trust_remote_code=True)
|
||||||
input_length_range = tuple(map(int, args.input_length_range.split(':')))
|
input_length_range = tuple(map(int, args.input_length_range.split(":")))
|
||||||
random.seed(args.seed)
|
random.seed(args.seed)
|
||||||
if args.dataset_path is not None:
|
if args.dataset_path is not None:
|
||||||
if args.prefix_len > 0:
|
if args.prefix_len > 0:
|
||||||
raise ValueError("prefix-len is not supported when "
|
raise ValueError(
|
||||||
"dataset-path is provided.")
|
"prefix-len is not supported when dataset-path is provided."
|
||||||
print(f"Start to sample {args.num_prompts} prompts "
|
)
|
||||||
f"from {args.dataset_path}")
|
print(f"Start to sample {args.num_prompts} prompts from {args.dataset_path}")
|
||||||
filtered_requests = sample_requests_from_dataset(
|
filtered_requests = sample_requests_from_dataset(
|
||||||
dataset_path=args.dataset_path,
|
dataset_path=args.dataset_path,
|
||||||
num_requests=args.num_prompts,
|
num_requests=args.num_prompts,
|
||||||
@ -194,14 +198,16 @@ def main(args):
|
|||||||
|
|
||||||
llm = LLM(**dataclasses.asdict(engine_args))
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
|
|
||||||
sampling_params = SamplingParams(temperature=0,
|
sampling_params = SamplingParams(
|
||||||
max_tokens=args.output_len,
|
temperature=0,
|
||||||
detokenize=not args.disable_detokenize)
|
max_tokens=args.output_len,
|
||||||
|
detokenize=not args.disable_detokenize,
|
||||||
|
)
|
||||||
|
|
||||||
print("Testing filtered requests")
|
print("Testing filtered requests")
|
||||||
prompts = repeat_and_sort_requests(filtered_requests,
|
prompts = repeat_and_sort_requests(
|
||||||
repeat_count=args.repeat_count,
|
filtered_requests, repeat_count=args.repeat_count, sort=args.sort
|
||||||
sort=args.sort)
|
)
|
||||||
|
|
||||||
print("------start generating------")
|
print("------start generating------")
|
||||||
test_prefix(
|
test_prefix(
|
||||||
@ -213,29 +219,35 @@ def main(args):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description=
|
description="Benchmark the performance with or without "
|
||||||
'Benchmark the performance with or without automatic prefix caching.')
|
"automatic prefix caching."
|
||||||
parser.add_argument("--dataset-path",
|
)
|
||||||
type=str,
|
parser.add_argument(
|
||||||
default=None,
|
"--dataset-path", type=str, default=None, help="Path to the dataset."
|
||||||
help="Path to the dataset.")
|
)
|
||||||
parser.add_argument('--output-len', type=int, default=10)
|
parser.add_argument("--output-len", type=int, default=10)
|
||||||
parser.add_argument('--num-prompts',
|
parser.add_argument(
|
||||||
type=int,
|
"--num-prompts",
|
||||||
required=True,
|
type=int,
|
||||||
help="Number of the prompts sampled from dataset")
|
required=True,
|
||||||
parser.add_argument('--repeat-count',
|
help="Number of the prompts sampled from dataset",
|
||||||
type=int,
|
)
|
||||||
default=1,
|
parser.add_argument(
|
||||||
help='Number of times to repeat each prompt')
|
"--repeat-count",
|
||||||
parser.add_argument('--sort',
|
type=int,
|
||||||
action='store_true',
|
default=1,
|
||||||
help='Sort prompts by input length')
|
help="Number of times to repeat each prompt",
|
||||||
parser.add_argument('--input-length-range',
|
)
|
||||||
type=str,
|
parser.add_argument(
|
||||||
required=True,
|
"--sort", action="store_true", help="Sort prompts by input length"
|
||||||
help='Range of input lengths for sampling prompts,'
|
)
|
||||||
'specified as "min:max" (e.g., "128:256").')
|
parser.add_argument(
|
||||||
|
"--input-length-range",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Range of input lengths for sampling prompts,"
|
||||||
|
'specified as "min:max" (e.g., "128:256").',
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--prefix-len",
|
"--prefix-len",
|
||||||
type=int,
|
type=int,
|
||||||
@ -246,10 +258,12 @@ if __name__ == "__main__":
|
|||||||
"when dataset-path is not provided.",
|
"when dataset-path is not provided.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--disable-detokenize',
|
"--disable-detokenize",
|
||||||
action='store_true',
|
action="store_true",
|
||||||
help=("Do not detokenize responses (i.e. do not include "
|
help=(
|
||||||
"detokenization time in the latency measurement)"),
|
"Do not detokenize responses (i.e. do not include "
|
||||||
|
"detokenization time in the latency measurement)"
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
"""Benchmark offline prioritization."""
|
"""Benchmark offline prioritization."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import json
|
import json
|
||||||
@ -13,7 +14,7 @@ from vllm.engine.arg_utils import EngineArgs
|
|||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
#Select a equi-probable random priority
|
# Select a equi-probable random priority
|
||||||
def get_random_flag():
|
def get_random_flag():
|
||||||
return 0 if random.random() < 0.5 else 1
|
return 0 if random.random() < 0.5 else 1
|
||||||
|
|
||||||
@ -33,8 +34,10 @@ def sample_requests(
|
|||||||
# Filter out the conversations with less than 2 turns.
|
# Filter out the conversations with less than 2 turns.
|
||||||
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
||||||
# Only keep the first two turns of each conversation.
|
# Only keep the first two turns of each conversation.
|
||||||
dataset = [(data["conversations"][0]["value"],
|
dataset = [
|
||||||
data["conversations"][1]["value"]) for data in dataset]
|
(data["conversations"][0]["value"], data["conversations"][1]["value"])
|
||||||
|
for data in dataset
|
||||||
|
]
|
||||||
|
|
||||||
# Shuffle the dataset.
|
# Shuffle the dataset.
|
||||||
random.shuffle(dataset)
|
random.shuffle(dataset)
|
||||||
@ -51,8 +54,9 @@ def sample_requests(
|
|||||||
completion = dataset[i][1]
|
completion = dataset[i][1]
|
||||||
completion_token_ids = tokenizer(completion).input_ids
|
completion_token_ids = tokenizer(completion).input_ids
|
||||||
prompt_len = len(prompt_token_ids)
|
prompt_len = len(prompt_token_ids)
|
||||||
output_len = len(completion_token_ids
|
output_len = (
|
||||||
) if fixed_output_len is None else fixed_output_len
|
len(completion_token_ids) if fixed_output_len is None else fixed_output_len
|
||||||
|
)
|
||||||
if prompt_len < 4 or output_len < 4:
|
if prompt_len < 4 or output_len < 4:
|
||||||
# Prune too short sequences.
|
# Prune too short sequences.
|
||||||
continue
|
continue
|
||||||
@ -74,13 +78,16 @@ def run_vllm(
|
|||||||
disable_detokenize: bool = False,
|
disable_detokenize: bool = False,
|
||||||
) -> float:
|
) -> float:
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
llm = LLM(**dataclasses.asdict(engine_args))
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
|
|
||||||
assert all(
|
assert all(
|
||||||
llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
|
llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
|
||||||
for request in requests), (
|
for request in requests
|
||||||
"Please ensure that max_model_len is greater than the sum of"
|
), (
|
||||||
" input_len and output_len for all requests.")
|
"Please ensure that max_model_len is greater than the sum of"
|
||||||
|
" input_len and output_len for all requests."
|
||||||
|
)
|
||||||
|
|
||||||
# Add the requests to the engine.
|
# Add the requests to the engine.
|
||||||
prompts = []
|
prompts = []
|
||||||
@ -97,7 +104,8 @@ def run_vllm(
|
|||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=output_len,
|
max_tokens=output_len,
|
||||||
detokenize=not disable_detokenize,
|
detokenize=not disable_detokenize,
|
||||||
))
|
)
|
||||||
|
)
|
||||||
|
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True)
|
llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True)
|
||||||
@ -111,26 +119,33 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
# Sample the requests.
|
# Sample the requests.
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
args.tokenizer, trust_remote_code=args.trust_remote_code)
|
args.tokenizer, trust_remote_code=args.trust_remote_code
|
||||||
|
)
|
||||||
if args.dataset is None:
|
if args.dataset is None:
|
||||||
# Synthesize a prompt with the given input length.
|
# Synthesize a prompt with the given input length.
|
||||||
prompt = "hi" * (args.input_len - 1)
|
prompt = "hi" * (args.input_len - 1)
|
||||||
requests = [(prompt, args.input_len, args.output_len,
|
requests = [
|
||||||
get_random_flag()) for _ in range(args.num_prompts)]
|
(prompt, args.input_len, args.output_len, get_random_flag())
|
||||||
|
for _ in range(args.num_prompts)
|
||||||
|
]
|
||||||
else:
|
else:
|
||||||
requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
|
requests = sample_requests(
|
||||||
args.output_len)
|
args.dataset, args.num_prompts, tokenizer, args.output_len
|
||||||
|
)
|
||||||
|
|
||||||
if args.backend == "vllm":
|
if args.backend == "vllm":
|
||||||
elapsed_time = run_vllm(requests, args.n,
|
elapsed_time = run_vllm(
|
||||||
EngineArgs.from_cli_args(args),
|
requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
|
||||||
args.disable_detokenize)
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown backend: {args.backend}")
|
raise ValueError(f"Unknown backend: {args.backend}")
|
||||||
total_num_tokens = sum(prompt_len + output_len
|
total_num_tokens = sum(
|
||||||
for _, prompt_len, output_len, priority in requests)
|
prompt_len + output_len for _, prompt_len, output_len, priority in requests
|
||||||
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
)
|
||||||
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
|
print(
|
||||||
|
f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
||||||
|
f"{total_num_tokens / elapsed_time:.2f} tokens/s"
|
||||||
|
)
|
||||||
|
|
||||||
# Output JSON results if specified
|
# Output JSON results if specified
|
||||||
if args.output_json:
|
if args.output_json:
|
||||||
@ -147,41 +162,44 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
||||||
parser.add_argument("--backend",
|
|
||||||
type=str,
|
|
||||||
choices=["vllm", "hf", "mii"],
|
|
||||||
default="vllm")
|
|
||||||
parser.add_argument("--dataset",
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help="Path to the dataset.")
|
|
||||||
parser.add_argument("--input-len",
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help="Input prompt length for each request")
|
|
||||||
parser.add_argument("--output-len",
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help="Output length for each request. Overrides the "
|
|
||||||
"output length from the dataset.")
|
|
||||||
parser.add_argument("--n",
|
|
||||||
type=int,
|
|
||||||
default=1,
|
|
||||||
help="Number of generated sequences per prompt.")
|
|
||||||
parser.add_argument("--num-prompts",
|
|
||||||
type=int,
|
|
||||||
default=200,
|
|
||||||
help="Number of prompts to process.")
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--output-json',
|
"--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dataset", type=str, default=None, help="Path to the dataset."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--input-len",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Input prompt length for each request",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-len",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Output length for each request. Overrides the "
|
||||||
|
"output length from the dataset.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--n", type=int, default=1, help="Number of generated sequences per prompt."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--num-prompts", type=int, default=200, help="Number of prompts to process."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-json",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help='Path to save the throughput results in JSON format.')
|
help="Path to save the throughput results in JSON format.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--disable-detokenize',
|
"--disable-detokenize",
|
||||||
action='store_true',
|
action="store_true",
|
||||||
help=("Do not detokenize responses (i.e. do not include "
|
help=(
|
||||||
"detokenization time in the latency measurement)"),
|
"Do not detokenize responses (i.e. do not include "
|
||||||
|
"detokenization time in the latency measurement)"
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -5,16 +5,13 @@ On the server side, run one of the following commands:
|
|||||||
(vLLM OpenAI API server)
|
(vLLM OpenAI API server)
|
||||||
vllm serve <your_model> --disable-log-requests
|
vllm serve <your_model> --disable-log-requests
|
||||||
|
|
||||||
(TGI backend)
|
|
||||||
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
|
|
||||||
|
|
||||||
On the client side, run:
|
On the client side, run:
|
||||||
python benchmarks/benchmark_serving_structured_output.py \
|
python benchmarks/benchmark_serving_structured_output.py \
|
||||||
--backend <backend> \
|
--backend <backend> \
|
||||||
--model <your_model> \
|
--model <your_model> \
|
||||||
--dataset json \
|
--dataset json \
|
||||||
--structured-output-ratio 1.0 \
|
--structured-output-ratio 1.0 \
|
||||||
--structured-output-backend xgrammar \
|
--structured-output-backend auto \
|
||||||
--request-rate 10 \
|
--request-rate 10 \
|
||||||
--num-prompts 1000
|
--num-prompts 1000
|
||||||
|
|
||||||
@ -22,6 +19,7 @@ On the client side, run:
|
|||||||
--endpoint /generate_stream
|
--endpoint /generate_stream
|
||||||
to the end of the command above.
|
to the end of the command above.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
import copy
|
import copy
|
||||||
@ -39,11 +37,15 @@ from typing import Optional
|
|||||||
import datasets
|
import datasets
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
|
||||||
RequestFuncOutput)
|
|
||||||
from tqdm.asyncio import tqdm
|
from tqdm.asyncio import tqdm
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
|
from backend_request_func import (
|
||||||
|
ASYNC_REQUEST_FUNCS,
|
||||||
|
RequestFuncInput,
|
||||||
|
RequestFuncOutput,
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -54,8 +56,9 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
from argparse import ArgumentParser as FlexibleArgumentParser
|
from argparse import ArgumentParser as FlexibleArgumentParser
|
||||||
|
|
||||||
from vllm.v1.structured_output.utils import (
|
from vllm.v1.structured_output.backend_xgrammar import (
|
||||||
has_xgrammar_unsupported_json_features)
|
has_xgrammar_unsupported_json_features,
|
||||||
|
)
|
||||||
|
|
||||||
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
||||||
|
|
||||||
@ -101,6 +104,7 @@ class SampleRequest:
|
|||||||
prompt_len: The length of the prompt in tokens.
|
prompt_len: The length of the prompt in tokens.
|
||||||
expected_output_len: The expected length of the output in tokens.
|
expected_output_len: The expected length of the output in tokens.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
prompt: str
|
prompt: str
|
||||||
prompt_len: int
|
prompt_len: int
|
||||||
expected_output_len: int
|
expected_output_len: int
|
||||||
@ -109,60 +113,61 @@ class SampleRequest:
|
|||||||
completion: str = None
|
completion: str = None
|
||||||
|
|
||||||
|
|
||||||
def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
def sample_requests(
|
||||||
args: argparse.Namespace) -> list[SampleRequest]:
|
tokenizer: PreTrainedTokenizerBase, args: argparse.Namespace
|
||||||
if args.dataset == 'json' or args.dataset == 'json-unique':
|
) -> list[SampleRequest]:
|
||||||
|
if args.dataset == "json" or args.dataset == "json-unique":
|
||||||
if args.json_schema_path is None:
|
if args.json_schema_path is None:
|
||||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||||
args.json_schema_path = os.path.join(dir_path,
|
args.json_schema_path = os.path.join(
|
||||||
"structured_schemas",
|
dir_path, "structured_schemas", "structured_schema_1.json"
|
||||||
"structured_schema_1.json")
|
)
|
||||||
json_schemas = []
|
json_schemas = []
|
||||||
with open(args.json_schema_path) as f:
|
with open(args.json_schema_path) as f:
|
||||||
schema = json.load(f)
|
schema = json.load(f)
|
||||||
|
|
||||||
if args.dataset == 'json-unique':
|
if args.dataset == "json-unique":
|
||||||
json_schemas = [
|
json_schemas = [copy.deepcopy(schema) for _ in range(args.num_prompts)]
|
||||||
copy.deepcopy(schema) for _ in range(args.num_prompts)
|
|
||||||
]
|
|
||||||
for i in range(len(json_schemas)):
|
for i in range(len(json_schemas)):
|
||||||
json_schemas[i]["properties"][
|
if "properties" not in json_schemas[i]:
|
||||||
f"__optional_field_{uuid.uuid4()}"] = {
|
json_schemas[i]["properties"] = {}
|
||||||
"type":
|
json_schemas[i]["properties"][f"__optional_field_{uuid.uuid4()}"] = {
|
||||||
"string",
|
"type": "string",
|
||||||
"description":
|
"description": "An unique optional field to avoid cached schemas",
|
||||||
"An unique optional field to avoid cached schemas"
|
}
|
||||||
}
|
else:
|
||||||
|
json_schemas = [schema] * args.num_prompts
|
||||||
|
|
||||||
def gen_prompt(index: int):
|
def gen_prompt(index: int):
|
||||||
schema = json_schemas[index % len(json_schemas)]
|
return f"Generate an example of a brief user profile given the following schema: {json.dumps(get_schema(index))}" # noqa: E501
|
||||||
return f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501
|
|
||||||
|
|
||||||
def get_schema(index: int):
|
def get_schema(index: int):
|
||||||
return json_schemas[index % len(json_schemas)]
|
return json_schemas[index % len(json_schemas)]
|
||||||
|
|
||||||
requests = [
|
requests = [
|
||||||
SampleRequest(prompt=gen_prompt(i),
|
SampleRequest(
|
||||||
prompt_len=len(tokenizer(gen_prompt(i)).input_ids),
|
prompt=gen_prompt(i),
|
||||||
expected_output_len=args.output_len,
|
prompt_len=len(tokenizer(gen_prompt(i)).input_ids),
|
||||||
schema=get_schema(i),
|
expected_output_len=args.output_len,
|
||||||
structure_type=args.structure_type)
|
schema=get_schema(i),
|
||||||
|
structure_type=args.structure_type,
|
||||||
|
)
|
||||||
for i in range(args.num_prompts)
|
for i in range(args.num_prompts)
|
||||||
]
|
]
|
||||||
|
|
||||||
elif args.dataset == "grammar":
|
elif args.dataset == "grammar":
|
||||||
schema = """
|
schema = """
|
||||||
?start: select_statement
|
root ::= select_statement
|
||||||
|
|
||||||
?select_statement: "SELECT " column_list " FROM " table_name
|
select_statement ::= "SELECT " column " from " table " where " condition
|
||||||
|
|
||||||
?column_list: column_name ("," column_name)*
|
column ::= "col_1 " | "col_2 "
|
||||||
|
|
||||||
?table_name: identifier
|
table ::= "table_1 " | "table_2 "
|
||||||
|
|
||||||
?column_name: identifier
|
condition ::= column "= " number
|
||||||
|
|
||||||
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
|
number ::= "1 " | "2 "
|
||||||
"""
|
"""
|
||||||
prompt = "Generate an SQL query to show the 'username' \
|
prompt = "Generate an SQL query to show the 'username' \
|
||||||
and 'email' from the 'users' table."
|
and 'email' from the 'users' table."
|
||||||
@ -170,11 +175,13 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
|||||||
input_len = len(tokenizer(prompt).input_ids)
|
input_len = len(tokenizer(prompt).input_ids)
|
||||||
print(f"Input length of the prompt: {input_len} tokens")
|
print(f"Input length of the prompt: {input_len} tokens")
|
||||||
requests = [
|
requests = [
|
||||||
SampleRequest(prompt=prompt,
|
SampleRequest(
|
||||||
prompt_len=input_len,
|
prompt=prompt,
|
||||||
expected_output_len=args.output_len,
|
prompt_len=input_len,
|
||||||
schema=schema,
|
expected_output_len=args.output_len,
|
||||||
structure_type=args.structure_type)
|
schema=schema,
|
||||||
|
structure_type=args.structure_type,
|
||||||
|
)
|
||||||
for _ in range(args.num_prompts)
|
for _ in range(args.num_prompts)
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -188,11 +195,13 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
|||||||
input_len = len(tokenizer(prompt).input_ids)
|
input_len = len(tokenizer(prompt).input_ids)
|
||||||
print(f"Input length of the prompt: {input_len} tokens")
|
print(f"Input length of the prompt: {input_len} tokens")
|
||||||
requests = [
|
requests = [
|
||||||
SampleRequest(prompt=prompt,
|
SampleRequest(
|
||||||
prompt_len=input_len,
|
prompt=prompt,
|
||||||
expected_output_len=args.output_len,
|
prompt_len=input_len,
|
||||||
schema=regex,
|
expected_output_len=args.output_len,
|
||||||
structure_type=args.structure_type)
|
schema=regex,
|
||||||
|
structure_type=args.structure_type,
|
||||||
|
)
|
||||||
for _ in range(args.num_prompts)
|
for _ in range(args.num_prompts)
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -203,47 +212,55 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
|||||||
input_len = len(tokenizer(prompt).input_ids)
|
input_len = len(tokenizer(prompt).input_ids)
|
||||||
print(f"Input length of the prompt: {input_len} tokens")
|
print(f"Input length of the prompt: {input_len} tokens")
|
||||||
requests = [
|
requests = [
|
||||||
SampleRequest(prompt=prompt,
|
SampleRequest(
|
||||||
prompt_len=input_len,
|
prompt=prompt,
|
||||||
expected_output_len=args.output_len,
|
prompt_len=input_len,
|
||||||
schema=choice,
|
expected_output_len=args.output_len,
|
||||||
structure_type=args.structure_type)
|
schema=choice,
|
||||||
|
structure_type=args.structure_type,
|
||||||
|
)
|
||||||
for _ in range(args.num_prompts)
|
for _ in range(args.num_prompts)
|
||||||
]
|
]
|
||||||
|
|
||||||
elif args.dataset == "xgrammar_bench":
|
elif args.dataset == "xgrammar_bench":
|
||||||
requests: list[SampleRequest] = []
|
requests: list[SampleRequest] = []
|
||||||
dataset = datasets.load_dataset("NousResearch/json-mode-eval",
|
dataset = datasets.load_dataset("NousResearch/json-mode-eval", split="train")
|
||||||
split="train")
|
|
||||||
full_dataset_len = len(dataset)
|
full_dataset_len = len(dataset)
|
||||||
|
|
||||||
def _filter_func(item):
|
def _filter_func(item):
|
||||||
import json
|
import json
|
||||||
|
|
||||||
schema = json.loads(item["schema"])
|
schema = json.loads(item["schema"])
|
||||||
return not has_xgrammar_unsupported_json_features(schema)
|
return not has_xgrammar_unsupported_json_features(schema)
|
||||||
|
|
||||||
dataset = dataset.filter(_filter_func)
|
dataset = dataset.filter(_filter_func)
|
||||||
num_filtered_out = full_dataset_len - len(dataset)
|
num_filtered_out = full_dataset_len - len(dataset)
|
||||||
print(f"dataset has {len(dataset)} entries after filtering "
|
print(
|
||||||
f"out {num_filtered_out} entries with unsupported features")
|
f"dataset has {len(dataset)} entries after filtering "
|
||||||
|
f"out {num_filtered_out} entries with unsupported features"
|
||||||
|
)
|
||||||
len_dataset = len(dataset)
|
len_dataset = len(dataset)
|
||||||
for data_point_idx in range(args.num_prompts):
|
for data_point_idx in range(args.num_prompts):
|
||||||
idx = data_point_idx
|
idx = data_point_idx
|
||||||
while idx >= len_dataset:
|
while idx >= len_dataset:
|
||||||
idx -= len_dataset
|
idx -= len_dataset
|
||||||
schema = dataset["schema"][idx]
|
schema = dataset["schema"][idx]
|
||||||
prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
|
prompt = tokenizer.apply_chat_template(
|
||||||
tokenize=False)
|
dataset["prompt"][idx], tokenize=False, add_generation_prompt=True
|
||||||
|
)
|
||||||
input_len = len(tokenizer(prompt).input_ids)
|
input_len = len(tokenizer(prompt).input_ids)
|
||||||
completion = dataset["completion"][idx]
|
completion = dataset["completion"][idx]
|
||||||
|
|
||||||
requests.append(
|
requests.append(
|
||||||
SampleRequest(prompt=prompt,
|
SampleRequest(
|
||||||
prompt_len=input_len,
|
prompt=prompt,
|
||||||
expected_output_len=args.output_len,
|
prompt_len=input_len,
|
||||||
schema=schema,
|
expected_output_len=args.output_len,
|
||||||
structure_type=args.structure_type,
|
schema=schema,
|
||||||
completion=completion))
|
structure_type=args.structure_type,
|
||||||
|
completion=completion,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return requests
|
return requests
|
||||||
|
|
||||||
@ -275,7 +292,8 @@ async def get_request(
|
|||||||
|
|
||||||
# Calculate scale parameter theta to maintain the desired request_rate.
|
# Calculate scale parameter theta to maintain the desired request_rate.
|
||||||
assert burstiness > 0, (
|
assert burstiness > 0, (
|
||||||
f"A positive burstiness factor is expected, but given {burstiness}.")
|
f"A positive burstiness factor is expected, but given {burstiness}."
|
||||||
|
)
|
||||||
theta = 1.0 / (request_rate * burstiness)
|
theta = 1.0 / (request_rate * burstiness)
|
||||||
|
|
||||||
for i, request in enumerate(input_requests):
|
for i, request in enumerate(input_requests):
|
||||||
@ -317,8 +335,8 @@ def calculate_metrics(
|
|||||||
# multiple output tokens may be bundled together
|
# multiple output tokens may be bundled together
|
||||||
# Note : this may inflate the output token count slightly
|
# Note : this may inflate the output token count slightly
|
||||||
output_len = len(
|
output_len = len(
|
||||||
tokenizer(outputs[i].generated_text,
|
tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids
|
||||||
add_special_tokens=False).input_ids)
|
)
|
||||||
actual_output_lens.append(output_len)
|
actual_output_lens.append(output_len)
|
||||||
total_input += input_requests[i].prompt_len
|
total_input += input_requests[i].prompt_len
|
||||||
tpot = 0
|
tpot = 0
|
||||||
@ -342,16 +360,19 @@ def calculate_metrics(
|
|||||||
|
|
||||||
if "ttft" in goodput_config_dict:
|
if "ttft" in goodput_config_dict:
|
||||||
valid_metrics.append(ttfts)
|
valid_metrics.append(ttfts)
|
||||||
slo_values.append(goodput_config_dict["ttft"] /
|
slo_values.append(
|
||||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION
|
||||||
|
)
|
||||||
if "tpot" in goodput_config_dict:
|
if "tpot" in goodput_config_dict:
|
||||||
valid_metrics.append(all_tpots)
|
valid_metrics.append(all_tpots)
|
||||||
slo_values.append(goodput_config_dict["tpot"] /
|
slo_values.append(
|
||||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION
|
||||||
|
)
|
||||||
if "e2el" in goodput_config_dict:
|
if "e2el" in goodput_config_dict:
|
||||||
valid_metrics.append(e2els)
|
valid_metrics.append(e2els)
|
||||||
slo_values.append(goodput_config_dict["e2el"] /
|
slo_values.append(
|
||||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION
|
||||||
|
)
|
||||||
|
|
||||||
for req_metric in zip(*valid_metrics):
|
for req_metric in zip(*valid_metrics):
|
||||||
is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
|
is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
|
||||||
@ -362,7 +383,8 @@ def calculate_metrics(
|
|||||||
warnings.warn(
|
warnings.warn(
|
||||||
"All requests failed. This is likely due to a misconfiguration "
|
"All requests failed. This is likely due to a misconfiguration "
|
||||||
"on the benchmark arguments.",
|
"on the benchmark arguments.",
|
||||||
stacklevel=2)
|
stacklevel=2,
|
||||||
|
)
|
||||||
metrics = BenchmarkMetrics(
|
metrics = BenchmarkMetrics(
|
||||||
completed=completed,
|
completed=completed,
|
||||||
total_input=total_input,
|
total_input=total_input,
|
||||||
@ -371,27 +393,31 @@ def calculate_metrics(
|
|||||||
request_goodput=good_completed / dur_s,
|
request_goodput=good_completed / dur_s,
|
||||||
output_throughput=sum(actual_output_lens) / dur_s,
|
output_throughput=sum(actual_output_lens) / dur_s,
|
||||||
total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
|
total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
|
||||||
mean_ttft_ms=np.mean(ttfts or 0) *
|
mean_ttft_ms=np.mean(ttfts or 0)
|
||||||
1000, # ttfts is empty if streaming is not supported by backend
|
* 1000, # ttfts is empty if streaming is not supported by backend
|
||||||
std_ttft_ms=np.std(ttfts or 0) * 1000,
|
std_ttft_ms=np.std(ttfts or 0) * 1000,
|
||||||
median_ttft_ms=np.median(ttfts or 0) * 1000,
|
median_ttft_ms=np.median(ttfts or 0) * 1000,
|
||||||
percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
|
percentiles_ttft_ms=[
|
||||||
for p in selected_percentiles],
|
(p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles
|
||||||
|
],
|
||||||
mean_tpot_ms=np.mean(tpots or 0) * 1000,
|
mean_tpot_ms=np.mean(tpots or 0) * 1000,
|
||||||
std_tpot_ms=np.std(tpots or 0) * 1000,
|
std_tpot_ms=np.std(tpots or 0) * 1000,
|
||||||
median_tpot_ms=np.median(tpots or 0) * 1000,
|
median_tpot_ms=np.median(tpots or 0) * 1000,
|
||||||
percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
|
percentiles_tpot_ms=[
|
||||||
for p in selected_percentiles],
|
(p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles
|
||||||
|
],
|
||||||
mean_itl_ms=np.mean(itls or 0) * 1000,
|
mean_itl_ms=np.mean(itls or 0) * 1000,
|
||||||
std_itl_ms=np.std(itls or 0) * 1000,
|
std_itl_ms=np.std(itls or 0) * 1000,
|
||||||
median_itl_ms=np.median(itls or 0) * 1000,
|
median_itl_ms=np.median(itls or 0) * 1000,
|
||||||
percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
|
percentiles_itl_ms=[
|
||||||
for p in selected_percentiles],
|
(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles
|
||||||
|
],
|
||||||
mean_e2el_ms=np.mean(e2els or 0) * 1000,
|
mean_e2el_ms=np.mean(e2els or 0) * 1000,
|
||||||
std_e2el_ms=np.std(e2els or 0) * 1000,
|
std_e2el_ms=np.std(e2els or 0) * 1000,
|
||||||
median_e2el_ms=np.median(e2els or 0) * 1000,
|
median_e2el_ms=np.median(e2els or 0) * 1000,
|
||||||
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
|
percentiles_e2el_ms=[
|
||||||
for p in selected_percentiles],
|
(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
|
||||||
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
return metrics, actual_output_lens
|
return metrics, actual_output_lens
|
||||||
@ -413,7 +439,6 @@ async def benchmark(
|
|||||||
ignore_eos: bool,
|
ignore_eos: bool,
|
||||||
max_concurrency: Optional[int],
|
max_concurrency: Optional[int],
|
||||||
structured_output_ratio: float,
|
structured_output_ratio: float,
|
||||||
structured_output_backend: str,
|
|
||||||
goodput_config_dict: Optional[dict[str, float]] = None,
|
goodput_config_dict: Optional[dict[str, float]] = None,
|
||||||
):
|
):
|
||||||
if backend in ASYNC_REQUEST_FUNCS:
|
if backend in ASYNC_REQUEST_FUNCS:
|
||||||
@ -425,18 +450,17 @@ async def benchmark(
|
|||||||
extra_body = {}
|
extra_body = {}
|
||||||
# Add the schema to the extra_body
|
# Add the schema to the extra_body
|
||||||
extra_body[request.structure_type] = request.schema
|
extra_body[request.structure_type] = request.schema
|
||||||
# Add the specific structured_output_backend
|
|
||||||
extra_body["guided_decoding_backend"] = structured_output_backend
|
|
||||||
return extra_body
|
return extra_body
|
||||||
|
|
||||||
print("Starting initial single prompt test run...")
|
print("Starting initial single prompt test run...")
|
||||||
structured_output_req_idx = random.sample(
|
structured_output_req_idx = random.sample(
|
||||||
range(len(input_requests)),
|
range(len(input_requests)), int(len(input_requests) * structured_output_ratio)
|
||||||
int(len(input_requests) * structured_output_ratio))
|
)
|
||||||
|
|
||||||
test_request = input_requests[0]
|
test_request = input_requests[0]
|
||||||
test_req_extra_body = (prepare_extra_body(test_request)
|
test_req_extra_body = (
|
||||||
if 0 in structured_output_req_idx else None)
|
prepare_extra_body(test_request) if 0 in structured_output_req_idx else None
|
||||||
|
)
|
||||||
test_input = RequestFuncInput(
|
test_input = RequestFuncInput(
|
||||||
model=model_id,
|
model=model_id,
|
||||||
prompt=test_request.prompt,
|
prompt=test_request.prompt,
|
||||||
@ -450,7 +474,8 @@ async def benchmark(
|
|||||||
if not test_output.success:
|
if not test_output.success:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Initial test run failed - Please make sure benchmark arguments "
|
"Initial test run failed - Please make sure benchmark arguments "
|
||||||
f"are correctly specified. Error: {test_output.error}")
|
f"are correctly specified. Error: {test_output.error}"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
print("Initial test run completed. Starting main benchmark run...")
|
print("Initial test run completed. Starting main benchmark run...")
|
||||||
|
|
||||||
@ -469,10 +494,7 @@ async def benchmark(
|
|||||||
if profile_output.success:
|
if profile_output.success:
|
||||||
print("Profiler started")
|
print("Profiler started")
|
||||||
|
|
||||||
if burstiness == 1.0:
|
distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
|
||||||
distribution = "Poisson process"
|
|
||||||
else:
|
|
||||||
distribution = "Gamma distribution"
|
|
||||||
|
|
||||||
print(f"Traffic request rate: {request_rate}")
|
print(f"Traffic request rate: {request_rate}")
|
||||||
print(f"Burstiness factor: {burstiness} ({distribution})")
|
print(f"Burstiness factor: {burstiness} ({distribution})")
|
||||||
@ -484,24 +506,21 @@ async def benchmark(
|
|||||||
# and it will simplify the code in limited_request_func.
|
# and it will simplify the code in limited_request_func.
|
||||||
# semaphore = (asyncio.Semaphore(max_concurrency)
|
# semaphore = (asyncio.Semaphore(max_concurrency)
|
||||||
# if max_concurrency else contextlib.nullcontext())
|
# if max_concurrency else contextlib.nullcontext())
|
||||||
semaphore = (asyncio.Semaphore(max_concurrency)
|
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
|
||||||
if max_concurrency else None)
|
|
||||||
|
|
||||||
async def limited_request_func(request_func_input, pbar):
|
async def limited_request_func(request_func_input, pbar):
|
||||||
if semaphore is None:
|
if semaphore is None:
|
||||||
return await request_func(request_func_input=request_func_input,
|
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||||
pbar=pbar)
|
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
return await request_func(request_func_input=request_func_input,
|
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||||
pbar=pbar)
|
|
||||||
|
|
||||||
benchmark_start_time = time.perf_counter()
|
benchmark_start_time = time.perf_counter()
|
||||||
tasks: list[asyncio.Task] = []
|
tasks: list[asyncio.Task] = []
|
||||||
expected: list[str] = []
|
expected: list[str] = []
|
||||||
async for i, request in get_request(input_requests, request_rate,
|
async for i, request in get_request(input_requests, request_rate, burstiness):
|
||||||
burstiness):
|
extra_body = (
|
||||||
extra_body = prepare_extra_body(
|
prepare_extra_body(request) if i in structured_output_req_idx else None
|
||||||
request) if i in structured_output_req_idx else None
|
)
|
||||||
request_func_input = RequestFuncInput(
|
request_func_input = RequestFuncInput(
|
||||||
model=model_id,
|
model=model_id,
|
||||||
prompt=request.prompt,
|
prompt=request.prompt,
|
||||||
@ -514,8 +533,9 @@ async def benchmark(
|
|||||||
expected.append(request.completion)
|
expected.append(request.completion)
|
||||||
tasks.append(
|
tasks.append(
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
limited_request_func(request_func_input=request_func_input,
|
limited_request_func(request_func_input=request_func_input, pbar=pbar)
|
||||||
pbar=pbar)))
|
)
|
||||||
|
)
|
||||||
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
|
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
if profile:
|
if profile:
|
||||||
@ -547,54 +567,58 @@ async def benchmark(
|
|||||||
goodput_config_dict=goodput_config_dict,
|
goodput_config_dict=goodput_config_dict,
|
||||||
)
|
)
|
||||||
|
|
||||||
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
|
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
||||||
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
||||||
print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
|
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
|
||||||
benchmark_duration))
|
|
||||||
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
|
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
|
||||||
print("{:<40} {:<10}".format("Total generated tokens:",
|
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
|
||||||
metrics.total_output))
|
print(
|
||||||
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
|
"{:<40} {:<10.2f}".format(
|
||||||
metrics.request_throughput))
|
"Request throughput (req/s):", metrics.request_throughput
|
||||||
|
)
|
||||||
|
)
|
||||||
if goodput_config_dict:
|
if goodput_config_dict:
|
||||||
print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
|
print(
|
||||||
metrics.request_goodput))
|
"{:<40} {:<10.2f}".format(
|
||||||
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
|
"Request goodput (req/s):", metrics.request_goodput
|
||||||
metrics.output_throughput))
|
)
|
||||||
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
|
)
|
||||||
metrics.total_token_throughput))
|
print(
|
||||||
|
"{:<40} {:<10.2f}".format(
|
||||||
|
"Output token throughput (tok/s):", metrics.output_throughput
|
||||||
|
)
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
"{:<40} {:<10.2f}".format(
|
||||||
|
"Total Token throughput (tok/s):", metrics.total_token_throughput
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
"duration":
|
"duration": benchmark_duration,
|
||||||
benchmark_duration,
|
"completed": metrics.completed,
|
||||||
"completed":
|
"total_input_tokens": metrics.total_input,
|
||||||
metrics.completed,
|
"total_output_tokens": metrics.total_output,
|
||||||
"total_input_tokens":
|
"request_throughput": metrics.request_throughput,
|
||||||
metrics.total_input,
|
"output_throughput": metrics.output_throughput,
|
||||||
"total_output_tokens":
|
"total_token_throughput": metrics.total_token_throughput,
|
||||||
metrics.total_output,
|
"ttft_description": pd.Series([output.ttft for output in outputs])
|
||||||
"request_throughput":
|
.describe()
|
||||||
metrics.request_throughput,
|
.to_dict(),
|
||||||
"output_throughput":
|
"tpot_description": pd.Series([output.tpot for output in outputs])
|
||||||
metrics.output_throughput,
|
.describe()
|
||||||
"total_token_throughput":
|
.to_dict(),
|
||||||
metrics.total_token_throughput,
|
|
||||||
"ttft_description":
|
|
||||||
pd.Series([output.ttft for output in outputs]).describe().to_dict(),
|
|
||||||
"tpot_description":
|
|
||||||
pd.Series([output.tpot for output in outputs]).describe().to_dict(),
|
|
||||||
"input_lens": [output.prompt_len for output in outputs],
|
"input_lens": [output.prompt_len for output in outputs],
|
||||||
"output_lens":
|
"output_lens": actual_output_lens,
|
||||||
actual_output_lens,
|
|
||||||
"ttfts": [output.ttft for output in outputs],
|
"ttfts": [output.ttft for output in outputs],
|
||||||
"itls": [output.itl for output in outputs],
|
"itls": [output.itl for output in outputs],
|
||||||
"errors": [output.error for output in outputs],
|
"errors": [output.error for output in outputs],
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = [{
|
ret = [
|
||||||
'generated': output.generated_text,
|
{"generated": output.generated_text, "expected": gt}
|
||||||
'expected': gt
|
for output, gt in zip(outputs, expected)
|
||||||
} for output, gt in zip(outputs, expected)]
|
]
|
||||||
|
|
||||||
def process_one_metric(
|
def process_one_metric(
|
||||||
# E.g., "ttft"
|
# E.g., "ttft"
|
||||||
@ -608,29 +632,35 @@ async def benchmark(
|
|||||||
# metric.
|
# metric.
|
||||||
if metric_attribute_name not in selected_percentile_metrics:
|
if metric_attribute_name not in selected_percentile_metrics:
|
||||||
return
|
return
|
||||||
print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
|
print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
|
||||||
print("{:<40} {:<10.2f}".format(
|
print(
|
||||||
f"Mean {metric_name} (ms):",
|
"{:<40} {:<10.2f}".format(
|
||||||
getattr(metrics, f"mean_{metric_attribute_name}_ms")))
|
f"Mean {metric_name} (ms):",
|
||||||
print("{:<40} {:<10.2f}".format(
|
getattr(metrics, f"mean_{metric_attribute_name}_ms"),
|
||||||
f"Median {metric_name} (ms):",
|
)
|
||||||
getattr(metrics, f"median_{metric_attribute_name}_ms")))
|
)
|
||||||
|
print(
|
||||||
|
"{:<40} {:<10.2f}".format(
|
||||||
|
f"Median {metric_name} (ms):",
|
||||||
|
getattr(metrics, f"median_{metric_attribute_name}_ms"),
|
||||||
|
)
|
||||||
|
)
|
||||||
result[f"mean_{metric_attribute_name}_ms"] = getattr(
|
result[f"mean_{metric_attribute_name}_ms"] = getattr(
|
||||||
metrics, f"mean_{metric_attribute_name}_ms")
|
metrics, f"mean_{metric_attribute_name}_ms"
|
||||||
|
)
|
||||||
result[f"median_{metric_attribute_name}_ms"] = getattr(
|
result[f"median_{metric_attribute_name}_ms"] = getattr(
|
||||||
metrics, f"median_{metric_attribute_name}_ms")
|
metrics, f"median_{metric_attribute_name}_ms"
|
||||||
|
)
|
||||||
result[f"std_{metric_attribute_name}_ms"] = getattr(
|
result[f"std_{metric_attribute_name}_ms"] = getattr(
|
||||||
metrics, f"std_{metric_attribute_name}_ms")
|
metrics, f"std_{metric_attribute_name}_ms"
|
||||||
for p, value in getattr(metrics,
|
)
|
||||||
f"percentiles_{metric_attribute_name}_ms"):
|
for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
|
||||||
p_word = str(int(p)) if int(p) == p else str(p)
|
p_word = str(int(p)) if int(p) == p else str(p)
|
||||||
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
|
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
|
||||||
value))
|
|
||||||
result[f"p{p_word}_{metric_attribute_name}_ms"] = value
|
result[f"p{p_word}_{metric_attribute_name}_ms"] = value
|
||||||
|
|
||||||
process_one_metric("ttft", "TTFT", "Time to First Token")
|
process_one_metric("ttft", "TTFT", "Time to First Token")
|
||||||
process_one_metric("tpot", "TPOT",
|
process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
|
||||||
"Time per Output Token (excl. 1st token)")
|
|
||||||
process_one_metric("itl", "ITL", "Inter-token Latency")
|
process_one_metric("itl", "ITL", "Inter-token Latency")
|
||||||
process_one_metric("e2el", "E2EL", "End-to-end Latency")
|
process_one_metric("e2el", "E2EL", "End-to-end Latency")
|
||||||
|
|
||||||
@ -640,13 +670,13 @@ async def benchmark(
|
|||||||
|
|
||||||
|
|
||||||
def evaluate(ret, args):
|
def evaluate(ret, args):
|
||||||
|
|
||||||
def _eval_correctness_json(expected, actual):
|
def _eval_correctness_json(expected, actual):
|
||||||
# extract json string from string using regex
|
# extract json string from string using regex
|
||||||
import re
|
import regex as re
|
||||||
actual = actual.replace('\n', '').replace(' ', '').strip()
|
|
||||||
|
actual = actual.replace("\n", "").replace(" ", "").strip()
|
||||||
try:
|
try:
|
||||||
actual = re.search(r'\{.*\}', actual).group()
|
actual = re.search(r"\{.*\}", actual).group()
|
||||||
actual = json.loads(actual)
|
actual = json.loads(actual)
|
||||||
except Exception:
|
except Exception:
|
||||||
return False
|
return False
|
||||||
@ -657,29 +687,33 @@ def evaluate(ret, args):
|
|||||||
return actual in args.choice
|
return actual in args.choice
|
||||||
|
|
||||||
def _eval_correctness_regex(expected, actual):
|
def _eval_correctness_regex(expected, actual):
|
||||||
import re
|
import regex as re
|
||||||
|
|
||||||
return re.match(args.regex, actual) is not None
|
return re.match(args.regex, actual) is not None
|
||||||
|
|
||||||
def _eval_correctness(expected, actual):
|
def _eval_correctness(expected, actual):
|
||||||
if args.structure_type == 'guided_json':
|
if args.structure_type == "guided_json":
|
||||||
return _eval_correctness_json(expected, actual)
|
return _eval_correctness_json(expected, actual)
|
||||||
elif args.structure_type == 'guided_regex':
|
elif args.structure_type == "guided_regex":
|
||||||
return _eval_correctness_regex(expected, actual)
|
return _eval_correctness_regex(expected, actual)
|
||||||
elif args.structure_type == 'guided_choice':
|
elif args.structure_type == "guided_choice":
|
||||||
return _eval_correctness_choice(expected, actual)
|
return _eval_correctness_choice(expected, actual)
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
scores = []
|
scores = []
|
||||||
for res in ret:
|
for res in ret:
|
||||||
score = _eval_correctness(res['expected'], res['generated'])
|
score = _eval_correctness(res["expected"], res["generated"])
|
||||||
res['correctness'] = score
|
res["correctness"] = score
|
||||||
scores.append(score)
|
scores.append(score)
|
||||||
|
|
||||||
not_none_scores = [score for score in scores if score is not None]
|
not_none_scores = [score for score in scores if score is not None]
|
||||||
|
|
||||||
return (sum(not_none_scores) / len(not_none_scores) *
|
return (
|
||||||
100) if len(not_none_scores) > 0 else None
|
(sum(not_none_scores) / len(not_none_scores) * 100)
|
||||||
|
if len(not_none_scores) > 0
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def parse_goodput(slo_pairs):
|
def parse_goodput(slo_pairs):
|
||||||
@ -691,9 +725,10 @@ def parse_goodput(slo_pairs):
|
|||||||
except ValueError as err:
|
except ValueError as err:
|
||||||
raise argparse.ArgumentTypeError(
|
raise argparse.ArgumentTypeError(
|
||||||
"Invalid format found for service level objectives. "
|
"Invalid format found for service level objectives. "
|
||||||
"Specify service level objectives for goodput as \"KEY:VALUE\" "
|
'Specify service level objectives for goodput as "KEY:VALUE" '
|
||||||
"pairs, where the key is a metric name, and the value is a "
|
"pairs, where the key is a metric name, and the value is a "
|
||||||
"number in milliseconds.") from err
|
"number in milliseconds."
|
||||||
|
) from err
|
||||||
return goodput_config_dict
|
return goodput_config_dict
|
||||||
|
|
||||||
|
|
||||||
@ -707,12 +742,14 @@ def check_goodput_args(args):
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Invalid metric name found, {slo_name}: {slo_val}. "
|
f"Invalid metric name found, {slo_name}: {slo_val}. "
|
||||||
"The service level objective name should be one of "
|
"The service level objective name should be one of "
|
||||||
f"{str(VALID_NAMES)}. ")
|
f"{str(VALID_NAMES)}. "
|
||||||
|
)
|
||||||
if slo_val < 0:
|
if slo_val < 0:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Invalid value found, {slo_name}: {slo_val}. "
|
f"Invalid value found, {slo_name}: {slo_val}. "
|
||||||
"The service level objective value should be "
|
"The service level objective value should be "
|
||||||
"non-negative.")
|
"non-negative."
|
||||||
|
)
|
||||||
return goodput_config_dict
|
return goodput_config_dict
|
||||||
|
|
||||||
|
|
||||||
@ -738,19 +775,19 @@ def main(args: argparse.Namespace):
|
|||||||
tokenizer_mode=args.tokenizer_mode,
|
tokenizer_mode=args.tokenizer_mode,
|
||||||
)
|
)
|
||||||
|
|
||||||
if args.dataset == 'grammar':
|
if args.dataset == "grammar":
|
||||||
args.structure_type = 'guided_grammar'
|
args.structure_type = "guided_grammar"
|
||||||
elif args.dataset == 'regex':
|
elif args.dataset == "regex":
|
||||||
args.structure_type = 'guided_regex'
|
args.structure_type = "guided_regex"
|
||||||
elif args.dataset == 'choice':
|
elif args.dataset == "choice":
|
||||||
args.structure_type = 'guided_choice'
|
args.structure_type = "guided_choice"
|
||||||
else:
|
else:
|
||||||
args.structure_type = 'guided_json'
|
args.structure_type = "guided_json"
|
||||||
|
|
||||||
if args.no_structured_output:
|
if args.no_structured_output:
|
||||||
args.structured_output_ratio = 0
|
args.structured_output_ratio = 0
|
||||||
if args.save_results:
|
if args.save_results:
|
||||||
result_file_name = f'{args.structured_output_ratio}guided'
|
result_file_name = f"{args.structured_output_ratio}guided"
|
||||||
result_file_name += f"_{backend}"
|
result_file_name += f"_{backend}"
|
||||||
result_file_name += f"_{args.request_rate}qps"
|
result_file_name += f"_{args.request_rate}qps"
|
||||||
result_file_name += f"_{args.model.split('/')[-1]}"
|
result_file_name += f"_{args.model.split('/')[-1]}"
|
||||||
@ -778,37 +815,29 @@ def main(args: argparse.Namespace):
|
|||||||
disable_tqdm=args.disable_tqdm,
|
disable_tqdm=args.disable_tqdm,
|
||||||
profile=args.profile,
|
profile=args.profile,
|
||||||
selected_percentile_metrics=args.percentile_metrics.split(","),
|
selected_percentile_metrics=args.percentile_metrics.split(","),
|
||||||
selected_percentiles=[
|
selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
|
||||||
float(p) for p in args.metric_percentiles.split(",")
|
|
||||||
],
|
|
||||||
ignore_eos=args.ignore_eos,
|
ignore_eos=args.ignore_eos,
|
||||||
max_concurrency=args.max_concurrency,
|
max_concurrency=args.max_concurrency,
|
||||||
structured_output_ratio=args.structured_output_ratio,
|
structured_output_ratio=args.structured_output_ratio,
|
||||||
structured_output_backend=args.structured_output_backend,
|
|
||||||
goodput_config_dict=goodput_config_dict,
|
goodput_config_dict=goodput_config_dict,
|
||||||
))
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# Save config and results to json
|
# Save config and results to json
|
||||||
score = evaluate(ret, args)
|
score = evaluate(ret, args)
|
||||||
print("correct_rate(%)", score, '\n')
|
print("correct_rate(%)", score, "\n")
|
||||||
if args.save_results:
|
if args.save_results:
|
||||||
results = {
|
results = {
|
||||||
"backend":
|
"backend": backend,
|
||||||
backend,
|
"model_id": model_id,
|
||||||
"model_id":
|
"tokenizer_id": tokenizer_id,
|
||||||
model_id,
|
"num_prompts": args.num_prompts,
|
||||||
"tokenizer_id":
|
"request_rate": args.request_rate
|
||||||
tokenizer_id,
|
if args.request_rate < float("inf")
|
||||||
"num_prompts":
|
else "inf",
|
||||||
args.num_prompts,
|
"burstiness": args.burstiness,
|
||||||
"request_rate":
|
"max_concurrency": args.max_concurrency,
|
||||||
args.request_rate if args.request_rate < float("inf") else "inf",
|
"correct_rate(%)": score,
|
||||||
"burstiness":
|
|
||||||
args.burstiness,
|
|
||||||
"max_concurrency":
|
|
||||||
args.max_concurrency,
|
|
||||||
"correct_rate(%)":
|
|
||||||
score
|
|
||||||
}
|
}
|
||||||
results = {"outputs": ret, **results, **benchmark_result}
|
results = {"outputs": ret, **results, **benchmark_result}
|
||||||
|
|
||||||
@ -817,13 +846,14 @@ def main(args: argparse.Namespace):
|
|||||||
result_file_name = args.result_filename
|
result_file_name = args.result_filename
|
||||||
if args.result_dir:
|
if args.result_dir:
|
||||||
result_file_name = os.path.join(args.result_dir, result_file_name)
|
result_file_name = os.path.join(args.result_dir, result_file_name)
|
||||||
with open(result_file_name, "w", encoding='utf-8') as outfile:
|
with open(result_file_name, "w", encoding="utf-8") as outfile:
|
||||||
json.dump(results, outfile, indent=4)
|
json.dump(results, outfile, indent=4)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Benchmark the online serving throughput.")
|
description="Benchmark the online serving throughput."
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--backend",
|
"--backend",
|
||||||
type=str,
|
type=str,
|
||||||
@ -845,16 +875,14 @@ if __name__ == "__main__":
|
|||||||
default="/v1/completions",
|
default="/v1/completions",
|
||||||
help="API endpoint.",
|
help="API endpoint.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--dataset",
|
parser.add_argument(
|
||||||
default='json',
|
"--dataset",
|
||||||
choices=[
|
default="json",
|
||||||
'json', 'json-unique', 'grammar', 'regex',
|
choices=["json", "json-unique", "grammar", "regex", "choice", "xgrammar_bench"],
|
||||||
'choice', 'xgrammar_bench'
|
)
|
||||||
])
|
parser.add_argument(
|
||||||
parser.add_argument("--json_schema_path",
|
"--json-schema-path", type=str, default=None, help="Path to json schema."
|
||||||
type=str,
|
)
|
||||||
default=None,
|
|
||||||
help="Path to json schema.")
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--max-concurrency",
|
"--max-concurrency",
|
||||||
type=int,
|
type=int,
|
||||||
@ -866,7 +894,8 @@ if __name__ == "__main__":
|
|||||||
"initiated, this argument will control how many are actually allowed "
|
"initiated, this argument will control how many are actually allowed "
|
||||||
"to execute at a time. This means that when used in combination, the "
|
"to execute at a time. This means that when used in combination, the "
|
||||||
"actual request rate may be lower than specified with --request-rate, "
|
"actual request rate may be lower than specified with --request-rate, "
|
||||||
"if the server is not processing requests fast enough to keep up.")
|
"if the server is not processing requests fast enough to keep up.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--model",
|
"--model",
|
||||||
type=str,
|
type=str,
|
||||||
@ -876,15 +905,13 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--tokenizer",
|
"--tokenizer",
|
||||||
type=str,
|
type=str,
|
||||||
help=
|
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||||
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--tokenizer-mode",
|
"--tokenizer-mode",
|
||||||
type=str,
|
type=str,
|
||||||
default="auto",
|
default="auto",
|
||||||
help=
|
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||||
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--num-prompts",
|
"--num-prompts",
|
||||||
@ -961,50 +988,51 @@ if __name__ == "__main__":
|
|||||||
"--ignore-eos",
|
"--ignore-eos",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Set ignore_eos flag when sending the benchmark request."
|
help="Set ignore_eos flag when sending the benchmark request."
|
||||||
"Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
|
"Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--percentile-metrics",
|
"--percentile-metrics",
|
||||||
type=str,
|
type=str,
|
||||||
default="ttft,tpot,itl",
|
default="ttft,tpot,itl",
|
||||||
help="Comma-seperated list of selected metrics to report percentils. "
|
help="Comma-separated list of selected metrics to report percentils. "
|
||||||
"This argument specifies the metrics to report percentiles. "
|
"This argument specifies the metrics to report percentiles. "
|
||||||
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
|
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
|
||||||
"Default value is \"ttft,tpot,itl\".")
|
'Default value is "ttft,tpot,itl".',
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--metric-percentiles",
|
"--metric-percentiles",
|
||||||
type=str,
|
type=str,
|
||||||
default="99",
|
default="99",
|
||||||
help="Comma-seperated list of percentiles for selected metrics. "
|
help="Comma-separated list of percentiles for selected metrics. "
|
||||||
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
|
'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
|
||||||
"Default value is \"99\". "
|
'Default value is "99". '
|
||||||
"Use \"--percentile-metrics\" to select metrics.",
|
'Use "--percentile-metrics" to select metrics.',
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--goodput",
|
"--goodput",
|
||||||
nargs="+",
|
nargs="+",
|
||||||
required=False,
|
required=False,
|
||||||
help="Specify service level objectives for goodput as \"KEY:VALUE\" "
|
help='Specify service level objectives for goodput as "KEY:VALUE" '
|
||||||
"pairs, where the key is a metric name, and the value is in "
|
"pairs, where the key is a metric name, and the value is in "
|
||||||
"milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
|
'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
|
||||||
"separated by spaces. Allowed request level metric names are "
|
"separated by spaces. Allowed request level metric names are "
|
||||||
"\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
|
'"ttft", "tpot", "e2el". For more context on the definition of '
|
||||||
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
|
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
|
||||||
"and the blog: https://hao-ai-lab.github.io/blogs/distserve")
|
"and the blog: https://hao-ai-lab.github.io/blogs/distserve",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument("--no-structured-output",
|
|
||||||
action='store_true',
|
|
||||||
default=False,
|
|
||||||
help="Whether to disable JSON decoding or not.")
|
|
||||||
parser.add_argument("--structured-output-ratio",
|
|
||||||
type=float,
|
|
||||||
default=1.0,
|
|
||||||
help="Ratio of Structured Outputs requests")
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--structured-output-backend",
|
"--no-structured-output",
|
||||||
type=str,
|
action="store_true",
|
||||||
choices=["outlines", "lm-format-enforcer", "xgrammar", "guidance"],
|
default=False,
|
||||||
default="xgrammar",
|
help="Whether to disable JSON decoding or not.",
|
||||||
help="Backend to use for structured outputs")
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--structured-output-ratio",
|
||||||
|
type=float,
|
||||||
|
default=1.0,
|
||||||
|
help="Ratio of Structured Outputs requests",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
"""Benchmark offline inference throughput."""
|
"""Benchmark offline inference throughput."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import json
|
import json
|
||||||
@ -11,17 +12,25 @@ from typing import Any, Optional, Union
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
import uvloop
|
import uvloop
|
||||||
from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
|
|
||||||
RandomDataset, SampleRequest, ShareGPTDataset,
|
|
||||||
SonnetDataset, VisionArenaDataset)
|
|
||||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
|
||||||
PreTrainedTokenizerBase)
|
|
||||||
|
|
||||||
|
from benchmark_dataset import (
|
||||||
|
AIMODataset,
|
||||||
|
BurstGPTDataset,
|
||||||
|
ConversationDataset,
|
||||||
|
InstructCoderDataset,
|
||||||
|
RandomDataset,
|
||||||
|
SampleRequest,
|
||||||
|
ShareGPTDataset,
|
||||||
|
SonnetDataset,
|
||||||
|
VisionArenaDataset,
|
||||||
|
)
|
||||||
|
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
||||||
from vllm.entrypoints.openai.api_server import (
|
from vllm.entrypoints.openai.api_server import (
|
||||||
build_async_engine_client_from_engine_args)
|
build_async_engine_client_from_engine_args,
|
||||||
|
)
|
||||||
from vllm.inputs import TextPrompt, TokensPrompt
|
from vllm.inputs import TextPrompt, TokensPrompt
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.outputs import RequestOutput
|
from vllm.outputs import RequestOutput
|
||||||
@ -36,23 +45,30 @@ def run_vllm(
|
|||||||
disable_detokenize: bool = False,
|
disable_detokenize: bool = False,
|
||||||
) -> tuple[float, Optional[list[RequestOutput]]]:
|
) -> tuple[float, Optional[list[RequestOutput]]]:
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
llm = LLM(**dataclasses.asdict(engine_args))
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
assert all(
|
assert all(
|
||||||
llm.llm_engine.model_config.max_model_len >= (
|
llm.llm_engine.model_config.max_model_len
|
||||||
request.prompt_len + request.expected_output_len)
|
>= (request.prompt_len + request.expected_output_len)
|
||||||
for request in requests), (
|
for request in requests
|
||||||
"Please ensure that max_model_len is greater than the sum of"
|
), (
|
||||||
" prompt_len and expected_output_len for all requests.")
|
"Please ensure that max_model_len is greater than the sum of"
|
||||||
|
" prompt_len and expected_output_len for all requests."
|
||||||
|
)
|
||||||
# Add the requests to the engine.
|
# Add the requests to the engine.
|
||||||
prompts: list[Union[TextPrompt, TokensPrompt]] = []
|
prompts: list[Union[TextPrompt, TokensPrompt]] = []
|
||||||
sampling_params: list[SamplingParams] = []
|
sampling_params: list[SamplingParams] = []
|
||||||
for request in requests:
|
for request in requests:
|
||||||
prompts.append(
|
prompts.append(
|
||||||
TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
|
TokensPrompt(
|
||||||
multi_modal_data=request.multi_modal_data)
|
prompt_token_ids=request.prompt["prompt_token_ids"],
|
||||||
if "prompt_token_ids" in request.prompt else \
|
multi_modal_data=request.multi_modal_data,
|
||||||
TextPrompt(prompt=request.prompt,
|
)
|
||||||
multi_modal_data=request.multi_modal_data))
|
if "prompt_token_ids" in request.prompt
|
||||||
|
else TextPrompt(
|
||||||
|
prompt=request.prompt, multi_modal_data=request.multi_modal_data
|
||||||
|
)
|
||||||
|
)
|
||||||
sampling_params.append(
|
sampling_params.append(
|
||||||
SamplingParams(
|
SamplingParams(
|
||||||
n=n,
|
n=n,
|
||||||
@ -61,7 +77,8 @@ def run_vllm(
|
|||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=request.expected_output_len,
|
max_tokens=request.expected_output_len,
|
||||||
detokenize=not disable_detokenize,
|
detokenize=not disable_detokenize,
|
||||||
))
|
)
|
||||||
|
)
|
||||||
lora_requests: Optional[list[LoRARequest]] = None
|
lora_requests: Optional[list[LoRARequest]] = None
|
||||||
if engine_args.enable_lora:
|
if engine_args.enable_lora:
|
||||||
lora_requests = [request.lora_request for request in requests]
|
lora_requests = [request.lora_request for request in requests]
|
||||||
@ -71,10 +88,9 @@ def run_vllm(
|
|||||||
outputs = None
|
outputs = None
|
||||||
if not use_beam_search:
|
if not use_beam_search:
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
outputs = llm.generate(prompts,
|
outputs = llm.generate(
|
||||||
sampling_params,
|
prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
|
||||||
lora_request=lora_requests,
|
)
|
||||||
use_tqdm=True)
|
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
else:
|
else:
|
||||||
assert lora_requests is None, "BeamSearch API does not support LoRA"
|
assert lora_requests is None, "BeamSearch API does not support LoRA"
|
||||||
@ -90,30 +106,35 @@ def run_vllm(
|
|||||||
beam_width=n,
|
beam_width=n,
|
||||||
max_tokens=output_len,
|
max_tokens=output_len,
|
||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
))
|
),
|
||||||
|
)
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
return end - start, outputs
|
return end - start, outputs
|
||||||
|
|
||||||
|
|
||||||
def run_vllm_chat(
|
def run_vllm_chat(
|
||||||
requests: list[SampleRequest],
|
requests: list[SampleRequest],
|
||||||
n: int,
|
n: int,
|
||||||
engine_args: EngineArgs,
|
engine_args: EngineArgs,
|
||||||
disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
|
disable_detokenize: bool = False,
|
||||||
|
) -> tuple[float, list[RequestOutput]]:
|
||||||
"""
|
"""
|
||||||
Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
|
Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
|
||||||
multimodal models as it properly handles multimodal inputs and chat
|
multimodal models as it properly handles multimodal inputs and chat
|
||||||
formatting. For non-multimodal models, use run_vllm() instead.
|
formatting. For non-multimodal models, use run_vllm() instead.
|
||||||
"""
|
"""
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
llm = LLM(**dataclasses.asdict(engine_args))
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
|
|
||||||
assert all(
|
assert all(
|
||||||
llm.llm_engine.model_config.max_model_len >= (
|
llm.llm_engine.model_config.max_model_len
|
||||||
request.prompt_len + request.expected_output_len)
|
>= (request.prompt_len + request.expected_output_len)
|
||||||
for request in requests), (
|
for request in requests
|
||||||
"Please ensure that max_model_len is greater than the sum of "
|
), (
|
||||||
"prompt_len and expected_output_len for all requests.")
|
"Please ensure that max_model_len is greater than the sum of "
|
||||||
|
"prompt_len and expected_output_len for all requests."
|
||||||
|
)
|
||||||
|
|
||||||
prompts = []
|
prompts = []
|
||||||
sampling_params: list[SamplingParams] = []
|
sampling_params: list[SamplingParams] = []
|
||||||
@ -127,7 +148,8 @@ def run_vllm_chat(
|
|||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=request.expected_output_len,
|
max_tokens=request.expected_output_len,
|
||||||
detokenize=not disable_detokenize,
|
detokenize=not disable_detokenize,
|
||||||
))
|
)
|
||||||
|
)
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
|
outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
@ -144,13 +166,17 @@ async def run_vllm_async(
|
|||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
|
|
||||||
async with build_async_engine_client_from_engine_args(
|
async with build_async_engine_client_from_engine_args(
|
||||||
engine_args, disable_frontend_multiprocessing) as llm:
|
engine_args, disable_frontend_multiprocessing
|
||||||
|
) as llm:
|
||||||
|
model_config = await llm.get_model_config()
|
||||||
assert all(
|
assert all(
|
||||||
llm.model_config.max_model_len >= (request.prompt_len +
|
model_config.max_model_len
|
||||||
request.expected_output_len)
|
>= (request.prompt_len + request.expected_output_len)
|
||||||
for request in requests), (
|
for request in requests
|
||||||
"Please ensure that max_model_len is greater than the sum of"
|
), (
|
||||||
" prompt_len and expected_output_len for all requests.")
|
"Please ensure that max_model_len is greater than the sum of"
|
||||||
|
" prompt_len and expected_output_len for all requests."
|
||||||
|
)
|
||||||
|
|
||||||
# Add the requests to the engine.
|
# Add the requests to the engine.
|
||||||
prompts: list[Union[TextPrompt, TokensPrompt]] = []
|
prompts: list[Union[TextPrompt, TokensPrompt]] = []
|
||||||
@ -158,11 +184,15 @@ async def run_vllm_async(
|
|||||||
lora_requests: list[Optional[LoRARequest]] = []
|
lora_requests: list[Optional[LoRARequest]] = []
|
||||||
for request in requests:
|
for request in requests:
|
||||||
prompts.append(
|
prompts.append(
|
||||||
TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
|
TokensPrompt(
|
||||||
multi_modal_data=request.multi_modal_data)
|
prompt_token_ids=request.prompt["prompt_token_ids"],
|
||||||
if "prompt_token_ids" in request.prompt else \
|
multi_modal_data=request.multi_modal_data,
|
||||||
TextPrompt(prompt=request.prompt,
|
)
|
||||||
multi_modal_data=request.multi_modal_data))
|
if "prompt_token_ids" in request.prompt
|
||||||
|
else TextPrompt(
|
||||||
|
prompt=request.prompt, multi_modal_data=request.multi_modal_data
|
||||||
|
)
|
||||||
|
)
|
||||||
sampling_params.append(
|
sampling_params.append(
|
||||||
SamplingParams(
|
SamplingParams(
|
||||||
n=n,
|
n=n,
|
||||||
@ -171,17 +201,16 @@ async def run_vllm_async(
|
|||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=request.expected_output_len,
|
max_tokens=request.expected_output_len,
|
||||||
detokenize=not disable_detokenize,
|
detokenize=not disable_detokenize,
|
||||||
))
|
)
|
||||||
|
)
|
||||||
lora_requests.append(request.lora_request)
|
lora_requests.append(request.lora_request)
|
||||||
|
|
||||||
generators = []
|
generators = []
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
for i, (prompt, sp,
|
for i, (prompt, sp, lr) in enumerate(
|
||||||
lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
|
zip(prompts, sampling_params, lora_requests)
|
||||||
generator = llm.generate(prompt,
|
):
|
||||||
sp,
|
generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}")
|
||||||
lora_request=lr,
|
|
||||||
request_id=f"test{i}")
|
|
||||||
generators.append(generator)
|
generators.append(generator)
|
||||||
all_gens = merge_async_iterators(*generators)
|
all_gens = merge_async_iterators(*generators)
|
||||||
async for i, res in all_gens:
|
async for i, res in all_gens:
|
||||||
@ -200,7 +229,8 @@ def run_hf(
|
|||||||
disable_detokenize: bool = False,
|
disable_detokenize: bool = False,
|
||||||
) -> float:
|
) -> float:
|
||||||
llm = AutoModelForCausalLM.from_pretrained(
|
llm = AutoModelForCausalLM.from_pretrained(
|
||||||
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
|
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
|
||||||
|
)
|
||||||
if llm.config.model_type == "llama":
|
if llm.config.model_type == "llama":
|
||||||
# To enable padding in the HF backend.
|
# To enable padding in the HF backend.
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
@ -212,22 +242,26 @@ def run_hf(
|
|||||||
max_prompt_len = 0
|
max_prompt_len = 0
|
||||||
max_output_len = 0
|
max_output_len = 0
|
||||||
for i in range(len(requests)):
|
for i in range(len(requests)):
|
||||||
prompt, prompt_len, output_len = requests[i]
|
prompt = requests[i].prompt
|
||||||
|
prompt_len = requests[i].prompt_len
|
||||||
|
output_len = requests[i].expected_output_len
|
||||||
# Add the prompt to the batch.
|
# Add the prompt to the batch.
|
||||||
batch.append(prompt)
|
batch.append(prompt)
|
||||||
max_prompt_len = max(max_prompt_len, prompt_len)
|
max_prompt_len = max(max_prompt_len, prompt_len)
|
||||||
max_output_len = max(max_output_len, output_len)
|
max_output_len = max(max_output_len, output_len)
|
||||||
if len(batch) < max_batch_size and i != len(requests) - 1:
|
if len(batch) < max_batch_size and i != len(requests) - 1:
|
||||||
# Check if we can add more requests to the batch.
|
# Check if we can add more requests to the batch.
|
||||||
_, next_prompt_len, next_output_len = requests[i + 1]
|
next_prompt_len = requests[i + 1].prompt_len
|
||||||
if (max(max_prompt_len, next_prompt_len) +
|
next_output_len = requests[i + 1].expected_output_len
|
||||||
max(max_output_len, next_output_len)) <= 2048:
|
if (
|
||||||
|
max(max_prompt_len, next_prompt_len)
|
||||||
|
+ max(max_output_len, next_output_len)
|
||||||
|
) <= 2048:
|
||||||
# We can add more requests to the batch.
|
# We can add more requests to the batch.
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Generate the sequences.
|
# Generate the sequences.
|
||||||
input_ids = tokenizer(batch, return_tensors="pt",
|
input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
|
||||||
padding=True).input_ids
|
|
||||||
llm_outputs = llm.generate(
|
llm_outputs = llm.generate(
|
||||||
input_ids=input_ids.cuda(),
|
input_ids=input_ids.cuda(),
|
||||||
do_sample=True,
|
do_sample=True,
|
||||||
@ -257,6 +291,7 @@ def run_mii(
|
|||||||
output_len: int,
|
output_len: int,
|
||||||
) -> float:
|
) -> float:
|
||||||
from mii import client, serve
|
from mii import client, serve
|
||||||
|
|
||||||
llm = serve(model, tensor_parallel=tensor_parallel_size)
|
llm = serve(model, tensor_parallel=tensor_parallel_size)
|
||||||
prompts = [request.prompt for request in requests]
|
prompts = [request.prompt for request in requests]
|
||||||
|
|
||||||
@ -268,8 +303,9 @@ def run_mii(
|
|||||||
return end - start
|
return end - start
|
||||||
|
|
||||||
|
|
||||||
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
def save_to_pytorch_benchmark_format(
|
||||||
results: dict[str, Any]) -> None:
|
args: argparse.Namespace, results: dict[str, Any]
|
||||||
|
) -> None:
|
||||||
pt_records = convert_to_pytorch_benchmark_format(
|
pt_records = convert_to_pytorch_benchmark_format(
|
||||||
args=args,
|
args=args,
|
||||||
metrics={
|
metrics={
|
||||||
@ -277,9 +313,9 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
|||||||
"tokens_per_second": [results["tokens_per_second"]],
|
"tokens_per_second": [results["tokens_per_second"]],
|
||||||
},
|
},
|
||||||
extra_info={
|
extra_info={
|
||||||
k: results[k]
|
k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"]
|
||||||
for k in ["elapsed_time", "num_requests", "total_num_tokens"]
|
},
|
||||||
})
|
)
|
||||||
if pt_records:
|
if pt_records:
|
||||||
# Don't use json suffix here as we don't want CI to pick it up
|
# Don't use json suffix here as we don't want CI to pick it up
|
||||||
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
||||||
@ -300,6 +336,7 @@ def get_requests(args, tokenizer):
|
|||||||
"input_len": args.input_len,
|
"input_len": args.input_len,
|
||||||
"output_len": args.output_len,
|
"output_len": args.output_len,
|
||||||
}
|
}
|
||||||
|
|
||||||
if args.dataset_path is None or args.dataset_name == "random":
|
if args.dataset_path is None or args.dataset_name == "random":
|
||||||
sample_kwargs["range_ratio"] = args.random_range_ratio
|
sample_kwargs["range_ratio"] = args.random_range_ratio
|
||||||
sample_kwargs["prefix_len"] = args.prefix_len
|
sample_kwargs["prefix_len"] = args.prefix_len
|
||||||
@ -310,25 +347,31 @@ def get_requests(args, tokenizer):
|
|||||||
sample_kwargs["enable_multimodal_chat"] = True
|
sample_kwargs["enable_multimodal_chat"] = True
|
||||||
elif args.dataset_name == "sonnet":
|
elif args.dataset_name == "sonnet":
|
||||||
assert tokenizer.chat_template or tokenizer.default_chat_template, (
|
assert tokenizer.chat_template or tokenizer.default_chat_template, (
|
||||||
"Tokenizer/model must have chat template for sonnet dataset.")
|
"Tokenizer/model must have chat template for sonnet dataset."
|
||||||
|
)
|
||||||
dataset_cls = SonnetDataset
|
dataset_cls = SonnetDataset
|
||||||
sample_kwargs["prefix_len"] = args.prefix_len
|
sample_kwargs["prefix_len"] = args.prefix_len
|
||||||
sample_kwargs["return_prompt_formatted"] = True
|
sample_kwargs["return_prompt_formatted"] = True
|
||||||
elif args.dataset_name == "burstgpt":
|
elif args.dataset_name == "burstgpt":
|
||||||
dataset_cls = BurstGPTDataset
|
dataset_cls = BurstGPTDataset
|
||||||
elif args.dataset_name == "hf":
|
elif args.dataset_name == "hf":
|
||||||
if args.backend != "vllm-chat":
|
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
|
||||||
raise ValueError(
|
dataset_cls = VisionArenaDataset
|
||||||
"hf datasets only are supported by vllm-chat backend")
|
common_kwargs["dataset_subset"] = None
|
||||||
# Choose between VisionArenaDataset and HuggingFaceDataset based on
|
common_kwargs["dataset_split"] = "train"
|
||||||
# provided parameters.
|
sample_kwargs["enable_multimodal_chat"] = True
|
||||||
dataset_cls = (VisionArenaDataset if args.dataset_path
|
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
|
||||||
== VisionArenaDataset.VISION_ARENA_DATASET_PATH
|
dataset_cls = InstructCoderDataset
|
||||||
and args.hf_subset is None else HuggingFaceDataset)
|
common_kwargs["dataset_split"] = "train"
|
||||||
common_kwargs['dataset_subset'] = args.hf_subset
|
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
|
||||||
common_kwargs['dataset_split'] = args.hf_split
|
dataset_cls = ConversationDataset
|
||||||
sample_kwargs["enable_multimodal_chat"] = True
|
common_kwargs["dataset_subset"] = args.hf_subset
|
||||||
|
common_kwargs["dataset_split"] = args.hf_split
|
||||||
|
sample_kwargs["enable_multimodal_chat"] = True
|
||||||
|
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
|
||||||
|
dataset_cls = AIMODataset
|
||||||
|
common_kwargs["dataset_subset"] = None
|
||||||
|
common_kwargs["dataset_split"] = "train"
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown dataset name: {args.dataset_name}")
|
raise ValueError(f"Unknown dataset name: {args.dataset_name}")
|
||||||
# Remove None values
|
# Remove None values
|
||||||
@ -343,10 +386,10 @@ def main(args: argparse.Namespace):
|
|||||||
random.seed(args.seed)
|
random.seed(args.seed)
|
||||||
# Sample the requests.
|
# Sample the requests.
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
args.tokenizer, trust_remote_code=args.trust_remote_code)
|
args.tokenizer, trust_remote_code=args.trust_remote_code
|
||||||
|
)
|
||||||
requests = get_requests(args, tokenizer)
|
requests = get_requests(args, tokenizer)
|
||||||
is_multi_modal = any(request.multi_modal_data is not None
|
is_multi_modal = any(request.multi_modal_data is not None for request in requests)
|
||||||
for request in requests)
|
|
||||||
request_outputs: Optional[list[RequestOutput]] = None
|
request_outputs: Optional[list[RequestOutput]] = None
|
||||||
if args.backend == "vllm":
|
if args.backend == "vllm":
|
||||||
if args.async_engine:
|
if args.async_engine:
|
||||||
@ -357,23 +400,34 @@ def main(args: argparse.Namespace):
|
|||||||
AsyncEngineArgs.from_cli_args(args),
|
AsyncEngineArgs.from_cli_args(args),
|
||||||
args.disable_frontend_multiprocessing,
|
args.disable_frontend_multiprocessing,
|
||||||
args.disable_detokenize,
|
args.disable_detokenize,
|
||||||
))
|
)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
elapsed_time, request_outputs = run_vllm(
|
elapsed_time, request_outputs = run_vllm(
|
||||||
requests, args.n, EngineArgs.from_cli_args(args),
|
requests,
|
||||||
args.disable_detokenize)
|
args.n,
|
||||||
|
EngineArgs.from_cli_args(args),
|
||||||
|
args.disable_detokenize,
|
||||||
|
)
|
||||||
elif args.backend == "hf":
|
elif args.backend == "hf":
|
||||||
assert args.tensor_parallel_size == 1
|
assert args.tensor_parallel_size == 1
|
||||||
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
elapsed_time = run_hf(
|
||||||
args.hf_max_batch_size, args.trust_remote_code,
|
requests,
|
||||||
args.disable_detokenize)
|
args.model,
|
||||||
|
tokenizer,
|
||||||
|
args.n,
|
||||||
|
args.hf_max_batch_size,
|
||||||
|
args.trust_remote_code,
|
||||||
|
args.disable_detokenize,
|
||||||
|
)
|
||||||
elif args.backend == "mii":
|
elif args.backend == "mii":
|
||||||
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
|
elapsed_time = run_mii(
|
||||||
args.output_len)
|
requests, args.model, args.tensor_parallel_size, args.output_len
|
||||||
|
)
|
||||||
elif args.backend == "vllm-chat":
|
elif args.backend == "vllm-chat":
|
||||||
elapsed_time, request_outputs = run_vllm_chat(
|
elapsed_time, request_outputs = run_vllm_chat(
|
||||||
requests, args.n, EngineArgs.from_cli_args(args),
|
requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
|
||||||
args.disable_detokenize)
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown backend: {args.backend}")
|
raise ValueError(f"Unknown backend: {args.backend}")
|
||||||
|
|
||||||
@ -385,28 +439,31 @@ def main(args: argparse.Namespace):
|
|||||||
for ro in request_outputs:
|
for ro in request_outputs:
|
||||||
if not isinstance(ro, RequestOutput):
|
if not isinstance(ro, RequestOutput):
|
||||||
continue
|
continue
|
||||||
total_prompt_tokens += len(
|
total_prompt_tokens += (
|
||||||
ro.prompt_token_ids) if ro.prompt_token_ids else 0
|
len(ro.prompt_token_ids) if ro.prompt_token_ids else 0
|
||||||
total_output_tokens += sum(
|
)
|
||||||
len(o.token_ids) for o in ro.outputs if o)
|
total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o)
|
||||||
total_num_tokens = total_prompt_tokens + total_output_tokens
|
total_num_tokens = total_prompt_tokens + total_output_tokens
|
||||||
else:
|
else:
|
||||||
total_num_tokens = sum(r.prompt_len + r.expected_output_len
|
total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests)
|
||||||
for r in requests)
|
|
||||||
total_output_tokens = sum(r.expected_output_len for r in requests)
|
total_output_tokens = sum(r.expected_output_len for r in requests)
|
||||||
total_prompt_tokens = total_num_tokens - total_output_tokens
|
total_prompt_tokens = total_num_tokens - total_output_tokens
|
||||||
|
|
||||||
if is_multi_modal and args.backend != "vllm-chat":
|
if is_multi_modal and args.backend != "vllm-chat":
|
||||||
print("\033[91mWARNING\033[0m: Multi-modal request with "
|
print(
|
||||||
f"{args.backend} backend detected. The "
|
"\033[91mWARNING\033[0m: Multi-modal request with "
|
||||||
"following metrics are not accurate because image tokens are not"
|
f"{args.backend} backend detected. The "
|
||||||
" counted. See vllm-project/vllm/issues/9778 for details.")
|
"following metrics are not accurate because image tokens are not"
|
||||||
|
" counted. See vllm-project/vllm/issues/9778 for details."
|
||||||
|
)
|
||||||
# TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
|
# TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
|
||||||
# vllm-chat backend counts the image tokens now
|
# vllm-chat backend counts the image tokens now
|
||||||
|
|
||||||
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
print(
|
||||||
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
|
f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
||||||
f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
|
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
|
||||||
|
f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
|
||||||
|
)
|
||||||
print(f"Total num prompt tokens: {total_prompt_tokens}")
|
print(f"Total num prompt tokens: {total_prompt_tokens}")
|
||||||
print(f"Total num output tokens: {total_output_tokens}")
|
print(f"Total num output tokens: {total_output_tokens}")
|
||||||
|
|
||||||
@ -434,7 +491,8 @@ def validate_args(args):
|
|||||||
warnings.warn(
|
warnings.warn(
|
||||||
"The '--dataset' argument will be deprecated in the next release. "
|
"The '--dataset' argument will be deprecated in the next release. "
|
||||||
"Please use '--dataset-name' and '--dataset-path' instead.",
|
"Please use '--dataset-name' and '--dataset-path' instead.",
|
||||||
stacklevel=2)
|
stacklevel=2,
|
||||||
|
)
|
||||||
args.dataset_path = args.dataset
|
args.dataset_path = args.dataset
|
||||||
|
|
||||||
if not getattr(args, "tokenizer", None):
|
if not getattr(args, "tokenizer", None):
|
||||||
@ -447,9 +505,8 @@ def validate_args(args):
|
|||||||
|
|
||||||
# === Dataset Configuration ===
|
# === Dataset Configuration ===
|
||||||
if not args.dataset and not args.dataset_path:
|
if not args.dataset and not args.dataset_path:
|
||||||
print(
|
print("When dataset path is not set, it will default to random dataset")
|
||||||
"When dataset path is not set, it will default to random dataset")
|
args.dataset_name = "random"
|
||||||
args.dataset_name = 'random'
|
|
||||||
if args.input_len is None:
|
if args.input_len is None:
|
||||||
raise ValueError("input_len must be provided for a random dataset")
|
raise ValueError("input_len must be provided for a random dataset")
|
||||||
|
|
||||||
@ -457,33 +514,55 @@ def validate_args(args):
|
|||||||
# --hf-subset and --hf-split: only used
|
# --hf-subset and --hf-split: only used
|
||||||
# when dataset_name is 'hf'
|
# when dataset_name is 'hf'
|
||||||
if args.dataset_name != "hf" and (
|
if args.dataset_name != "hf" and (
|
||||||
getattr(args, "hf_subset", None) is not None
|
getattr(args, "hf_subset", None) is not None
|
||||||
or getattr(args, "hf_split", None) is not None):
|
or getattr(args, "hf_split", None) is not None
|
||||||
warnings.warn("--hf-subset and --hf-split will be ignored \
|
):
|
||||||
|
warnings.warn(
|
||||||
|
"--hf-subset and --hf-split will be ignored \
|
||||||
since --dataset-name is not 'hf'.",
|
since --dataset-name is not 'hf'.",
|
||||||
stacklevel=2)
|
stacklevel=2,
|
||||||
elif args.dataset_name == "hf" and args.backend != "vllm-chat":
|
)
|
||||||
raise ValueError(
|
elif args.dataset_name == "hf":
|
||||||
"When --dataset-name is 'hf', backend must be 'vllm-chat'")
|
if args.dataset_path in (
|
||||||
|
VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
|
||||||
|
| ConversationDataset.SUPPORTED_DATASET_PATHS
|
||||||
|
):
|
||||||
|
assert args.backend == "vllm-chat", (
|
||||||
|
f"{args.dataset_path} needs to use vllm-chat as the backend."
|
||||||
|
) # noqa: E501
|
||||||
|
elif args.dataset_path in (
|
||||||
|
InstructCoderDataset.SUPPORTED_DATASET_PATHS
|
||||||
|
| AIMODataset.SUPPORTED_DATASET_PATHS
|
||||||
|
):
|
||||||
|
assert args.backend == "vllm", (
|
||||||
|
f"{args.dataset_path} needs to use vllm as the backend."
|
||||||
|
) # noqa: E501
|
||||||
|
else:
|
||||||
|
raise ValueError(f"{args.dataset_path} is not supported by hf dataset.")
|
||||||
|
|
||||||
# --random-range-ratio: only used when dataset_name is 'random'
|
# --random-range-ratio: only used when dataset_name is 'random'
|
||||||
if args.dataset_name != 'random' and args.random_range_ratio is not None:
|
if args.dataset_name != "random" and args.random_range_ratio is not None:
|
||||||
warnings.warn("--random-range-ratio will be ignored since \
|
warnings.warn(
|
||||||
|
"--random-range-ratio will be ignored since \
|
||||||
--dataset-name is not 'random'.",
|
--dataset-name is not 'random'.",
|
||||||
stacklevel=2)
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
|
||||||
# --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
|
# --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
|
||||||
# set.
|
# set.
|
||||||
if args.dataset_name not in {"random", "sonnet", None
|
if (
|
||||||
} and args.prefix_len is not None:
|
args.dataset_name not in {"random", "sonnet", None}
|
||||||
warnings.warn("--prefix-len will be ignored since --dataset-name\
|
and args.prefix_len is not None
|
||||||
|
):
|
||||||
|
warnings.warn(
|
||||||
|
"--prefix-len will be ignored since --dataset-name\
|
||||||
is not 'random', 'sonnet', or not set.",
|
is not 'random', 'sonnet', or not set.",
|
||||||
stacklevel=2)
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
|
||||||
# === LoRA Settings ===
|
# === LoRA Settings ===
|
||||||
if getattr(args, "enable_lora", False) and args.backend != "vllm":
|
if getattr(args, "enable_lora", False) and args.backend != "vllm":
|
||||||
raise ValueError(
|
raise ValueError("LoRA benchmarking is only supported for vLLM backend")
|
||||||
"LoRA benchmarking is only supported for vLLM backend")
|
|
||||||
if getattr(args, "enable_lora", False) and args.lora_path is None:
|
if getattr(args, "enable_lora", False) and args.lora_path is None:
|
||||||
raise ValueError("LoRA path must be provided when enable_lora is True")
|
raise ValueError("LoRA path must be provided when enable_lora is True")
|
||||||
|
|
||||||
@ -493,8 +572,10 @@ def validate_args(args):
|
|||||||
if args.backend != "hf" and args.hf_max_batch_size is not None:
|
if args.backend != "hf" and args.hf_max_batch_size is not None:
|
||||||
raise ValueError("HF max batch size is only for HF backend.")
|
raise ValueError("HF max batch size is only for HF backend.")
|
||||||
|
|
||||||
if args.backend in {"hf", "mii"} and getattr(args, "quantization",
|
if (
|
||||||
None) is not None:
|
args.backend in {"hf", "mii"}
|
||||||
|
and getattr(args, "quantization", None) is not None
|
||||||
|
):
|
||||||
raise ValueError("Quantization is only for vLLM backend.")
|
raise ValueError("Quantization is only for vLLM backend.")
|
||||||
|
|
||||||
if args.backend == "mii" and args.dtype != "auto":
|
if args.backend == "mii" and args.dtype != "auto":
|
||||||
@ -502,22 +583,32 @@ def validate_args(args):
|
|||||||
if args.backend == "mii" and args.n != 1:
|
if args.backend == "mii" and args.n != 1:
|
||||||
raise ValueError("n must be 1 for MII backend.")
|
raise ValueError("n must be 1 for MII backend.")
|
||||||
if args.backend == "mii" and args.tokenizer != args.model:
|
if args.backend == "mii" and args.tokenizer != args.model:
|
||||||
|
raise ValueError("Tokenizer must be the same as the model for MII backend.")
|
||||||
|
|
||||||
|
# --data-parallel is not supported currently.
|
||||||
|
# https://github.com/vllm-project/vllm/issues/16222
|
||||||
|
if args.data_parallel_size > 1:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Tokenizer must be the same as the model for MII backend.")
|
"Data parallel is not supported in offline benchmark, \
|
||||||
|
please use benchmark serving instead"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
||||||
parser.add_argument("--backend",
|
parser.add_argument(
|
||||||
type=str,
|
"--backend",
|
||||||
choices=["vllm", "hf", "mii", "vllm-chat"],
|
type=str,
|
||||||
default="vllm")
|
choices=["vllm", "hf", "mii", "vllm-chat"],
|
||||||
|
default="vllm",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--dataset-name",
|
"--dataset-name",
|
||||||
type=str,
|
type=str,
|
||||||
choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
|
choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
|
||||||
help="Name of the dataset to benchmark on.",
|
help="Name of the dataset to benchmark on.",
|
||||||
default="sharegpt")
|
default="sharegpt",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--dataset",
|
"--dataset",
|
||||||
type=str,
|
type=str,
|
||||||
@ -525,80 +616,104 @@ if __name__ == "__main__":
|
|||||||
help="Path to the ShareGPT dataset, will be deprecated in\
|
help="Path to the ShareGPT dataset, will be deprecated in\
|
||||||
the next release. The dataset is expected to "
|
the next release. The dataset is expected to "
|
||||||
"be a json in form of list[dict[..., conversations: "
|
"be a json in form of list[dict[..., conversations: "
|
||||||
"list[dict[..., value: <prompt_or_response>]]]]")
|
"list[dict[..., value: <prompt_or_response>]]]]",
|
||||||
parser.add_argument("--dataset-path",
|
)
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help="Path to the dataset")
|
|
||||||
parser.add_argument("--input-len",
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help="Input prompt length for each request")
|
|
||||||
parser.add_argument("--output-len",
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help="Output length for each request. Overrides the "
|
|
||||||
"output length from the dataset.")
|
|
||||||
parser.add_argument("--n",
|
|
||||||
type=int,
|
|
||||||
default=1,
|
|
||||||
help="Number of generated sequences per prompt.")
|
|
||||||
parser.add_argument("--num-prompts",
|
|
||||||
type=int,
|
|
||||||
default=1000,
|
|
||||||
help="Number of prompts to process.")
|
|
||||||
parser.add_argument("--hf-max-batch-size",
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help="Maximum batch size for HF backend.")
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--output-json',
|
"--dataset-path", type=str, default=None, help="Path to the dataset"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--input-len",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Input prompt length for each request",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-len",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Output length for each request. Overrides the "
|
||||||
|
"output length from the dataset.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--n", type=int, default=1, help="Number of generated sequences per prompt."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--num-prompts", type=int, default=1000, help="Number of prompts to process."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--hf-max-batch-size",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Maximum batch size for HF backend.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-json",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help='Path to save the throughput results in JSON format.')
|
help="Path to save the throughput results in JSON format.",
|
||||||
parser.add_argument("--async-engine",
|
)
|
||||||
action='store_true',
|
parser.add_argument(
|
||||||
default=False,
|
"--async-engine",
|
||||||
help="Use vLLM async engine rather than LLM class.")
|
action="store_true",
|
||||||
parser.add_argument("--disable-frontend-multiprocessing",
|
default=False,
|
||||||
action='store_true',
|
help="Use vLLM async engine rather than LLM class.",
|
||||||
default=False,
|
)
|
||||||
help="Disable decoupled async engine frontend.")
|
parser.add_argument(
|
||||||
|
"--disable-frontend-multiprocessing",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Disable decoupled async engine frontend.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--disable-detokenize",
|
"--disable-detokenize",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help=("Do not detokenize the response (i.e. do not include "
|
help=(
|
||||||
"detokenization time in the measurement)"))
|
"Do not detokenize the response (i.e. do not include "
|
||||||
|
"detokenization time in the measurement)"
|
||||||
|
),
|
||||||
|
)
|
||||||
# LoRA
|
# LoRA
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--lora-path",
|
"--lora-path",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help="Path to the lora adapters to use. This can be an absolute path, "
|
help="Path to the LoRA adapters to use. This can be an absolute path, "
|
||||||
"a relative path, or a Hugging Face model identifier.")
|
"a relative path, or a Hugging Face model identifier.",
|
||||||
parser.add_argument("--prefix-len",
|
)
|
||||||
type=int,
|
parser.add_argument(
|
||||||
default=None,
|
"--prefix-len",
|
||||||
help="Number of prefix tokens per request."
|
type=int,
|
||||||
"This is for the RandomDataset and SonnetDataset")
|
default=None,
|
||||||
|
help=f"Number of prefix tokens to be used in RandomDataset "
|
||||||
|
"and SonnetDataset. For RandomDataset, the total input "
|
||||||
|
"length is the sum of prefix-len (default: "
|
||||||
|
f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
|
||||||
|
"sampled from [input_len * (1 - range_ratio), "
|
||||||
|
"input_len * (1 + range_ratio)]. For SonnetDataset, "
|
||||||
|
f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
|
||||||
|
"controls how much of the input is fixed lines versus "
|
||||||
|
"random lines, but the total input length remains approximately "
|
||||||
|
"input_len tokens.",
|
||||||
|
)
|
||||||
# random dataset
|
# random dataset
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--random-range-ratio",
|
"--random-range-ratio",
|
||||||
type=float,
|
type=float,
|
||||||
default=None,
|
default=None,
|
||||||
help="Range of sampled ratio of input/output length, "
|
help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) "
|
||||||
"used only for RandomDataSet.",
|
"for sampling input/output length, "
|
||||||
|
"used only for RandomDataset. Must be in the range [0, 1) to "
|
||||||
|
"define a symmetric sampling range "
|
||||||
|
"[length * (1 - range_ratio), length * (1 + range_ratio)].",
|
||||||
)
|
)
|
||||||
|
|
||||||
# hf dtaset
|
# hf dtaset
|
||||||
parser.add_argument("--hf-subset",
|
parser.add_argument(
|
||||||
type=str,
|
"--hf-subset", type=str, default=None, help="Subset of the HF dataset."
|
||||||
default=None,
|
)
|
||||||
help="Subset of the HF dataset.")
|
parser.add_argument(
|
||||||
parser.add_argument("--hf-split",
|
"--hf-split", type=str, default=None, help="Split of the HF dataset."
|
||||||
type=str,
|
)
|
||||||
default=None,
|
|
||||||
help="Split of the HF dataset.")
|
|
||||||
|
|
||||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
parser = AsyncEngineArgs.add_cli_args(parser)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|||||||
@ -7,9 +7,9 @@ import os
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
|
def convert_to_pytorch_benchmark_format(
|
||||||
metrics: dict[str, list],
|
args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any]
|
||||||
extra_info: dict[str, Any]) -> list:
|
) -> list:
|
||||||
"""
|
"""
|
||||||
Save the benchmark results in the format used by PyTorch OSS benchmark with
|
Save the benchmark results in the format used by PyTorch OSS benchmark with
|
||||||
on metric per record
|
on metric per record
|
||||||
@ -37,12 +37,12 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
tp = record["benchmark"]["extra_info"]["args"].get(
|
tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size")
|
||||||
"tensor_parallel_size")
|
|
||||||
# Save tensor_parallel_size parameter if it's part of the metadata
|
# Save tensor_parallel_size parameter if it's part of the metadata
|
||||||
if not tp and "tensor_parallel_size" in extra_info:
|
if not tp and "tensor_parallel_size" in extra_info:
|
||||||
record["benchmark"]["extra_info"]["args"][
|
record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = (
|
||||||
"tensor_parallel_size"] = extra_info["tensor_parallel_size"]
|
extra_info["tensor_parallel_size"]
|
||||||
|
)
|
||||||
|
|
||||||
records.append(record)
|
records.append(record)
|
||||||
|
|
||||||
@ -50,7 +50,6 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
|
|||||||
|
|
||||||
|
|
||||||
class InfEncoder(json.JSONEncoder):
|
class InfEncoder(json.JSONEncoder):
|
||||||
|
|
||||||
def clear_inf(self, o: Any):
|
def clear_inf(self, o: Any):
|
||||||
if isinstance(o, dict):
|
if isinstance(o, dict):
|
||||||
return {k: self.clear_inf(v) for k, v in o.items()}
|
return {k: self.clear_inf(v) for k, v in o.items()}
|
||||||
|
|||||||
@ -23,8 +23,9 @@ DEFAULT_TP_SIZES = [1]
|
|||||||
|
|
||||||
|
|
||||||
# bench
|
# bench
|
||||||
def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
|
def bench_fn(
|
||||||
**kwargs) -> TMeasurement:
|
label: str, sub_label: str, description: str, fn: Callable, *args, **kwargs
|
||||||
|
) -> TMeasurement:
|
||||||
min_run_time = 1
|
min_run_time = 1
|
||||||
|
|
||||||
globals = {
|
globals = {
|
||||||
@ -41,16 +42,18 @@ def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
|
|||||||
).blocked_autorange(min_run_time=min_run_time)
|
).blocked_autorange(min_run_time=min_run_time)
|
||||||
|
|
||||||
|
|
||||||
def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
def bench_int8(
|
||||||
sub_label: str) -> Iterable[TMeasurement]:
|
dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str
|
||||||
|
) -> Iterable[TMeasurement]:
|
||||||
assert dtype == torch.int8
|
assert dtype == torch.int8
|
||||||
b_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
|
b_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
|
||||||
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
|
bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16)
|
||||||
|
|
||||||
out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
|
out = ops.cutlass_scaled_sparse_mm(
|
||||||
torch.bfloat16)
|
a, b_compressed, e, scale_a, scale_b, torch.bfloat16
|
||||||
|
)
|
||||||
out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
|
out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
|
||||||
|
|
||||||
if not torch.allclose(out, out_ref):
|
if not torch.allclose(out, out_ref):
|
||||||
@ -63,54 +66,107 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
|||||||
timers = []
|
timers = []
|
||||||
# pytorch impl - bfloat16
|
# pytorch impl - bfloat16
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
|
bench_fn(
|
||||||
torch.mm, a.to(dtype=torch.bfloat16),
|
label,
|
||||||
b.to(dtype=torch.bfloat16)))
|
sub_label,
|
||||||
|
"pytorch_bf16_bf16_bf16_matmul-no-scales",
|
||||||
|
torch.mm,
|
||||||
|
a.to(dtype=torch.bfloat16),
|
||||||
|
b.to(dtype=torch.bfloat16),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# pytorch impl - float16
|
# pytorch impl - float16
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label,
|
bench_fn(
|
||||||
"pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
|
label,
|
||||||
a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
|
sub_label,
|
||||||
|
"pytorch_fp16_fp16_fp16_matmul-no-scales",
|
||||||
|
torch.mm,
|
||||||
|
a.to(dtype=torch.float16),
|
||||||
|
b.to(dtype=torch.float16),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# cutlass impl
|
# cutlass impl
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
|
bench_fn(
|
||||||
ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
|
label,
|
||||||
torch.bfloat16))
|
sub_label,
|
||||||
|
"cutlass_i8_i8_bf16_scaled_mm",
|
||||||
|
ops.cutlass_scaled_mm,
|
||||||
|
a,
|
||||||
|
b,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
torch.bfloat16,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# cutlass with bias
|
# cutlass with bias
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
|
bench_fn(
|
||||||
ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
|
label,
|
||||||
bias))
|
sub_label,
|
||||||
|
"cutlass_i8_i8_bf16_scaled_mm_bias",
|
||||||
|
ops.cutlass_scaled_mm,
|
||||||
|
a,
|
||||||
|
b,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
torch.bfloat16,
|
||||||
|
bias,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# cutlass sparse impl
|
# cutlass sparse impl
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
|
bench_fn(
|
||||||
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
|
label,
|
||||||
scale_b, torch.bfloat16))
|
sub_label,
|
||||||
|
"cutlass_i8_i8_bf16_scaled_sparse_mm",
|
||||||
|
ops.cutlass_scaled_sparse_mm,
|
||||||
|
a,
|
||||||
|
b_compressed,
|
||||||
|
e,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
torch.bfloat16,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# cutlass sparse with bias
|
# cutlass sparse with bias
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
|
bench_fn(
|
||||||
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
|
label,
|
||||||
scale_b, torch.bfloat16, bias))
|
sub_label,
|
||||||
|
"cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
|
||||||
|
ops.cutlass_scaled_sparse_mm,
|
||||||
|
a,
|
||||||
|
b_compressed,
|
||||||
|
e,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
torch.bfloat16,
|
||||||
|
bias,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return timers
|
return timers
|
||||||
|
|
||||||
|
|
||||||
def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
def bench_fp8(
|
||||||
sub_label: str) -> Iterable[TMeasurement]:
|
dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str
|
||||||
|
) -> Iterable[TMeasurement]:
|
||||||
assert dtype == torch.float8_e4m3fn
|
assert dtype == torch.float8_e4m3fn
|
||||||
b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n,
|
b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
|
||||||
k)
|
|
||||||
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
|
bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16)
|
||||||
|
|
||||||
out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
|
out = ops.cutlass_scaled_sparse_mm(
|
||||||
torch.bfloat16)
|
a, b_compressed, e, scale_a, scale_b, torch.bfloat16
|
||||||
|
)
|
||||||
out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
|
out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
|
||||||
|
|
||||||
if not torch.allclose(out, out_ref):
|
if not torch.allclose(out, out_ref):
|
||||||
@ -124,97 +180,165 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
|||||||
|
|
||||||
# pytorch impl w. bf16
|
# pytorch impl w. bf16
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
|
bench_fn(
|
||||||
torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
|
label,
|
||||||
b.to(dtype=torch.bfloat16, device="cuda")))
|
sub_label,
|
||||||
|
"pytorch_bf16_bf16_bf16_matmul-no-scales",
|
||||||
|
torch.mm,
|
||||||
|
a.to(dtype=torch.bfloat16, device="cuda"),
|
||||||
|
b.to(dtype=torch.bfloat16, device="cuda"),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# pytorch impl: bf16 output, without fp8 fast accum
|
# pytorch impl: bf16 output, without fp8 fast accum
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label,
|
bench_fn(
|
||||||
sub_label,
|
label,
|
||||||
"pytorch_fp8_fp8_bf16_scaled_mm",
|
sub_label,
|
||||||
torch._scaled_mm,
|
"pytorch_fp8_fp8_bf16_scaled_mm",
|
||||||
a,
|
torch._scaled_mm,
|
||||||
b,
|
a,
|
||||||
scale_a=scale_a,
|
b,
|
||||||
scale_b=scale_b,
|
scale_a=scale_a,
|
||||||
out_dtype=torch.bfloat16))
|
scale_b=scale_b,
|
||||||
|
out_dtype=torch.bfloat16,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# pytorch impl: bf16 output, with fp8 fast accum
|
# pytorch impl: bf16 output, with fp8 fast accum
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label,
|
bench_fn(
|
||||||
sub_label,
|
label,
|
||||||
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
|
sub_label,
|
||||||
torch._scaled_mm,
|
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
|
||||||
a,
|
torch._scaled_mm,
|
||||||
b,
|
a,
|
||||||
scale_a=scale_a,
|
b,
|
||||||
scale_b=scale_b,
|
scale_a=scale_a,
|
||||||
out_dtype=torch.bfloat16,
|
scale_b=scale_b,
|
||||||
use_fast_accum=True))
|
out_dtype=torch.bfloat16,
|
||||||
|
use_fast_accum=True,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# pytorch impl: fp16 output, without fp8 fast accum
|
# pytorch impl: fp16 output, without fp8 fast accum
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label,
|
bench_fn(
|
||||||
sub_label,
|
label,
|
||||||
"pytorch_fp8_fp8_fp16_scaled_mm",
|
sub_label,
|
||||||
torch._scaled_mm,
|
"pytorch_fp8_fp8_fp16_scaled_mm",
|
||||||
a,
|
torch._scaled_mm,
|
||||||
b,
|
a,
|
||||||
scale_a=scale_a,
|
b,
|
||||||
scale_b=scale_b,
|
scale_a=scale_a,
|
||||||
out_dtype=torch.float16))
|
scale_b=scale_b,
|
||||||
|
out_dtype=torch.float16,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# pytorch impl: fp16 output, with fp8 fast accum
|
# pytorch impl: fp16 output, with fp8 fast accum
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label,
|
bench_fn(
|
||||||
sub_label,
|
label,
|
||||||
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
|
sub_label,
|
||||||
torch._scaled_mm,
|
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
|
||||||
a,
|
torch._scaled_mm,
|
||||||
b,
|
a,
|
||||||
scale_a=scale_a,
|
b,
|
||||||
scale_b=scale_b,
|
scale_a=scale_a,
|
||||||
out_dtype=torch.float16,
|
scale_b=scale_b,
|
||||||
use_fast_accum=True))
|
out_dtype=torch.float16,
|
||||||
|
use_fast_accum=True,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# cutlass impl: bf16 output
|
# cutlass impl: bf16 output
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
|
bench_fn(
|
||||||
ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
|
label,
|
||||||
torch.bfloat16))
|
sub_label,
|
||||||
|
"cutlass_fp8_fp8_bf16_scaled_mm",
|
||||||
|
ops.cutlass_scaled_mm,
|
||||||
|
a,
|
||||||
|
b,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
torch.bfloat16,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# cutlass impl: bf16 output
|
# cutlass impl: bf16 output
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
|
bench_fn(
|
||||||
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
|
label,
|
||||||
scale_b, torch.bfloat16))
|
sub_label,
|
||||||
|
"cutlass_fp8_fp8_bf16_scaled_sparse_mm",
|
||||||
|
ops.cutlass_scaled_sparse_mm,
|
||||||
|
a,
|
||||||
|
b_compressed,
|
||||||
|
e,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
torch.bfloat16,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# cutlass impl: fp16 output
|
# cutlass impl: fp16 output
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
|
bench_fn(
|
||||||
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
|
label,
|
||||||
scale_b, torch.float16))
|
sub_label,
|
||||||
|
"cutlass_fp8_fp8_fp16_scaled_sparse_mm",
|
||||||
|
ops.cutlass_scaled_sparse_mm,
|
||||||
|
a,
|
||||||
|
b_compressed,
|
||||||
|
e,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
torch.float16,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# cutlass impl: bf16 output, with bias
|
# cutlass impl: bf16 output, with bias
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label,
|
bench_fn(
|
||||||
"cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias",
|
label,
|
||||||
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
|
sub_label,
|
||||||
scale_b, torch.bfloat16, bias))
|
"cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias",
|
||||||
|
ops.cutlass_scaled_sparse_mm,
|
||||||
|
a,
|
||||||
|
b_compressed,
|
||||||
|
e,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
torch.bfloat16,
|
||||||
|
bias,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# cutlass impl: fp16 output, with bias
|
# cutlass impl: fp16 output, with bias
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label,
|
bench_fn(
|
||||||
"cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias",
|
label,
|
||||||
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
|
sub_label,
|
||||||
scale_b, torch.float16, bias.to(dtype=torch.float16)))
|
"cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias",
|
||||||
|
ops.cutlass_scaled_sparse_mm,
|
||||||
|
a,
|
||||||
|
b_compressed,
|
||||||
|
e,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
torch.float16,
|
||||||
|
bias.to(dtype=torch.float16),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return timers
|
return timers
|
||||||
|
|
||||||
|
|
||||||
def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
def bench(
|
||||||
sub_label: str) -> Iterable[TMeasurement]:
|
dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str
|
||||||
|
) -> Iterable[TMeasurement]:
|
||||||
if dtype == torch.int8:
|
if dtype == torch.int8:
|
||||||
return bench_int8(dtype, m, k, n, label, sub_label)
|
return bench_int8(dtype, m, k, n, label, sub_label)
|
||||||
if dtype == torch.float8_e4m3fn:
|
if dtype == torch.float8_e4m3fn:
|
||||||
@ -228,12 +352,12 @@ def print_timers(timers: Iterable[TMeasurement]):
|
|||||||
compare.print()
|
compare.print()
|
||||||
|
|
||||||
|
|
||||||
def run(dtype: torch.dtype,
|
def run(
|
||||||
MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]:
|
dtype: torch.dtype, MKNs: Iterable[tuple[int, int, int]]
|
||||||
|
) -> Iterable[TMeasurement]:
|
||||||
results = []
|
results = []
|
||||||
for m, k, n in MKNs:
|
for m, k, n in MKNs:
|
||||||
timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
|
timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm", f"MKN=({m}x{k}x{n})")
|
||||||
f"MKN=({m}x{k}x{n})")
|
|
||||||
print_timers(timers)
|
print_timers(timers)
|
||||||
results.extend(timers)
|
results.extend(timers)
|
||||||
|
|
||||||
@ -241,10 +365,12 @@ def run(dtype: torch.dtype,
|
|||||||
|
|
||||||
|
|
||||||
# output makers
|
# output makers
|
||||||
def make_output(data: Iterable[TMeasurement],
|
def make_output(
|
||||||
MKNs: Iterable[tuple[int, int, int]],
|
data: Iterable[TMeasurement],
|
||||||
base_description: str,
|
MKNs: Iterable[tuple[int, int, int]],
|
||||||
timestamp=None):
|
base_description: str,
|
||||||
|
timestamp=None,
|
||||||
|
):
|
||||||
print(f"== All Results {base_description} ====")
|
print(f"== All Results {base_description} ====")
|
||||||
print_timers(data)
|
print_timers(data)
|
||||||
|
|
||||||
@ -258,8 +384,7 @@ def make_output(data: Iterable[TMeasurement],
|
|||||||
|
|
||||||
|
|
||||||
def run_square_bench(args):
|
def run_square_bench(args):
|
||||||
dim_sizes = list(
|
dim_sizes = list(range(args.dim_start, args.dim_end + 1, args.dim_increment))
|
||||||
range(args.dim_start, args.dim_end + 1, args.dim_increment))
|
|
||||||
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
|
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
|
||||||
data = run(args.dtype, MKNs)
|
data = run(args.dtype, MKNs)
|
||||||
|
|
||||||
@ -319,7 +444,7 @@ def run_model_bench(args):
|
|||||||
pkl.dump(all_data, f)
|
pkl.dump(all_data, f)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
|
|
||||||
def to_torch_dtype(dt):
|
def to_torch_dtype(dt):
|
||||||
if dt == "int8":
|
if dt == "int8":
|
||||||
@ -344,12 +469,15 @@ Benchmark Cutlass GEMM.
|
|||||||
Output:
|
Output:
|
||||||
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
|
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
|
||||||
""", # noqa: E501
|
""", # noqa: E501
|
||||||
formatter_class=argparse.RawTextHelpFormatter)
|
formatter_class=argparse.RawTextHelpFormatter,
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument("--dtype",
|
parser.add_argument(
|
||||||
type=to_torch_dtype,
|
"--dtype",
|
||||||
required=True,
|
type=to_torch_dtype,
|
||||||
help="Available options are ['int8', 'fp8']")
|
required=True,
|
||||||
|
help="Available options are ['int8', 'fp8']",
|
||||||
|
)
|
||||||
subparsers = parser.add_subparsers(dest="cmd")
|
subparsers = parser.add_subparsers(dest="cmd")
|
||||||
|
|
||||||
square_parser = subparsers.add_parser("square_bench")
|
square_parser = subparsers.add_parser("square_bench")
|
||||||
@ -368,19 +496,19 @@ Benchmark Cutlass GEMM.
|
|||||||
range_parser.set_defaults(func=run_range_bench)
|
range_parser.set_defaults(func=run_range_bench)
|
||||||
|
|
||||||
model_parser = subparsers.add_parser("model_bench")
|
model_parser = subparsers.add_parser("model_bench")
|
||||||
model_parser.add_argument("--models",
|
model_parser.add_argument(
|
||||||
nargs="+",
|
"--models",
|
||||||
type=str,
|
nargs="+",
|
||||||
default=DEFAULT_MODELS,
|
type=str,
|
||||||
choices=WEIGHT_SHAPES.keys())
|
default=DEFAULT_MODELS,
|
||||||
model_parser.add_argument("--tp-sizes",
|
choices=WEIGHT_SHAPES.keys(),
|
||||||
nargs="+",
|
)
|
||||||
type=int,
|
model_parser.add_argument(
|
||||||
default=DEFAULT_TP_SIZES)
|
"--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES
|
||||||
model_parser.add_argument("--batch-sizes",
|
)
|
||||||
nargs="+",
|
model_parser.add_argument(
|
||||||
type=int,
|
"--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
|
||||||
default=DEFAULT_BATCH_SIZES)
|
)
|
||||||
model_parser.set_defaults(func=run_model_bench)
|
model_parser.set_defaults(func=run_model_bench)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|||||||
@ -10,8 +10,9 @@ import vllm._custom_ops as ops
|
|||||||
|
|
||||||
def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
|
def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
|
||||||
finfo = torch.finfo(torch.float8_e4m3fn)
|
finfo = torch.finfo(torch.float8_e4m3fn)
|
||||||
return torch.round(tensor.clamp(
|
return torch.round(tensor.clamp(min=finfo.min, max=finfo.max)).to(
|
||||||
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
|
dtype=torch.float8_e4m3fn
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def to_int8(tensor: torch.Tensor) -> torch.Tensor:
|
def to_int8(tensor: torch.Tensor) -> torch.Tensor:
|
||||||
@ -26,10 +27,11 @@ def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
|
|||||||
return tensor.to(dtype=torch.float16)
|
return tensor.to(dtype=torch.float16)
|
||||||
|
|
||||||
|
|
||||||
def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
|
def make_rand_tensors(
|
||||||
k: int) -> tuple[torch.Tensor, torch.Tensor]:
|
dtype: torch.dtype, m: int, n: int, k: int
|
||||||
a = torch.randn((m, k), device='cuda') * 5
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
b = torch.randn((n, k), device='cuda').t() * 5
|
a = torch.randn((m, k), device="cuda") * 5
|
||||||
|
b = torch.randn((n, k), device="cuda").t() * 5
|
||||||
|
|
||||||
if dtype == torch.int8:
|
if dtype == torch.int8:
|
||||||
return to_int8(a), to_int8(b)
|
return to_int8(a), to_int8(b)
|
||||||
@ -49,9 +51,7 @@ def prune_to_2_4(tensor):
|
|||||||
|
|
||||||
# Create binary mask
|
# Create binary mask
|
||||||
mask = torch.zeros_like(reshaped)
|
mask = torch.zeros_like(reshaped)
|
||||||
mask.scatter_(dim=1,
|
mask.scatter_(dim=1, index=indices, src=torch.ones_like(indices, dtype=mask.dtype))
|
||||||
index=indices,
|
|
||||||
src=torch.ones_like(indices, dtype=mask.dtype))
|
|
||||||
|
|
||||||
# Apply mask and reshape back
|
# Apply mask and reshape back
|
||||||
pruned = reshaped * mask
|
pruned = reshaped * mask
|
||||||
@ -62,10 +62,11 @@ def prune_to_2_4(tensor):
|
|||||||
return pruned.reshape(original_shape)
|
return pruned.reshape(original_shape)
|
||||||
|
|
||||||
|
|
||||||
def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
|
def make_rand_sparse_tensors(
|
||||||
k: int) -> tuple[torch.Tensor, torch.Tensor]:
|
dtype: torch.dtype, m: int, n: int, k: int
|
||||||
a = torch.randn((m, k), device='cuda') * 5
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
b = torch.randn((n, k), device='cuda').t() * 5
|
a = torch.randn((m, k), device="cuda") * 5
|
||||||
|
b = torch.randn((n, k), device="cuda").t() * 5
|
||||||
|
|
||||||
b = prune_to_2_4(b.t()).t()
|
b = prune_to_2_4(b.t()).t()
|
||||||
|
|
||||||
@ -86,9 +87,9 @@ def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
|
|||||||
return b_compressed, e, a, b
|
return b_compressed, e, a, b
|
||||||
|
|
||||||
|
|
||||||
def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype,
|
def make_n_rand_sparse_tensors(
|
||||||
m: int, n: int, k: int) -> \
|
num_tensors: int, dtype: torch.dtype, m: int, n: int, k: int
|
||||||
tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
|
) -> tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
|
||||||
ABs = []
|
ABs = []
|
||||||
for _ in range(num_tensors):
|
for _ in range(num_tensors):
|
||||||
b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
|
b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
|
||||||
|
|||||||
@ -16,7 +16,8 @@ from weight_shapes import WEIGHT_SHAPES
|
|||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
w8a8_block_fp8_matmul)
|
w8a8_block_fp8_matmul,
|
||||||
|
)
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||||
@ -25,8 +26,9 @@ DEFAULT_TP_SIZES = [1]
|
|||||||
|
|
||||||
|
|
||||||
# bench
|
# bench
|
||||||
def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
|
def bench_fn(
|
||||||
**kwargs) -> TMeasurement:
|
label: str, sub_label: str, description: str, fn: Callable, *args, **kwargs
|
||||||
|
) -> TMeasurement:
|
||||||
min_run_time = 1
|
min_run_time = 1
|
||||||
|
|
||||||
globals = {
|
globals = {
|
||||||
@ -44,45 +46,48 @@ def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
|
|||||||
|
|
||||||
|
|
||||||
def bench_int8(
|
def bench_int8(
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
m: int,
|
m: int,
|
||||||
k: int,
|
k: int,
|
||||||
n: int,
|
n: int,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
|
bench_kernels: Optional[list[str]] = None,
|
||||||
|
) -> Iterable[TMeasurement]:
|
||||||
"""Benchmark INT8-based kernels."""
|
"""Benchmark INT8-based kernels."""
|
||||||
assert dtype == torch.int8
|
assert dtype == torch.int8
|
||||||
a, b = make_rand_tensors(torch.int8, m, n, k)
|
a, b = make_rand_tensors(torch.int8, m, n, k)
|
||||||
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
|
bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16)
|
||||||
azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
|
azp = torch.zeros((m,), device="cuda", dtype=torch.int32)
|
||||||
azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
|
azp_adj = torch.zeros((n,), device="cuda", dtype=torch.int32)
|
||||||
|
|
||||||
bench_fns = {
|
bench_fns = {
|
||||||
"pytorch_bf16_bf16_bf16_matmul-no-scales":
|
"pytorch_bf16_bf16_bf16_matmul-no-scales": lambda: torch.mm(
|
||||||
lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
|
a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
|
||||||
),
|
),
|
||||||
"pytorch_fp16_fp16_fp16_matmul-no-scales":
|
"pytorch_fp16_fp16_fp16_matmul-no-scales": lambda: torch.mm(
|
||||||
lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)),
|
a.to(dtype=torch.float16), b.to(dtype=torch.float16)
|
||||||
"cutlass_i8_i8_bf16_scaled_mm":
|
),
|
||||||
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16),
|
"cutlass_i8_i8_bf16_scaled_mm": lambda: ops.cutlass_scaled_mm(
|
||||||
"cutlass_i8_i8_bf16_scaled_mm_bias":
|
a, b, scale_a, scale_b, torch.bfloat16
|
||||||
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16,
|
),
|
||||||
bias),
|
"cutlass_i8_i8_bf16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
|
||||||
"cutlass_i8_i8_bf16_scaled_mm_azp":
|
a, b, scale_a, scale_b, torch.bfloat16, bias
|
||||||
lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
|
),
|
||||||
bfloat16, azp_adj),
|
"cutlass_i8_i8_bf16_scaled_mm_azp": lambda: ops.cutlass_scaled_mm_azp(
|
||||||
"cutlass_i8_i8_bf16_scaled_mm_azp_bias":
|
a, b, scale_a, scale_b, torch.bfloat16, azp_adj
|
||||||
lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
|
),
|
||||||
bfloat16, azp_adj, None, bias),
|
"cutlass_i8_i8_bf16_scaled_mm_azp_bias": lambda: ops.cutlass_scaled_mm_azp(
|
||||||
"cutlass_i8_i8_bf16_scaled_mm_azp_pt":
|
a, b, scale_a, scale_b, torch.bfloat16, azp_adj, None, bias
|
||||||
lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
|
),
|
||||||
bfloat16, azp_adj, azp),
|
"cutlass_i8_i8_bf16_scaled_mm_azp_pt": lambda: ops.cutlass_scaled_mm_azp(
|
||||||
"cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias":
|
a, b, scale_a, scale_b, torch.bfloat16, azp_adj, azp
|
||||||
lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
|
),
|
||||||
bfloat16, azp_adj, azp, bias),
|
"cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias": lambda: ops.cutlass_scaled_mm_azp(
|
||||||
|
a, b, scale_a, scale_b, torch.bfloat16, azp_adj, azp, bias
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
timers = []
|
timers = []
|
||||||
@ -96,73 +101,73 @@ def bench_int8(
|
|||||||
|
|
||||||
|
|
||||||
def bench_fp8(
|
def bench_fp8(
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
m: int,
|
m: int,
|
||||||
k: int,
|
k: int,
|
||||||
n: int,
|
n: int,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
|
bench_kernels: Optional[list[str]] = None,
|
||||||
|
) -> Iterable[TMeasurement]:
|
||||||
"""Benchmark FP8-based kernels."""
|
"""Benchmark FP8-based kernels."""
|
||||||
assert dtype == torch.float8_e4m3fn
|
assert dtype == torch.float8_e4m3fn
|
||||||
a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
|
a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
|
||||||
a_cont = a.contiguous()
|
a_cont = a.contiguous()
|
||||||
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
block_scale_a = torch.rand((m, k // 128),
|
|
||||||
device="cuda",
|
def ceil_div(x: int, y: int) -> int:
|
||||||
dtype=torch.float32)
|
return (x + y - 1) // y
|
||||||
block_scale_b = torch.rand((k // 128, n // 128),
|
|
||||||
device="cuda",
|
block_scale_a = torch.rand(
|
||||||
dtype=torch.float32)
|
(m, ceil_div(k, 128)), device="cuda", dtype=torch.float32
|
||||||
|
)
|
||||||
|
block_scale_b = torch.rand(
|
||||||
|
ceil_div(k, 128), ceil_div(n, 128), device="cuda", dtype=torch.float32
|
||||||
|
)
|
||||||
block_scale_a_M_major = block_scale_a.t().contiguous().t()
|
block_scale_a_M_major = block_scale_a.t().contiguous().t()
|
||||||
block_scale_b_K_major = block_scale_b.t().contiguous().t()
|
block_scale_b_K_major = block_scale_b.t().contiguous().t()
|
||||||
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
|
bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16)
|
||||||
|
|
||||||
print(m, k, n)
|
print(m, k, n)
|
||||||
|
|
||||||
bench_fns = {
|
bench_fns = {
|
||||||
"pytorch_bf16_bf16_bf16_matmul-no-scales":
|
"pytorch_bf16_bf16_bf16_matmul-no-scales": lambda: torch.mm(
|
||||||
lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
|
a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
|
||||||
),
|
),
|
||||||
"pytorch_fp16_fp16_fp16_matmul-no-scales":
|
"pytorch_fp16_fp16_fp16_matmul-no-scales": lambda: torch.mm(
|
||||||
lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)),
|
a.to(dtype=torch.float16), b.to(dtype=torch.float16)
|
||||||
"pytorch_fp8_fp8_fp16_scaled_mm":
|
),
|
||||||
lambda: torch._scaled_mm(
|
"pytorch_fp8_fp8_fp16_scaled_mm": lambda: torch._scaled_mm(
|
||||||
a, b, scale_a, scale_b, out_dtype=torch.float16),
|
a, b, scale_a, scale_b, out_dtype=torch.float16
|
||||||
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum":
|
),
|
||||||
lambda: torch._scaled_mm(a,
|
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum": lambda: torch._scaled_mm(
|
||||||
b,
|
a, b, scale_a, scale_b, out_dtype=torch.float16, use_fast_accum=True
|
||||||
scale_a,
|
),
|
||||||
scale_b,
|
"pytorch_fp8_fp8_bf16_scaled_mm": lambda: torch._scaled_mm(
|
||||||
out_dtype=torch.float16,
|
a, b, scale_a, scale_b, out_dtype=torch.bfloat16
|
||||||
use_fast_accum=True),
|
),
|
||||||
"pytorch_fp8_fp8_bf16_scaled_mm":
|
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum": lambda: torch._scaled_mm(
|
||||||
lambda: torch._scaled_mm(
|
a, b, scale_a, scale_b, out_dtype=torch.bfloat16, use_fast_accum=True
|
||||||
a, b, scale_a, scale_b, out_dtype=torch.bfloat16),
|
),
|
||||||
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum":
|
"cutlass_fp8_fp8_bf16_scaled_mm": lambda: ops.cutlass_scaled_mm(
|
||||||
lambda: torch._scaled_mm(a,
|
a, b, scale_a, scale_b, torch.bfloat16
|
||||||
b,
|
),
|
||||||
scale_a,
|
"cutlass_fp8_fp8_fp16_scaled_mm": lambda: ops.cutlass_scaled_mm(
|
||||||
scale_b,
|
a, b, scale_a, scale_b, torch.float16
|
||||||
out_dtype=torch.bfloat16,
|
),
|
||||||
use_fast_accum=True),
|
"cutlass_fp8_fp8_bf16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
|
||||||
"cutlass_fp8_fp8_bf16_scaled_mm":
|
a, b, scale_a, scale_b, torch.bfloat16, bias
|
||||||
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16),
|
),
|
||||||
"cutlass_fp8_fp8_fp16_scaled_mm":
|
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
|
||||||
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16),
|
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
|
||||||
"cutlass_fp8_fp8_bf16_scaled_mm_bias":
|
),
|
||||||
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16,
|
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
|
||||||
bias),
|
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
|
||||||
"cutlass_fp8_fp8_fp16_scaled_mm_bias":
|
),
|
||||||
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16,
|
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
|
||||||
bias.to(dtype=torch.float16)),
|
a, b, block_scale_a_M_major, block_scale_b_K_major, torch.float16
|
||||||
"triton_fp8_fp8_fp16_scaled_mm_blockwise":
|
),
|
||||||
lambda: w8a8_block_fp8_matmul(a_cont, b.t(), block_scale_a,
|
|
||||||
block_scale_b.t(), (128, 128)),
|
|
||||||
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise":
|
|
||||||
lambda: ops.cutlass_scaled_mm(a, b, block_scale_a_M_major,
|
|
||||||
block_scale_b_K_major, torch.float16),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
timers = []
|
timers = []
|
||||||
@ -175,13 +180,15 @@ def bench_fp8(
|
|||||||
return timers
|
return timers
|
||||||
|
|
||||||
|
|
||||||
def bench(dtype: torch.dtype,
|
def bench(
|
||||||
m: int,
|
dtype: torch.dtype,
|
||||||
k: int,
|
m: int,
|
||||||
n: int,
|
k: int,
|
||||||
label: str,
|
n: int,
|
||||||
sub_label: str,
|
label: str,
|
||||||
bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
|
sub_label: str,
|
||||||
|
bench_kernels: Optional[list[str]] = None,
|
||||||
|
) -> Iterable[TMeasurement]:
|
||||||
if dtype == torch.int8:
|
if dtype == torch.int8:
|
||||||
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
|
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
|
||||||
if dtype == torch.float8_e4m3fn:
|
if dtype == torch.float8_e4m3fn:
|
||||||
@ -195,27 +202,33 @@ def print_timers(timers: Iterable[TMeasurement]):
|
|||||||
compare.print()
|
compare.print()
|
||||||
|
|
||||||
|
|
||||||
def run(dtype: torch.dtype,
|
def run(
|
||||||
MKNs: Iterable[tuple[int, int, int]],
|
dtype: torch.dtype,
|
||||||
bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
|
MKNs: Iterable[tuple[int, int, int]],
|
||||||
|
bench_kernels: Optional[list[str]] = None,
|
||||||
|
) -> Iterable[TMeasurement]:
|
||||||
results = []
|
results = []
|
||||||
for m, k, n in MKNs:
|
for m, k, n in MKNs:
|
||||||
timers = bench(dtype,
|
timers = bench(
|
||||||
m,
|
dtype,
|
||||||
k,
|
m,
|
||||||
n,
|
k,
|
||||||
f"scaled-{dtype}-gemm",
|
n,
|
||||||
f"MKN=({m}x{k}x{n})",
|
f"scaled-{dtype}-gemm",
|
||||||
bench_kernels=bench_kernels)
|
f"MKN=({m}x{k}x{n})",
|
||||||
|
bench_kernels=bench_kernels,
|
||||||
|
)
|
||||||
print_timers(timers)
|
print_timers(timers)
|
||||||
results.extend(timers)
|
results.extend(timers)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def make_output(data: Iterable[TMeasurement],
|
def make_output(
|
||||||
MKNs: Iterable[tuple[int, int, int]],
|
data: Iterable[TMeasurement],
|
||||||
base_description: str,
|
MKNs: Iterable[tuple[int, int, int]],
|
||||||
timestamp=None):
|
base_description: str,
|
||||||
|
timestamp=None,
|
||||||
|
):
|
||||||
print(f"== All Results {base_description} ====")
|
print(f"== All Results {base_description} ====")
|
||||||
print_timers(data)
|
print_timers(data)
|
||||||
|
|
||||||
@ -226,8 +239,7 @@ def make_output(data: Iterable[TMeasurement],
|
|||||||
|
|
||||||
|
|
||||||
def run_square_bench(args):
|
def run_square_bench(args):
|
||||||
dim_sizes = list(
|
dim_sizes = list(range(args.dim_start, args.dim_end + 1, args.dim_increment))
|
||||||
range(args.dim_start, args.dim_end + 1, args.dim_increment))
|
|
||||||
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
|
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
|
||||||
data = run(args.dtype, MKNs, bench_kernels=args.kernels)
|
data = run(args.dtype, MKNs, bench_kernels=args.kernels)
|
||||||
make_output(data, MKNs, f"square_bench-{args.dtype}")
|
make_output(data, MKNs, f"square_bench-{args.dtype}")
|
||||||
@ -285,7 +297,7 @@ def run_model_bench(args):
|
|||||||
pkl.dump(all_data, f)
|
pkl.dump(all_data, f)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
|
|
||||||
def to_torch_dtype(dt):
|
def to_torch_dtype(dt):
|
||||||
if dt == "int8":
|
if dt == "int8":
|
||||||
@ -310,19 +322,21 @@ Benchmark Cutlass GEMM.
|
|||||||
Output:
|
Output:
|
||||||
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
|
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
|
||||||
""", # noqa: E501
|
""", # noqa: E501
|
||||||
formatter_class=argparse.RawTextHelpFormatter)
|
formatter_class=argparse.RawTextHelpFormatter,
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument("--dtype",
|
parser.add_argument(
|
||||||
type=to_torch_dtype,
|
"--dtype",
|
||||||
required=True,
|
type=to_torch_dtype,
|
||||||
help="Available options are ['int8', 'fp8']")
|
required=True,
|
||||||
|
help="Available options are ['int8', 'fp8']",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--kernels",
|
"--kernels",
|
||||||
nargs="+",
|
nargs="+",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help=
|
help="Exact names of the kernels to benchmark. If not set, runs all kernels.",
|
||||||
"Exact names of the kernels to benchmark. If not set, runs all kernels."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
subparsers = parser.add_subparsers(dest="cmd")
|
subparsers = parser.add_subparsers(dest="cmd")
|
||||||
@ -343,19 +357,19 @@ Benchmark Cutlass GEMM.
|
|||||||
range_parser.set_defaults(func=run_range_bench)
|
range_parser.set_defaults(func=run_range_bench)
|
||||||
|
|
||||||
model_parser = subparsers.add_parser("model_bench")
|
model_parser = subparsers.add_parser("model_bench")
|
||||||
model_parser.add_argument("--models",
|
model_parser.add_argument(
|
||||||
nargs="+",
|
"--models",
|
||||||
type=str,
|
nargs="+",
|
||||||
default=DEFAULT_MODELS,
|
type=str,
|
||||||
choices=WEIGHT_SHAPES.keys())
|
default=DEFAULT_MODELS,
|
||||||
model_parser.add_argument("--tp-sizes",
|
choices=WEIGHT_SHAPES.keys(),
|
||||||
nargs="+",
|
)
|
||||||
type=int,
|
model_parser.add_argument(
|
||||||
default=DEFAULT_TP_SIZES)
|
"--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES
|
||||||
model_parser.add_argument("--batch-sizes",
|
)
|
||||||
nargs="+",
|
model_parser.add_argument(
|
||||||
type=int,
|
"--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
|
||||||
default=DEFAULT_BATCH_SIZES)
|
)
|
||||||
model_parser.set_defaults(func=run_model_bench)
|
model_parser.set_defaults(func=run_model_bench)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|||||||
@ -42,4 +42,4 @@ WEIGHT_SHAPES = {
|
|||||||
([8192, 57344], 1),
|
([8192, 57344], 1),
|
||||||
([28672, 8192], 0),
|
([28672, 8192], 0),
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
|||||||
@ -12,39 +12,37 @@ app = Quart(__name__)
|
|||||||
|
|
||||||
async def forward_request(url, data):
|
async def forward_request(url, data):
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
headers = {
|
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
|
async with session.post(url=url, json=data, headers=headers) as response:
|
||||||
}
|
|
||||||
async with session.post(url=url, json=data,
|
|
||||||
headers=headers) as response:
|
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
# if response.headers.get('Transfer-Encoding') == 'chunked':
|
# if response.headers.get('Transfer-Encoding') == 'chunked':
|
||||||
if True:
|
if True:
|
||||||
async for chunk_bytes in response.content.iter_chunked(
|
async for chunk_bytes in response.content.iter_chunked(1024):
|
||||||
1024):
|
|
||||||
yield chunk_bytes
|
yield chunk_bytes
|
||||||
else:
|
else:
|
||||||
content = await response.read()
|
content = await response.read()
|
||||||
yield content
|
yield content
|
||||||
|
|
||||||
|
|
||||||
@app.route('/v1/completions', methods=['POST'])
|
@app.route("/v1/completions", methods=["POST"])
|
||||||
async def handle_request():
|
async def handle_request():
|
||||||
try:
|
try:
|
||||||
original_request_data = await request.get_json()
|
original_request_data = await request.get_json()
|
||||||
|
|
||||||
prefill_request = original_request_data.copy()
|
prefill_request = original_request_data.copy()
|
||||||
# change max_tokens = 1 to let it only do prefill
|
# change max_tokens = 1 to let it only do prefill
|
||||||
prefill_request['max_tokens'] = 1
|
prefill_request["max_tokens"] = 1
|
||||||
|
|
||||||
# finish prefill
|
# finish prefill
|
||||||
async for _ in forward_request('http://localhost:8100/v1/completions',
|
async for _ in forward_request(
|
||||||
prefill_request):
|
"http://localhost:8100/v1/completions", prefill_request
|
||||||
|
):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# return decode
|
# return decode
|
||||||
generator = forward_request('http://localhost:8200/v1/completions',
|
generator = forward_request(
|
||||||
original_request_data)
|
"http://localhost:8200/v1/completions", original_request_data
|
||||||
|
)
|
||||||
response = await make_response(generator)
|
response = await make_response(generator)
|
||||||
response.timeout = None
|
response.timeout = None
|
||||||
|
|
||||||
@ -53,11 +51,12 @@ async def handle_request():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
import sys
|
import sys
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
exc_info = sys.exc_info()
|
exc_info = sys.exc_info()
|
||||||
print("Error occurred in disagg prefill proxy server")
|
print("Error occurred in disagg prefill proxy server")
|
||||||
print(e)
|
print(e)
|
||||||
print("".join(traceback.format_exception(*exc_info)))
|
print("".join(traceback.format_exception(*exc_info)))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
app.run(port=8000)
|
app.run(port=8000)
|
||||||
|
|||||||
@ -8,7 +8,6 @@ from aiohttp import web
|
|||||||
|
|
||||||
|
|
||||||
class RoundRobinProxy:
|
class RoundRobinProxy:
|
||||||
|
|
||||||
def __init__(self, target_ports):
|
def __init__(self, target_ports):
|
||||||
self.target_ports = target_ports
|
self.target_ports = target_ports
|
||||||
self.port_cycle = itertools.cycle(self.target_ports)
|
self.port_cycle = itertools.cycle(self.target_ports)
|
||||||
@ -21,14 +20,15 @@ class RoundRobinProxy:
|
|||||||
try:
|
try:
|
||||||
# Forward the request
|
# Forward the request
|
||||||
async with session.request(
|
async with session.request(
|
||||||
method=request.method,
|
method=request.method,
|
||||||
url=target_url,
|
url=target_url,
|
||||||
headers=request.headers,
|
headers=request.headers,
|
||||||
data=request.content,
|
data=request.content,
|
||||||
) as response:
|
) as response:
|
||||||
# Start sending the response
|
# Start sending the response
|
||||||
resp = web.StreamResponse(status=response.status,
|
resp = web.StreamResponse(
|
||||||
headers=response.headers)
|
status=response.status, headers=response.headers
|
||||||
|
)
|
||||||
await resp.prepare(request)
|
await resp.prepare(request)
|
||||||
|
|
||||||
# Stream the response content
|
# Stream the response content
|
||||||
@ -45,11 +45,11 @@ class RoundRobinProxy:
|
|||||||
async def main():
|
async def main():
|
||||||
proxy = RoundRobinProxy([8100, 8200])
|
proxy = RoundRobinProxy([8100, 8200])
|
||||||
app = web.Application()
|
app = web.Application()
|
||||||
app.router.add_route('*', '/{path:.*}', proxy.handle_request)
|
app.router.add_route("*", "/{path:.*}", proxy.handle_request)
|
||||||
|
|
||||||
runner = web.AppRunner(app)
|
runner = web.AppRunner(app)
|
||||||
await runner.setup()
|
await runner.setup()
|
||||||
site = web.TCPSite(runner, 'localhost', 8000)
|
site = web.TCPSite(runner, "localhost", 8000)
|
||||||
await site.start()
|
await site.start()
|
||||||
|
|
||||||
print("Proxy server started on http://localhost:8000")
|
print("Proxy server started on http://localhost:8000")
|
||||||
@ -58,5 +58,5 @@ async def main():
|
|||||||
await asyncio.Event().wait()
|
await asyncio.Event().wait()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
|
|||||||
@ -6,43 +6,41 @@ import matplotlib.pyplot as plt
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
data = []
|
data = []
|
||||||
for name in ['disagg_prefill', 'chunked_prefill']:
|
for name in ["disagg_prefill", "chunked_prefill"]:
|
||||||
for qps in [2, 4, 6, 8]:
|
for qps in [2, 4, 6, 8]:
|
||||||
with open(f"results/{name}-qps-{qps}.json") as f:
|
with open(f"results/{name}-qps-{qps}.json") as f:
|
||||||
x = json.load(f)
|
x = json.load(f)
|
||||||
x['name'] = name
|
x["name"] = name
|
||||||
x['qps'] = qps
|
x["qps"] = qps
|
||||||
data.append(x)
|
data.append(x)
|
||||||
|
|
||||||
df = pd.DataFrame.from_dict(data)
|
df = pd.DataFrame.from_dict(data)
|
||||||
dis_df = df[df['name'] == 'disagg_prefill']
|
dis_df = df[df["name"] == "disagg_prefill"]
|
||||||
chu_df = df[df['name'] == 'chunked_prefill']
|
chu_df = df[df["name"] == "chunked_prefill"]
|
||||||
|
|
||||||
plt.style.use('bmh')
|
plt.style.use("bmh")
|
||||||
plt.rcParams['font.size'] = 20
|
plt.rcParams["font.size"] = 20
|
||||||
|
|
||||||
for key in [
|
for key in [
|
||||||
'mean_ttft_ms', 'median_ttft_ms', 'p99_ttft_ms', 'mean_itl_ms',
|
"mean_ttft_ms",
|
||||||
'median_itl_ms', 'p99_itl_ms'
|
"median_ttft_ms",
|
||||||
|
"p99_ttft_ms",
|
||||||
|
"mean_itl_ms",
|
||||||
|
"median_itl_ms",
|
||||||
|
"p99_itl_ms",
|
||||||
]:
|
]:
|
||||||
|
|
||||||
fig, ax = plt.subplots(figsize=(11, 7))
|
fig, ax = plt.subplots(figsize=(11, 7))
|
||||||
plt.plot(dis_df['qps'],
|
plt.plot(
|
||||||
dis_df[key],
|
dis_df["qps"], dis_df[key], label="disagg_prefill", marker="o", linewidth=4
|
||||||
label='disagg_prefill',
|
)
|
||||||
marker='o',
|
plt.plot(
|
||||||
linewidth=4)
|
chu_df["qps"], chu_df[key], label="chunked_prefill", marker="o", linewidth=4
|
||||||
plt.plot(chu_df['qps'],
|
)
|
||||||
chu_df[key],
|
|
||||||
label='chunked_prefill',
|
|
||||||
marker='o',
|
|
||||||
linewidth=4)
|
|
||||||
ax.legend()
|
ax.legend()
|
||||||
|
|
||||||
ax.set_xlabel('QPS')
|
ax.set_xlabel("QPS")
|
||||||
ax.set_ylabel(key)
|
ax.set_ylabel(key)
|
||||||
ax.set_ylim(bottom=0)
|
ax.set_ylim(bottom=0)
|
||||||
fig.savefig(f'results/{key}.png')
|
fig.savefig(f"results/{key}.png")
|
||||||
plt.close(fig)
|
plt.close(fig)
|
||||||
|
|||||||
@ -24,10 +24,12 @@ class bench_params_t:
|
|||||||
dtype: torch.dtype
|
dtype: torch.dtype
|
||||||
|
|
||||||
def description(self):
|
def description(self):
|
||||||
return (f'N {self.num_tokens} '
|
return (
|
||||||
f'x D {self.hidden_size} '
|
f"N {self.num_tokens} "
|
||||||
f'x R {self.add_residual} '
|
f"x D {self.hidden_size} "
|
||||||
f'x DT {self.dtype}')
|
f"x R {self.add_residual} "
|
||||||
|
f"x DT {self.dtype}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_bench_params() -> list[bench_params_t]:
|
def get_bench_params() -> list[bench_params_t]:
|
||||||
@ -38,15 +40,19 @@ def get_bench_params() -> list[bench_params_t]:
|
|||||||
DTYPES = [torch.bfloat16, torch.float]
|
DTYPES = [torch.bfloat16, torch.float]
|
||||||
|
|
||||||
combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
|
combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
|
||||||
bench_params = list(map(lambda x: \
|
bench_params = list(
|
||||||
bench_params_t(x[0], x[1], x[2], x[3]), combinations))
|
map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations)
|
||||||
|
)
|
||||||
return bench_params
|
return bench_params
|
||||||
|
|
||||||
|
|
||||||
# Reference impls
|
# Reference impls
|
||||||
def unfused_int8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,
|
def unfused_int8_impl(
|
||||||
residual: Optional[torch.Tensor],
|
rms_norm_layer: RMSNorm,
|
||||||
quant_dtype: torch.dtype):
|
x: torch.Tensor,
|
||||||
|
residual: Optional[torch.Tensor],
|
||||||
|
quant_dtype: torch.dtype,
|
||||||
|
):
|
||||||
# Norm
|
# Norm
|
||||||
torch_out = None
|
torch_out = None
|
||||||
if residual is None:
|
if residual is None:
|
||||||
@ -58,9 +64,12 @@ def unfused_int8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,
|
|||||||
torch_out, _, _ = ops.scaled_int8_quant(torch_out)
|
torch_out, _, _ = ops.scaled_int8_quant(torch_out)
|
||||||
|
|
||||||
|
|
||||||
def unfused_fp8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,
|
def unfused_fp8_impl(
|
||||||
residual: Optional[torch.Tensor],
|
rms_norm_layer: RMSNorm,
|
||||||
quant_dtype: torch.dtype):
|
x: torch.Tensor,
|
||||||
|
residual: Optional[torch.Tensor],
|
||||||
|
quant_dtype: torch.dtype,
|
||||||
|
):
|
||||||
# Norm
|
# Norm
|
||||||
torch_out = None
|
torch_out = None
|
||||||
if residual is None:
|
if residual is None:
|
||||||
@ -73,22 +82,27 @@ def unfused_fp8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,
|
|||||||
|
|
||||||
|
|
||||||
def fused_impl(
|
def fused_impl(
|
||||||
rms_norm_layer: RMSNorm, # this stores the weights
|
rms_norm_layer: RMSNorm, # this stores the weights
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor],
|
residual: Optional[torch.Tensor],
|
||||||
quant_dtype: torch.dtype):
|
quant_dtype: torch.dtype,
|
||||||
out, _ = ops.rms_norm_dynamic_per_token_quant(x,
|
):
|
||||||
rms_norm_layer.weight,
|
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
||||||
1e-6,
|
x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
|
||||||
quant_dtype,
|
)
|
||||||
residual=residual)
|
|
||||||
|
|
||||||
|
|
||||||
# Bench functions
|
# Bench functions
|
||||||
def bench_fn(rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor,
|
def bench_fn(
|
||||||
quant_dtype: torch.dtype, label: str, sub_label: str,
|
rms_norm_layer: RMSNorm,
|
||||||
fn: Callable, description: str) -> TMeasurement:
|
x: torch.Tensor,
|
||||||
|
residual: torch.Tensor,
|
||||||
|
quant_dtype: torch.dtype,
|
||||||
|
label: str,
|
||||||
|
sub_label: str,
|
||||||
|
fn: Callable,
|
||||||
|
description: str,
|
||||||
|
) -> TMeasurement:
|
||||||
min_run_time = 1
|
min_run_time = 1
|
||||||
|
|
||||||
globals = {
|
globals = {
|
||||||
@ -106,43 +120,81 @@ def bench_fn(rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor,
|
|||||||
description=description,
|
description=description,
|
||||||
).blocked_autorange(min_run_time=min_run_time)
|
).blocked_autorange(min_run_time=min_run_time)
|
||||||
|
|
||||||
def bench(params: bench_params_t, label: str, sub_label: str) \
|
|
||||||
-> Iterable[TMeasurement]:
|
|
||||||
|
|
||||||
|
def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasurement]:
|
||||||
# Make inputs
|
# Make inputs
|
||||||
layer = RMSNorm(params.hidden_size, 1e-6).to(dtype=params.dtype)
|
layer = RMSNorm(params.hidden_size, 1e-6).to(dtype=params.dtype)
|
||||||
# Make weights
|
# Make weights
|
||||||
layer.weight.data.normal_(mean=1.0, std=0.1)
|
layer.weight.data.normal_(mean=1.0, std=0.1)
|
||||||
# Make inputs
|
# Make inputs
|
||||||
scale = 1 / params.hidden_size
|
scale = 1 / params.hidden_size
|
||||||
x = torch.randn(params.num_tokens,
|
x = (
|
||||||
params.hidden_size,
|
torch.randn(
|
||||||
dtype=params.dtype,
|
params.num_tokens, params.hidden_size, dtype=params.dtype, device="cuda"
|
||||||
device='cuda') * scale
|
)
|
||||||
residual = (torch.randn_like(x) * scale).to(device='cuda') \
|
* scale
|
||||||
if params.add_residual else None
|
)
|
||||||
|
residual = (
|
||||||
|
(torch.randn_like(x) * scale).to(device="cuda") if params.add_residual else None
|
||||||
|
)
|
||||||
|
|
||||||
timers = []
|
timers = []
|
||||||
|
|
||||||
# unfused int8 impl.
|
# unfused int8 impl.
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(layer, x, residual, torch.int8, label, sub_label,
|
bench_fn(
|
||||||
unfused_int8_impl, "unfused_int8_impl"))
|
layer,
|
||||||
|
x,
|
||||||
|
residual,
|
||||||
|
torch.int8,
|
||||||
|
label,
|
||||||
|
sub_label,
|
||||||
|
unfused_int8_impl,
|
||||||
|
"unfused_int8_impl",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# unfused fp8 impl.
|
# unfused fp8 impl.
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label,
|
bench_fn(
|
||||||
unfused_fp8_impl, "unfused_fp8_impl"))
|
layer,
|
||||||
|
x,
|
||||||
|
residual,
|
||||||
|
torch.float8_e4m3fn,
|
||||||
|
label,
|
||||||
|
sub_label,
|
||||||
|
unfused_fp8_impl,
|
||||||
|
"unfused_fp8_impl",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# fused int8 impl.
|
# fused int8 impl.
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(layer, x, residual, torch.int8, label, sub_label, fused_impl,
|
bench_fn(
|
||||||
"fused_int8_impl"))
|
layer,
|
||||||
|
x,
|
||||||
|
residual,
|
||||||
|
torch.int8,
|
||||||
|
label,
|
||||||
|
sub_label,
|
||||||
|
fused_impl,
|
||||||
|
"fused_int8_impl",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# fused fp8 impl.
|
# fused fp8 impl.
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label,
|
bench_fn(
|
||||||
fused_impl, "fused_fp8_impl"))
|
layer,
|
||||||
|
x,
|
||||||
|
residual,
|
||||||
|
torch.float8_e4m3fn,
|
||||||
|
label,
|
||||||
|
sub_label,
|
||||||
|
fused_impl,
|
||||||
|
"fused_fp8_impl",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
print_timers(timers)
|
print_timers(timers)
|
||||||
|
|
||||||
@ -157,13 +209,12 @@ def print_timers(timers: Iterable[TMeasurement]):
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
torch.set_default_device('cuda')
|
torch.set_default_device("cuda")
|
||||||
bench_params = get_bench_params()
|
bench_params = get_bench_params()
|
||||||
|
|
||||||
timers = []
|
timers = []
|
||||||
for bp in tqdm(bench_params):
|
for bp in tqdm(bench_params):
|
||||||
timers.extend(
|
timers.extend(bench(bp, "rms-norm-dynamic-per-token-quant", bp.description()))
|
||||||
bench(bp, "rms-norm-dynamic-per-token-quant", bp.description()))
|
|
||||||
print_timers(timers)
|
print_timers(timers)
|
||||||
|
|
||||||
# pickle all the results
|
# pickle all the results
|
||||||
@ -172,5 +223,5 @@ def main():
|
|||||||
pkl.dump(timers, f)
|
pkl.dump(timers, f)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
222
benchmarks/kernels/bench_fp8_gemm.py
Normal file
222
benchmarks/kernels/bench_fp8_gemm.py
Normal file
@ -0,0 +1,222 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
import argparse
|
||||||
|
import copy
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import triton
|
||||||
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
|
|
||||||
|
from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
|
||||||
|
from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
|
||||||
|
|
||||||
|
|
||||||
|
@triton.testing.perf_report(
|
||||||
|
triton.testing.Benchmark(
|
||||||
|
x_names=["batch_size"],
|
||||||
|
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
|
||||||
|
x_log=False,
|
||||||
|
line_arg="provider",
|
||||||
|
line_vals=[
|
||||||
|
"torch-bf16",
|
||||||
|
# "fp8-tensor-w-token-a",
|
||||||
|
"fp8-tensor-w-tensor-a",
|
||||||
|
"fp8-channel-w-token-a",
|
||||||
|
# "fp8-channel-w-tensor-a",
|
||||||
|
# "fp8-tensor-w-token-a-noquant",
|
||||||
|
"fp8-tensor-w-tensor-a-noquant",
|
||||||
|
"fp8-channel-w-token-a-noquant",
|
||||||
|
# "fp8-channel-w-tensor-a-noquant",
|
||||||
|
],
|
||||||
|
line_names=[
|
||||||
|
"torch-bf16",
|
||||||
|
# "fp8-tensor-w-token-a",
|
||||||
|
"fp8-tensor-w-tensor-a",
|
||||||
|
"fp8-channel-w-token-a",
|
||||||
|
# "fp8-channel-w-tensor-a",
|
||||||
|
# "fp8-tensor-w-token-a-noquant",
|
||||||
|
"fp8-tensor-w-tensor-a-noquant",
|
||||||
|
"fp8-channel-w-token-a-noquant",
|
||||||
|
# "fp8-channel-w-tensor-a-noquant",
|
||||||
|
],
|
||||||
|
ylabel="TFLOP/s (larger is better)",
|
||||||
|
plot_name="BF16 vs FP8 GEMMs",
|
||||||
|
args={},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
def benchmark(batch_size, provider, N, K):
|
||||||
|
M = batch_size
|
||||||
|
device = "cuda"
|
||||||
|
dtype = torch.bfloat16
|
||||||
|
|
||||||
|
# Create input tensors
|
||||||
|
a = torch.randn((M, K), device=device, dtype=dtype)
|
||||||
|
b = torch.randn((N, K), device=device, dtype=dtype)
|
||||||
|
|
||||||
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
|
|
||||||
|
if "torch-bf16" in provider:
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
|
||||||
|
)
|
||||||
|
|
||||||
|
elif "fp8" in provider:
|
||||||
|
# Weights are always quantized ahead of time
|
||||||
|
if "noquant" in provider:
|
||||||
|
# For no quantization, we just measure the GEMM
|
||||||
|
if "tensor-w-token-a" in provider:
|
||||||
|
# Dynamic per-token quant for A, per-tensor quant for B
|
||||||
|
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
|
||||||
|
assert scale_b_fp8.numel() == 1
|
||||||
|
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
||||||
|
a, use_per_token_if_dynamic=True
|
||||||
|
)
|
||||||
|
|
||||||
|
def run_quant():
|
||||||
|
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||||
|
|
||||||
|
elif "tensor-w-tensor-a" in provider:
|
||||||
|
# Static per-tensor quantization with fixed scales
|
||||||
|
# for both A and B
|
||||||
|
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||||
|
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||||
|
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||||
|
assert scale_b_fp8.numel() == 1
|
||||||
|
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
||||||
|
|
||||||
|
def run_quant():
|
||||||
|
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||||
|
|
||||||
|
elif "channel-w-token-a" in provider:
|
||||||
|
# Static per-channel quantization for weights, per-token
|
||||||
|
# quant for A
|
||||||
|
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
||||||
|
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||||
|
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
||||||
|
assert scale_b_fp8.numel() == N
|
||||||
|
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
||||||
|
a, use_per_token_if_dynamic=True
|
||||||
|
)
|
||||||
|
|
||||||
|
def run_quant():
|
||||||
|
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||||
|
|
||||||
|
elif "channel-w-tensor-a" in provider:
|
||||||
|
# Static per-channel quantization for weights, per-tensor
|
||||||
|
# quant for A
|
||||||
|
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||||
|
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
||||||
|
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||||
|
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
||||||
|
assert scale_b_fp8.numel() == N
|
||||||
|
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
||||||
|
|
||||||
|
def run_quant():
|
||||||
|
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# In these cases, we quantize the activations during the GEMM call
|
||||||
|
if "tensor-w-token-a" in provider:
|
||||||
|
# Dynamic per-token quant for A, per-tensor quant for B
|
||||||
|
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
|
||||||
|
assert scale_b_fp8.numel() == 1
|
||||||
|
|
||||||
|
def run_quant():
|
||||||
|
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
||||||
|
a, use_per_token_if_dynamic=True
|
||||||
|
)
|
||||||
|
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||||
|
|
||||||
|
elif "tensor-w-tensor-a" in provider:
|
||||||
|
# Static per-tensor quantization with fixed scales
|
||||||
|
# for both A and B
|
||||||
|
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||||
|
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||||
|
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||||
|
assert scale_b_fp8.numel() == 1
|
||||||
|
|
||||||
|
def run_quant():
|
||||||
|
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
||||||
|
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||||
|
|
||||||
|
elif "channel-w-token-a" in provider:
|
||||||
|
# Static per-channel quantization for weights, per-token
|
||||||
|
# quant for A
|
||||||
|
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
||||||
|
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||||
|
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
||||||
|
assert scale_b_fp8.numel() == N
|
||||||
|
|
||||||
|
def run_quant():
|
||||||
|
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
||||||
|
a, use_per_token_if_dynamic=True
|
||||||
|
)
|
||||||
|
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||||
|
|
||||||
|
elif "channel-w-tensor-a" in provider:
|
||||||
|
# Static per-channel quantization for weights, per-tensor
|
||||||
|
# quant for A
|
||||||
|
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||||
|
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
||||||
|
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||||
|
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
||||||
|
assert scale_b_fp8.numel() == N
|
||||||
|
|
||||||
|
def run_quant():
|
||||||
|
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
||||||
|
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||||
|
|
||||||
|
b_fp8 = b_fp8.t()
|
||||||
|
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: run_quant(), quantiles=quantiles
|
||||||
|
)
|
||||||
|
|
||||||
|
# Calculate TFLOP/s, two flops per multiply-add
|
||||||
|
tflops = lambda ms: (2 * M * N * K) * 1e-12 / (ms * 1e-3)
|
||||||
|
return tflops(ms), tflops(max_ms), tflops(min_ms)
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_shapes(args):
|
||||||
|
KN_model_names = []
|
||||||
|
models_tps = list(itertools.product(args.models, args.tp_sizes))
|
||||||
|
for model, tp_size in models_tps:
|
||||||
|
assert model in WEIGHT_SHAPES
|
||||||
|
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
|
||||||
|
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
|
||||||
|
KN.append(model)
|
||||||
|
KN_model_names.append(KN)
|
||||||
|
return KN_model_names
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--models",
|
||||||
|
nargs="+",
|
||||||
|
type=str,
|
||||||
|
default=["meta-llama/Llama-3.1-8B-Instruct"],
|
||||||
|
choices=[*WEIGHT_SHAPES.keys()],
|
||||||
|
help="List of models to benchmark",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--tp-sizes",
|
||||||
|
nargs="+",
|
||||||
|
type=int,
|
||||||
|
default=[1],
|
||||||
|
help="List of tensor parallel sizes",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
KN_model_names = prepare_shapes(args)
|
||||||
|
for K, N, model_name in KN_model_names:
|
||||||
|
print(f"{model_name}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:")
|
||||||
|
benchmark.run(
|
||||||
|
print_data=True,
|
||||||
|
show_plots=True,
|
||||||
|
save_path=f"bench_fp8_res_n{N}_k{K}",
|
||||||
|
N=N,
|
||||||
|
K=K,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Benchmark finished!")
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user