Compare commits
783 Commits
v0.6.1.pos
...
v0.6.4.pos
| Author | SHA1 | Date | |
|---|---|---|---|
| a6221a144a | |||
| 79ee45b428 | |||
| 691a3ec047 | |||
| 3a763ba0c3 | |||
| f2056f726d | |||
| 1d65ec7eeb | |||
| 26908554b2 | |||
| b311efd0bd | |||
| 3d158cdc8d | |||
| 02dbf30e9a | |||
| 2ac6d0e75b | |||
| 2ec8827288 | |||
| b40cf6402e | |||
| 2885ba0e24 | |||
| bf2ddc6610 | |||
| 972112d82f | |||
| 11cd1ae6ad | |||
| 554af9228d | |||
| b2e0ad3b59 | |||
| 4a18fd14ba | |||
| 1dbae0329c | |||
| 675d603400 | |||
| 03025c023f | |||
| 29f3ef26a3 | |||
| 294bf467ba | |||
| 52b48c1ead | |||
| f67ce05d0b | |||
| e0853b6508 | |||
| 504ac53d18 | |||
| 15bb8330aa | |||
| ac49b59d8b | |||
| 0b8bb86bf1 | |||
| bb7991aa29 | |||
| d909acf9fe | |||
| b6dde33019 | |||
| 1b886aa104 | |||
| 3945c82346 | |||
| 032fcf16ae | |||
| 56a955e774 | |||
| bbd3e86926 | |||
| 0d4ea3fb5c | |||
| 112fa0bbe5 | |||
| 377b74fe87 | |||
| 18081451f9 | |||
| 96ae0eaeb2 | |||
| 1f55e05713 | |||
| 8a06428c70 | |||
| b41fb9d3b1 | |||
| 7c65527918 | |||
| 47db6ec831 | |||
| 176fcb1c71 | |||
| a838ba7254 | |||
| 36c513a076 | |||
| d201d41973 | |||
| 3a28f18b0b | |||
| 812c981fa0 | |||
| 7f5edb5900 | |||
| eea55cca5b | |||
| 9cdba9669c | |||
| d1c6799b88 | |||
| 6ace6fba2c | |||
| 08f93e7439 | |||
| 9d5b4e4dea | |||
| 8a7fe47d32 | |||
| 4800339c62 | |||
| fe15729a2b | |||
| 330e82d34a | |||
| d7a4f2207b | |||
| f9dadfbee3 | |||
| 25144ceed0 | |||
| e6de9784d2 | |||
| 36fc439de0 | |||
| 874f551b36 | |||
| 2cebda42bb | |||
| 5fb1f935b0 | |||
| 36e4acd02a | |||
| 58170d6503 | |||
| 9804ac7c7c | |||
| f89d18ff74 | |||
| f0f2e5638e | |||
| ad9a78bf64 | |||
| 73b9083e99 | |||
| 20cf2f553c | |||
| bfb7d61a7c | |||
| 19682023b6 | |||
| 9fa4bdde9d | |||
| 51c2e1fcef | |||
| b09895a618 | |||
| d88bff1b96 | |||
| 9e37266420 | |||
| 8a4358ecb5 | |||
| bd46357ad9 | |||
| f192aeba74 | |||
| 8e1529dc57 | |||
| 1a95f10ee7 | |||
| 49d2a41a86 | |||
| 47672f38b5 | |||
| f83feccd7f | |||
| e0191a95d8 | |||
| d7edca1dee | |||
| 127c07480e | |||
| 10b67d865d | |||
| 4f93dfe952 | |||
| e1b5a82179 | |||
| 87713c6053 | |||
| b5815c8413 | |||
| 6b30471586 | |||
| f6778620a9 | |||
| 0535e5fe6c | |||
| b489fc3c91 | |||
| 208ce622c7 | |||
| 1ff4aed5bd | |||
| f10797c0ce | |||
| f4c2187e29 | |||
| aea6ad629f | |||
| da07a9ead7 | |||
| 3a7f15a398 | |||
| 7371749d54 | |||
| ad39bd640c | |||
| 40d0e7411d | |||
| 6bb52b0f97 | |||
| 201fc07730 | |||
| 42b4f46b71 | |||
| 073a472728 | |||
| 93bff421bc | |||
| 28b2877d30 | |||
| 97b8475beb | |||
| a2f1f3b089 | |||
| 3be5b26a76 | |||
| de0e61a323 | |||
| 9d43afcc53 | |||
| ae62fd17c0 | |||
| a62bc0109c | |||
| 999df95b4e | |||
| a6f332d0d9 | |||
| 0dfba97b42 | |||
| aa9078fa03 | |||
| e036e527a0 | |||
| 6192e9b8fe | |||
| d7263a1bb8 | |||
| 104d729656 | |||
| db7db4aab9 | |||
| 1fa020c539 | |||
| e7b84c394d | |||
| a4b3e0c1e9 | |||
| 29862b884b | |||
| d3859f1891 | |||
| 4ab3256644 | |||
| 719c1ca468 | |||
| 74f2f8a0f1 | |||
| d58268c56a | |||
| 87bd7e0515 | |||
| 098f94de42 | |||
| 399c798608 | |||
| 406d4cc480 | |||
| a5bba7d234 | |||
| 2003cc3513 | |||
| 6a585a23d2 | |||
| a02a50e6e5 | |||
| a5fda50a10 | |||
| 21063c11c7 | |||
| 4be3a45158 | |||
| 4089985552 | |||
| 9d59b75593 | |||
| ea928f608c | |||
| 2bcbae704c | |||
| ffc0f2b47a | |||
| 82bfc38d07 | |||
| c4cacbaa7f | |||
| 0c63c34f72 | |||
| 966e31697b | |||
| 43300bd98a | |||
| ca9844b340 | |||
| 235366fe2e | |||
| 02462465ea | |||
| b9c64c0ca7 | |||
| d2e80332a7 | |||
| a53046b16f | |||
| 731aec5be7 | |||
| 09d3550372 | |||
| cd34029e91 | |||
| 5952d81139 | |||
| 93dee88f6b | |||
| 7a83b1aec0 | |||
| ad23318928 | |||
| bbc3619dc8 | |||
| 04bbf38e05 | |||
| 8f0a9ca890 | |||
| 2094062b4e | |||
| d93478b399 | |||
| ac04a97a9f | |||
| 9a5664d4a4 | |||
| 04cef2c6ab | |||
| 6e056bcf04 | |||
| 5208dc7a20 | |||
| 1c45f4c385 | |||
| 603a661ae8 | |||
| fb2716d641 | |||
| 8d72bb20fa | |||
| ac6b8f19b9 | |||
| ccb5376a9a | |||
| ea4adeddc1 | |||
| 4dbcbbeb09 | |||
| b67feb1274 | |||
| c49f0407ba | |||
| 91c9ebbb1b | |||
| 54597724f4 | |||
| 1f1b6d6eda | |||
| 3bb4befea7 | |||
| ae5279a163 | |||
| 1b73ab2a1f | |||
| cea808f325 | |||
| 74b529ceee | |||
| d6459b4516 | |||
| e893795443 | |||
| 1d4cfe2be1 | |||
| eed92f12fc | |||
| af7380d83b | |||
| a78dd3303e | |||
| d522034c85 | |||
| 6c0b7f548d | |||
| d151fde834 | |||
| 27cd36e6e2 | |||
| 18bd7587b7 | |||
| 598b6d7b07 | |||
| aff1fd8188 | |||
| 4581d2cc02 | |||
| 1dd4cb2935 | |||
| ba0d892074 | |||
| 30a2e80742 | |||
| 06386a64dd | |||
| d3aa2a8b2f | |||
| 2b5bf20988 | |||
| 93a76dd21d | |||
| 566cd27797 | |||
| 37a4947dcd | |||
| 96e0c9cbbd | |||
| 031a7995f3 | |||
| b63c64d95b | |||
| 9fb12f7848 | |||
| 55650c83a0 | |||
| 77f7ef2908 | |||
| 16b8f7a86f | |||
| 5608e611c2 | |||
| 3ea2dc2ec4 | |||
| d087bf863e | |||
| 890ca36072 | |||
| abbfb6134d | |||
| 64384bbcdf | |||
| 00d91c8a2c | |||
| c2cd1a2142 | |||
| c787f2d81d | |||
| 33d257735f | |||
| 3b3f1e7436 | |||
| 9ff4511e43 | |||
| 81f09cfd80 | |||
| cc98f1e079 | |||
| 211fe91aa8 | |||
| 6aa6020f9b | |||
| ff5ed6e1bc | |||
| 7b0365efef | |||
| 04a3ae0aca | |||
| 62fac4b9aa | |||
| 226688bd61 | |||
| 64cb1cdc3f | |||
| 1ab6f6b4ad | |||
| bc73e9821c | |||
| 8d7724104a | |||
| 882a1ad0de | |||
| 67bdf8e523 | |||
| 0ad216f575 | |||
| 7585ec996f | |||
| ab6f981671 | |||
| ac3d748dba | |||
| 0ce7798f44 | |||
| 0f43387157 | |||
| 08600ddc68 | |||
| 74fc2d77ae | |||
| 622b7ab955 | |||
| 09500f7dde | |||
| ef7865b4f9 | |||
| eae3d48181 | |||
| e74f2d448c | |||
| 7a4df5f200 | |||
| c5d7fb9ddc | |||
| 76ed5340f0 | |||
| 97b61bfae6 | |||
| aa0addb397 | |||
| 5f8d8075f9 | |||
| 8b0e4f2ad7 | |||
| 2adb4409e0 | |||
| feb92fbe4a | |||
| 32176fee73 | |||
| 4e2d95e372 | |||
| 34a9941620 | |||
| e130c40e4e | |||
| 3cb07a36a2 | |||
| 8549c82660 | |||
| 67a6882da4 | |||
| 6650e6a930 | |||
| 07e981fdf4 | |||
| 55137e8ee3 | |||
| 5cbdccd151 | |||
| 067e77f9a8 | |||
| 6567e13724 | |||
| 228cfbd03f | |||
| ca0d92227e | |||
| 9645b9f646 | |||
| a6f3721861 | |||
| 9f7b4ba865 | |||
| c91ed47c43 | |||
| 59449095ab | |||
| e26d37a185 | |||
| 722d46edb9 | |||
| c866e0079d | |||
| d27cfbf791 | |||
| de662d32b5 | |||
| f58454968f | |||
| b979143d5b | |||
| ad6f78053e | |||
| 295a061fb3 | |||
| 8a02cd045a | |||
| 4fdc581f9e | |||
| 3770071eb4 | |||
| 836e8ef6ee | |||
| 056a68c7db | |||
| 33bab41060 | |||
| b7df53cd42 | |||
| bb01f2915e | |||
| b548d7a5f4 | |||
| fc6c274626 | |||
| 150b779081 | |||
| 9013e24f7b | |||
| fd0e2cfdb2 | |||
| e5ac6a4199 | |||
| dbdd3b5e5a | |||
| e7116c017c | |||
| 31a08f5bd2 | |||
| c18e1a3418 | |||
| 3ff57ebfca | |||
| 2394962d70 | |||
| 51c24c9736 | |||
| 831540cf04 | |||
| 29061ed9df | |||
| 65050a40e6 | |||
| 208cb34c81 | |||
| b17046e298 | |||
| d1e8240875 | |||
| cb6fdaa0a0 | |||
| 23b899a8e6 | |||
| 17c79f3c36 | |||
| cd5601ac37 | |||
| 434984e665 | |||
| 32a1ee74a0 | |||
| 08075c3448 | |||
| bb392ea2d2 | |||
| 9dbcce84a7 | |||
| a48e3ec052 | |||
| 6c5af09b39 | |||
| 3ddbe25502 | |||
| 0d02747f2e | |||
| f7db5f0fa9 | |||
| ca30c3c84b | |||
| c0292211ce | |||
| 74692421f7 | |||
| 29acd2c34c | |||
| f085995a7b | |||
| b729901139 | |||
| 76a5e13270 | |||
| ef7faad1b8 | |||
| 575dcebe9a | |||
| 711f3a7806 | |||
| 15713e3b75 | |||
| d621c43df7 | |||
| 9d9186be97 | |||
| 5241aa1494 | |||
| ec6bd6c4c6 | |||
| 8ca8954841 | |||
| f6b97293aa | |||
| 496e991da8 | |||
| 696b01af8f | |||
| 855e0e6f97 | |||
| 4fa3e33349 | |||
| 962d2c6349 | |||
| 5b59fe0f08 | |||
| 8e3e7f2713 | |||
| 263d8ee150 | |||
| c5eea3c8ba | |||
| 85dc92fc98 | |||
| dfd951ed9b | |||
| 82c25151ec | |||
| 1325872ec8 | |||
| 380e18639f | |||
| 337ed76671 | |||
| 0c9a5258f9 | |||
| d11bf435a0 | |||
| 9bb10a7d27 | |||
| 3921a2f29e | |||
| 67a7e5ef38 | |||
| 051eaf6db3 | |||
| 7dbe738d65 | |||
| ae8b633ba3 | |||
| 1bbbcc0b1d | |||
| 25aeb7d4c9 | |||
| d2b1bf55ec | |||
| 1ffc8a7362 | |||
| 944dd8edaf | |||
| 154a8ae880 | |||
| de4008e2ab | |||
| 48138a8415 | |||
| 343f8e0905 | |||
| bb76538bbd | |||
| d615b5c9f8 | |||
| d65049daab | |||
| eca2c5f7c0 | |||
| 0f41fbe5a3 | |||
| 7871659abb | |||
| a2c71c5405 | |||
| 81ede99ca4 | |||
| 5eda21e773 | |||
| 8e1cddcd44 | |||
| 5e443b594f | |||
| 9d30a056e7 | |||
| 390be74649 | |||
| e312e52b44 | |||
| dbfa8d31d5 | |||
| 92d86da217 | |||
| c3fab5f769 | |||
| 776dbd74f1 | |||
| 8345045833 | |||
| 5b8a1fde84 | |||
| fb60ae9b91 | |||
| 415f76a9cb | |||
| cf1d62a644 | |||
| 59230ef32b | |||
| cee711fdbb | |||
| 1de76a0e55 | |||
| 7abba39ee6 | |||
| 7e7eae338d | |||
| ed920135c8 | |||
| 717a5f82cd | |||
| ba30942240 | |||
| 22f8a69549 | |||
| 5d264f4ab8 | |||
| e9d517f276 | |||
| 55e081fbad | |||
| 8e836d982a | |||
| 44eaa5a5d9 | |||
| 169b530607 | |||
| f0fe4fe86d | |||
| 4d31cd424b | |||
| 473e7b3606 | |||
| fd47e57f4b | |||
| 203ab8f80f | |||
| 4141608c6a | |||
| dfe43a2071 | |||
| 16b24e7dcd | |||
| f519902c52 | |||
| 250e26a63e | |||
| 2b184ddd4f | |||
| 00298e092c | |||
| 89feb4c84d | |||
| ec10cb8511 | |||
| d11b46f3a5 | |||
| c6cf9295e1 | |||
| de9fb4bef8 | |||
| 8baf85e4e9 | |||
| 1a1823871d | |||
| 6cf1167c1a | |||
| f710090d8e | |||
| 7342a7d7f8 | |||
| df3dcdf49d | |||
| 36ea79079b | |||
| e808156f30 | |||
| cbc2ef5529 | |||
| 94bf9ae4e9 | |||
| f990bab2a4 | |||
| e00c094f15 | |||
| a78c6ba7c8 | |||
| fb870fd491 | |||
| 270953bafb | |||
| 9cc811c4ff | |||
| e4d652ea3e | |||
| 78c0b4166c | |||
| 21efb603f5 | |||
| 055f3270d4 | |||
| 18511aeda6 | |||
| 83ea5c72b9 | |||
| 04de9057ab | |||
| 07c11cf4d4 | |||
| f3a507f1d3 | |||
| a64e7b9407 | |||
| ce00231a8b | |||
| de895f1697 | |||
| cf25b93bdd | |||
| d5fbb8706d | |||
| cdca8994bd | |||
| ca77dd7a44 | |||
| 7dea289066 | |||
| cfaa6008e6 | |||
| 21906a6f50 | |||
| dc4aea677a | |||
| c8627cd41b | |||
| 8bfaa4e31e | |||
| 0b5b5d767e | |||
| cdc72e3c80 | |||
| 7627172bf4 | |||
| 480b7f40cf | |||
| acce7630c1 | |||
| ffc4b27ea8 | |||
| 2f4117c38e | |||
| 9ba0bd6aa6 | |||
| 2a131965a8 | |||
| bd37b9fbe2 | |||
| de24046fcd | |||
| 1874c6a1b0 | |||
| 9a94ca4a5d | |||
| cfba685bd4 | |||
| 069d3bd8d0 | |||
| a3691b6b5e | |||
| 8c746226c9 | |||
| e1faa2a598 | |||
| 80b57f00d5 | |||
| 04c12f8157 | |||
| 8eeb857084 | |||
| fa45513a51 | |||
| c0d9a98d0c | |||
| e0dbdb013d | |||
| 93cf74a8a7 | |||
| 151ef4efd2 | |||
| f19da64871 | |||
| 4f95ffee6f | |||
| 8c6de96ea1 | |||
| 18b296fdb2 | |||
| c8f26bb636 | |||
| 487678d046 | |||
| cb3b2b9ba4 | |||
| fdf59d30ea | |||
| b22b798471 | |||
| f22619fe96 | |||
| 168cab6bbf | |||
| 23fea8714a | |||
| f4dd830e09 | |||
| 5df1834895 | |||
| cfadb9c687 | |||
| 15986f598c | |||
| 53b3a33027 | |||
| dac914b0d6 | |||
| a95354a36e | |||
| 663874e048 | |||
| cc90419e89 | |||
| 27302dd584 | |||
| 0cc566ca8f | |||
| 05c531be47 | |||
| fbb74420e7 | |||
| 05d686432f | |||
| 0dcc8cbe5a | |||
| 26aa325f4f | |||
| e5dc713c23 | |||
| 36eecfbddb | |||
| 9ade8bbc8d | |||
| 22482e495e | |||
| 3d826d2c52 | |||
| 0e36fd4909 | |||
| 0f6d7a9a34 | |||
| 303d44790a | |||
| aeb37c2a72 | |||
| 3dbb215b38 | |||
| 2838d6b38e | |||
| 91add85ec4 | |||
| 9aaf14c62e | |||
| 63e39937f9 | |||
| f5d72b2fc6 | |||
| 83caf35e08 | |||
| 01843c89b8 | |||
| 19a4dd0990 | |||
| 18c2e30c57 | |||
| 19f0d25796 | |||
| f58d4fccc9 | |||
| afb050b29d | |||
| 7f60520deb | |||
| 563649aafe | |||
| 1570203864 | |||
| 22f5851b80 | |||
| 4f341bd4bf | |||
| 35bd215168 | |||
| 1fe0a4264a | |||
| bc4eb65b54 | |||
| 82f3937e59 | |||
| 7da2487591 | |||
| aaccca2b4d | |||
| 062c89e7c9 | |||
| bce324487a | |||
| 1425a1bcf9 | |||
| 1cabfcefb6 | |||
| be76e5aabf | |||
| 2ae25f79cf | |||
| 8e60afa15e | |||
| b6d7392579 | |||
| e01ab595d8 | |||
| f13a07b1f8 | |||
| 6c9ba48fde | |||
| 1fb9c1b0bf | |||
| 31f46a0d35 | |||
| 3d49776bbb | |||
| bc2ef1f77c | |||
| 2e7fe7e79f | |||
| 26a68d5d7e | |||
| d081da0064 | |||
| 5bf8789b2a | |||
| d1537039ce | |||
| cc276443b5 | |||
| e585b583a9 | |||
| 090e945e36 | |||
| e1a3f5e831 | |||
| 19d02ff938 | |||
| 39d3f8d94f | |||
| b0298aa8cc | |||
| 260024a374 | |||
| d86f6b2afb | |||
| bd429f2b75 | |||
| 18e60d7d13 | |||
| c2ec430ab5 | |||
| c5d55356f9 | |||
| 172d1cd276 | |||
| a9b15c606f | |||
| 8df2dc3c88 | |||
| 6d792d2f31 | |||
| 0e088750af | |||
| dc4e3df5c2 | |||
| 3b00b9c26c | |||
| 344cd2b6f4 | |||
| 1b49148e47 | |||
| 4b377d6feb | |||
| 71d21c73ab | |||
| ee2da3e9ef | |||
| e2f6f26e86 | |||
| b28d2104de | |||
| 93d364da34 | |||
| d9cfbc891e | |||
| 70de39f6b4 | |||
| 68988d4e0d | |||
| 520db4dbc1 | |||
| f70bccac75 | |||
| 4bb98f2190 | |||
| 7193774b1f | |||
| e2c6e0a829 | |||
| 770ec6024f | |||
| 4f1ba0844b | |||
| 873edda6cf | |||
| 64840dfae4 | |||
| 28e1299e60 | |||
| 0c4d2ad5e6 | |||
| c6f2485c82 | |||
| 300da09177 | |||
| 1c046447a6 | |||
| 8fae5ed7f6 | |||
| 3368c3ab36 | |||
| 1ac3de09cd | |||
| 3e073e66f1 | |||
| c23953675f | |||
| e3dd0692fa | |||
| fc3afc20df | |||
| b4522474a3 | |||
| ee777d9c30 | |||
| 6e0c9d6bd0 | |||
| 6da1ab6b41 | |||
| 01b6f9e1f0 | |||
| 13f9f7a3d0 | |||
| 1e7d5c01f5 | |||
| 2467b642dd | |||
| 72fc97a0f1 | |||
| 2529d09b5a | |||
| a928ded995 | |||
| cc4325b66a | |||
| 8ff7ced996 | |||
| 3f06bae907 | |||
| b8747e8a7c | |||
| 3185fb0cca | |||
| 0250dd68c5 | |||
| 88577ac928 | |||
| 530821d00c | |||
| 1a2aef3e59 | |||
| 5f7bb58427 | |||
| b05f5c9238 | |||
| 9b0e3ec970 | |||
| 86e9c8df29 | |||
| ee5f34b1c2 | |||
| f2bd246c17 | |||
| a79e522984 | |||
| 3e83c12b5c | |||
| e551ca1555 | |||
| 9b8c8ba119 | |||
| d23679eb99 | |||
| 57a0702e63 | |||
| 3dda7c2250 | |||
| 92ba7e7477 | |||
| d4a2ac8302 | |||
| c6bd70d772 | |||
| 5b59532760 | |||
| ca2b628b3c | |||
| 8ca5051b9a | |||
| 06ed2815e2 | |||
| 0e40ac9b7b | |||
| 13d88d4137 | |||
| d66ac62854 | |||
| 9dc7c6c7f3 | |||
| ec4aaad812 | |||
| 4dfdf43196 | |||
| 5e85f4f82a | |||
| 71c60491f2 | |||
| 0faab90eb0 | |||
| 0455c46ed4 | |||
| d4bf085ad0 | |||
| 0057894ef7 | |||
| 0f961b3ce9 | |||
| 7f9c8902e3 | |||
| 7c8566aa4f | |||
| b4e4eda92e | |||
| 2874bac618 | |||
| 035fa895ec | |||
| b28298f2f4 | |||
| 2940afa04e | |||
| 3b63de9353 | |||
| 260d40b5ea | |||
| 9e5ec35b1f | |||
| 18ae428a0d | |||
| de6f90a13d | |||
| 6cb748e190 | |||
| 9e99407e3c | |||
| ea4647b7d7 | |||
| e42c634acb | |||
| 9cc373f390 | |||
| 76515f303b | |||
| 855c8ae2c9 | |||
| c52ec5f034 | |||
| 02c9afa2d0 | |||
| 3118f63385 | |||
| 4c34ce8916 | |||
| 0d47bf3bf4 | |||
| d9cd78eb71 | |||
| db9120cded | |||
| b3195bc9e4 | |||
| e18749ff09 | |||
| d65798f78c | |||
| a8c1d161a7 | |||
| 7c7714d856 | |||
| 9d104b5beb | |||
| 6ffa3f314c | |||
| e351572900 | |||
| 95965d31b6 | |||
| 8110e44529 | |||
| 09deb4721f | |||
| fa0c114fad | |||
| 98f9713399 | |||
| 56c3de018c | |||
| a54ed80249 | |||
| 9855b99502 | |||
| 1009e93c5d | |||
| 1b6de8352b | |||
| cbdb252259 | |||
| 99aa4eddaf | |||
| ee2bceaaa6 | |||
| 1c1bb388e0 | |||
| 546034b466 | |||
| cca61642e0 | |||
| 5ce45eb54d | |||
| 5478c4b41f | |||
| 47f5e03b5b | |||
| 2759a43a26 | |||
| 5d73ae49d6 | |||
| 781e3b9a42 | |||
| acd5511b6d | |||
| 837c1968f9 | |||
| a091e2da3e | |||
| fc990f9795 | |||
| 3724d5f6b5 | |||
| 50e9ec41fc | |||
| 47790f3e32 | |||
| a36e070dad | |||
| 8a0cf1ddc3 | |||
| 1ef0d2efd0 | |||
| 851725202a |
@ -0,0 +1,11 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.764
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.764
|
||||
limit: 250
|
||||
num_fewshot: 5
|
||||
@ -0,0 +1,11 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.356
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.358
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
||||
@ -1,6 +1,7 @@
|
||||
Meta-Llama-3-8B-Instruct.yaml
|
||||
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
|
||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
|
||||
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
||||
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
|
||||
Minitron-4B-Base-FP8.yaml
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# We can use this script to compute baseline accuracy on GSM for transformers.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
|
||||
# pip install lm-eval==0.4.4
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
@ -41,6 +41,6 @@ while getopts "m:b:l:f:" OPT; do
|
||||
done
|
||||
|
||||
lm_eval --model hf \
|
||||
--model_args pretrained=$MODEL,parallelize=True \
|
||||
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
|
||||
--batch_size $BATCH_SIZE
|
||||
--model_args "pretrained=$MODEL,parallelize=True" \
|
||||
--tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
||||
--batch_size "$BATCH_SIZE"
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
# We use this for fp8, which HF does not support.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install lm-eval==0.4.3
|
||||
# pip install lm-eval==0.4.4
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
|
||||
done
|
||||
|
||||
lm_eval --model vllm \
|
||||
--model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \
|
||||
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
|
||||
--batch_size $BATCH_SIZE
|
||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
|
||||
--tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
||||
--batch_size "$BATCH_SIZE"
|
||||
|
||||
@ -30,7 +30,7 @@ while getopts "c:t:" OPT; do
|
||||
done
|
||||
|
||||
# Parse list of configs.
|
||||
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
|
||||
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
|
||||
|
||||
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
|
||||
do
|
||||
|
||||
@ -49,10 +49,15 @@ def test_lm_eval_correctness():
|
||||
results = launch_lm_eval(eval_config)
|
||||
|
||||
# Confirm scores match ground truth.
|
||||
success = True
|
||||
for task in eval_config["tasks"]:
|
||||
for metric in task["metrics"]:
|
||||
ground_truth = metric["value"]
|
||||
measured_value = results["results"][task["name"]][metric["name"]]
|
||||
print(f'{task["name"]} | {metric["name"]}: '
|
||||
f'ground_truth={ground_truth} | measured={measured_value}')
|
||||
assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
|
||||
success = success and numpy.isclose(
|
||||
ground_truth, measured_value, rtol=RTOL)
|
||||
|
||||
# Assert at the end, print all scores even on failure for debugging.
|
||||
assert success
|
||||
|
||||
@ -8,8 +8,7 @@ steps:
|
||||
containers:
|
||||
- image: badouralix/curl-jq
|
||||
command:
|
||||
- sh
|
||||
- .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
||||
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
||||
- wait
|
||||
- label: "A100"
|
||||
agents:
|
||||
|
||||
28
.buildkite/nightly-benchmarks/nightly-annotation.md
Normal file
28
.buildkite/nightly-benchmarks/nightly-annotation.md
Normal file
@ -0,0 +1,28 @@
|
||||
|
||||
## Description
|
||||
|
||||
This file contains the downloading link for benchmarking results.
|
||||
|
||||
- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
|
||||
- [benchmarking results](artifact://results.zip)
|
||||
- [benchmarking code](artifact://nightly-benchmarks.zip)
|
||||
|
||||
Please download the visualization scripts in the post
|
||||
|
||||
|
||||
## Results reproduction
|
||||
|
||||
- Find the docker we use in `benchmarking pipeline`
|
||||
- Deploy the docker, and inside the docker:
|
||||
- Download `nightly-benchmarks.zip`.
|
||||
- In the same folder, run the following code
|
||||
```
|
||||
export HF_TOKEN=<your HF token>
|
||||
apt update
|
||||
apt install -y git
|
||||
unzip nightly-benchmarks.zip
|
||||
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||
```
|
||||
|
||||
And the results will be inside `./benchmarks/results`.
|
||||
|
||||
@ -1,45 +1,39 @@
|
||||
|
||||
# Nightly benchmark
|
||||
|
||||
The main goal of this benchmarking is two-fold:
|
||||
- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload.
|
||||
- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().
|
||||
This benchmark aims to:
|
||||
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
|
||||
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
|
||||
|
||||
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
|
||||
|
||||
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
||||
|
||||
|
||||
## Docker images
|
||||
## Setup
|
||||
|
||||
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
|
||||
- vllm/vllm-openai:v0.5.0.post1
|
||||
- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
|
||||
- openmmlab/lmdeploy:v0.5.0
|
||||
- ghcr.io/huggingface/text-generation-inference:2.1
|
||||
- Docker images:
|
||||
- vLLM: `vllm/vllm-openai:v0.6.2`
|
||||
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
|
||||
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
|
||||
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
|
||||
- *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
|
||||
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
|
||||
- Hardware
|
||||
- 8x Nvidia A100 GPUs
|
||||
- Workload:
|
||||
- Dataset
|
||||
- ShareGPT dataset
|
||||
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
|
||||
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
|
||||
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
|
||||
- Models: llama-3 8B, llama-3 70B.
|
||||
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
|
||||
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
|
||||
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
||||
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
||||
|
||||
<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. -->
|
||||
# Known issues
|
||||
|
||||
|
||||
## Hardware
|
||||
|
||||
One AWS node with 8x NVIDIA A100 GPUs.
|
||||
|
||||
|
||||
## Workload description
|
||||
|
||||
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
|
||||
|
||||
- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
|
||||
- Output length: the corresponding output length of these 500 prompts.
|
||||
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
|
||||
- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
||||
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
||||
|
||||
<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
|
||||
|
||||
## Plots
|
||||
|
||||
In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
|
||||
|
||||
<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
|
||||
|
||||
## Results
|
||||
|
||||
{nightly_results_benchmarking_table}
|
||||
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
|
||||
- TGI does not support `ignore-eos` flag.
|
||||
@ -13,7 +13,7 @@ common_pod_spec: &common_pod_spec
|
||||
|
||||
common_container_settings: &common_container_settings
|
||||
command:
|
||||
- bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
|
||||
- bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 8
|
||||
@ -37,7 +37,10 @@ common_container_settings: &common_container_settings
|
||||
|
||||
steps:
|
||||
- block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
|
||||
- label: "A100 trt benchmark"
|
||||
|
||||
|
||||
|
||||
- label: "A100 vllm step 10"
|
||||
priority: 100
|
||||
agents:
|
||||
queue: A100
|
||||
@ -46,7 +49,21 @@ steps:
|
||||
podSpec:
|
||||
<<: *common_pod_spec
|
||||
containers:
|
||||
- image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
|
||||
- image: vllm/vllm-openai:v0.6.2
|
||||
<<: *common_container_settings
|
||||
|
||||
|
||||
|
||||
- label: "A100 sglang benchmark"
|
||||
priority: 100
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
<<: *common_pod_spec
|
||||
containers:
|
||||
- image: lmsysorg/sglang:v0.3.2-cu121
|
||||
<<: *common_container_settings
|
||||
|
||||
- label: "A100 lmdeploy benchmark"
|
||||
@ -58,11 +75,13 @@ steps:
|
||||
podSpec:
|
||||
<<: *common_pod_spec
|
||||
containers:
|
||||
- image: openmmlab/lmdeploy:v0.5.0
|
||||
- image: openmmlab/lmdeploy:v0.6.1-cu12
|
||||
<<: *common_container_settings
|
||||
|
||||
|
||||
- label: "A100 vllm benchmark"
|
||||
|
||||
|
||||
|
||||
- label: "A100 trt llama-8B"
|
||||
priority: 100
|
||||
agents:
|
||||
queue: A100
|
||||
@ -71,10 +90,25 @@ steps:
|
||||
podSpec:
|
||||
<<: *common_pod_spec
|
||||
containers:
|
||||
- image: vllm/vllm-openai:latest
|
||||
- image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
||||
<<: *common_container_settings
|
||||
env:
|
||||
- name: VLLM_USAGE_SOURCE
|
||||
value: ci-test
|
||||
- name: HF_HOME
|
||||
value: /root/.cache/huggingface
|
||||
- name: VLLM_SOURCE_CODE_LOC
|
||||
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
- name: TEST_SELECTOR
|
||||
value: "llama8B"
|
||||
|
||||
- label: "A100 tgi benchmark"
|
||||
|
||||
- label: "A100 trt llama-70B"
|
||||
priority: 100
|
||||
agents:
|
||||
queue: A100
|
||||
@ -83,12 +117,54 @@ steps:
|
||||
podSpec:
|
||||
<<: *common_pod_spec
|
||||
containers:
|
||||
- image: ghcr.io/huggingface/text-generation-inference:2.1
|
||||
- image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
||||
<<: *common_container_settings
|
||||
env:
|
||||
- name: VLLM_USAGE_SOURCE
|
||||
value: ci-test
|
||||
- name: HF_HOME
|
||||
value: /root/.cache/huggingface
|
||||
- name: VLLM_SOURCE_CODE_LOC
|
||||
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
- name: TEST_SELECTOR
|
||||
value: "llama70B"
|
||||
|
||||
|
||||
# FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image
|
||||
# - label: "A100 trt benchmark"
|
||||
# priority: 100
|
||||
# agents:
|
||||
# queue: A100
|
||||
# plugins:
|
||||
# - kubernetes:
|
||||
# podSpec:
|
||||
# <<: *common_pod_spec
|
||||
# containers:
|
||||
# - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
||||
# <<: *common_container_settings
|
||||
|
||||
|
||||
# FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
|
||||
# - label: "A100 tgi benchmark"
|
||||
# priority: 100
|
||||
# agents:
|
||||
# queue: A100
|
||||
# plugins:
|
||||
# - kubernetes:
|
||||
# podSpec:
|
||||
# <<: *common_pod_spec
|
||||
# containers:
|
||||
# - image: ghcr.io/huggingface/text-generation-inference:2.2.0
|
||||
# <<: *common_container_settings
|
||||
|
||||
- wait
|
||||
|
||||
- label: "Plot"
|
||||
- label: "Collect the results"
|
||||
priority: 100
|
||||
agents:
|
||||
queue: A100
|
||||
@ -117,4 +193,4 @@ steps:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
|
||||
- wait
|
||||
- block: ":rocket: check the results!"
|
||||
@ -1,76 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -o pipefail
|
||||
set -x
|
||||
|
||||
check_gpus() {
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
if [[ $gpu_count -gt 0 ]]; then
|
||||
echo "GPU found."
|
||||
else
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
check_hf_token() {
|
||||
# check if HF_TOKEN is available and valid
|
||||
if [[ -z "$HF_TOKEN" ]]; then
|
||||
echo "Error: HF_TOKEN is not set."
|
||||
exit 1
|
||||
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
||||
echo "Error: HF_TOKEN does not start with 'hf_'."
|
||||
exit 1
|
||||
else
|
||||
echo "HF_TOKEN is set and valid."
|
||||
fi
|
||||
}
|
||||
|
||||
main() {
|
||||
|
||||
check_gpus
|
||||
check_hf_token
|
||||
|
||||
df -h
|
||||
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
(which jq) || (apt-get update && apt-get -y install jq)
|
||||
|
||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
|
||||
|
||||
# run lmdeploy
|
||||
if which lmdeploy >/dev/null; then
|
||||
echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
|
||||
bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# run tgi
|
||||
if [ -e /tgi-entrypoint.sh ]; then
|
||||
echo "tgi is available, redirect to run-tgi-nightly.sh"
|
||||
bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# run trt
|
||||
if which trtllm-build >/dev/null; then
|
||||
echo "trtllm is available, redirect to run-trt-nightly.sh"
|
||||
bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# run vllm
|
||||
if [ -e /vllm-workspace ]; then
|
||||
echo "vllm is available, redirect to run-vllm-nightly.sh"
|
||||
bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
|
||||
exit 0
|
||||
fi
|
||||
|
||||
}
|
||||
|
||||
main "$@"
|
||||
@ -56,7 +56,7 @@ serving_column_mapping = {
|
||||
|
||||
def read_markdown(file):
|
||||
if os.path.exists(file):
|
||||
with open(file, "r") as f:
|
||||
with open(file) as f:
|
||||
return f.read() + "\n"
|
||||
else:
|
||||
return f"{file} not found.\n"
|
||||
@ -75,14 +75,14 @@ if __name__ == "__main__":
|
||||
# collect results
|
||||
for test_file in results_folder.glob("*.json"):
|
||||
|
||||
with open(test_file, "r") as f:
|
||||
with open(test_file) as f:
|
||||
raw_result = json.loads(f.read())
|
||||
|
||||
if "serving" in str(test_file):
|
||||
# this result is generated via `benchmark_serving.py`
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
with open(test_file.with_suffix(".commands"), "r") as f:
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
raw_result.update(command)
|
||||
|
||||
@ -97,7 +97,7 @@ if __name__ == "__main__":
|
||||
# this result is generated via `benchmark_latency.py`
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
with open(test_file.with_suffix(".commands"), "r") as f:
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
raw_result.update(command)
|
||||
|
||||
@ -119,7 +119,7 @@ if __name__ == "__main__":
|
||||
# this result is generated via `benchmark_throughput.py`
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
with open(test_file.with_suffix(".commands"), "r") as f:
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
raw_result.update(command)
|
||||
|
||||
|
||||
@ -0,0 +1,95 @@
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tabulate import tabulate
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser(
|
||||
description=
|
||||
'Parse command line arguments for summary-nightly-results script.')
|
||||
parser.add_argument('--results-folder',
|
||||
type=str,
|
||||
required=True,
|
||||
help='The folder where the results are stored.')
|
||||
parser.add_argument('--description',
|
||||
type=str,
|
||||
required=True,
|
||||
help='Description of the results.')
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def get_perf(df, method, model, metric):
|
||||
|
||||
means = []
|
||||
|
||||
for qps in [2, 4, 8, 16, "inf"]:
|
||||
target = df['Test name'].str.contains(model)
|
||||
target = target & df['Engine'].str.contains(method)
|
||||
target = target & df['Test name'].str.contains("qps_" + str(qps))
|
||||
filtered_df = df[target]
|
||||
|
||||
if filtered_df.empty:
|
||||
means.append(0.)
|
||||
else:
|
||||
means.append(filtered_df[metric].values[0])
|
||||
|
||||
return np.array(means)
|
||||
|
||||
|
||||
def get_perf_w_std(df, method, model, metric):
|
||||
|
||||
if metric in ["TTFT", "ITL"]:
|
||||
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
|
||||
mean = mean.tolist()
|
||||
std = get_perf(df, method, model, "Std " + metric + " (ms)")
|
||||
if std.mean() == 0:
|
||||
std = None
|
||||
success = get_perf(df, method, model, "Successful req.")
|
||||
if std is not None:
|
||||
std = std / np.sqrt(success)
|
||||
std = std.tolist()
|
||||
|
||||
else:
|
||||
assert metric == "Tput"
|
||||
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
|
||||
df, method, model, "Output Tput (tok/s)")
|
||||
mean = mean.tolist()
|
||||
std = None
|
||||
|
||||
return mean, std
|
||||
|
||||
|
||||
def main(args):
|
||||
results_folder = Path(args.results_folder)
|
||||
|
||||
results = []
|
||||
|
||||
# collect results
|
||||
for test_file in results_folder.glob("*_nightly_results.json"):
|
||||
with open(test_file) as f:
|
||||
results = results + json.loads(f.read())
|
||||
|
||||
# generate markdown table
|
||||
df = pd.DataFrame.from_dict(results)
|
||||
|
||||
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
|
||||
|
||||
with open(args.description) as f:
|
||||
description = f.read()
|
||||
|
||||
description = description.format(
|
||||
nightly_results_benchmarking_table=md_table)
|
||||
|
||||
with open("nightly_results.md", "w") as f:
|
||||
f.write(description)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_arguments()
|
||||
main(args)
|
||||
228
.buildkite/nightly-benchmarks/scripts/launch-server.sh
Normal file
228
.buildkite/nightly-benchmarks/scripts/launch-server.sh
Normal file
@ -0,0 +1,228 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Currently FP8 benchmark is NOT enabled.
|
||||
|
||||
set -x
|
||||
server_params=$1
|
||||
common_params=$2
|
||||
|
||||
json2args() {
|
||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||
# example:
|
||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||
local json_string=$1
|
||||
local args=$(
|
||||
echo "$json_string" | jq -r '
|
||||
to_entries |
|
||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||
join(" ")
|
||||
'
|
||||
)
|
||||
echo "$args"
|
||||
}
|
||||
|
||||
launch_trt_server() {
|
||||
|
||||
model_path=$(echo "$common_params" | jq -r '.model')
|
||||
model_name="${model_path#*/}"
|
||||
model_type=$(echo "$server_params" | jq -r '.model_type')
|
||||
model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
|
||||
model_tp_size=$(echo "$common_params" | jq -r '.tp')
|
||||
max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
|
||||
max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
|
||||
max_seq_len=$(echo "$server_params" | jq -r '.max_seq_len')
|
||||
max_num_tokens=$(echo "$server_params" | jq -r '.max_num_tokens')
|
||||
trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
|
||||
|
||||
# create model caching directory
|
||||
cd ~
|
||||
rm -rf models
|
||||
mkdir -p models
|
||||
cd models
|
||||
models_dir=$(pwd)
|
||||
trt_model_path=${models_dir}/${model_name}-trt-ckpt
|
||||
trt_engine_path=${models_dir}/${model_name}-trt-engine
|
||||
|
||||
# clone tensorrt backend
|
||||
cd /
|
||||
rm -rf tensorrtllm_backend
|
||||
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
|
||||
git lfs install
|
||||
cd tensorrtllm_backend
|
||||
git checkout "$trt_llm_version"
|
||||
git submodule update --init --recursive
|
||||
|
||||
# build trtllm engine
|
||||
cd /tensorrtllm_backend
|
||||
cd "./tensorrt_llm/examples/${model_type}"
|
||||
python3 convert_checkpoint.py \
|
||||
--model_dir "${model_path}" \
|
||||
--dtype "${model_dtype}" \
|
||||
--tp_size "${model_tp_size}" \
|
||||
--output_dir "${trt_model_path}"
|
||||
trtllm-build \
|
||||
--checkpoint_dir "${trt_model_path}" \
|
||||
--use_fused_mlp \
|
||||
--reduce_fusion disable \
|
||||
--workers 8 \
|
||||
--gpt_attention_plugin "${model_dtype}" \
|
||||
--gemm_plugin "${model_dtype}" \
|
||||
--tp_size "${model_tp_size}" \
|
||||
--max_batch_size "${max_batch_size}" \
|
||||
--max_input_len "${max_input_len}" \
|
||||
--max_seq_len "${max_seq_len}" \
|
||||
--max_num_tokens "${max_num_tokens}" \
|
||||
--output_dir "${trt_engine_path}"
|
||||
|
||||
# handle triton protobuf files and launch triton server
|
||||
cd /tensorrtllm_backend
|
||||
mkdir triton_model_repo
|
||||
cp -r all_models/inflight_batcher_llm/* triton_model_repo/
|
||||
cd triton_model_repo
|
||||
rm -rf ./tensorrt_llm/1/*
|
||||
cp -r "${trt_engine_path}"/* ./tensorrt_llm/1
|
||||
python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
|
||||
python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5"
|
||||
python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false"
|
||||
python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size"
|
||||
python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1"
|
||||
cd /tensorrtllm_backend
|
||||
python3 scripts/launch_triton_server.py \
|
||||
--world_size="${model_tp_size}" \
|
||||
--model_repo=/tensorrtllm_backend/triton_model_repo &
|
||||
|
||||
}
|
||||
|
||||
launch_tgi_server() {
|
||||
model=$(echo "$common_params" | jq -r '.model')
|
||||
tp=$(echo "$common_params" | jq -r '.tp')
|
||||
port=$(echo "$common_params" | jq -r '.port')
|
||||
server_args=$(json2args "$server_params")
|
||||
|
||||
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||
echo "Key 'fp8' exists in common params."
|
||||
server_command="/tgi-entrypoint.sh \
|
||||
--model-id $model \
|
||||
--num-shard $tp \
|
||||
--port $port \
|
||||
--quantize fp8 \
|
||||
$server_args"
|
||||
else
|
||||
echo "Key 'fp8' does not exist in common params."
|
||||
server_command="/tgi-entrypoint.sh \
|
||||
--model-id $model \
|
||||
--num-shard $tp \
|
||||
--port $port \
|
||||
$server_args"
|
||||
fi
|
||||
|
||||
echo "Server command: $server_command"
|
||||
eval "$server_command" &
|
||||
|
||||
}
|
||||
|
||||
launch_lmdeploy_server() {
|
||||
model=$(echo "$common_params" | jq -r '.model')
|
||||
tp=$(echo "$common_params" | jq -r '.tp')
|
||||
port=$(echo "$common_params" | jq -r '.port')
|
||||
server_args=$(json2args "$server_params")
|
||||
|
||||
server_command="lmdeploy serve api_server $model \
|
||||
--tp $tp \
|
||||
--server-port $port \
|
||||
$server_args"
|
||||
|
||||
# run the server
|
||||
echo "Server command: $server_command"
|
||||
bash -c "$server_command" &
|
||||
}
|
||||
|
||||
launch_sglang_server() {
|
||||
|
||||
model=$(echo "$common_params" | jq -r '.model')
|
||||
tp=$(echo "$common_params" | jq -r '.tp')
|
||||
port=$(echo "$common_params" | jq -r '.port')
|
||||
server_args=$(json2args "$server_params")
|
||||
|
||||
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
||||
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
||||
server_command="python3 \
|
||||
-m sglang.launch_server \
|
||||
--tp $tp \
|
||||
--model-path $model \
|
||||
--port $port \
|
||||
$server_args"
|
||||
else
|
||||
echo "Key 'fp8' does not exist in common params."
|
||||
server_command="python3 \
|
||||
-m sglang.launch_server \
|
||||
--tp $tp \
|
||||
--model-path $model \
|
||||
--port $port \
|
||||
$server_args"
|
||||
fi
|
||||
|
||||
# run the server
|
||||
echo "Server command: $server_command"
|
||||
eval "$server_command" &
|
||||
}
|
||||
|
||||
launch_vllm_server() {
|
||||
|
||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||
|
||||
model=$(echo "$common_params" | jq -r '.model')
|
||||
tp=$(echo "$common_params" | jq -r '.tp')
|
||||
port=$(echo "$common_params" | jq -r '.port')
|
||||
server_args=$(json2args "$server_params")
|
||||
|
||||
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
||||
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
||||
server_command="python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
-tp $tp \
|
||||
--model $model \
|
||||
--port $port \
|
||||
$server_args"
|
||||
else
|
||||
echo "Key 'fp8' does not exist in common params."
|
||||
server_command="python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
-tp $tp \
|
||||
--model $model \
|
||||
--port $port \
|
||||
$server_args"
|
||||
fi
|
||||
|
||||
# run the server
|
||||
echo "Server command: $server_command"
|
||||
eval "$server_command" &
|
||||
}
|
||||
|
||||
main() {
|
||||
|
||||
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then
|
||||
launch_trt_server
|
||||
fi
|
||||
|
||||
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then
|
||||
launch_tgi_server
|
||||
fi
|
||||
|
||||
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
|
||||
launch_lmdeploy_server
|
||||
fi
|
||||
|
||||
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then
|
||||
launch_sglang_server
|
||||
fi
|
||||
|
||||
if [[ "$CURRENT_LLM_SERVING_ENGINE" == *"vllm"* ]]; then
|
||||
launch_vllm_server
|
||||
fi
|
||||
}
|
||||
|
||||
main
|
||||
@ -1,102 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
|
||||
server_params=$1
|
||||
common_params=$2
|
||||
|
||||
|
||||
|
||||
model_path=$(echo "$common_params" | jq -r '.model')
|
||||
model_name="${model_path#*/}"
|
||||
model_type=$(echo "$server_params" | jq -r '.model_type')
|
||||
model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
|
||||
model_tp_size=$(echo "$common_params" | jq -r '.tp')
|
||||
max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
|
||||
max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
|
||||
max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
|
||||
trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
|
||||
|
||||
cd ~
|
||||
rm -rf models
|
||||
mkdir -p models
|
||||
cd models
|
||||
models_dir=$(pwd)
|
||||
trt_model_path=${models_dir}/${model_name}-trt-ckpt
|
||||
trt_engine_path=${models_dir}/${model_name}-trt-engine
|
||||
|
||||
cd ~
|
||||
rm -rf tensorrt-demo
|
||||
git clone https://github.com/neuralmagic/tensorrt-demo.git
|
||||
cd tensorrt-demo
|
||||
tensorrt_demo_dir=$(pwd)
|
||||
|
||||
# make sure the parameter inside tensorrt_demo is consistent to envvar
|
||||
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
|
||||
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
|
||||
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
|
||||
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
|
||||
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
|
||||
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
|
||||
|
||||
|
||||
cd /
|
||||
rm -rf tensorrtllm_backend
|
||||
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
|
||||
git lfs install
|
||||
cd tensorrtllm_backend
|
||||
git checkout $trt_llm_version
|
||||
tensorrtllm_backend_dir=$(pwd)
|
||||
git submodule update --init --recursive
|
||||
cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
|
||||
|
||||
cd /tensorrtllm_backend
|
||||
cd ./tensorrt_llm/examples/${model_type}
|
||||
|
||||
|
||||
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
|
||||
|
||||
echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py"
|
||||
echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
|
||||
python ../quantization/quantize.py \
|
||||
--model_dir ${model_path} \
|
||||
--dtype ${model_dtype} \
|
||||
--tp_size ${model_tp_size} \
|
||||
--output_dir ${trt_model_path} \
|
||||
--qformat fp8 \
|
||||
--kv_cache_dtype fp8 \
|
||||
--calib_size 2
|
||||
|
||||
else
|
||||
|
||||
echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py"
|
||||
python3 convert_checkpoint.py \
|
||||
--model_dir ${model_path} \
|
||||
--dtype ${model_dtype} \
|
||||
--tp_size ${model_tp_size} \
|
||||
--output_dir ${trt_model_path}
|
||||
|
||||
fi
|
||||
|
||||
|
||||
|
||||
trtllm-build \
|
||||
--checkpoint_dir=${trt_model_path} \
|
||||
--gpt_attention_plugin=${model_dtype} \
|
||||
--gemm_plugin=${model_dtype} \
|
||||
--remove_input_padding=enable \
|
||||
--paged_kv_cache=enable \
|
||||
--tp_size=${model_tp_size} \
|
||||
--max_batch_size=${max_batch_size} \
|
||||
--max_input_len=${max_input_len} \
|
||||
--max_output_len=${max_output_len} \
|
||||
--max_num_tokens=${max_output_len} \
|
||||
--opt_num_tokens=${max_output_len} \
|
||||
--output_dir=${trt_engine_path}
|
||||
|
||||
cd /tensorrtllm_backend/triton_model_repo
|
||||
rm -rf ./tensorrt_llm/1/*
|
||||
cp -r ${trt_engine_path}/* ./tensorrt_llm/1
|
||||
cd /tensorrtllm_backend
|
||||
python3 scripts/launch_triton_server.py \
|
||||
--world_size=${model_tp_size} \
|
||||
--model_repo=/tensorrtllm_backend/triton_model_repo &
|
||||
@ -8,6 +8,7 @@ main() {
|
||||
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
(which jq) || (apt-get update && apt-get -y install jq)
|
||||
(which zip) || (apt-get install -y zip)
|
||||
|
||||
if [ ! -f /workspace/buildkite-agent ]; then
|
||||
echo "buildkite-agent binary not found. Skip plotting the results."
|
||||
@ -15,26 +16,63 @@ main() {
|
||||
fi
|
||||
|
||||
# initial annotation
|
||||
description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
|
||||
#description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
|
||||
|
||||
# download results
|
||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||
mkdir -p results/
|
||||
/workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
|
||||
ls
|
||||
ls results/
|
||||
|
||||
# generate figures
|
||||
python3 -m pip install tabulate pandas matplotlib
|
||||
python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||
--description $description \
|
||||
--results-folder results/
|
||||
# upload benchmark results
|
||||
zip -r results.zip results/
|
||||
/workspace/buildkite-agent artifact upload "results.zip"
|
||||
|
||||
# upload benchmarking scripts
|
||||
cd "$VLLM_SOURCE_CODE_LOC/"
|
||||
zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
|
||||
/workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
|
||||
|
||||
cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
||||
# upload benchmarking pipeline
|
||||
/workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
|
||||
|
||||
cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
||||
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
|
||||
|
||||
# upload results and figures
|
||||
/workspace/buildkite-agent artifact upload "nightly_results.png"
|
||||
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
|
||||
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
|
||||
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
|
||||
|
||||
|
||||
# The figures should be genereated by a separate process outside the CI/CD pipeline
|
||||
|
||||
# # generate figures
|
||||
# python3 -m pip install tabulate pandas matplotlib
|
||||
|
||||
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
|
||||
# --description $description \
|
||||
# --results-folder results/
|
||||
|
||||
|
||||
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||
# --description $description \
|
||||
# --results-folder results/ \
|
||||
# --dataset sharegpt
|
||||
|
||||
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||
# --description $description \
|
||||
# --results-folder results/ \
|
||||
# --dataset sonnet_2048_128
|
||||
|
||||
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||
# --description $description \
|
||||
# --results-folder results/ \
|
||||
# --dataset sonnet_128_2048
|
||||
|
||||
# # upload results and figures
|
||||
# /workspace/buildkite-agent artifact upload "nightly_results*.png"
|
||||
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
|
||||
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
|
||||
# /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
|
||||
}
|
||||
|
||||
main "$@"
|
||||
main "$@"
|
||||
|
||||
@ -1,135 +0,0 @@
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
from tabulate import tabulate
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser(
|
||||
description=
|
||||
'Parse command line arguments for summary-nightly-results script.')
|
||||
parser.add_argument('--results-folder',
|
||||
type=str,
|
||||
required=True,
|
||||
help='The folder where the results are stored.')
|
||||
parser.add_argument('--description',
|
||||
type=str,
|
||||
required=True,
|
||||
help='Description of the results.')
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def main(args):
|
||||
bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00']
|
||||
results_folder = Path(args.results_folder)
|
||||
|
||||
results = []
|
||||
|
||||
# collect results
|
||||
for test_file in results_folder.glob("*_nightly_results.json"):
|
||||
with open(test_file, "r") as f:
|
||||
results = results + json.loads(f.read())
|
||||
|
||||
# generate markdown table
|
||||
df = pd.DataFrame.from_dict(results)
|
||||
|
||||
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
|
||||
|
||||
with open(args.description, "r") as f:
|
||||
description = f.read()
|
||||
|
||||
description = description.format(
|
||||
nightly_results_benchmarking_table=md_table)
|
||||
|
||||
with open("nightly_results.md", "w") as f:
|
||||
f.write(description)
|
||||
|
||||
plt.rcParams.update({'font.size': 20})
|
||||
|
||||
# plot results
|
||||
fig, axes = plt.subplots(3, 3, figsize=(16, 14))
|
||||
fig.subplots_adjust(hspace=1)
|
||||
methods = ["vllm", "trt", "lmdeploy", "tgi"]
|
||||
for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
|
||||
for j, metric in enumerate(["TTFT", "ITL"]):
|
||||
means, stds = [], []
|
||||
for method in methods:
|
||||
target = df['Test name'].str.contains(model)
|
||||
target = target & df['Engine'].str.contains(method)
|
||||
filtered_df = df[target]
|
||||
|
||||
if filtered_df.empty:
|
||||
means.append(0.)
|
||||
stds.append(0.)
|
||||
else:
|
||||
means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
|
||||
std = filtered_df[f"Std {metric} (ms)"].values[0]
|
||||
success = filtered_df["Successful req."].values[0]
|
||||
stds.append(std / math.sqrt(success))
|
||||
|
||||
print(model, metric)
|
||||
print(means, stds)
|
||||
|
||||
ax = axes[i, j + 1]
|
||||
|
||||
bars = ax.bar(
|
||||
["vllm", "trt", "lmdeploy", "tgi"],
|
||||
means,
|
||||
yerr=stds,
|
||||
capsize=10,
|
||||
)
|
||||
for idx, bar in enumerate(bars):
|
||||
bar.set_color(bar_colors[idx])
|
||||
ax.set_ylim(bottom=0)
|
||||
|
||||
ax.set_ylabel(f"{metric} (ms)")
|
||||
ax.set_title(f"{model} {metric}")
|
||||
ax.grid(axis='y')
|
||||
|
||||
metric = "Tput"
|
||||
j = 0
|
||||
if True:
|
||||
tputs = []
|
||||
for method in methods:
|
||||
target = df['Test name'].str.contains(model)
|
||||
target = target & df['Engine'].str.contains(method)
|
||||
filtered_df = df[target]
|
||||
|
||||
if filtered_df.empty:
|
||||
tputs.append(0.)
|
||||
else:
|
||||
input_tput = filtered_df["Input Tput (tok/s)"].values[0]
|
||||
output_tput = filtered_df["Output Tput (tok/s)"].values[0]
|
||||
tputs.append(input_tput + output_tput)
|
||||
|
||||
print(model, metric)
|
||||
print(tputs)
|
||||
|
||||
ax = axes[i, j]
|
||||
|
||||
bars = ax.bar(
|
||||
["vllm", "trt", "lmdeploy", "tgi"],
|
||||
tputs,
|
||||
)
|
||||
for idx, bar in enumerate(bars):
|
||||
bar.set_color(bar_colors[idx])
|
||||
|
||||
ax.set_ylim(bottom=0)
|
||||
|
||||
ax.set_ylabel("Tput (token/s)")
|
||||
ax.set_title(f"{model} {metric}")
|
||||
ax.grid(axis='y')
|
||||
|
||||
fig.tight_layout()
|
||||
fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_arguments()
|
||||
main(args)
|
||||
@ -1,218 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -o pipefail
|
||||
|
||||
check_gpus() {
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
if [[ $gpu_count -gt 0 ]]; then
|
||||
echo "GPU found."
|
||||
else
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
kill_gpu_processes() {
|
||||
pkill lmdeploy || true
|
||||
# waiting for GPU processes to be fully killed
|
||||
sleep 10
|
||||
# Print the GPU memory usage
|
||||
# so that we know if all GPU processes are killed.
|
||||
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
||||
# The memory usage should be 0 MB.
|
||||
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
||||
}
|
||||
|
||||
json2args() {
|
||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||
# example:
|
||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||
local json_string=$1
|
||||
local args=$(
|
||||
echo "$json_string" | jq -r '
|
||||
to_entries |
|
||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||
join(" ")
|
||||
'
|
||||
)
|
||||
echo "$args"
|
||||
}
|
||||
|
||||
wait_for_server() {
|
||||
# wait for vllm server to start
|
||||
# return 1 if vllm server crashes
|
||||
timeout 1200 bash -c '
|
||||
until curl -s localhost:8000/v1/completions > /dev/null; do
|
||||
sleep 1
|
||||
done' && return 0 || return 1
|
||||
}
|
||||
|
||||
run_serving_tests() {
|
||||
# run serving tests using `benchmark_serving.py`
|
||||
# $1: a json file specifying serving test cases
|
||||
|
||||
local serving_test_file
|
||||
serving_test_file=$1
|
||||
|
||||
# Iterate over serving tests
|
||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# append lmdeploy to the test name
|
||||
test_name=lmdeploy_$test_name
|
||||
|
||||
# get common parameters
|
||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||
model=$(echo "$common_params" | jq -r '.model')
|
||||
tp=$(echo "$common_params" | jq -r '.tp')
|
||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||
port=$(echo "$common_params" | jq -r '.port')
|
||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||
|
||||
|
||||
|
||||
# get client and server arguments
|
||||
server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
|
||||
client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
|
||||
server_args=$(json2args "$server_params")
|
||||
client_args=$(json2args "$client_params")
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
echo "Running over qps list $qps_list"
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# prepare tokenizer
|
||||
rm -rf /tokenizer_cache
|
||||
mkdir /tokenizer_cache
|
||||
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
||||
--model "$model" \
|
||||
--cachedir /tokenizer_cache
|
||||
|
||||
server_command="lmdeploy serve api_server $model \
|
||||
--tp $tp \
|
||||
--server-port $port \
|
||||
$server_args"
|
||||
|
||||
# run the server
|
||||
echo "Running test case $test_name"
|
||||
echo "Server command: $server_command"
|
||||
bash -c "$server_command" &
|
||||
|
||||
# wait until the server is alive
|
||||
wait_for_server
|
||||
if [ $? -eq 0 ]; then
|
||||
echo ""
|
||||
echo "lmdeploy server is up and running."
|
||||
else
|
||||
echo ""
|
||||
echo "lmdeploy failed to start within the timeout period."
|
||||
break
|
||||
fi
|
||||
|
||||
# get model name
|
||||
model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
|
||||
|
||||
# iterate over different QPS
|
||||
for qps in $qps_list; do
|
||||
# remove the surrounding single quote from qps
|
||||
if [[ "$qps" == *"inf"* ]]; then
|
||||
echo "qps was $qps"
|
||||
qps="inf"
|
||||
echo "now qps is $qps"
|
||||
fi
|
||||
|
||||
new_test_name=$test_name"_qps_"$qps
|
||||
|
||||
client_command="python3 benchmark_serving.py \
|
||||
--backend lmdeploy \
|
||||
--tokenizer /tokenizer_cache \
|
||||
--dataset-name $dataset_name \
|
||||
--dataset-path $dataset_path \
|
||||
--num-prompts $num_prompts \
|
||||
--port $port \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
--model \"$model_name\" \
|
||||
$client_args"
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
echo "Client command: $client_command"
|
||||
|
||||
eval "$client_command"
|
||||
|
||||
# record the benchmarking commands
|
||||
jq_output=$(jq -n \
|
||||
--arg server "$server_command" \
|
||||
--arg client "$client_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
--arg engine "lmdeploy" \
|
||||
'{
|
||||
server_command: $server,
|
||||
client_command: $client,
|
||||
gpu_type: $gpu,
|
||||
engine: $engine
|
||||
}')
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||
|
||||
done
|
||||
|
||||
# clean up
|
||||
kill_gpu_processes
|
||||
rm -rf /root/.cache/huggingface/*
|
||||
done
|
||||
}
|
||||
|
||||
|
||||
upload_to_buildkite() {
|
||||
# upload the benchmarking results to buildkite
|
||||
|
||||
# if the agent binary is not found, skip uploading the results, exit 0
|
||||
if [ ! -f /workspace/buildkite-agent ]; then
|
||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||
return 0
|
||||
fi
|
||||
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
||||
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||
}
|
||||
|
||||
|
||||
main() {
|
||||
|
||||
check_gpus
|
||||
# enter vllm directory
|
||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||
|
||||
declare -g RESULTS_FOLDER=results/
|
||||
mkdir -p $RESULTS_FOLDER
|
||||
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||
|
||||
python -m pip install transformers==4.41.2
|
||||
|
||||
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
|
||||
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
||||
python -m pip install tabulate pandas
|
||||
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
||||
upload_to_buildkite
|
||||
|
||||
}
|
||||
|
||||
main "$@"
|
||||
355
.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
Normal file
355
.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
Normal file
@ -0,0 +1,355 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -o pipefail
|
||||
set -x
|
||||
|
||||
check_gpus() {
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
if [[ $gpu_count -gt 0 ]]; then
|
||||
echo "GPU found."
|
||||
else
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
check_hf_token() {
|
||||
# check if HF_TOKEN is available and valid
|
||||
if [[ -z "$HF_TOKEN" ]]; then
|
||||
echo "Error: HF_TOKEN is not set."
|
||||
exit 1
|
||||
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
||||
echo "Error: HF_TOKEN does not start with 'hf_'."
|
||||
exit 1
|
||||
else
|
||||
echo "HF_TOKEN is set and valid."
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
upload_to_buildkite() {
|
||||
# upload the benchmarking results to buildkite
|
||||
|
||||
# if the agent binary is not found, skip uploading the results, exit 0
|
||||
if [ ! -f /workspace/buildkite-agent ]; then
|
||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||
return 0
|
||||
fi
|
||||
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
||||
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||
}
|
||||
|
||||
|
||||
get_current_llm_serving_engine() {
|
||||
|
||||
if which lmdeploy >/dev/null; then
|
||||
echo "Container: lmdeploy"
|
||||
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
|
||||
return
|
||||
fi
|
||||
|
||||
if [ -e /tgi-entrypoint.sh ]; then
|
||||
echo "Container: tgi"
|
||||
export CURRENT_LLM_SERVING_ENGINE=tgi
|
||||
return
|
||||
fi
|
||||
|
||||
if which trtllm-build >/dev/null; then
|
||||
echo "Container: tensorrt-llm"
|
||||
export CURRENT_LLM_SERVING_ENGINE=trt
|
||||
return
|
||||
fi
|
||||
|
||||
if [ -e /sgl-workspace ]; then
|
||||
echo "Container: sglang"
|
||||
export CURRENT_LLM_SERVING_ENGINE=sglang
|
||||
return
|
||||
fi
|
||||
|
||||
if [ -e /vllm-workspace ]; then
|
||||
echo "Container: vllm"
|
||||
# move to a completely irrelevant directory, to avoid import vllm from current folder
|
||||
export CURRENT_LLM_SERVING_ENGINE=vllm
|
||||
|
||||
return
|
||||
fi
|
||||
}
|
||||
|
||||
json2args() {
|
||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||
# example:
|
||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||
local json_string=$1
|
||||
local args=$(
|
||||
echo "$json_string" | jq -r '
|
||||
to_entries |
|
||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||
join(" ")
|
||||
'
|
||||
)
|
||||
echo "$args"
|
||||
}
|
||||
|
||||
kill_gpu_processes() {
|
||||
pkill -f python
|
||||
pkill -f python3
|
||||
pkill -f tritonserver
|
||||
pkill -f pt_main_thread
|
||||
pkill -f text-generation
|
||||
pkill -f lmdeploy
|
||||
|
||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||
sleep 1
|
||||
done
|
||||
}
|
||||
|
||||
wait_for_server() {
|
||||
# wait for vllm server to start
|
||||
# return 1 if vllm server crashes
|
||||
timeout 1200 bash -c '
|
||||
until curl -s localhost:8000/v1/completions > /dev/null; do
|
||||
sleep 1
|
||||
done' && return 0 || return 1
|
||||
}
|
||||
|
||||
ensure_installed() {
|
||||
# Ensure that the given command is installed by apt-get
|
||||
local cmd=$1
|
||||
if ! which "$cmd" >/dev/null; then
|
||||
apt-get update && apt-get install -y "$cmd"
|
||||
fi
|
||||
}
|
||||
|
||||
run_serving_tests() {
|
||||
# run serving tests using `benchmark_serving.py`
|
||||
# $1: a json file specifying serving test cases
|
||||
|
||||
local serving_test_file
|
||||
serving_test_file=$1
|
||||
|
||||
# Iterate over serving tests
|
||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# prepend the current serving engine to the test name
|
||||
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
||||
|
||||
# get common parameters
|
||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||
model=$(echo "$common_params" | jq -r '.model')
|
||||
tp=$(echo "$common_params" | jq -r '.tp')
|
||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||
port=$(echo "$common_params" | jq -r '.port')
|
||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
|
||||
|
||||
# get client and server arguments
|
||||
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
|
||||
client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
|
||||
client_args=$(json2args "$client_params")
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
echo "Running over qps list $qps_list"
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
if [[ $reuse_server == "true" ]]; then
|
||||
echo "Reuse previous server for test case $test_name"
|
||||
else
|
||||
kill_gpu_processes
|
||||
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
|
||||
"$server_params" "$common_params"
|
||||
fi
|
||||
|
||||
if wait_for_server; then
|
||||
echo ""
|
||||
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
||||
else
|
||||
echo ""
|
||||
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
|
||||
break
|
||||
fi
|
||||
|
||||
# prepare tokenizer
|
||||
# this is required for lmdeploy.
|
||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||
rm -rf /tokenizer_cache
|
||||
mkdir /tokenizer_cache
|
||||
python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
||||
--model "$model" \
|
||||
--cachedir /tokenizer_cache
|
||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||
|
||||
|
||||
# change model name for lmdeploy (it will not follow standard hf name)
|
||||
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
|
||||
model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
|
||||
fi
|
||||
|
||||
# iterate over different QPS
|
||||
for qps in $qps_list; do
|
||||
# remove the surrounding single quote from qps
|
||||
if [[ "$qps" == *"inf"* ]]; then
|
||||
echo "qps was $qps"
|
||||
qps="inf"
|
||||
echo "now qps is $qps"
|
||||
fi
|
||||
|
||||
new_test_name=$test_name"_qps_"$qps
|
||||
|
||||
backend=$CURRENT_LLM_SERVING_ENGINE
|
||||
|
||||
if [[ $backend = "trt" ]]; then
|
||||
backend="tensorrt-llm"
|
||||
fi
|
||||
|
||||
if [[ "$backend" == *"vllm"* ]]; then
|
||||
backend="vllm"
|
||||
fi
|
||||
|
||||
if [[ "$dataset_name" = "sharegpt" ]]; then
|
||||
|
||||
client_command="python3 benchmark_serving.py \
|
||||
--backend $backend \
|
||||
--tokenizer /tokenizer_cache \
|
||||
--model $model \
|
||||
--dataset-name $dataset_name \
|
||||
--dataset-path $dataset_path \
|
||||
--num-prompts $num_prompts \
|
||||
--port $port \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
--ignore-eos \
|
||||
$client_args"
|
||||
|
||||
elif [[ "$dataset_name" = "sonnet" ]]; then
|
||||
|
||||
sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
|
||||
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
||||
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
||||
|
||||
client_command="python3 benchmark_serving.py \
|
||||
--backend $backend \
|
||||
--tokenizer /tokenizer_cache \
|
||||
--model $model \
|
||||
--dataset-name $dataset_name \
|
||||
--dataset-path $dataset_path \
|
||||
--num-prompts $num_prompts \
|
||||
--sonnet-input-len $sonnet_input_len \
|
||||
--sonnet-output-len $sonnet_output_len \
|
||||
--sonnet-prefix-len $sonnet_prefix_len \
|
||||
--port $port \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
--ignore-eos \
|
||||
$client_args"
|
||||
|
||||
else
|
||||
|
||||
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
|
||||
exit 1
|
||||
|
||||
fi
|
||||
|
||||
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
echo "Client command: $client_command"
|
||||
|
||||
eval "$client_command"
|
||||
|
||||
server_command="None"
|
||||
|
||||
# record the benchmarking commands
|
||||
jq_output=$(jq -n \
|
||||
--arg server "$server_command" \
|
||||
--arg client "$client_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
--arg engine "$CURRENT_LLM_SERVING_ENGINE" \
|
||||
'{
|
||||
server_command: $server,
|
||||
client_command: $client,
|
||||
gpu_type: $gpu,
|
||||
engine: $engine
|
||||
}')
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||
|
||||
done
|
||||
|
||||
done
|
||||
|
||||
kill_gpu_processes
|
||||
}
|
||||
|
||||
|
||||
prepare_dataset() {
|
||||
|
||||
# download sharegpt dataset
|
||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
|
||||
# duplicate sonnet by 4x, to allow benchmarking with input length 2048
|
||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||
echo "" > sonnet_4x.txt
|
||||
for _ in {1..4}
|
||||
do
|
||||
cat sonnet.txt >> sonnet_4x.txt
|
||||
done
|
||||
|
||||
}
|
||||
|
||||
main() {
|
||||
|
||||
# check if the environment variable is successfully injected from yaml
|
||||
|
||||
check_gpus
|
||||
check_hf_token
|
||||
get_current_llm_serving_engine
|
||||
|
||||
pip install -U transformers
|
||||
|
||||
# check storage
|
||||
df -h
|
||||
|
||||
ensure_installed wget
|
||||
ensure_installed curl
|
||||
ensure_installed jq
|
||||
|
||||
prepare_dataset
|
||||
|
||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||
declare -g RESULTS_FOLDER=results/
|
||||
mkdir -p $RESULTS_FOLDER
|
||||
BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
||||
|
||||
# run the test
|
||||
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
|
||||
|
||||
# upload benchmark results to buildkite
|
||||
python3 -m pip install tabulate pandas
|
||||
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
|
||||
upload_to_buildkite
|
||||
|
||||
}
|
||||
|
||||
main "$@"
|
||||
@ -17,7 +17,7 @@ check_gpus() {
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
@ -93,7 +93,7 @@ kill_gpu_processes() {
|
||||
|
||||
|
||||
# wait until GPU memory usage smaller than 1GB
|
||||
while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
|
||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||
sleep 1
|
||||
done
|
||||
|
||||
@ -117,7 +117,7 @@ upload_to_buildkite() {
|
||||
fi
|
||||
|
||||
# Use the determined command to annotate and upload artifacts
|
||||
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md
|
||||
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md"
|
||||
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
|
||||
}
|
||||
|
||||
@ -150,7 +150,7 @@ run_latency_tests() {
|
||||
# check if there is enough GPU to run the test
|
||||
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
|
||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
@ -206,9 +206,9 @@ run_throughput_tests() {
|
||||
throughput_args=$(json2args "$throughput_params")
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
tp=$(echo $throughput_params | jq -r '.tensor_parallel_size')
|
||||
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
|
||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
@ -270,7 +270,7 @@ run_serving_tests() {
|
||||
# check if there is enough GPU to run the test
|
||||
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
|
||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
@ -278,7 +278,7 @@ run_serving_tests() {
|
||||
server_model=$(echo "$server_params" | jq -r '.model')
|
||||
client_model=$(echo "$client_params" | jq -r '.model')
|
||||
if [[ $server_model != "$client_model" ]]; then
|
||||
echo "Server model and client model must be the same. Skip testcase $testname."
|
||||
echo "Server model and client model must be the same. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
@ -293,8 +293,7 @@ run_serving_tests() {
|
||||
server_pid=$!
|
||||
|
||||
# wait until the server is alive
|
||||
wait_for_server
|
||||
if [ $? -eq 0 ]; then
|
||||
if wait_for_server; then
|
||||
echo ""
|
||||
echo "vllm server is up and running."
|
||||
else
|
||||
|
||||
@ -1,216 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -o pipefail
|
||||
|
||||
check_gpus() {
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
if [[ $gpu_count -gt 0 ]]; then
|
||||
echo "GPU found."
|
||||
else
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
kill_gpu_processes() {
|
||||
pkill text-generation || true
|
||||
# waiting for GPU processes to be fully killed
|
||||
sleep 10
|
||||
# Print the GPU memory usage
|
||||
# so that we know if all GPU processes are killed.
|
||||
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
||||
# The memory usage should be 0 MB.
|
||||
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
||||
}
|
||||
|
||||
json2args() {
|
||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||
# example:
|
||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||
local json_string=$1
|
||||
local args=$(
|
||||
echo "$json_string" | jq -r '
|
||||
to_entries |
|
||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||
join(" ")
|
||||
'
|
||||
)
|
||||
echo "$args"
|
||||
}
|
||||
|
||||
wait_for_server() {
|
||||
timeout 1200 bash -c '
|
||||
until curl -s localhost:8000/generate_stream > /dev/null; do
|
||||
sleep 1
|
||||
done' && return 0 || return 1
|
||||
}
|
||||
|
||||
run_serving_tests() {
|
||||
# run serving tests using `benchmark_serving.py`
|
||||
# $1: a json file specifying serving test cases
|
||||
|
||||
local serving_test_file
|
||||
serving_test_file=$1
|
||||
|
||||
# Iterate over serving tests
|
||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# append tgi to the test name
|
||||
test_name=tgi_$test_name
|
||||
|
||||
# get common parameters
|
||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||
model=$(echo "$common_params" | jq -r '.model')
|
||||
tp=$(echo "$common_params" | jq -r '.tp')
|
||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||
port=$(echo "$common_params" | jq -r '.port')
|
||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||
|
||||
# get client and server arguments
|
||||
server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
|
||||
client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
|
||||
server_args=$(json2args "$server_params")
|
||||
client_args=$(json2args "$client_params")
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
echo "Running over qps list $qps_list"
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
|
||||
echo "Key 'fp8' exists in common params."
|
||||
server_command="/tgi-entrypoint.sh \
|
||||
--model-id $model \
|
||||
--num-shard $tp \
|
||||
--port $port \
|
||||
--quantize fp8 \
|
||||
$server_args"
|
||||
else
|
||||
echo "Key 'fp8' does not exist in common params."
|
||||
server_command="/tgi-entrypoint.sh \
|
||||
--model-id $model \
|
||||
--num-shard $tp \
|
||||
--port $port \
|
||||
$server_args"
|
||||
fi
|
||||
|
||||
|
||||
|
||||
|
||||
# run the server
|
||||
echo "Running test case $test_name"
|
||||
echo "Server command: $server_command"
|
||||
eval "$server_command" &
|
||||
|
||||
# wait until the server is alive
|
||||
wait_for_server
|
||||
if [ $? -eq 0 ]; then
|
||||
echo ""
|
||||
echo "tgi server is up and running."
|
||||
else
|
||||
echo ""
|
||||
echo "tgi failed to start within the timeout period."
|
||||
break
|
||||
fi
|
||||
|
||||
# iterate over different QPS
|
||||
for qps in $qps_list; do
|
||||
# remove the surrounding single quote from qps
|
||||
if [[ "$qps" == *"inf"* ]]; then
|
||||
echo "qps was $qps"
|
||||
qps="inf"
|
||||
echo "now qps is $qps"
|
||||
fi
|
||||
|
||||
new_test_name=$test_name"_qps_"$qps
|
||||
|
||||
client_command="python3 benchmark_serving.py \
|
||||
--backend tgi \
|
||||
--model $model \
|
||||
--dataset-name $dataset_name \
|
||||
--dataset-path $dataset_path \
|
||||
--num-prompts $num_prompts \
|
||||
--port $port \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
$client_args"
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
echo "Client command: $client_command"
|
||||
|
||||
eval "$client_command"
|
||||
|
||||
# record the benchmarking commands
|
||||
jq_output=$(jq -n \
|
||||
--arg server "$server_command" \
|
||||
--arg client "$client_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
--arg engine "tgi" \
|
||||
'{
|
||||
server_command: $server,
|
||||
client_command: $client,
|
||||
gpu_type: $gpu,
|
||||
engine: $engine
|
||||
}')
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||
|
||||
done
|
||||
|
||||
# clean up
|
||||
kill_gpu_processes
|
||||
rm -rf /root/.cache/huggingface/*
|
||||
done
|
||||
}
|
||||
|
||||
|
||||
|
||||
upload_to_buildkite() {
|
||||
# upload the benchmarking results to buildkite
|
||||
|
||||
# if the agent binary is not found, skip uploading the results, exit 0
|
||||
if [ ! -f /workspace/buildkite-agent ]; then
|
||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||
return 0
|
||||
fi
|
||||
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
||||
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||
}
|
||||
|
||||
main() {
|
||||
|
||||
check_gpus
|
||||
# enter vllm directory
|
||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||
declare -g RESULTS_FOLDER=results/
|
||||
mkdir -p $RESULTS_FOLDER
|
||||
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||
|
||||
export CURRENT_LLM_SERVING_ENGINE=tgi
|
||||
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
||||
python -m pip install tabulate pandas
|
||||
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
||||
upload_to_buildkite
|
||||
|
||||
}
|
||||
|
||||
main "$@"
|
||||
@ -1,214 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -o pipefail
|
||||
|
||||
check_gpus() {
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
if [[ $gpu_count -gt 0 ]]; then
|
||||
echo "GPU found."
|
||||
else
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
kill_gpu_processes() {
|
||||
pkill tritonserver || true
|
||||
# waiting for GPU processes to be fully killed
|
||||
sleep 20
|
||||
# Print the GPU memory usage
|
||||
# so that we know if all GPU processes are killed.
|
||||
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
||||
# The memory usage should be 0 MB.
|
||||
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
||||
}
|
||||
|
||||
json2args() {
|
||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||
# example:
|
||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||
local json_string=$1
|
||||
local args=$(
|
||||
echo "$json_string" | jq -r '
|
||||
to_entries |
|
||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||
join(" ")
|
||||
'
|
||||
)
|
||||
echo "$args"
|
||||
}
|
||||
|
||||
wait_for_server() {
|
||||
timeout 1200 bash -c '
|
||||
until curl -s localhost:8000/generate_stream > /dev/null; do
|
||||
sleep 1
|
||||
done' && return 0 || return 1
|
||||
}
|
||||
|
||||
run_serving_tests() {
|
||||
# run serving tests using `benchmark_serving.py`
|
||||
# $1: a json file specifying serving test cases
|
||||
|
||||
local serving_test_file
|
||||
serving_test_file=$1
|
||||
|
||||
# Iterate over serving tests
|
||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# append trt to the test name
|
||||
test_name=trt_$test_name
|
||||
|
||||
# get common parameters
|
||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||
model=$(echo "$common_params" | jq -r '.model')
|
||||
tp=$(echo "$common_params" | jq -r '.tp')
|
||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||
port=$(echo "$common_params" | jq -r '.port')
|
||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||
|
||||
# get client and server arguments
|
||||
server_params=$(echo "$params" | jq -r '.trt_server_parameters')
|
||||
client_params=$(echo "$params" | jq -r '.trt_client_parameters')
|
||||
client_args=$(json2args "$client_params")
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
echo "Running over qps list $qps_list"
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
|
||||
|
||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||
|
||||
|
||||
echo "Running test case $test_name"
|
||||
bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
|
||||
|
||||
# wait until the server is alive
|
||||
wait_for_server
|
||||
if [ $? -eq 0 ]; then
|
||||
echo ""
|
||||
echo "trt server is up and running."
|
||||
else
|
||||
echo ""
|
||||
echo "trt failed to start within the timeout period."
|
||||
break
|
||||
fi
|
||||
|
||||
# prepare tokenizer
|
||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||
rm -rf /tokenizer_cache
|
||||
mkdir /tokenizer_cache
|
||||
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
||||
--model "$model" \
|
||||
--cachedir /tokenizer_cache
|
||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||
|
||||
|
||||
# iterate over different QPS
|
||||
for qps in $qps_list; do
|
||||
# remove the surrounding single quote from qps
|
||||
if [[ "$qps" == *"inf"* ]]; then
|
||||
echo "qps was $qps"
|
||||
qps="inf"
|
||||
echo "now qps is $qps"
|
||||
fi
|
||||
|
||||
new_test_name=$test_name"_qps_"$qps
|
||||
|
||||
client_command="python3 benchmark_serving.py \
|
||||
--backend tensorrt-llm \
|
||||
--tokenizer /tokenizer_cache \
|
||||
--model $model \
|
||||
--dataset-name $dataset_name \
|
||||
--dataset-path $dataset_path \
|
||||
--num-prompts $num_prompts \
|
||||
--port $port \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
$client_args"
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
echo "Client command: $client_command"
|
||||
|
||||
eval "$client_command"
|
||||
|
||||
server_command=""
|
||||
# record the benchmarking commands
|
||||
jq_output=$(jq -n \
|
||||
--arg server "$server_command" \
|
||||
--arg client "$client_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
--arg engine "trt" \
|
||||
'{
|
||||
server_command: $server,
|
||||
client_command: $client,
|
||||
gpu_type: $gpu,
|
||||
engine: $engine
|
||||
}')
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||
|
||||
done
|
||||
|
||||
# clean up
|
||||
kill_gpu_processes
|
||||
rm -rf /root/.cache/huggingface/*
|
||||
done
|
||||
}
|
||||
|
||||
upload_to_buildkite() {
|
||||
# upload the benchmarking results to buildkite
|
||||
|
||||
# if the agent binary is not found, skip uploading the results, exit 0
|
||||
if [ ! -f /workspace/buildkite-agent ]; then
|
||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||
return 0
|
||||
fi
|
||||
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
||||
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||
}
|
||||
|
||||
|
||||
main() {
|
||||
|
||||
check_gpus
|
||||
|
||||
|
||||
# enter vllm directory
|
||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||
|
||||
declare -g RESULTS_FOLDER=results/
|
||||
mkdir -p $RESULTS_FOLDER
|
||||
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||
|
||||
# update transformers package, to make sure mixtral tokenizer is available
|
||||
python -m pip install transformers -U
|
||||
|
||||
export CURRENT_LLM_SERVING_ENGINE=trt
|
||||
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
||||
python -m pip install tabulate pandas
|
||||
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
||||
upload_to_buildkite
|
||||
|
||||
}
|
||||
|
||||
main "$@"
|
||||
@ -1,221 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -o pipefail
|
||||
|
||||
check_gpus() {
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
if [[ $gpu_count -gt 0 ]]; then
|
||||
echo "GPU found."
|
||||
else
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
kill_gpu_processes() {
|
||||
# kill all processes on GPU.
|
||||
pkill pt_main_thread
|
||||
sleep 10
|
||||
|
||||
# remove vllm config file
|
||||
rm -rf ~/.config/vllm
|
||||
|
||||
# Print the GPU memory usage
|
||||
# so that we know if all GPU processes are killed.
|
||||
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
||||
# The memory usage should be 0 MB.
|
||||
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
||||
}
|
||||
|
||||
json2args() {
|
||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||
# example:
|
||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||
local json_string=$1
|
||||
local args=$(
|
||||
echo "$json_string" | jq -r '
|
||||
to_entries |
|
||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||
join(" ")
|
||||
'
|
||||
)
|
||||
echo "$args"
|
||||
}
|
||||
|
||||
wait_for_server() {
|
||||
# wait for vllm server to start
|
||||
# return 1 if vllm server crashes
|
||||
timeout 1200 bash -c '
|
||||
until curl -s localhost:8000/v1/completions > /dev/null; do
|
||||
sleep 1
|
||||
done' && return 0 || return 1
|
||||
}
|
||||
|
||||
run_serving_tests() {
|
||||
# run serving tests using `benchmark_serving.py`
|
||||
# $1: a json file specifying serving test cases
|
||||
|
||||
local serving_test_file
|
||||
serving_test_file=$1
|
||||
|
||||
# Iterate over serving tests
|
||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# append vllm to the test name
|
||||
test_name=vllm_$test_name
|
||||
|
||||
|
||||
# get common parameters
|
||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||
model=$(echo "$common_params" | jq -r '.model')
|
||||
tp=$(echo "$common_params" | jq -r '.tp')
|
||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||
port=$(echo "$common_params" | jq -r '.port')
|
||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||
|
||||
# get client and server arguments
|
||||
server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
|
||||
client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
|
||||
server_args=$(json2args "$server_params")
|
||||
client_args=$(json2args "$client_params")
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
echo "Running over qps list $qps_list"
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
|
||||
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
||||
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
||||
server_command="python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
-tp $tp \
|
||||
--model $model \
|
||||
--port $port \
|
||||
$server_args"
|
||||
else
|
||||
echo "Key 'fp8' does not exist in common params."
|
||||
server_command="python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
-tp $tp \
|
||||
--model $model \
|
||||
--port $port \
|
||||
$server_args"
|
||||
fi
|
||||
|
||||
# run the server
|
||||
echo "Running test case $test_name"
|
||||
echo "Server command: $server_command"
|
||||
eval "$server_command" &
|
||||
|
||||
# wait until the server is alive
|
||||
wait_for_server
|
||||
if [ $? -eq 0 ]; then
|
||||
echo ""
|
||||
echo "vllm server is up and running."
|
||||
else
|
||||
echo ""
|
||||
echo "vllm failed to start within the timeout period."
|
||||
break
|
||||
fi
|
||||
|
||||
# iterate over different QPS
|
||||
for qps in $qps_list; do
|
||||
# remove the surrounding single quote from qps
|
||||
if [[ "$qps" == *"inf"* ]]; then
|
||||
echo "qps was $qps"
|
||||
qps="inf"
|
||||
echo "now qps is $qps"
|
||||
fi
|
||||
|
||||
new_test_name=$test_name"_qps_"$qps
|
||||
|
||||
client_command="python3 benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--model $model \
|
||||
--dataset-name $dataset_name \
|
||||
--dataset-path $dataset_path \
|
||||
--num-prompts $num_prompts \
|
||||
--port $port \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
$client_args"
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
echo "Client command: $client_command"
|
||||
|
||||
eval "$client_command"
|
||||
|
||||
# record the benchmarking commands
|
||||
jq_output=$(jq -n \
|
||||
--arg server "$server_command" \
|
||||
--arg client "$client_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
--arg engine "vllm" \
|
||||
'{
|
||||
server_command: $server,
|
||||
client_command: $client,
|
||||
gpu_type: $gpu,
|
||||
engine: $engine
|
||||
}')
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||
|
||||
done
|
||||
|
||||
# clean up
|
||||
kill_gpu_processes
|
||||
rm -rf /root/.cache/huggingface/*
|
||||
done
|
||||
}
|
||||
|
||||
|
||||
upload_to_buildkite() {
|
||||
# upload the benchmarking results to buildkite
|
||||
|
||||
# if the agent binary is not found, skip uploading the results, exit 0
|
||||
if [ ! -f /workspace/buildkite-agent ]; then
|
||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||
return 0
|
||||
fi
|
||||
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
||||
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||
}
|
||||
|
||||
main() {
|
||||
|
||||
check_gpus
|
||||
# enter vllm directory
|
||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||
declare -g RESULTS_FOLDER=results/
|
||||
mkdir -p $RESULTS_FOLDER
|
||||
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||
|
||||
export CURRENT_LLM_SERVING_ENGINE=vllm
|
||||
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
||||
|
||||
python3 -m pip install tabulate pandas
|
||||
python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
||||
upload_to_buildkite
|
||||
|
||||
}
|
||||
|
||||
main "$@"
|
||||
@ -17,10 +17,17 @@ serving_column_mapping = {
|
||||
"request_throughput": "Tput (req/s)",
|
||||
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||
"std_ttft_ms": "Std TTFT (ms)",
|
||||
"median_ttft_ms": "Median TTFT (ms)",
|
||||
"mean_itl_ms": "Mean ITL (ms)",
|
||||
"std_itl_ms": "Std ITL (ms)",
|
||||
"input_throughput": "Input Tput (tok/s)",
|
||||
"median_itl_ms": "Median ITL (ms)",
|
||||
"mean_tpot_ms": "Mean TPOT (ms)",
|
||||
"std_tpot_ms": "Std TPOT (ms)",
|
||||
"median_tpot_ms": "Median TPOT (ms)",
|
||||
"total_token_throughput": "Total Token Tput (tok/s)",
|
||||
"output_throughput": "Output Tput (tok/s)",
|
||||
"total_input_tokens": "Total input tokens",
|
||||
"total_output_tokens": "Total output tokens",
|
||||
"engine": "Engine",
|
||||
}
|
||||
|
||||
@ -29,11 +36,11 @@ if __name__ == "__main__":
|
||||
# collect results
|
||||
for test_file in results_folder.glob("*.json"):
|
||||
|
||||
with open(test_file, "r") as f:
|
||||
with open(test_file) as f:
|
||||
raw_result = json.loads(f.read())
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
with open(test_file.with_suffix(".commands"), "r") as f:
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
raw_result.update(command)
|
||||
|
||||
|
||||
@ -2,9 +2,11 @@
|
||||
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
|
||||
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
|
||||
|
||||
TIMEOUT_SECONDS=10
|
||||
|
||||
retries=0
|
||||
while [ $retries -lt 1000 ]; do
|
||||
if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
|
||||
if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
@ -14,4 +16,4 @@ while [ $retries -lt 1000 ]; do
|
||||
sleep 5
|
||||
done
|
||||
|
||||
exit 1
|
||||
exit 1
|
||||
|
||||
@ -1,16 +1,18 @@
|
||||
[
|
||||
{
|
||||
"test_name": "llama8B_tp1",
|
||||
"qps_list": [4],
|
||||
"test_name": "llama8B_tp1_sharegpt",
|
||||
"qps_list": [4,8,16,32,"inf"],
|
||||
"common_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-8B",
|
||||
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
"tp": 1,
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 500,
|
||||
"port": 8000
|
||||
"port": 8000,
|
||||
"reuse_server": false
|
||||
},
|
||||
"lmdeploy_server_parameters": {
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"lmdeploy_client_parameters": {
|
||||
},
|
||||
@ -21,34 +23,158 @@
|
||||
},
|
||||
"trt_server_parameters": {
|
||||
"model_type": "llama",
|
||||
"model_dtype": "float16",
|
||||
"max_batch_size": 256,
|
||||
"model_dtype": "bfloat16",
|
||||
"max_batch_size": 2048,
|
||||
"max_input_len": 4096,
|
||||
"max_output_len": 4096,
|
||||
"trt_llm_version": "r24.04"
|
||||
"max_seq_len": 6144,
|
||||
"max_num_tokens": 16384,
|
||||
"trt_llm_version": "v0.11.0"
|
||||
},
|
||||
"trt_client_parameters": {
|
||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||
},
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": ""
|
||||
"disable_log_requests": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"vllm_client_parameters": {
|
||||
},
|
||||
"sglang_server_parameters": {
|
||||
"disable_radix_cache": "",
|
||||
"enable_torch_compile": "",
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"sglang_client_parameters": {
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "llama70B_tp4",
|
||||
"qps_list": [2],
|
||||
"test_name": "llama8B_tp1_sonnet_512_16",
|
||||
"qps_list": [4,8,16,32,"inf"],
|
||||
"common_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
"tp": 1,
|
||||
"dataset_name": "sonnet",
|
||||
"dataset_path": "./sonnet_4x.txt",
|
||||
"num_prompts": 500,
|
||||
"port": 8000,
|
||||
"sonnet_input_len": 512,
|
||||
"sonnet_output_len": 16,
|
||||
"sonnet_prefix_len": 50,
|
||||
"reuse_server": true
|
||||
},
|
||||
"lmdeploy_server_parameters": {
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"lmdeploy_client_parameters": {
|
||||
},
|
||||
"tgi_server_parameters": {
|
||||
},
|
||||
"tgi_client_parameters": {
|
||||
"endpoint": "/generate_stream"
|
||||
},
|
||||
"trt_server_parameters": {
|
||||
"model_type": "llama",
|
||||
"model_dtype": "bfloat16",
|
||||
"max_batch_size": 2048,
|
||||
"max_input_len": 4096,
|
||||
"max_seq_len": 6144,
|
||||
"max_num_tokens": 16384,
|
||||
"trt_llm_version": "v0.11.0"
|
||||
},
|
||||
"trt_client_parameters": {
|
||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"vllm_client_parameters": {
|
||||
},
|
||||
"sglang_server_parameters": {
|
||||
"disable_radix_cache": "",
|
||||
"enable_torch_compile": "",
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"sglang_client_parameters": {
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "llama8B_tp1_sonnet_512_256",
|
||||
"qps_list": [4,8,16,32,"inf"],
|
||||
"common_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
"tp": 1,
|
||||
"dataset_name": "sonnet",
|
||||
"dataset_path": "./sonnet_4x.txt",
|
||||
"num_prompts": 500,
|
||||
"port": 8000,
|
||||
"sonnet_input_len": 512,
|
||||
"sonnet_output_len": 256,
|
||||
"sonnet_prefix_len": 50,
|
||||
"reuse_server": true
|
||||
},
|
||||
"lmdeploy_server_parameters": {
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"lmdeploy_client_parameters": {
|
||||
},
|
||||
"tgi_server_parameters": {
|
||||
},
|
||||
"tgi_client_parameters": {
|
||||
"endpoint": "/generate_stream"
|
||||
},
|
||||
"trt_server_parameters": {
|
||||
"model_type": "llama",
|
||||
"model_dtype": "bfloat16",
|
||||
"max_batch_size": 2048,
|
||||
"max_input_len": 4096,
|
||||
"max_seq_len": 6144,
|
||||
"max_num_tokens": 16384,
|
||||
"trt_llm_version": "v0.11.0"
|
||||
},
|
||||
"trt_client_parameters": {
|
||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"vllm_client_parameters": {
|
||||
},
|
||||
"sglang_server_parameters": {
|
||||
"disable_radix_cache": "",
|
||||
"enable_torch_compile": "",
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"sglang_client_parameters": {
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "llama70B_tp4_sharegpt",
|
||||
"qps_list": [4,8,16,32,"inf"],
|
||||
"common_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||
"tp": 4,
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 500,
|
||||
"port": 8000
|
||||
"port": 8000,
|
||||
"reuse_server": false
|
||||
},
|
||||
"lmdeploy_server_parameters": {
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"lmdeploy_client_parameters": {
|
||||
},
|
||||
@ -59,34 +185,50 @@
|
||||
},
|
||||
"trt_server_parameters": {
|
||||
"model_type": "llama",
|
||||
"model_dtype": "float16",
|
||||
"max_batch_size": 256,
|
||||
"model_dtype": "bfloat16",
|
||||
"max_batch_size": 2048,
|
||||
"max_input_len": 4096,
|
||||
"max_output_len": 4096,
|
||||
"trt_llm_version": "r24.04"
|
||||
"max_seq_len": 6144,
|
||||
"max_num_tokens": 16384,
|
||||
"trt_llm_version": "v0.11.0"
|
||||
},
|
||||
"trt_client_parameters": {
|
||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||
},
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": ""
|
||||
"disable_log_requests": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"vllm_client_parameters": {
|
||||
},
|
||||
"sglang_server_parameters": {
|
||||
"disable_radix_cache": "",
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"sglang_client_parameters": {
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "mixtral8x7B_tp2",
|
||||
"qps_list": [2],
|
||||
"test_name": "llama70B_tp4_sonnet_512_16",
|
||||
"qps_list": [4,8,16,32,"inf"],
|
||||
"common_parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"tp": 2,
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||
"tp": 4,
|
||||
"dataset_name": "sonnet",
|
||||
"dataset_path": "./sonnet_4x.txt",
|
||||
"num_prompts": 500,
|
||||
"port": 8000
|
||||
"port": 8000,
|
||||
"sonnet_input_len": 512,
|
||||
"sonnet_output_len": 16,
|
||||
"sonnet_prefix_len": 50,
|
||||
"reuse_server": true
|
||||
},
|
||||
"lmdeploy_server_parameters": {
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"lmdeploy_client_parameters": {
|
||||
},
|
||||
@ -97,20 +239,85 @@
|
||||
},
|
||||
"trt_server_parameters": {
|
||||
"model_type": "llama",
|
||||
"model_dtype": "float16",
|
||||
"max_batch_size": 256,
|
||||
"model_dtype": "bfloat16",
|
||||
"max_batch_size": 2048,
|
||||
"max_input_len": 4096,
|
||||
"max_output_len": 4096,
|
||||
"trt_llm_version": "r24.04"
|
||||
"max_seq_len": 6144,
|
||||
"max_num_tokens": 16384,
|
||||
"trt_llm_version": "v0.11.0"
|
||||
},
|
||||
"trt_client_parameters": {
|
||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||
},
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": ""
|
||||
"disable_log_requests": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"vllm_client_parameters": {
|
||||
},
|
||||
"sglang_server_parameters": {
|
||||
"disable_radix_cache": "",
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"sglang_client_parameters": {
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "llama70B_tp4_sonnet_512_256",
|
||||
"qps_list": [4,8,16,32,"inf"],
|
||||
"common_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||
"tp": 4,
|
||||
"dataset_name": "sonnet",
|
||||
"dataset_path": "./sonnet_4x.txt",
|
||||
"num_prompts": 500,
|
||||
"port": 8000,
|
||||
"sonnet_input_len": 512,
|
||||
"sonnet_output_len": 256,
|
||||
"sonnet_prefix_len": 50,
|
||||
"reuse_server": true
|
||||
},
|
||||
"lmdeploy_server_parameters": {
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"lmdeploy_client_parameters": {
|
||||
},
|
||||
"tgi_server_parameters": {
|
||||
},
|
||||
"tgi_client_parameters": {
|
||||
"endpoint": "/generate_stream"
|
||||
},
|
||||
"trt_server_parameters": {
|
||||
"model_type": "llama",
|
||||
"model_dtype": "bfloat16",
|
||||
"max_batch_size": 2048,
|
||||
"max_input_len": 4096,
|
||||
"max_seq_len": 6144,
|
||||
"max_num_tokens": 16384,
|
||||
"trt_llm_version": "v0.11.0"
|
||||
},
|
||||
"trt_client_parameters": {
|
||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"vllm_client_parameters": {
|
||||
},
|
||||
"sglang_server_parameters": {
|
||||
"disable_radix_cache": "",
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"sglang_client_parameters": {
|
||||
}
|
||||
}
|
||||
]
|
||||
@ -3,30 +3,26 @@ steps:
|
||||
agents:
|
||||
queue: cpu_queue
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
# rename the files to change linux -> manylinux1
|
||||
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
|
||||
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
|
||||
- "bash .buildkite/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build CUDA 11.8 wheel"
|
||||
key: block-build-cu118-wheel
|
||||
|
||||
# Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
|
||||
# However, this block can be uncommented to save some compute hours.
|
||||
# - block: "Build CUDA 11.8 wheel"
|
||||
# key: block-build-cu118-wheel
|
||||
|
||||
- label: "Build wheel - CUDA 11.8"
|
||||
depends_on: block-build-cu118-wheel
|
||||
# depends_on: block-build-cu118-wheel
|
||||
agents:
|
||||
queue: cpu_queue
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
# rename the files to change linux -> manylinux1
|
||||
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
|
||||
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
|
||||
- "bash .buildkite/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script runs test inside the corresponding ROCm docker container.
|
||||
set -o pipefail
|
||||
|
||||
@ -31,8 +33,8 @@ cleanup_docker() {
|
||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||
# Remove dangling images (those that are not tagged and not used by any container)
|
||||
docker image prune -f
|
||||
# Remove unused volumes
|
||||
docker volume prune -f
|
||||
# Remove unused volumes / force the system prune for old images as well.
|
||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||
echo "Docker images and volumes cleanup completed."
|
||||
else
|
||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||
@ -57,17 +59,17 @@ done
|
||||
echo "--- Pulling container"
|
||||
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
docker pull ${image_name}
|
||||
docker pull "${image_name}"
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f ${container_name} || docker image rm -f ${image_name} || true
|
||||
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
echo "--- Running container"
|
||||
|
||||
HF_CACHE="$(realpath ~)/huggingface"
|
||||
mkdir -p ${HF_CACHE}
|
||||
mkdir -p "${HF_CACHE}"
|
||||
HF_MOUNT="/root/.cache/huggingface"
|
||||
|
||||
commands=$@
|
||||
@ -83,6 +85,7 @@ if [[ $commands == *" kernels "* ]]; then
|
||||
--ignore=kernels/test_encoder_decoder_attn.py \
|
||||
--ignore=kernels/test_flash_attn.py \
|
||||
--ignore=kernels/test_flashinfer.py \
|
||||
--ignore=kernels/test_gguf.py \
|
||||
--ignore=kernels/test_int8_quant.py \
|
||||
--ignore=kernels/test_machete_gemm.py \
|
||||
--ignore=kernels/test_mamba_ssm.py \
|
||||
@ -93,38 +96,49 @@ if [[ $commands == *" kernels "* ]]; then
|
||||
--ignore=kernels/test_sampler.py"
|
||||
fi
|
||||
|
||||
#ignore certain Entrypoints tests
|
||||
if [[ $commands == *" entrypoints/openai "* ]]; then
|
||||
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
||||
--ignore=entrypoints/openai/test_accuracy.py \
|
||||
--ignore=entrypoints/openai/test_audio.py \
|
||||
--ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||
--ignore=entrypoints/openai/test_embedding.py \
|
||||
--ignore=entrypoints/openai/test_oot_registration.py "}
|
||||
fi
|
||||
|
||||
PARALLEL_JOB_COUNT=8
|
||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||
if [[ $commands == *"--shard-id="* ]]; then
|
||||
# assign job count as the number of shards used
|
||||
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
|
||||
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
||||
#replace shard arguments
|
||||
commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
||||
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
|
||||
echo "Shard ${GPU} commands:$commands"
|
||||
# assign shard-id for each shard
|
||||
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
||||
echo "Shard ${GPU} commands:$commands_gpu"
|
||||
docker run \
|
||||
--device /dev/kfd --device /dev/dri \
|
||||
--network host \
|
||||
--shm-size=16gb \
|
||||
--rm \
|
||||
-e HIP_VISIBLE_DEVICES=${GPU} \
|
||||
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
||||
-e HF_TOKEN \
|
||||
-v ${HF_CACHE}:${HF_MOUNT} \
|
||||
-e HF_HOME=${HF_MOUNT} \
|
||||
--name ${container_name}_${GPU} \
|
||||
${image_name} \
|
||||
/bin/bash -c "${commands}" \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
--name "${container_name}_${GPU}" \
|
||||
"${image_name}" \
|
||||
/bin/bash -c "${commands_gpu}" \
|
||||
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
|
||||
PIDS+=($!)
|
||||
done
|
||||
#wait for all processes to finish and collect exit codes
|
||||
for pid in ${PIDS[@]}; do
|
||||
wait ${pid}
|
||||
for pid in "${PIDS[@]}"; do
|
||||
wait "${pid}"
|
||||
STATUS+=($?)
|
||||
done
|
||||
for st in ${STATUS[@]}; do
|
||||
for st in "${STATUS[@]}"; do
|
||||
if [[ ${st} -ne 0 ]]; then
|
||||
echo "One of the processes failed with $st"
|
||||
exit ${st}
|
||||
exit "${st}"
|
||||
fi
|
||||
done
|
||||
else
|
||||
@ -135,9 +149,9 @@ else
|
||||
--rm \
|
||||
-e HIP_VISIBLE_DEVICES=0 \
|
||||
-e HF_TOKEN \
|
||||
-v ${HF_CACHE}:${HF_MOUNT} \
|
||||
-e HF_HOME=${HF_MOUNT} \
|
||||
--name ${container_name} \
|
||||
${image_name} \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
--name "${container_name}" \
|
||||
"${image_name}" \
|
||||
/bin/bash -c "${commands}"
|
||||
fi
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script is run by buildkite to run the benchmarks and upload the results to buildkite
|
||||
|
||||
set -ex
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
@ -13,21 +15,38 @@ remove_docker_container
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
source /etc/environment
|
||||
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN="$HF_TOKEN" --name cpu-test cpu-test
|
||||
|
||||
# Run basic model test
|
||||
docker exec cpu-test bash -c "
|
||||
pip install pytest matplotlib einops transformers_stream_generator
|
||||
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
|
||||
function cpu_tests() {
|
||||
set -e
|
||||
|
||||
# online inference
|
||||
docker exec cpu-test bash -c "
|
||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
|
||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model facebook/opt-125m \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions \
|
||||
--tokenizer facebook/opt-125m"
|
||||
# Run basic model test
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pip install pytest pytest-asyncio \
|
||||
decord einops librosa peft Pillow sentence-transformers soundfile \
|
||||
transformers_stream_generator matplotlib datamodel_code_generator
|
||||
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
|
||||
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
||||
pytest -v -s tests/models/embedding/language -m cpu_model
|
||||
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
||||
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
|
||||
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
|
||||
|
||||
# online inference
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
|
||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model facebook/opt-125m \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions \
|
||||
--tokenizer facebook/opt-125m"
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 25 mins.
|
||||
export -f cpu_tests
|
||||
timeout 25m bash -c "cpu_tests"
|
||||
|
||||
@ -1,10 +1,16 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# allow to bind to different cores
|
||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||
NUMA_NODE=${NUMA_NODE:-1}
|
||||
|
||||
# Try building the docker image
|
||||
numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu .
|
||||
numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
|
||||
@ -12,38 +18,61 @@ trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
|
||||
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
|
||||
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
|
||||
|
||||
# offline inference
|
||||
docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
|
||||
function cpu_tests() {
|
||||
set -e
|
||||
|
||||
# Run basic model test
|
||||
docker exec cpu-test bash -c "
|
||||
pip install pytest matplotlib einops transformers_stream_generator
|
||||
pytest -v -s tests/models/decoder_only/language \
|
||||
--ignore=tests/models/test_fp8.py \
|
||||
--ignore=tests/models/decoder_only/language/test_jamba.py \
|
||||
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
|
||||
# offline inference
|
||||
docker exec cpu-test-avx2 bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference.py"
|
||||
|
||||
# Run compressed-tensor test
|
||||
docker exec cpu-test bash -c "
|
||||
pytest -s -v \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
|
||||
# Run basic model test
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pip install pytest pytest-asyncio \
|
||||
decord einops librosa peft Pillow sentence-transformers soundfile \
|
||||
transformers_stream_generator matplotlib datamodel_code_generator
|
||||
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
|
||||
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
||||
pytest -v -s tests/models/embedding/language -m cpu_model
|
||||
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
||||
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
|
||||
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
|
||||
|
||||
# online inference
|
||||
docker exec cpu-test bash -c "
|
||||
export VLLM_CPU_KVCACHE_SPACE=10
|
||||
export VLLM_CPU_OMP_THREADS_BIND=48-92
|
||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
|
||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model facebook/opt-125m \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions \
|
||||
--tokenizer facebook/opt-125m"
|
||||
# Run compressed-tensor test
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
||||
|
||||
# Run AWQ test
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
tests/quantization/test_ipex_quant.py"
|
||||
|
||||
# online inference
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
export VLLM_CPU_KVCACHE_SPACE=10
|
||||
export VLLM_CPU_OMP_THREADS_BIND=$1
|
||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
|
||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model facebook/opt-125m \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions \
|
||||
--tokenizer facebook/opt-125m"
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 25 mins.
|
||||
export -f cpu_tests
|
||||
timeout 25m bash -c "cpu_tests $CORE_RANGE"
|
||||
|
||||
16
.buildkite/run-hpu-test.sh
Normal file
16
.buildkite/run-hpu-test.sh
Normal file
@ -0,0 +1,16 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t hpu-test-env -f Dockerfile.hpu .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f hpu-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and launch offline inference
|
||||
docker run --runtime=habana --name=hpu-test --network=host -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
|
||||
@ -14,7 +14,7 @@ DOCKER_IMAGE=$4
|
||||
|
||||
shift 4
|
||||
COMMANDS=("$@")
|
||||
if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
|
||||
if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then
|
||||
echo "The number of commands must be equal to the number of nodes."
|
||||
echo "Number of nodes: $NUM_NODES"
|
||||
echo "Number of commands: ${#COMMANDS[@]}"
|
||||
@ -23,7 +23,7 @@ fi
|
||||
|
||||
echo "List of commands"
|
||||
for command in "${COMMANDS[@]}"; do
|
||||
echo $command
|
||||
echo "$command"
|
||||
done
|
||||
|
||||
start_network() {
|
||||
@ -36,7 +36,7 @@ start_nodes() {
|
||||
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
|
||||
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
|
||||
GPU_DEVICES+=$(($DEVICE_NUM))
|
||||
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
|
||||
if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
|
||||
GPU_DEVICES+=','
|
||||
fi
|
||||
done
|
||||
@ -49,17 +49,20 @@ start_nodes() {
|
||||
# 3. map the huggingface cache directory to the container
|
||||
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
|
||||
# starting from 192.168.10.11)
|
||||
docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
|
||||
docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
|
||||
--network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
|
||||
/bin/bash -c "tail -f /dev/null"
|
||||
|
||||
# organize containers into a ray cluster
|
||||
if [ $node -eq 0 ]; then
|
||||
if [ "$node" -eq 0 ]; then
|
||||
# start the ray head node
|
||||
docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
|
||||
docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block"
|
||||
# wait for the head node to be ready
|
||||
sleep 10
|
||||
else
|
||||
# start the ray worker nodes, and connect them to the head node
|
||||
docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
|
||||
docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
|
||||
fi
|
||||
done
|
||||
|
||||
@ -79,22 +82,22 @@ run_nodes() {
|
||||
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
|
||||
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
|
||||
GPU_DEVICES+=$(($DEVICE_NUM))
|
||||
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
|
||||
if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
|
||||
GPU_DEVICES+=','
|
||||
fi
|
||||
done
|
||||
GPU_DEVICES+='"'
|
||||
echo "Running node$node with GPU devices: $GPU_DEVICES"
|
||||
if [ $node -ne 0 ]; then
|
||||
docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
|
||||
if [ "$node" -ne 0 ]; then
|
||||
docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
|
||||
else
|
||||
docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
|
||||
docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
|
||||
fi
|
||||
done
|
||||
}
|
||||
cleanup() {
|
||||
for node in $(seq 0 $(($NUM_NODES-1))); do
|
||||
docker stop node$node
|
||||
docker stop "node$node"
|
||||
done
|
||||
docker network rm docker-net
|
||||
}
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the Neuron docker image and run the API server inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -e
|
||||
@ -12,10 +14,10 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
|
||||
current_time=$(date +%s)
|
||||
if [ $((current_time - last_build)) -gt 86400 ]; then
|
||||
docker system prune -f
|
||||
echo $current_time > /tmp/neuron-docker-build-timestamp
|
||||
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
||||
fi
|
||||
else
|
||||
echo $(date +%s) > /tmp/neuron-docker-build-timestamp
|
||||
date "+%s" > /tmp/neuron-docker-build-timestamp
|
||||
fi
|
||||
|
||||
docker build -t neuron -f Dockerfile.neuron .
|
||||
@ -34,7 +36,7 @@ wait_for_server_to_start() {
|
||||
timeout=300
|
||||
counter=0
|
||||
|
||||
while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
|
||||
while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
|
||||
sleep 1
|
||||
counter=$((counter + 1))
|
||||
if [ $counter -ge $timeout ]; then
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the OpenVINO docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
@ -11,4 +13,4 @@ trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and launch offline inference
|
||||
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
|
||||
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# Build the docker image.
|
||||
@ -12,4 +14,4 @@ remove_docker_container
|
||||
# For HF_TOKEN.
|
||||
source /etc/environment
|
||||
# Run a simple end-to-end example.
|
||||
docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
|
||||
docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
@ -11,4 +13,4 @@ trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and launch offline inference
|
||||
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
|
||||
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py
|
||||
|
||||
@ -9,6 +9,8 @@
|
||||
# label(str): the name of the test. emoji allowed.
|
||||
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
||||
# fast_check_only(bool): run this test on fastcheck pipeline only
|
||||
# nightly(bool): run this test in nightly pipeline only
|
||||
# optional(bool): never run this test by default (i.e. need to unblock manually)
|
||||
# command(str): the single command to run for tests. incompatible with commands.
|
||||
# commands(list): the list of commands to run for test. incompatbile with command.
|
||||
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
|
||||
@ -39,17 +41,19 @@ steps:
|
||||
# Check API reference (if it fails, you may have missing mock imports)
|
||||
- grep \"sig sig-object py\" build/html/dev/sampling_params.html
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker Test # 15min
|
||||
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
||||
fast_check: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/mq_llm_engine
|
||||
- tests/async_engine
|
||||
- tests/test_inputs
|
||||
- tests/multimodal
|
||||
- tests/test_utils
|
||||
- tests/worker
|
||||
commands:
|
||||
- pytest -v -s async_engine # Async Engine
|
||||
- pytest -v -s mq_llm_engine # MQLLMEngine
|
||||
- pytest -v -s async_engine # AsyncLLMEngine
|
||||
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s multimodal
|
||||
@ -61,14 +65,22 @@ steps:
|
||||
fast_check: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/basic_correctness
|
||||
- tests/basic_correctness/test_basic_correctness
|
||||
- tests/basic_correctness/test_cpu_offload
|
||||
- tests/basic_correctness/test_preemption
|
||||
commands:
|
||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||
|
||||
- label: Chunked Prefill Test
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/basic_correctness/test_chunked_prefill
|
||||
commands:
|
||||
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||
|
||||
|
||||
- label: Core Test # 10min
|
||||
mirror_hardwares: [amd]
|
||||
fast_check: true
|
||||
@ -79,18 +91,21 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s core
|
||||
|
||||
- label: Entrypoints Test # 20min
|
||||
- label: Entrypoints Test # 40min
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
fast_check: true
|
||||
#mirror_hardwares: [amd]
|
||||
mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
commands:
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
- pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
|
||||
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/openai
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
|
||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
@ -103,7 +118,10 @@ steps:
|
||||
- vllm/core/
|
||||
- tests/distributed
|
||||
- tests/spec_decode/e2e/test_integration_dist_tp4
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s distributed/test_utils.py
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
- pytest -v -s distributed/test_pynccl.py
|
||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||
|
||||
@ -131,7 +149,9 @@ steps:
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_regression
|
||||
command: pytest -v -s test_regression.py
|
||||
commands:
|
||||
- pip install modelscope
|
||||
- pytest -v -s test_regression.py
|
||||
working_dir: "/vllm-workspace/tests" # optional
|
||||
|
||||
- label: Engine Test # 10min
|
||||
@ -145,7 +165,15 @@ steps:
|
||||
# OOM in the CI unless we run this separately
|
||||
- pytest -v -s tokenization
|
||||
|
||||
- label: Examples Test # 12min
|
||||
- label: V1 Test
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
commands:
|
||||
- pytest -v -s v1
|
||||
|
||||
- label: Examples Test # 15min
|
||||
working_dir: "/vllm-workspace/examples"
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
@ -162,15 +190,9 @@ steps:
|
||||
- python3 offline_inference_vision_language_multi_image.py
|
||||
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference_encoder_decoder.py
|
||||
- python3 offline_profile.py --model facebook/opt-125m
|
||||
|
||||
- label: torch compile integration test
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
commands:
|
||||
- pytest -v -s ./compile/test_full_graph.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
|
||||
- label: Prefix Caching Test # 7min
|
||||
- label: Prefix Caching Test # 9min
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -178,7 +200,7 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s prefix_caching
|
||||
|
||||
- label: Samplers Test # 18min
|
||||
- label: Samplers Test # 36min
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers
|
||||
- vllm/sampling_metadata.py
|
||||
@ -194,17 +216,15 @@ steps:
|
||||
- tests/test_logits_processor
|
||||
command: pytest -v -s test_logits_processor.py
|
||||
|
||||
- label: Speculative decoding tests # 22min
|
||||
- label: Speculative decoding tests # 30min
|
||||
source_file_dependencies:
|
||||
- vllm/spec_decode
|
||||
- tests/spec_decode
|
||||
commands:
|
||||
# See https://github.com/vllm-project/vllm/issues/5152
|
||||
- export VLLM_ATTENTION_BACKEND=XFORMERS
|
||||
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
|
||||
- pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
|
||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
|
||||
|
||||
- label: LoRA Test %N # 30min each
|
||||
- label: LoRA Test %N # 15min each
|
||||
mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
@ -212,7 +232,25 @@ steps:
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
|
||||
parallelism: 4
|
||||
|
||||
- label: Kernels Test %N # 30min each
|
||||
- label: "PyTorch Fullgraph Smoke Test" # 9min
|
||||
fast_check: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
# these tests need to be separated, cannot combine
|
||||
- pytest -v -s compile/piecewise/test_simple.py
|
||||
- pytest -v -s compile/piecewise/test_toy_llama.py
|
||||
|
||||
- label: "PyTorch Fullgraph Test" # 18min
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_full_graph.py
|
||||
|
||||
- label: Kernels Test %N # 1h each
|
||||
mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
@ -239,15 +277,14 @@ steps:
|
||||
source_file_dependencies:
|
||||
- benchmarks/
|
||||
commands:
|
||||
- pip install aiohttp
|
||||
- bash run-benchmarks.sh
|
||||
|
||||
- label: Quantization Test # 15min
|
||||
- label: Quantization Test # 33min
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/quantization
|
||||
command: pytest -v -s quantization
|
||||
command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
@ -255,10 +292,16 @@ steps:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- pip install lm-eval
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
||||
|
||||
- label: Encoder Decoder tests # 5min
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/encoder_decoder
|
||||
commands:
|
||||
- pytest -v -s encoder_decoder
|
||||
|
||||
- label: OpenAI-Compatible Tool Use # 20 min
|
||||
fast_check: false
|
||||
mirror_hardwares: [ amd ]
|
||||
@ -270,42 +313,79 @@ steps:
|
||||
|
||||
##### models test #####
|
||||
|
||||
- label: Basic Models Test # 3min
|
||||
- label: Basic Models Test # 30min
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models
|
||||
commands:
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py
|
||||
- pytest -v -s models/test_registry.py
|
||||
- pytest -v -s models/test_initialization.py
|
||||
|
||||
- label: Decoder-only Language Models Test # 1h3min
|
||||
- label: Language Models Test (Standard) # 42min
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/language
|
||||
- tests/models/embedding/language
|
||||
- tests/models/encoder_decoder/language
|
||||
commands:
|
||||
- pytest -v -s models/decoder_only/language
|
||||
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
|
||||
- pytest -v -s models/embedding/language -m core_model
|
||||
- pytest -v -s models/embedding/vision_language -m core_model
|
||||
|
||||
- label: Decoder-only Multi-Modal Models Test # 56min
|
||||
- label: Language Models Test (Extended) # 50min
|
||||
nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/language
|
||||
- tests/models/embedding/language
|
||||
- tests/models/encoder_decoder/language
|
||||
commands:
|
||||
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
|
||||
- pytest -v -s models/embedding/language -m 'not core_model'
|
||||
- pytest -v -s models/embedding/vision_language -m 'not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Standard) # 26min
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/audio_language
|
||||
- tests/models/decoder_only/vision_language
|
||||
- tests/models/embedding/vision_language
|
||||
- tests/models/encoder_decoder/vision_language
|
||||
commands:
|
||||
- pytest -v -s models/decoder_only/audio_language
|
||||
- pytest -v -s models/decoder_only/vision_language
|
||||
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
|
||||
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
|
||||
- pytest -v -s models/encoder_decoder/language -m core_model
|
||||
- pytest -v -s models/encoder_decoder/vision_language -m core_model
|
||||
|
||||
- label: Other Models Test # 5min
|
||||
#mirror_hardwares: [amd]
|
||||
- label: Multi-Modal Models Test (Extended) # 1h15m
|
||||
nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/embedding/language
|
||||
- tests/models/encoder_decoder/language
|
||||
- tests/models/decoder_only/audio_language
|
||||
- tests/models/decoder_only/vision_language
|
||||
- tests/models/embedding/vision_language
|
||||
- tests/models/encoder_decoder/vision_language
|
||||
commands:
|
||||
- pytest -v -s models/embedding/language
|
||||
- pytest -v -s models/encoder_decoder/language
|
||||
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
|
||||
# HACK - run phi3v tests separately to sidestep this transformers bug
|
||||
# https://github.com/huggingface/transformers/issues/34307
|
||||
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
|
||||
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
|
||||
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
|
||||
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
|
||||
|
||||
# This test is used only in PR development phase to test individual models and should never run on main
|
||||
- label: Custom Models Test
|
||||
optional: true
|
||||
commands:
|
||||
- echo 'Testing custom models...'
|
||||
# PR authors can temporarily add commands below to test individual models
|
||||
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
|
||||
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
|
||||
|
||||
##### 1 GPU test #####
|
||||
##### multi gpus test #####
|
||||
@ -338,7 +418,7 @@ steps:
|
||||
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
|
||||
|
||||
- label: Distributed Tests (2 GPUs) # 28min
|
||||
- label: Distributed Tests (2 GPUs) # 40min
|
||||
#mirror_hardwares: [amd]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
@ -348,18 +428,22 @@ steps:
|
||||
- vllm/executor/
|
||||
- vllm/model_executor/models/
|
||||
- tests/distributed/
|
||||
- vllm/compilation
|
||||
commands:
|
||||
- pytest -v -s ./compile/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
|
||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
|
||||
# Avoid importing model tests that cause CUDA reinitialization error
|
||||
- pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
|
||||
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
|
||||
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
|
||||
- pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus
|
||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
- pytest -v -s distributed/test_distributed_oot.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
|
||||
|
||||
- label: Multi-step Tests (4 GPUs) # 21min
|
||||
- label: Multi-step Tests (4 GPUs) # 36min
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
@ -377,7 +461,7 @@ steps:
|
||||
- pytest -v -s multi_step/test_correctness_async_llm.py
|
||||
- pytest -v -s multi_step/test_correctness_llm.py
|
||||
|
||||
- label: Pipeline Parallelism Test # 23min
|
||||
- label: Pipeline Parallelism Test # 45min
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
@ -403,7 +487,7 @@ steps:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s -x lora/test_long_context.py
|
||||
|
||||
- label: Weight Loading Multiple GPU Test
|
||||
- label: Weight Loading Multiple GPU Test # 33min
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
@ -436,7 +520,8 @@ steps:
|
||||
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||
- TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
|
||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
- label: LM Eval Large Models # optional
|
||||
@ -447,6 +532,5 @@ steps:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- pip install lm-eval
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- bash ./run-tests.sh -c configs/models-large.txt -t 4
|
||||
|
||||
38
.buildkite/upload-wheels.sh
Normal file
38
.buildkite/upload-wheels.sh
Normal file
@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
# Assume wheels are in artifacts/dist/*.whl
|
||||
wheel_files=(artifacts/dist/*.whl)
|
||||
|
||||
# Check that exactly one wheel is found
|
||||
if [[ ${#wheel_files[@]} -ne 1 ]]; then
|
||||
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get the single wheel file
|
||||
wheel="${wheel_files[0]}"
|
||||
|
||||
# Rename 'linux' to 'manylinux1' in the wheel filename
|
||||
new_wheel="${wheel/linux/manylinux1}"
|
||||
mv -- "$wheel" "$new_wheel"
|
||||
wheel="$new_wheel"
|
||||
|
||||
# Extract the version from the wheel
|
||||
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||
echo "Version: $version"
|
||||
|
||||
# If the version contains "dev", rename it to v1.0.0.dev for consistency
|
||||
if [[ $version == *dev* ]]; then
|
||||
new_version="1.0.0.dev"
|
||||
new_wheel="${wheel/$version/$new_version}"
|
||||
mv -- "$wheel" "$new_wheel"
|
||||
wheel="$new_wheel"
|
||||
version="$new_version"
|
||||
fi
|
||||
|
||||
# Upload the wheel to S3
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||
@ -1,4 +1,33 @@
|
||||
vllm/*.so
|
||||
/.venv
|
||||
/build
|
||||
dist
|
||||
vllm/*.so
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
.mypy_cache
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
/build/
|
||||
cmake-build-*/
|
||||
CMakeUserPresets.json
|
||||
develop-eggs/
|
||||
/dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
30
.github/CODEOWNERS
vendored
Normal file
30
.github/CODEOWNERS
vendored
Normal file
@ -0,0 +1,30 @@
|
||||
# See https://help.github.com/articles/about-codeowners/
|
||||
# for more info about CODEOWNERS file
|
||||
|
||||
# This lists cover the "core" components of vLLM that require careful review
|
||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||
/vllm/core @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||
/vllm/engine/llm_engine.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||
/vllm/executor/executor_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||
/vllm/worker/worker_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||
/vllm/worker/worker.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||
/vllm/model_executor/layers/sampler.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||
CMakeLists.txt @tlrmchlsmth @WoosukKwon
|
||||
|
||||
# Test ownership
|
||||
/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
|
||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
|
||||
/tests/models @DarkLight1337 @ywang96
|
||||
/tests/multimodal @DarkLight1337 @ywang96
|
||||
/tests/prefix_caching @comaniac @KuntaiDu
|
||||
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
||||
/tests/kernels @tlrmchlsmth @WoosukKwon
|
||||
/tests/quantization @mgoin @robertgshaw2-neuralmagic
|
||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||
/tests/distributed/test_same_node.py @youkaichao
|
||||
/tests/multi_step @alexm-neuralmagic @comaniac
|
||||
/tests/weight_loading @mgoin @youkaichao
|
||||
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
|
||||
32
.github/dependabot.yml
vendored
Normal file
32
.github/dependabot.yml
vendored
Normal file
@ -0,0 +1,32 @@
|
||||
version: 2
|
||||
updates:
|
||||
# Maintain dependencies for GitHub Actions
|
||||
- package-ecosystem: "github-actions"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
- package-ecosystem: "pip"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
labels: ["dependencies"]
|
||||
open-pull-requests-limit: 5
|
||||
reviewers: ["khluu", "simon-mo"]
|
||||
allow:
|
||||
- dependency-type: "all"
|
||||
ignore:
|
||||
- dependency-name: "torch"
|
||||
- dependency-name: "torchvision"
|
||||
- dependency-name: "xformers"
|
||||
- dependency-name: "lm-format-enforcer"
|
||||
- dependency-name: "gguf"
|
||||
- dependency-name: "compressed-tensors"
|
||||
- dependency-name: "ray[adag]"
|
||||
- dependency-name: "lm-eval"
|
||||
groups:
|
||||
patch-update:
|
||||
applies-to: version-updates
|
||||
update-types: ["patch"]
|
||||
minor-update:
|
||||
applies-to: version-updates
|
||||
update-types: ["minor"]
|
||||
60
.github/mergify.yml
vendored
Normal file
60
.github/mergify.yml
vendored
Normal file
@ -0,0 +1,60 @@
|
||||
pull_request_rules:
|
||||
- name: label-documentation
|
||||
description: Automatically apply documentation label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^[^/]+\.md$
|
||||
- files~=^docs/
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- documentation
|
||||
|
||||
- name: label-ci-build
|
||||
description: Automatically apply ci/build label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^\.github/
|
||||
- files~=\.buildkite/
|
||||
- files~=^cmake/
|
||||
- files=CMakeLists.txt
|
||||
- files~=^Dockerfile
|
||||
- files~=^requirements.*\.txt
|
||||
- files=setup.py
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- ci/build
|
||||
|
||||
- name: label-frontend
|
||||
description: Automatically apply frontend label
|
||||
conditions:
|
||||
- files~=^vllm/entrypoints/
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- frontend
|
||||
|
||||
- name: ping author on conflicts and add 'needs-rebase' label
|
||||
conditions:
|
||||
- conflict
|
||||
- -closed
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- needs-rebase
|
||||
comment:
|
||||
message: |
|
||||
This pull request has merge conflicts that must be resolved before it can be
|
||||
merged. Please rebase the PR, @{{author}}.
|
||||
|
||||
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
|
||||
|
||||
- name: remove 'needs-rebase' label when conflict is resolved
|
||||
conditions:
|
||||
- -conflict
|
||||
- -closed
|
||||
actions:
|
||||
label:
|
||||
remove:
|
||||
- needs-rebase
|
||||
33
.github/scripts/cleanup_pr_body.sh
vendored
Executable file
33
.github/scripts/cleanup_pr_body.sh
vendored
Executable file
@ -0,0 +1,33 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -eu
|
||||
|
||||
# ensure 1 argument is passed
|
||||
if [ "$#" -ne 1 ]; then
|
||||
echo "Usage: $0 <pr_number>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
PR_NUMBER=$1
|
||||
OLD=/tmp/orig_pr_body.txt
|
||||
NEW=/tmp/new_pr_body.txt
|
||||
|
||||
gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
|
||||
cp "${OLD}" "${NEW}"
|
||||
|
||||
# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
|
||||
sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE\*\*/,$d' "${NEW}"
|
||||
|
||||
# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
|
||||
sed -i '/FIX #xxxx.*$/d' "${NEW}"
|
||||
|
||||
# Remove "FILL IN THE PR DESCRIPTION HERE"
|
||||
sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
|
||||
|
||||
# Run this only if ${NEW} is different than ${OLD}
|
||||
if ! cmp -s "${OLD}" "${NEW}"; then
|
||||
echo "Updating PR body"
|
||||
gh pr edit --body-file "${NEW}" "${PR_NUMBER}"
|
||||
else
|
||||
echo "No changes needed"
|
||||
fi
|
||||
40
.github/workflows/actionlint.yml
vendored
Normal file
40
.github/workflows/actionlint.yml
vendored
Normal file
@ -0,0 +1,40 @@
|
||||
name: Lint GitHub Actions workflows
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- "main"
|
||||
paths:
|
||||
- '.github/workflows/*.ya?ml'
|
||||
- '.github/workflows/actionlint.*'
|
||||
- '.github/workflows/matchers/actionlint.json'
|
||||
pull_request:
|
||||
branches:
|
||||
- "main"
|
||||
paths:
|
||||
- '.github/workflows/*.ya?ml'
|
||||
- '.github/workflows/actionlint.*'
|
||||
- '.github/workflows/matchers/actionlint.json'
|
||||
|
||||
env:
|
||||
LC_ALL: en_US.UTF-8
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
actionlint:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: "Checkout"
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: "Run actionlint"
|
||||
run: |
|
||||
echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
||||
tools/actionlint.sh -color
|
||||
2
.github/workflows/add_label_automerge.yml
vendored
2
.github/workflows/add_label_automerge.yml
vendored
@ -8,7 +8,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Add label
|
||||
uses: actions/github-script@v5
|
||||
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
|
||||
with:
|
||||
script: |
|
||||
github.rest.issues.addLabels({
|
||||
|
||||
18
.github/workflows/clang-format.yml
vendored
18
.github/workflows/clang-format.yml
vendored
@ -6,9 +6,21 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- '**/*.h'
|
||||
- '**/*.cpp'
|
||||
- '**/*.cu'
|
||||
- '**/*.cuh'
|
||||
- '.github/workflows/clang-format.yml'
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- '**/*.h'
|
||||
- '**/*.cpp'
|
||||
- '**/*.cu'
|
||||
- '**/*.cuh'
|
||||
- '.github/workflows/clang-format.yml'
|
||||
|
||||
jobs:
|
||||
clang-format:
|
||||
@ -17,9 +29,9 @@ jobs:
|
||||
matrix:
|
||||
python-version: ["3.11"]
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v2
|
||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
@ -38,4 +50,4 @@ jobs:
|
||||
)
|
||||
find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
|
||||
| grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
|
||||
| xargs clang-format --dry-run --Werror
|
||||
| xargs clang-format --dry-run --Werror
|
||||
|
||||
26
.github/workflows/cleanup_pr_body.yml
vendored
Normal file
26
.github/workflows/cleanup_pr_body.yml
vendored
Normal file
@ -0,0 +1,26 @@
|
||||
name: Cleanup PR Body
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened, reopened, edited]
|
||||
|
||||
permissions:
|
||||
pull-requests: write
|
||||
|
||||
jobs:
|
||||
update-description:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Update PR description
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
|
||||
45
.github/workflows/codespell.yml
vendored
Normal file
45
.github/workflows/codespell.yml
vendored
Normal file
@ -0,0 +1,45 @@
|
||||
name: codespell
|
||||
|
||||
on:
|
||||
# Trigger the workflow on push or pull request,
|
||||
# but only for the main branch
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "**/*.py"
|
||||
- "**/*.md"
|
||||
- "**/*.rst"
|
||||
- pyproject.toml
|
||||
- requirements-lint.txt
|
||||
- .github/workflows/codespell.yml
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "**/*.py"
|
||||
- "**/*.md"
|
||||
- "**/*.rst"
|
||||
- pyproject.toml
|
||||
- requirements-lint.txt
|
||||
- .github/workflows/codespell.yml
|
||||
|
||||
jobs:
|
||||
codespell:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.12"]
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements-lint.txt
|
||||
- name: Spelling check with codespell
|
||||
run: |
|
||||
codespell --toml pyproject.toml
|
||||
17
.github/workflows/matchers/actionlint.json
vendored
Normal file
17
.github/workflows/matchers/actionlint.json
vendored
Normal file
@ -0,0 +1,17 @@
|
||||
{
|
||||
"problemMatcher": [
|
||||
{
|
||||
"owner": "actionlint",
|
||||
"pattern": [
|
||||
{
|
||||
"regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
|
||||
"file": 1,
|
||||
"line": 2,
|
||||
"column": 3,
|
||||
"message": 4,
|
||||
"code": 5
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
16
.github/workflows/matchers/mypy.json
vendored
Normal file
16
.github/workflows/matchers/mypy.json
vendored
Normal file
@ -0,0 +1,16 @@
|
||||
{
|
||||
"problemMatcher": [
|
||||
{
|
||||
"owner": "mypy",
|
||||
"pattern": [
|
||||
{
|
||||
"regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$",
|
||||
"file": 1,
|
||||
"line": 2,
|
||||
"severity": 3,
|
||||
"message": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
17
.github/workflows/matchers/ruff.json
vendored
Normal file
17
.github/workflows/matchers/ruff.json
vendored
Normal file
@ -0,0 +1,17 @@
|
||||
{
|
||||
"problemMatcher": [
|
||||
{
|
||||
"owner": "ruff",
|
||||
"pattern": [
|
||||
{
|
||||
"regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$",
|
||||
"file": 1,
|
||||
"line": 2,
|
||||
"column": 3,
|
||||
"code": 4,
|
||||
"message": 5
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
37
.github/workflows/mypy.yaml
vendored
37
.github/workflows/mypy.yaml
vendored
@ -6,20 +6,35 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- '**/*.py'
|
||||
- '.github/workflows/mypy.yaml'
|
||||
- 'tools/mypy.sh'
|
||||
- 'pyproject.toml'
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
# This workflow is only relevant when one of the following files changes.
|
||||
# However, we have github configured to expect and require this workflow
|
||||
# to run and pass before github with auto-merge a pull request. Until github
|
||||
# allows more flexible auto-merge policy, we can just run this on every PR.
|
||||
# It doesn't take that long to run, anyway.
|
||||
#paths:
|
||||
# - '**/*.py'
|
||||
# - '.github/workflows/mypy.yaml'
|
||||
# - 'tools/mypy.sh'
|
||||
# - 'pyproject.toml'
|
||||
|
||||
jobs:
|
||||
ruff:
|
||||
mypy:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
||||
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v2
|
||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
@ -32,15 +47,5 @@ jobs:
|
||||
pip install types-setuptools
|
||||
- name: Mypy
|
||||
run: |
|
||||
mypy
|
||||
mypy tests --follow-imports skip
|
||||
mypy vllm/attention --follow-imports skip
|
||||
mypy vllm/distributed --follow-imports skip
|
||||
mypy vllm/engine --follow-imports skip
|
||||
mypy vllm/executor --follow-imports skip
|
||||
mypy vllm/lora --follow-imports skip
|
||||
mypy vllm/model_executor --follow-imports skip
|
||||
mypy vllm/prompt_adapter --follow-imports skip
|
||||
mypy vllm/spec_decode --follow-imports skip
|
||||
mypy vllm/worker --follow-imports skip
|
||||
|
||||
echo "::add-matcher::.github/workflows/matchers/mypy.json"
|
||||
tools/mypy.sh 1 ${{ matrix.python-version }}
|
||||
|
||||
22
.github/workflows/publish.yml
vendored
22
.github/workflows/publish.yml
vendored
@ -21,16 +21,16 @@ jobs:
|
||||
upload_url: ${{ steps.create_release.outputs.upload_url }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Extract branch info
|
||||
shell: bash
|
||||
run: |
|
||||
echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
|
||||
echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Create Release
|
||||
id: create_release
|
||||
uses: "actions/github-script@v6"
|
||||
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
|
||||
env:
|
||||
RELEASE_TAG: ${{ env.release_tag }}
|
||||
with:
|
||||
@ -48,16 +48,16 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: ['ubuntu-20.04']
|
||||
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
|
||||
python-version: ['3.9', '3.10', '3.11', '3.12']
|
||||
pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
|
||||
cuda-version: ['11.8', '12.1']
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Setup ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2
|
||||
uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
|
||||
with:
|
||||
create-symlink: true
|
||||
key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
|
||||
@ -68,7 +68,7 @@ jobs:
|
||||
bash -x .github/workflows/scripts/env.sh
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
@ -86,13 +86,13 @@ jobs:
|
||||
CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
|
||||
run: |
|
||||
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
|
||||
wheel_name=$(ls dist/*whl | xargs -n 1 basename)
|
||||
wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
|
||||
asset_name=${wheel_name//"linux"/"manylinux1"}
|
||||
echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
|
||||
echo "asset_name=${asset_name}" >> $GITHUB_ENV
|
||||
echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
|
||||
echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Upload Release Asset
|
||||
uses: actions/upload-release-asset@v1
|
||||
uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
with:
|
||||
|
||||
2
.github/workflows/reminder_comment.yml
vendored
2
.github/workflows/reminder_comment.yml
vendored
@ -8,7 +8,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Remind to run full CI on PR
|
||||
uses: actions/github-script@v6
|
||||
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
|
||||
with:
|
||||
script: |
|
||||
github.rest.issues.createComment({
|
||||
|
||||
53
.github/workflows/ruff.yml
vendored
53
.github/workflows/ruff.yml
vendored
@ -6,32 +6,47 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "**/*.py"
|
||||
- pyproject.toml
|
||||
- requirements-lint.txt
|
||||
- .github/workflows/matchers/ruff.json
|
||||
- .github/workflows/ruff.yml
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
# This workflow is only relevant when one of the following files changes.
|
||||
# However, we have github configured to expect and require this workflow
|
||||
# to run and pass before github with auto-merge a pull request. Until github
|
||||
# allows more flexible auto-merge policy, we can just run this on every PR.
|
||||
# It doesn't take that long to run, anyway.
|
||||
#paths:
|
||||
# - "**/*.py"
|
||||
# - pyproject.toml
|
||||
# - requirements-lint.txt
|
||||
# - .github/workflows/matchers/ruff.json
|
||||
# - .github/workflows/ruff.yml
|
||||
|
||||
jobs:
|
||||
ruff:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
||||
python-version: ["3.12"]
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2
|
||||
- name: Analysing the code with ruff
|
||||
run: |
|
||||
ruff .
|
||||
- name: Spelling check with codespell
|
||||
run: |
|
||||
codespell --toml pyproject.toml
|
||||
- name: Run isort
|
||||
run: |
|
||||
isort . --check-only
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements-lint.txt
|
||||
- name: Analysing the code with ruff
|
||||
run: |
|
||||
echo "::add-matcher::.github/workflows/matchers/ruff.json"
|
||||
ruff check --output-format github .
|
||||
- name: Run isort
|
||||
run: |
|
||||
isort . --check-only
|
||||
|
||||
8
.github/workflows/scripts/build.sh
vendored
8
.github/workflows/scripts/build.sh
vendored
@ -1,4 +1,5 @@
|
||||
#!/bin/bash
|
||||
set -eux
|
||||
|
||||
python_executable=python$1
|
||||
cuda_home=/usr/local/cuda-$2
|
||||
@ -8,12 +9,15 @@ PATH=${cuda_home}/bin:$PATH
|
||||
LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
|
||||
|
||||
# Install requirements
|
||||
$python_executable -m pip install wheel packaging
|
||||
$python_executable -m pip install -r requirements-cuda.txt
|
||||
$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
|
||||
|
||||
# Limit the number of parallel jobs to avoid OOM
|
||||
export MAX_JOBS=1
|
||||
# Make sure release wheels are built for the following architectures
|
||||
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
|
||||
export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
|
||||
|
||||
bash tools/check_repo.sh
|
||||
|
||||
# Build
|
||||
$python_executable setup.py bdist_wheel --dist-dir=dist
|
||||
|
||||
8
.github/workflows/scripts/cuda-install.sh
vendored
8
.github/workflows/scripts/cuda-install.sh
vendored
@ -1,16 +1,16 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Replace '.' with '-' ex: 11.8 -> 11-8
|
||||
cuda_version=$(echo $1 | tr "." "-")
|
||||
cuda_version=$(echo "$1" | tr "." "-")
|
||||
# Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
|
||||
OS=$(echo $2 | tr -d ".\-")
|
||||
OS=$(echo "$2" | tr -d ".\-")
|
||||
|
||||
# Installs CUDA
|
||||
wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
|
||||
wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb"
|
||||
sudo dpkg -i cuda-keyring_1.1-1_all.deb
|
||||
rm cuda-keyring_1.1-1_all.deb
|
||||
sudo apt -qq update
|
||||
sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
|
||||
sudo apt -y install "cuda-${cuda_version}" "cuda-nvcc-${cuda_version}" "cuda-libraries-dev-${cuda_version}"
|
||||
sudo apt clean
|
||||
|
||||
# Test nvcc
|
||||
|
||||
2
.github/workflows/scripts/pytorch-install.sh
vendored
2
.github/workflows/scripts/pytorch-install.sh
vendored
@ -6,7 +6,7 @@ cuda_version=$3
|
||||
|
||||
# Install torch
|
||||
$python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
|
||||
$python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}
|
||||
$python_executable -m pip install torch=="${pytorch_version}+cu${cuda_version//./}" --extra-index-url "https://download.pytorch.org/whl/cu${cuda_version//./}"
|
||||
|
||||
# Print version information
|
||||
$python_executable --version
|
||||
|
||||
37
.github/workflows/shellcheck.yml
vendored
Normal file
37
.github/workflows/shellcheck.yml
vendored
Normal file
@ -0,0 +1,37 @@
|
||||
name: Lint shell scripts
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- "main"
|
||||
paths:
|
||||
- '**/*.sh'
|
||||
- '.github/workflows/shellcheck.yml'
|
||||
pull_request:
|
||||
branches:
|
||||
- "main"
|
||||
paths:
|
||||
- '**/*.sh'
|
||||
- '.github/workflows/shellcheck.yml'
|
||||
|
||||
env:
|
||||
LC_ALL: en_US.UTF-8
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
shellcheck:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: "Checkout"
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: "Check shell scripts"
|
||||
run: |
|
||||
tools/shellcheck.sh
|
||||
52
.github/workflows/stale.yml
vendored
Normal file
52
.github/workflows/stale.yml
vendored
Normal file
@ -0,0 +1,52 @@
|
||||
name: 'Close inactive issues and PRs'
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Daily at 1:30 AM UTC
|
||||
- cron: '30 1 * * *'
|
||||
|
||||
jobs:
|
||||
close-issues-and-pull-requests:
|
||||
permissions:
|
||||
issues: write
|
||||
pull-requests: write
|
||||
actions: write
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
|
||||
with:
|
||||
# Increasing this value ensures that changes to this workflow
|
||||
# propagate to all issues and PRs in days rather than months
|
||||
operations-per-run: 1000
|
||||
|
||||
exempt-draft-pr: true
|
||||
exempt-issue-labels: 'keep-open'
|
||||
exempt-pr-labels: 'keep-open'
|
||||
|
||||
labels-to-add-when-unstale: 'unstale'
|
||||
labels-to-remove-when-stale: 'unstale'
|
||||
|
||||
days-before-issue-stale: 90
|
||||
days-before-issue-close: 30
|
||||
stale-issue-label: 'stale'
|
||||
stale-issue-message: >
|
||||
This issue has been automatically marked as stale because it has not
|
||||
had any activity within 90 days. It will be automatically closed if no
|
||||
further activity occurs within 30 days. Leave a comment if
|
||||
you feel this issue should remain open. Thank you!
|
||||
close-issue-message: >
|
||||
This issue has been automatically closed due to inactivity. Please
|
||||
feel free to reopen if you feel it is still relevant. Thank you!
|
||||
|
||||
days-before-pr-stale: 90
|
||||
days-before-pr-close: 30
|
||||
stale-pr-label: 'stale'
|
||||
stale-pr-message: >
|
||||
This pull request has been automatically marked as stale because it
|
||||
has not had any activity within 90 days. It will be automatically
|
||||
closed if no further activity occurs within 30 days. Leave a comment
|
||||
if you feel this pull request should remain open. Thank you!
|
||||
close-pr-message: >
|
||||
This pull request has been automatically closed due to inactivity.
|
||||
Please feel free to reopen if you intend to continue working on it.
|
||||
Thank you!
|
||||
35
.github/workflows/yapf.yml
vendored
35
.github/workflows/yapf.yml
vendored
@ -6,26 +6,33 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "**/*.py"
|
||||
- .github/workflows/yapf.yml
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "**/*.py"
|
||||
- .github/workflows/yapf.yml
|
||||
|
||||
jobs:
|
||||
yapf:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
||||
python-version: ["3.12"]
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install yapf==0.32.0
|
||||
pip install toml==0.10.2
|
||||
- name: Running yapf
|
||||
run: |
|
||||
yapf --diff --recursive .
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install yapf==0.32.0
|
||||
pip install toml==0.10.2
|
||||
- name: Running yapf
|
||||
run: |
|
||||
yapf --diff --recursive .
|
||||
|
||||
14
.gitignore
vendored
14
.gitignore
vendored
@ -1,5 +1,8 @@
|
||||
# vllm commit id, generated by setup.py
|
||||
vllm/commit_id.py
|
||||
# version file generated by setuptools-scm
|
||||
/vllm/_version.py
|
||||
|
||||
# vllm-flash-attn built from source
|
||||
vllm/vllm_flash_attn/
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
@ -12,6 +15,8 @@ __pycache__/
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
cmake-build-*/
|
||||
CMakeUserPresets.json
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
@ -28,6 +33,7 @@ share/python-wheels/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
/.deps/
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
@ -193,3 +199,7 @@ hip_compat.h
|
||||
|
||||
# Benchmark dataset
|
||||
benchmarks/*.json
|
||||
|
||||
# Linting
|
||||
actionlint
|
||||
shellcheck*/
|
||||
|
||||
@ -6,17 +6,16 @@ version: 2
|
||||
build:
|
||||
os: ubuntu-22.04
|
||||
tools:
|
||||
python: "3.8"
|
||||
python: "3.12"
|
||||
|
||||
sphinx:
|
||||
configuration: docs/source/conf.py
|
||||
fail_on_warning: true
|
||||
configuration: docs/source/conf.py
|
||||
fail_on_warning: true
|
||||
|
||||
# If using Sphinx, optionally build your docs in additional formats such as PDF
|
||||
formats:
|
||||
- pdf
|
||||
formats: []
|
||||
|
||||
# Optionally declare the Python requirements required to build your docs
|
||||
python:
|
||||
install:
|
||||
- requirements: docs/requirements-docs.txt
|
||||
install:
|
||||
- requirements: docs/requirements-docs.txt
|
||||
|
||||
9
.shellcheckrc
Normal file
9
.shellcheckrc
Normal file
@ -0,0 +1,9 @@
|
||||
# rules currently disabled:
|
||||
#
|
||||
# SC1091 (info): Not following: <sourced file> was not specified as input (see shellcheck -x)
|
||||
# SC2004 (style): $/${} is unnecessary on arithmetic variables.
|
||||
# SC2129 (style): Consider using { cmd1; cmd2; } >> file instead of individual redirects.
|
||||
# SC2155 (warning): Declare and assign separately to avoid masking return values.
|
||||
# SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails.
|
||||
#
|
||||
disable=SC1091,SC2004,SC2129,SC2155,SC2164
|
||||
427
CMakeLists.txt
427
CMakeLists.txt
@ -1,5 +1,16 @@
|
||||
cmake_minimum_required(VERSION 3.26)
|
||||
|
||||
# When building directly using CMake, make sure you run the install step
|
||||
# (it places the .so files in the correct location).
|
||||
#
|
||||
# Example:
|
||||
# mkdir build && cd build
|
||||
# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_INSTALL_PREFIX=.. ..
|
||||
# cmake --build . --target install
|
||||
#
|
||||
# If you want to only build one target, make sure to install it manually:
|
||||
# cmake --build . --target _C
|
||||
# cmake --install . --component _C
|
||||
project(vllm_extensions LANGUAGES CXX)
|
||||
|
||||
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
||||
@ -13,17 +24,20 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
|
||||
# Suppress potential warnings about unused manually-specified variables
|
||||
set(ignoreMe "${VLLM_PYTHON_PATH}")
|
||||
|
||||
# Prevent installation of dependencies (cutlass) by default.
|
||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
||||
|
||||
#
|
||||
# Supported python versions. These versions will be searched in order, the
|
||||
# first match will be selected. These should be kept in sync with setup.py.
|
||||
#
|
||||
set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11" "3.12")
|
||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
||||
|
||||
# Supported NVIDIA architectures.
|
||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
|
||||
|
||||
# Supported AMD GPU architectures.
|
||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
|
||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
|
||||
|
||||
#
|
||||
# Supported/expected torch versions for CUDA/ROCm.
|
||||
@ -35,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
|
||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||
# versions are derived from Dockerfile.rocm
|
||||
#
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
|
||||
|
||||
#
|
||||
# Try to find python package with an executable that exactly matches
|
||||
@ -69,39 +83,6 @@ endif()
|
||||
#
|
||||
find_package(Torch REQUIRED)
|
||||
|
||||
#
|
||||
# Add the `default` target which detects which extensions should be
|
||||
# built based on platform/architecture. This is the same logic that
|
||||
# setup.py uses to select which extensions should be built and should
|
||||
# be kept in sync.
|
||||
#
|
||||
# The `default` target makes direct use of cmake easier since knowledge
|
||||
# of which extensions are supported has been factored in, e.g.
|
||||
#
|
||||
# mkdir build && cd build
|
||||
# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
|
||||
# cmake --build . --target default
|
||||
#
|
||||
add_custom_target(default)
|
||||
message(STATUS "Enabling core extension.")
|
||||
|
||||
# Define _core_C extension
|
||||
# built for (almost) every target platform, (excludes TPU and Neuron)
|
||||
|
||||
set(VLLM_EXT_SRC
|
||||
"csrc/core/torch_bindings.cpp")
|
||||
|
||||
define_gpu_extension_target(
|
||||
_core_C
|
||||
DESTINATION vllm
|
||||
LANGUAGE CXX
|
||||
SOURCES ${VLLM_EXT_SRC}
|
||||
COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
|
||||
USE_SABI 3
|
||||
WITH_SOABI)
|
||||
|
||||
add_dependencies(default _core_C)
|
||||
|
||||
#
|
||||
# Forward the non-CUDA device extensions to external CMake scripts.
|
||||
#
|
||||
@ -144,14 +125,32 @@ else()
|
||||
message(FATAL_ERROR "Can't find CUDA or HIP installation.")
|
||||
endif()
|
||||
|
||||
#
|
||||
# Override the GPU architectures detected by cmake/torch and filter them by
|
||||
# the supported versions for the current language.
|
||||
# The final set of arches is stored in `VLLM_GPU_ARCHES`.
|
||||
#
|
||||
override_gpu_arches(VLLM_GPU_ARCHES
|
||||
${VLLM_GPU_LANG}
|
||||
"${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
#
|
||||
# For cuda we want to be able to control which architectures we compile for on
|
||||
# a per-file basis in order to cut down on compile time. So here we extract
|
||||
# the set of architectures we want to compile for and remove the from the
|
||||
# CMAKE_CUDA_FLAGS so that they are not applied globally.
|
||||
#
|
||||
clear_cuda_arches(CUDA_ARCH_FLAGS)
|
||||
extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
|
||||
message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
|
||||
# Filter the target architectures by the supported supported archs
|
||||
# since for some files we will build for all CUDA_ARCHS.
|
||||
cuda_archs_loose_intersection(CUDA_ARCHS
|
||||
"${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
|
||||
message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
|
||||
else()
|
||||
#
|
||||
# For other GPU targets override the GPU architectures detected by cmake/torch
|
||||
# and filter them by the supported versions for the current language.
|
||||
# The final set of arches is stored in `VLLM_GPU_ARCHES`.
|
||||
#
|
||||
override_gpu_arches(VLLM_GPU_ARCHES
|
||||
${VLLM_GPU_LANG}
|
||||
"${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
|
||||
endif()
|
||||
|
||||
#
|
||||
# Query torch for additional GPU compilation flags for the given
|
||||
@ -167,6 +166,17 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
||||
endif()
|
||||
|
||||
|
||||
#
|
||||
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
|
||||
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
|
||||
# Each dependency that produces build artifacts should override its BINARY_DIR to avoid
|
||||
# conflicts between build types. It should instead be set to ${CMAKE_BINARY_DIR}/<dependency>.
|
||||
#
|
||||
include(FetchContent)
|
||||
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
|
||||
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
||||
|
||||
#
|
||||
# Define other extension targets
|
||||
#
|
||||
@ -177,21 +187,25 @@ endif()
|
||||
|
||||
set(VLLM_EXT_SRC
|
||||
"csrc/cache_kernels.cu"
|
||||
"csrc/attention/attention_kernels.cu"
|
||||
"csrc/attention/paged_attention_v1.cu"
|
||||
"csrc/attention/paged_attention_v2.cu"
|
||||
"csrc/pos_encoding_kernels.cu"
|
||||
"csrc/activation_kernels.cu"
|
||||
"csrc/layernorm_kernels.cu"
|
||||
"csrc/layernorm_quant_kernels.cu"
|
||||
"csrc/quantization/gptq/q_gemm.cu"
|
||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||
"csrc/quantization/fp8/common.cu"
|
||||
"csrc/cuda_utils_kernels.cu"
|
||||
"csrc/moe_align_block_size_kernels.cu"
|
||||
"csrc/prepare_inputs/advance_step.cu"
|
||||
"csrc/torch_bindings.cpp")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
include(FetchContent)
|
||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||
|
||||
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
|
||||
set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
|
||||
|
||||
FetchContent_Declare(
|
||||
cutlass
|
||||
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
||||
@ -210,29 +224,89 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
||||
"csrc/quantization/aqlm/gemm_kernels.cu"
|
||||
"csrc/quantization/awq/gemm_kernels.cu"
|
||||
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
|
||||
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
||||
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
||||
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
|
||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||
"csrc/quantization/fp8/fp8_marlin.cu"
|
||||
"csrc/custom_all_reduce.cu"
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
|
||||
"csrc/permute_cols.cu"
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${VLLM_EXT_SRC}"
|
||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||
|
||||
# Only build Marlin kernels if we are building for at least some compatible archs.
|
||||
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
||||
# are not supported by Machete yet.
|
||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
|
||||
if (MARLIN_ARCHS)
|
||||
set(MARLIN_SRCS
|
||||
"csrc/quantization/fp8/fp8_marlin.cu"
|
||||
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
|
||||
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
||||
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
||||
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_SRCS}"
|
||||
CUDA_ARCHS "${MARLIN_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
|
||||
message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
|
||||
else()
|
||||
message(STATUS "Not building Marlin kernels as no compatible archs found"
|
||||
" in CUDA target architectures")
|
||||
endif()
|
||||
|
||||
#
|
||||
# The CUTLASS kernels for Hopper require sm90a to be enabled.
|
||||
# This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
|
||||
# That adds an extra 17MB to compiled binary, so instead we selectively enable it.
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
|
||||
set_source_files_properties(
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
|
||||
PROPERTIES
|
||||
COMPILE_FLAGS
|
||||
"-gencode arch=compute_90a,code=sm_90a")
|
||||
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
|
||||
# CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
|
||||
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
|
||||
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
|
||||
message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
|
||||
else()
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
|
||||
message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
|
||||
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
|
||||
"later if you intend on running FP8 quantized models on "
|
||||
"Hopper.")
|
||||
else()
|
||||
message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
|
||||
"in CUDA target architectures")
|
||||
endif()
|
||||
|
||||
# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
|
||||
# build any 3x kernels
|
||||
set(SCALED_MM_3X_ARCHS)
|
||||
endif()
|
||||
|
||||
#
|
||||
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
|
||||
# kernels for the remaining archs that are not already built for 3x.
|
||||
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
||||
"7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
|
||||
# subtract out the archs that are already built for 3x
|
||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||
if (SCALED_MM_2X_ARCHS)
|
||||
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1")
|
||||
message(STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS}")
|
||||
else()
|
||||
if (SCALED_MM_3X_ARCHS)
|
||||
message(STATUS "Not building scaled_mm_c2x as all archs are already built"
|
||||
" for and covered by scaled_mm_c3x")
|
||||
else()
|
||||
message(STATUS "Not building scaled_mm_c2x as no compatible archs found "
|
||||
"in CUDA target architectures")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
@ -240,49 +314,75 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# Machete kernels
|
||||
|
||||
# The machete kernels only work on hopper and require CUDA 12.0 or later.
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
|
||||
# Only build Machete kernels if we are building for something compatible with sm90a
|
||||
cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
|
||||
#
|
||||
# For the Machete kernels we automatically generate sources for various
|
||||
# For the Machete kernels we automatically generate sources for various
|
||||
# preselected input type pairs and schedules.
|
||||
# Generate sources:
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} -E env
|
||||
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
|
||||
${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py
|
||||
RESULT_VARIABLE machete_generation_result
|
||||
OUTPUT_VARIABLE machete_generation_output
|
||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
|
||||
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
|
||||
)
|
||||
set(MACHETE_GEN_SCRIPT
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
|
||||
file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
|
||||
|
||||
if (NOT machete_generation_result EQUAL 0)
|
||||
message(FATAL_ERROR "Machete generation failed."
|
||||
" Result: \"${machete_generation_result}\""
|
||||
"\nCheck the log for details: "
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
|
||||
message(STATUS "Machete generation script hash: ${MACHETE_GEN_SCRIPT_HASH}")
|
||||
message(STATUS "Last run machete generate script hash: $CACHE{MACHETE_GEN_SCRIPT_HASH}")
|
||||
|
||||
if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
|
||||
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} -E env
|
||||
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
|
||||
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
|
||||
RESULT_VARIABLE machete_generation_result
|
||||
OUTPUT_VARIABLE machete_generation_output
|
||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
|
||||
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
|
||||
)
|
||||
|
||||
if (NOT machete_generation_result EQUAL 0)
|
||||
message(FATAL_ERROR "Machete generation failed."
|
||||
" Result: \"${machete_generation_result}\""
|
||||
"\nCheck the log for details: "
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
|
||||
else()
|
||||
set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
|
||||
CACHE STRING "Last run machete generate script hash" FORCE)
|
||||
message(STATUS "Machete generation completed successfully.")
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "Machete generation completed successfully.")
|
||||
message(STATUS "Machete generation script has not changed, skipping generation.")
|
||||
endif()
|
||||
|
||||
# Add machete generated sources
|
||||
file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
|
||||
list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
|
||||
message(STATUS "Machete generated sources: ${MACHETE_GEN_SOURCES}")
|
||||
|
||||
set_source_files_properties(
|
||||
${MACHETE_GEN_SOURCES}
|
||||
PROPERTIES
|
||||
COMPILE_FLAGS
|
||||
"-gencode arch=compute_90a,code=sm_90a")
|
||||
# forward compatible
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MACHETE_GEN_SOURCES}"
|
||||
CUDA_ARCHS "${MACHETE_ARCHS}")
|
||||
|
||||
list(APPEND VLLM_EXT_SRC
|
||||
csrc/quantization/machete/machete_pytorch.cu)
|
||||
|
||||
message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
|
||||
else()
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
|
||||
AND MACHETE_ARCHS)
|
||||
message(STATUS "Not building Machete kernels as CUDA Compiler version is "
|
||||
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
|
||||
"later if you intend on running w4a16 quantized models on "
|
||||
"Hopper.")
|
||||
else()
|
||||
message(STATUS "Not building Machete kernels as no compatible archs "
|
||||
"found in CUDA target architectures")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Add pytorch binding for machete (add on even CUDA < 12.0 so that we can
|
||||
# raise an error if the user that this was built with an incompatible
|
||||
# CUDA version)
|
||||
list(APPEND VLLM_EXT_SRC
|
||||
csrc/quantization/machete/machete_pytorch.cu)
|
||||
# if CUDA endif
|
||||
endif()
|
||||
|
||||
message(STATUS "Enabling C extension.")
|
||||
define_gpu_extension_target(
|
||||
_C
|
||||
DESTINATION vllm
|
||||
@ -294,8 +394,8 @@ define_gpu_extension_target(
|
||||
USE_SABI 3
|
||||
WITH_SOABI)
|
||||
|
||||
# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
|
||||
# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
|
||||
# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
|
||||
# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
|
||||
# driver API. This causes problems when linking with earlier versions of CUDA.
|
||||
# Setting this variable sidesteps the issue by calling the driver directly.
|
||||
target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
|
||||
@ -306,13 +406,39 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
|
||||
|
||||
set(VLLM_MOE_EXT_SRC
|
||||
"csrc/moe/torch_bindings.cpp"
|
||||
"csrc/moe/moe_align_sum_kernels.cu"
|
||||
"csrc/moe/topk_softmax_kernels.cu")
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${VLLM_MOE_EXT_SRC}"
|
||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_MOE_EXT_SRC
|
||||
"csrc/moe/marlin_moe_ops.cu")
|
||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
|
||||
if (MARLIN_MOE_ARCHS)
|
||||
set(MARLIN_MOE_SRC
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
|
||||
"csrc/moe/marlin_moe_ops.cu")
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_MOE_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
||||
|
||||
list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
|
||||
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
||||
else()
|
||||
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
||||
" in CUDA target architectures")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
message(STATUS "Enabling moe extension.")
|
||||
define_gpu_extension_target(
|
||||
_moe_C
|
||||
DESTINATION vllm
|
||||
@ -323,13 +449,98 @@ define_gpu_extension_target(
|
||||
USE_SABI 3
|
||||
WITH_SOABI)
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "HIP")
|
||||
#
|
||||
# _rocm_C extension
|
||||
#
|
||||
set(VLLM_ROCM_EXT_SRC
|
||||
"csrc/rocm/torch_bindings.cpp"
|
||||
"csrc/rocm/attention.cu")
|
||||
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
|
||||
message(STATUS "Enabling C extension.")
|
||||
add_dependencies(default _C)
|
||||
|
||||
message(STATUS "Enabling moe extension.")
|
||||
add_dependencies(default _moe_C)
|
||||
|
||||
define_gpu_extension_target(
|
||||
_rocm_C
|
||||
DESTINATION vllm
|
||||
LANGUAGE ${VLLM_GPU_LANG}
|
||||
SOURCES ${VLLM_ROCM_EXT_SRC}
|
||||
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||
USE_SABI 3
|
||||
WITH_SOABI)
|
||||
endif()
|
||||
|
||||
# vllm-flash-attn currently only supported on CUDA
|
||||
if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
|
||||
return()
|
||||
endif ()
|
||||
|
||||
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
|
||||
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
|
||||
# arches in the CUDA case (and instead set the gencodes on a per file basis)
|
||||
# we need to manually set VLLM_GPU_ARCHES here.
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
foreach(_ARCH ${CUDA_ARCHS})
|
||||
string(REPLACE "." "" _ARCH "${_ARCH}")
|
||||
list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
#
|
||||
# Build vLLM flash attention from source
|
||||
#
|
||||
# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
|
||||
# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
|
||||
# They should be identical but if they aren't, this is a massive footgun.
|
||||
#
|
||||
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
|
||||
# To only install vllm-flash-attn, use --component vllm_flash_attn_c.
|
||||
# If no component is specified, vllm-flash-attn is still installed.
|
||||
|
||||
# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
|
||||
# This is to enable local development of vllm-flash-attn within vLLM.
|
||||
# It can be set as an environment variable or passed as a cmake argument.
|
||||
# The environment variable takes precedence.
|
||||
if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
|
||||
set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
|
||||
endif()
|
||||
|
||||
if(VLLM_FLASH_ATTN_SRC_DIR)
|
||||
FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR})
|
||||
else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 5259c586c403a4e4d8bf69973c159b40cc346fb9
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
)
|
||||
endif()
|
||||
|
||||
# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
|
||||
set(VLLM_PARENT_BUILD ON)
|
||||
|
||||
# Ensure the vllm/vllm_flash_attn directory exists before installation
|
||||
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c)
|
||||
|
||||
# Make sure vllm-flash-attn install rules are nested under vllm/
|
||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
|
||||
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
|
||||
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT vllm_flash_attn_c)
|
||||
|
||||
# Fetch the vllm-flash-attn library
|
||||
FetchContent_MakeAvailable(vllm-flash-attn)
|
||||
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
|
||||
|
||||
# Restore the install prefix
|
||||
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
|
||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)
|
||||
|
||||
# Copy over the vllm-flash-attn python files
|
||||
install(
|
||||
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||
DESTINATION vllm/vllm_flash_attn
|
||||
COMPONENT vllm_flash_attn_c
|
||||
FILES_MATCHING PATTERN "*.py"
|
||||
)
|
||||
|
||||
# Nothing after vllm-flash-attn, see comment about macros above
|
||||
|
||||
@ -1,56 +1,3 @@
|
||||
# Contributing to vLLM
|
||||
|
||||
Thank you for your interest in contributing to vLLM!
|
||||
Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
|
||||
There are several ways you can contribute to the project:
|
||||
|
||||
- Identify and report any issues or bugs.
|
||||
- Request or add a new model.
|
||||
- Suggest or implement new features.
|
||||
|
||||
However, remember that contributions aren't just about code.
|
||||
We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
|
||||
|
||||
Finally, one of the most impactful ways to support us is by raising awareness about vLLM.
|
||||
Talk about it in your blog posts, highlighting how it's driving your incredible projects.
|
||||
Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
|
||||
|
||||
|
||||
## Setup for development
|
||||
|
||||
### Build from source
|
||||
|
||||
```bash
|
||||
pip install -e . # This may take several minutes.
|
||||
```
|
||||
|
||||
### Testing
|
||||
|
||||
```bash
|
||||
pip install -r requirements-dev.txt
|
||||
|
||||
# linting and formatting
|
||||
bash format.sh
|
||||
# Static type checking
|
||||
mypy
|
||||
# Unit tests
|
||||
pytest tests/
|
||||
```
|
||||
**Note:** Currently, the repository does not pass the mypy tests.
|
||||
|
||||
|
||||
## Contributing Guidelines
|
||||
|
||||
### Issue Reporting
|
||||
|
||||
If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
|
||||
If not, please file a new issue, providing as much relevant information as possible.
|
||||
|
||||
### Pull Requests & Code Reviews
|
||||
|
||||
Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution.
|
||||
|
||||
### Thank You
|
||||
|
||||
Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
|
||||
Your contributions make vLLM a great tool for everyone!
|
||||
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
|
||||
|
||||
34
DCO
Normal file
34
DCO
Normal file
@ -0,0 +1,34 @@
|
||||
Developer Certificate of Origin
|
||||
Version 1.1
|
||||
|
||||
Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
|
||||
|
||||
Everyone is permitted to copy and distribute verbatim copies of this
|
||||
license document, but changing it is not allowed.
|
||||
|
||||
|
||||
Developer's Certificate of Origin 1.1
|
||||
|
||||
By making a contribution to this project, I certify that:
|
||||
|
||||
(a) The contribution was created in whole or in part by me and I
|
||||
have the right to submit it under the open source license
|
||||
indicated in the file; or
|
||||
|
||||
(b) The contribution is based upon previous work that, to the best
|
||||
of my knowledge, is covered under an appropriate open source
|
||||
license and I have the right under that license to submit that
|
||||
work with modifications, whether created in whole or in part
|
||||
by me, under the same open source license (unless I am
|
||||
permitted to submit under a different license), as indicated
|
||||
in the file; or
|
||||
|
||||
(c) The contribution was provided directly to me by some other
|
||||
person who certified (a), (b) or (c) and I have not modified
|
||||
it.
|
||||
|
||||
(d) I understand and agree that this project and the contribution
|
||||
are public and that a record of the contribution (including all
|
||||
personal information I submit with it, including my sign-off) is
|
||||
maintained indefinitely and may be redistributed consistent with
|
||||
this project or the open source license(s) involved.
|
||||
48
Dockerfile
48
Dockerfile
@ -27,6 +27,14 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
|
||||
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
||||
# as it was causing spam when compiling the CUTLASS kernels
|
||||
RUN apt-get install -y gcc-10 g++-10
|
||||
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
|
||||
RUN <<EOF
|
||||
gcc --version
|
||||
EOF
|
||||
|
||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||
# this won't be needed for future versions of this docker image
|
||||
@ -48,6 +56,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
# see https://github.com/pytorch/pytorch/pull/123243
|
||||
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
|
||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||
# Override the arch list for flash-attn to reduce the binary size
|
||||
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
|
||||
ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
|
||||
#################### BASE BUILD IMAGE ####################
|
||||
|
||||
#################### WHEEL BUILD IMAGE ####################
|
||||
@ -59,15 +70,10 @@ COPY requirements-build.txt requirements-build.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
python3 -m pip install -r requirements-build.txt
|
||||
|
||||
# files and directories related to build wheels
|
||||
COPY csrc csrc
|
||||
COPY setup.py setup.py
|
||||
COPY cmake cmake
|
||||
COPY CMakeLists.txt CMakeLists.txt
|
||||
COPY requirements-common.txt requirements-common.txt
|
||||
COPY requirements-cuda.txt requirements-cuda.txt
|
||||
COPY pyproject.toml pyproject.toml
|
||||
COPY vllm vllm
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||
|
||||
# max jobs used by Ninja to build extensions
|
||||
ARG max_jobs=2
|
||||
@ -76,14 +82,13 @@ ENV MAX_JOBS=${max_jobs}
|
||||
ARG nvcc_threads=8
|
||||
ENV NVCC_THREADS=$nvcc_threads
|
||||
|
||||
ARG buildkite_commit
|
||||
ENV BUILDKITE_COMMIT=${buildkite_commit}
|
||||
|
||||
ARG USE_SCCACHE
|
||||
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
|
||||
ARG SCCACHE_REGION_NAME=us-west-2
|
||||
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
if [ "$USE_SCCACHE" = "1" ]; then \
|
||||
echo "Installing sccache..." \
|
||||
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
|
||||
@ -92,6 +97,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
|
||||
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
|
||||
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
|
||||
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
|
||||
&& export SCCACHE_IDLE_TIMEOUT=0 \
|
||||
&& export CMAKE_BUILD_TYPE=Release \
|
||||
&& sccache --show-stats \
|
||||
@ -102,6 +108,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
ENV CCACHE_DIR=/root/.cache/ccache
|
||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=cache,target=/root/.cache/pip \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
if [ "$USE_SCCACHE" != "1" ]; then \
|
||||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
||||
fi
|
||||
@ -131,7 +138,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
#################### DEV IMAGE ####################
|
||||
#################### vLLM installation IMAGE ####################
|
||||
# image with vLLM installed
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
|
||||
ARG CUDA_VERSION=12.4.1
|
||||
ARG PYTHON_VERSION=3.12
|
||||
WORKDIR /vllm-workspace
|
||||
@ -169,6 +176,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
. /etc/environment && \
|
||||
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
|
||||
COPY examples examples
|
||||
#################### vLLM installation IMAGE ####################
|
||||
|
||||
|
||||
@ -180,13 +188,17 @@ FROM vllm-base AS test
|
||||
ADD . /vllm-workspace/
|
||||
|
||||
# install development dependencies (for testing)
|
||||
# A newer setuptools is required for installing some test dependencies from source that do not publish python 3.12 wheels
|
||||
# This installation must complete before the test dependencies are collected and installed.
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
python3 -m pip install "setuptools>=74.1.1"
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
python3 -m pip install -r requirements-dev.txt
|
||||
|
||||
# enable fast downloads from hf (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
python3 -m pip install hf_transfer
|
||||
ENV HF_HUB_ENABLE_HF_TRANSFER 1
|
||||
|
||||
# Copy in the v1 package for testing (it isn't distributed yet)
|
||||
COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
|
||||
|
||||
# doc requires source code
|
||||
# we hide them inside `test_docs/` , so that this source code
|
||||
# will not be imported by other tests
|
||||
@ -202,7 +214,7 @@ FROM vllm-base AS vllm-openai
|
||||
|
||||
# install additional dependencies for openai api server
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip install accelerate hf_transfer 'modelscope!=1.15.0'
|
||||
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' timm==0.9.10
|
||||
|
||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||
|
||||
|
||||
@ -22,27 +22,17 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li
|
||||
|
||||
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
||||
|
||||
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl
|
||||
RUN pip install intel_extension_for_pytorch==2.5.0
|
||||
|
||||
ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
|
||||
WORKDIR /workspace
|
||||
|
||||
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
|
||||
pip install --upgrade pip && \
|
||||
pip install -r requirements-build.txt
|
||||
|
||||
# install oneDNN
|
||||
RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \
|
||||
-DONEDNN_BUILD_DOC=OFF \
|
||||
-DONEDNN_BUILD_EXAMPLES=OFF \
|
||||
-DONEDNN_BUILD_TESTS=OFF \
|
||||
-DONEDNN_BUILD_GRAPH=OFF \
|
||||
-DONEDNN_ENABLE_WORKLOAD=INFERENCE \
|
||||
-DONEDNN_ENABLE_PRIMITIVE=MATMUL && \
|
||||
cmake --build ./oneDNN/build --target install --config Release
|
||||
|
||||
FROM cpu-test-1 AS build
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
@ -52,7 +42,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
--mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
|
||||
pip install -v -r requirements-cpu.txt
|
||||
|
||||
COPY ./ ./
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||
|
||||
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
|
||||
ARG VLLM_CPU_DISABLE_AVX512
|
||||
@ -60,8 +53,10 @@ ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
--mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
|
||||
pip install dist/*.whl
|
||||
pip install dist/*.whl && \
|
||||
rm -rf dist
|
||||
|
||||
WORKDIR /workspace/
|
||||
|
||||
|
||||
18
Dockerfile.hpu
Normal file
18
Dockerfile.hpu
Normal file
@ -0,0 +1,18 @@
|
||||
FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
|
||||
|
||||
COPY ./ /workspace/vllm
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
|
||||
RUN pip install -v -r requirements-hpu.txt
|
||||
|
||||
ENV no_proxy=localhost,127.0.0.1
|
||||
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
|
||||
|
||||
RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
|
||||
|
||||
WORKDIR /workspace/
|
||||
|
||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||
|
||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||
@ -1,38 +1,41 @@
|
||||
# default base image
|
||||
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.19.1-ubuntu20.04"
|
||||
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04"
|
||||
|
||||
FROM $BASE_IMAGE
|
||||
|
||||
RUN echo "Base image is $BASE_IMAGE"
|
||||
|
||||
# Install some basic utilities
|
||||
RUN apt-get update \
|
||||
&& apt-get install python3 python3-pip -y \
|
||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1
|
||||
RUN apt-get update && \
|
||||
apt-get install -y \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
ffmpeg libsm6 libxext6 libgl1
|
||||
|
||||
### Mount Point ###
|
||||
# When launching the container, mount the code directory to /app
|
||||
ARG APP_MOUNT=/app
|
||||
VOLUME [ ${APP_MOUNT} ]
|
||||
WORKDIR ${APP_MOUNT}
|
||||
WORKDIR ${APP_MOUNT}/vllm
|
||||
|
||||
RUN python3 -m pip install --upgrade pip
|
||||
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
|
||||
RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
|
||||
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||
RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||
RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||
|
||||
COPY ./vllm /app/vllm/vllm
|
||||
COPY ./setup.py /app/vllm/setup.py
|
||||
COPY ./requirements-common.txt /app/vllm/requirements-common.txt
|
||||
COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||
|
||||
RUN cd /app/vllm \
|
||||
&& python3 -m pip install -U -r requirements-neuron.txt
|
||||
RUN python3 -m pip install -U \
|
||||
'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
|
||||
-r requirements-neuron.txt
|
||||
|
||||
ENV VLLM_TARGET_DEVICE neuron
|
||||
RUN cd /app/vllm \
|
||||
&& pip install -e . \
|
||||
&& cd ..
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
pip install --no-build-isolation -v -e .
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
@ -4,27 +4,22 @@
|
||||
FROM ubuntu:22.04 AS dev
|
||||
|
||||
RUN apt-get update -y && \
|
||||
apt-get install -y python3-pip git && \
|
||||
apt-get install -y ffmpeg libsm6 libxext6 libgl1
|
||||
apt-get install -y \
|
||||
git python3-pip \
|
||||
ffmpeg libsm6 libxext6 libgl1
|
||||
WORKDIR /workspace
|
||||
|
||||
# copy requirements
|
||||
COPY requirements-build.txt /workspace/vllm/
|
||||
COPY requirements-common.txt /workspace/vllm/
|
||||
COPY requirements-openvino.txt /workspace/vllm/
|
||||
|
||||
COPY vllm/ /workspace/vllm/vllm
|
||||
COPY csrc/core /workspace/vllm/csrc/core
|
||||
COPY cmake/utils.cmake /workspace/vllm/cmake/
|
||||
COPY CMakeLists.txt /workspace/vllm/
|
||||
COPY setup.py /workspace/vllm/
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||
|
||||
# install build requirements
|
||||
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
|
||||
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
|
||||
# build vLLM with OpenVINO backend
|
||||
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
|
||||
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
|
||||
|
||||
COPY examples/ /workspace/vllm/examples
|
||||
COPY benchmarks/ /workspace/vllm/benchmarks
|
||||
COPY examples/ /workspace/examples
|
||||
COPY benchmarks/ /workspace/benchmarks
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
@ -14,14 +14,23 @@ RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p
|
||||
COPY ./ /workspace/vllm
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
|
||||
|
||||
# These packages will be in rocketce eventually
|
||||
RUN pip install -v cmake xformers torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
|
||||
'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
|
||||
torch==2.3.1 \
|
||||
-r requirements-cpu.txt \
|
||||
xformers uvloop==0.20.0
|
||||
|
||||
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
||||
|
||||
WORKDIR /workspace/
|
||||
|
||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||
|
||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||
ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
# Default ROCm 6.1 base image
|
||||
ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
|
||||
# Default ROCm 6.2 base image
|
||||
ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0"
|
||||
|
||||
# Default ROCm ARCHes to build vLLM for.
|
||||
ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
|
||||
@ -7,18 +7,12 @@ ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
|
||||
# Whether to install CK-based flash-attention
|
||||
# If 0, will not install flash-attention
|
||||
ARG BUILD_FA="1"
|
||||
# If `TRY_FA_WHEEL=1`, we will try installing flash-attention from `FA_WHEEL_URL`
|
||||
# If this succeeds, we use the downloaded wheel and skip building flash-attention.
|
||||
# Otherwise, ROCm flash-attention from `FA_BRANCH` will be built for the
|
||||
# architectures specified in `FA_GFX_ARCHS`
|
||||
ARG TRY_FA_WHEEL="1"
|
||||
ARG FA_WHEEL_URL="https://github.com/ROCm/flash-attention/releases/download/v2.5.9post1-cktile-vllm/flash_attn-2.5.9.post1-cp39-cp39-linux_x86_64.whl"
|
||||
ARG FA_GFX_ARCHS="gfx90a;gfx942"
|
||||
ARG FA_BRANCH="23a2b1c2"
|
||||
ARG FA_BRANCH="3cea2fb"
|
||||
|
||||
# Whether to build triton on rocm
|
||||
ARG BUILD_TRITON="1"
|
||||
ARG TRITON_BRANCH="e0fc12c"
|
||||
ARG TRITON_BRANCH="e192dba"
|
||||
|
||||
### Base image build stage
|
||||
FROM $BASE_IMAGE AS base
|
||||
@ -50,14 +44,17 @@ RUN python3 -m pip install --upgrade pip
|
||||
# Remove sccache so it doesn't interfere with ccache
|
||||
# TODO: implement sccache support across components
|
||||
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
||||
# Install torch == 2.5.0 on ROCm
|
||||
RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
|
||||
*"rocm-6.1"*) \
|
||||
|
||||
# Install torch == 2.6.0 on ROCm
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
|
||||
*"rocm-6.2"*) \
|
||||
python3 -m pip uninstall -y torch torchvision \
|
||||
&& python3 -m pip install --no-cache-dir --pre \
|
||||
torch==2.5.0.dev20240726 \
|
||||
torchvision==0.20.0.dev20240726 \
|
||||
--index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
|
||||
&& python3 -m pip install --pre \
|
||||
torch==2.6.0.dev20240918 \
|
||||
'setuptools-scm>=8' \
|
||||
torchvision==0.20.0.dev20240918 \
|
||||
--extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
|
||||
*) ;; esac
|
||||
|
||||
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
|
||||
@ -79,25 +76,18 @@ RUN cd /opt/rocm/share/amd_smi \
|
||||
### Flash-Attention wheel build stage
|
||||
FROM base AS build_fa
|
||||
ARG BUILD_FA
|
||||
ARG TRY_FA_WHEEL
|
||||
ARG FA_WHEEL_URL
|
||||
ARG FA_GFX_ARCHS
|
||||
ARG FA_BRANCH
|
||||
# Build ROCm flash-attention wheel if `BUILD_FA = 1`
|
||||
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
||||
if [ "$BUILD_FA" = "1" ]; then \
|
||||
if [ "${TRY_FA_WHEEL}" = "1" ] && python3 -m pip install "${FA_WHEEL_URL}"; then \
|
||||
# If a suitable wheel exists, we download it instead of building FA
|
||||
mkdir -p /install && wget -N "${FA_WHEEL_URL}" -P /install; \
|
||||
else \
|
||||
mkdir -p libs \
|
||||
&& cd libs \
|
||||
&& git clone https://github.com/ROCm/flash-attention.git \
|
||||
&& cd flash-attention \
|
||||
&& git checkout "${FA_BRANCH}" \
|
||||
&& git submodule update --init \
|
||||
&& GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
|
||||
fi; \
|
||||
mkdir -p libs \
|
||||
&& cd libs \
|
||||
&& git clone https://github.com/ROCm/flash-attention.git \
|
||||
&& cd flash-attention \
|
||||
&& git checkout "${FA_BRANCH}" \
|
||||
&& git submodule update --init \
|
||||
&& GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
|
||||
# Create an empty directory otherwise as later build stages expect one
|
||||
else mkdir -p /install; \
|
||||
fi
|
||||
@ -112,6 +102,7 @@ RUN --mount=type=cache,target=${CCACHE_DIR} \
|
||||
if [ "$BUILD_TRITON" = "1" ]; then \
|
||||
mkdir -p libs \
|
||||
&& cd libs \
|
||||
&& python3 -m pip install ninja cmake wheel pybind11 \
|
||||
&& git clone https://github.com/OpenAI/triton.git \
|
||||
&& cd triton \
|
||||
&& git checkout "${TRITON_BRANCH}" \
|
||||
@ -126,10 +117,15 @@ RUN --mount=type=cache,target=${CCACHE_DIR} \
|
||||
FROM base AS final
|
||||
# Import the vLLM development directory from the build context
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||
|
||||
RUN python3 -m pip install --upgrade pip
|
||||
|
||||
# Package upgrades for useful functionality or to avoid dependency issues
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
python3 -m pip install --upgrade numba scipy huggingface-hub[cli]
|
||||
python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard
|
||||
|
||||
|
||||
# Workaround for ray >= 2.10.0
|
||||
@ -138,15 +134,9 @@ ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||
ENV TOKENIZERS_PARALLELISM=false
|
||||
|
||||
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
--mount=type=cache,target=/root/.cache/pip \
|
||||
python3 -m pip install -Ur requirements-rocm.txt \
|
||||
&& case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
|
||||
*"rocm-6.1"*) \
|
||||
# Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
|
||||
wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib \
|
||||
# Prevent interference if torch bundles its own HIP runtime
|
||||
&& rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
|
||||
*) ;; esac \
|
||||
&& python3 setup.py clean --all \
|
||||
&& python3 setup.py develop
|
||||
|
||||
|
||||
@ -1,20 +1,25 @@
|
||||
ARG NIGHTLY_DATE="20240828"
|
||||
ARG NIGHTLY_DATE="20241017"
|
||||
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
|
||||
|
||||
FROM $BASE_IMAGE
|
||||
WORKDIR /workspace
|
||||
WORKDIR /workspace/vllm
|
||||
|
||||
# Install some basic utilities
|
||||
RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libgl1
|
||||
|
||||
# Install the TPU and Pallas dependencies.
|
||||
RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
|
||||
RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
||||
RUN apt-get update && apt-get install -y \
|
||||
git \
|
||||
ffmpeg libsm6 libxext6 libgl1
|
||||
|
||||
# Build vLLM.
|
||||
COPY . /workspace/vllm
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
|
||||
|
||||
ENV VLLM_TARGET_DEVICE="tpu"
|
||||
RUN cd /workspace/vllm && python3 -m pip install -r requirements-tpu.txt
|
||||
RUN cd /workspace/vllm && python3 setup.py develop
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
python3 -m pip install \
|
||||
-r requirements-tpu.txt
|
||||
RUN python3 setup.py develop
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
@ -1,21 +1,68 @@
|
||||
FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04
|
||||
FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base
|
||||
|
||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
||||
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
||||
rm /etc/apt/sources.list.d/intel-graphics.list && \
|
||||
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
||||
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
||||
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
||||
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1
|
||||
COPY ./ /workspace/vllm
|
||||
RUN apt-get update -y && \
|
||||
apt-get install -y --no-install-recommends --fix-missing \
|
||||
curl \
|
||||
ffmpeg \
|
||||
git \
|
||||
libsndfile1 \
|
||||
libsm6 \
|
||||
libxext6 \
|
||||
libgl1 \
|
||||
lsb-release \
|
||||
numactl \
|
||||
python3 \
|
||||
python3-dev \
|
||||
python3-pip \
|
||||
# vim \
|
||||
wget
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt
|
||||
COPY requirements-common.txt /workspace/vllm/requirements-common.txt
|
||||
|
||||
RUN pip install -v -r requirements-xpu.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip install --no-cache-dir \
|
||||
-r requirements-xpu.txt
|
||||
|
||||
RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install
|
||||
RUN git clone https://github.com/intel/pti-gpu && \
|
||||
cd pti-gpu/sdk && \
|
||||
git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \
|
||||
mkdir build && \
|
||||
cd build && \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
|
||||
make -j && \
|
||||
cmake --install . --config Release --prefix "/usr/local"
|
||||
|
||||
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
|
||||
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
|
||||
|
||||
ENV VLLM_TARGET_DEVICE=xpu
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
python3 setup.py install
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
FROM vllm-base AS vllm-openai
|
||||
|
||||
# install additional dependencies for openai api server
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip install accelerate hf_transfer 'modelscope!=1.15.0'
|
||||
|
||||
ENV VLLM_USAGE_SOURCE production-docker-image \
|
||||
TRITON_XPU_PROFILE 1
|
||||
|
||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||
|
||||
20
README.md
20
README.md
@ -10,22 +10,15 @@ Easy, fast, and cheap LLM serving for everyone
|
||||
</h3>
|
||||
|
||||
<p align="center">
|
||||
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> |
|
||||
|
||||
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
||||
</p>
|
||||
|
||||
|
||||
---
|
||||
|
||||
**vLLM, AMD, Anyscale Meet & Greet at [Ray Summit 2024](http://raysummit.anyscale.com) (Monday, Sept 30th, 5-7pm PT) at Marriott Marquis San Francisco**
|
||||
|
||||
We are excited to announce our special vLLM event in collaboration with AMD and Anyscale.
|
||||
Join us to learn more about recent advancements of vLLM on MI300X.
|
||||
Register [here](https://lu.ma/db5ld9n5) and be a part of the event!
|
||||
|
||||
---
|
||||
|
||||
*Latest News* 🔥
|
||||
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing).
|
||||
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
|
||||
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users!
|
||||
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
|
||||
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
|
||||
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
|
||||
@ -51,7 +44,7 @@ vLLM is fast with:
|
||||
- Speculative decoding
|
||||
- Chunked prefill
|
||||
|
||||
**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
|
||||
**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script.
|
||||
|
||||
vLLM is flexible and easy to use with:
|
||||
|
||||
@ -136,5 +129,6 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
|
||||
|
||||
* For technical questions and feature requests, please use Github issues or discussions.
|
||||
* For discussing with fellow users, please use Discord.
|
||||
* For coordinating contributions and development, please use Slack.
|
||||
* For security disclosures, please use Github's security advisory feature.
|
||||
* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
|
||||
* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
|
||||
|
||||
11
SECURITY.md
Normal file
11
SECURITY.md
Normal file
@ -0,0 +1,11 @@
|
||||
# Security Policy
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
|
||||
|
||||
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
|
||||
|
||||
---
|
||||
|
||||
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
|
||||
@ -6,3 +6,14 @@ You can download the dataset by running:
|
||||
```bash
|
||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
```
|
||||
|
||||
## Downloading the ShareGPT4V dataset
|
||||
|
||||
The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
|
||||
will ignore a datapoint if the referred image is missing.
|
||||
```bash
|
||||
wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
|
||||
mkdir coco -p
|
||||
wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
|
||||
unzip coco/train2017.zip -d coco/
|
||||
```
|
||||
|
||||
@ -23,8 +23,9 @@ class RequestFuncInput:
|
||||
output_len: int
|
||||
model: str
|
||||
best_of: int = 1
|
||||
use_beam_search: bool = False
|
||||
logprobs: Optional[int] = None
|
||||
multi_modal_content: Optional[dict] = None
|
||||
ignore_eos: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -47,13 +48,13 @@ async def async_request_tgi(
|
||||
assert api_url.endswith("generate_stream")
|
||||
|
||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||
assert not request_func_input.use_beam_search
|
||||
params = {
|
||||
"best_of": request_func_input.best_of,
|
||||
"max_new_tokens": request_func_input.output_len,
|
||||
"do_sample": True,
|
||||
"temperature": 0.01, # TGI does not accept 0.0 temperature.
|
||||
"top_p": 0.99, # TGI does not accept 1.0 top_p.
|
||||
# TGI does not accept ignore_eos flag.
|
||||
}
|
||||
payload = {
|
||||
"inputs": request_func_input.prompt,
|
||||
@ -78,7 +79,7 @@ async def async_request_tgi(
|
||||
# any data, we should skip it.
|
||||
if chunk_bytes.startswith(":"):
|
||||
continue
|
||||
chunk = remove_prefix(chunk_bytes, "data:")
|
||||
chunk = chunk_bytes.removeprefix("data:")
|
||||
|
||||
data = json.loads(chunk)
|
||||
timestamp = time.perf_counter()
|
||||
@ -118,7 +119,6 @@ async def async_request_trt_llm(
|
||||
assert api_url.endswith("generate_stream")
|
||||
|
||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||
assert not request_func_input.use_beam_search
|
||||
assert request_func_input.best_of == 1
|
||||
payload = {
|
||||
"accumulate_tokens": True,
|
||||
@ -128,6 +128,8 @@ async def async_request_trt_llm(
|
||||
"max_tokens": request_func_input.output_len,
|
||||
"stream": True,
|
||||
}
|
||||
if request_func_input.ignore_eos:
|
||||
payload["min_length"] = request_func_input.output_len
|
||||
output = RequestFuncOutput()
|
||||
output.prompt_len = request_func_input.prompt_len
|
||||
|
||||
@ -142,8 +144,8 @@ async def async_request_trt_llm(
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
|
||||
"data:")
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data:")
|
||||
|
||||
data = json.loads(chunk)
|
||||
output.generated_text += data["text_output"]
|
||||
@ -182,7 +184,6 @@ async def async_request_deepspeed_mii(
|
||||
) -> RequestFuncOutput:
|
||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||
assert request_func_input.best_of == 1
|
||||
assert not request_func_input.use_beam_search
|
||||
|
||||
payload = {
|
||||
"prompt": request_func_input.prompt,
|
||||
@ -230,7 +231,6 @@ async def async_request_openai_completions(
|
||||
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||
|
||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||
assert not request_func_input.use_beam_search
|
||||
payload = {
|
||||
"model": request_func_input.model,
|
||||
"prompt": request_func_input.prompt,
|
||||
@ -239,6 +239,7 @@ async def async_request_openai_completions(
|
||||
"max_tokens": request_func_input.output_len,
|
||||
"logprobs": request_func_input.logprobs,
|
||||
"stream": True,
|
||||
"ignore_eos": request_func_input.ignore_eos,
|
||||
}
|
||||
headers = {
|
||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
|
||||
@ -255,13 +256,14 @@ async def async_request_openai_completions(
|
||||
async with session.post(url=api_url, json=payload,
|
||||
headers=headers) as response:
|
||||
if response.status == 200:
|
||||
first_chunk_received = False
|
||||
async for chunk_bytes in response.content:
|
||||
chunk_bytes = chunk_bytes.strip()
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
|
||||
"data: ")
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data: ")
|
||||
if chunk == "[DONE]":
|
||||
latency = time.perf_counter() - st
|
||||
else:
|
||||
@ -273,7 +275,8 @@ async def async_request_openai_completions(
|
||||
if data["choices"][0]["text"]:
|
||||
timestamp = time.perf_counter()
|
||||
# First token
|
||||
if ttft == 0.0:
|
||||
if not first_chunk_received:
|
||||
first_chunk_received = True
|
||||
ttft = time.perf_counter() - st
|
||||
output.ttft = ttft
|
||||
|
||||
@ -284,9 +287,14 @@ async def async_request_openai_completions(
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
generated_text += data["choices"][0]["text"]
|
||||
|
||||
if first_chunk_received:
|
||||
output.success = True
|
||||
else:
|
||||
output.success = False
|
||||
output.error = (
|
||||
"Never received a valid chunk to calculate TTFT."
|
||||
"This response will be marked as failed!")
|
||||
output.generated_text = generated_text
|
||||
output.success = True
|
||||
output.latency = latency
|
||||
else:
|
||||
output.error = response.reason or ""
|
||||
@ -311,18 +319,21 @@ async def async_request_openai_chat_completions(
|
||||
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
||||
|
||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||
assert not request_func_input.use_beam_search
|
||||
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||
if request_func_input.multi_modal_content:
|
||||
content.append(request_func_input.multi_modal_content)
|
||||
payload = {
|
||||
"model": request_func_input.model,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": request_func_input.prompt,
|
||||
"content": content
|
||||
},
|
||||
],
|
||||
"temperature": 0.0,
|
||||
"max_tokens": request_func_input.output_len,
|
||||
"max_completion_tokens": request_func_input.output_len,
|
||||
"stream": True,
|
||||
"ignore_eos": request_func_input.ignore_eos,
|
||||
}
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
@ -345,8 +356,8 @@ async def async_request_openai_chat_completions(
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
|
||||
"data: ")
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data: ")
|
||||
if chunk == "[DONE]":
|
||||
latency = time.perf_counter() - st
|
||||
else:
|
||||
@ -385,14 +396,6 @@ async def async_request_openai_chat_completions(
|
||||
return output
|
||||
|
||||
|
||||
# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
|
||||
# introduced in Python 3.9
|
||||
def remove_prefix(text: str, prefix: str) -> str:
|
||||
if text.startswith(prefix):
|
||||
return text[len(prefix):]
|
||||
return text
|
||||
|
||||
|
||||
def get_model(pretrained_model_name_or_path: str) -> str:
|
||||
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
|
||||
from modelscope import snapshot_download
|
||||
@ -426,4 +429,5 @@ ASYNC_REQUEST_FUNCS = {
|
||||
"openai-chat": async_request_openai_chat_completions,
|
||||
"tensorrt-llm": async_request_trt_llm,
|
||||
"scalellm": async_request_openai_completions,
|
||||
"sglang": async_request_openai_completions,
|
||||
}
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
"""Benchmark the latency of processing a single batch of requests."""
|
||||
import argparse
|
||||
import dataclasses
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
@ -10,50 +11,24 @@ import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
|
||||
from vllm.inputs import PromptInputs
|
||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.inputs import PromptType
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
print(args)
|
||||
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
|
||||
# NOTE(woosuk): If the request cannot be processed in a single batch,
|
||||
# the engine will automatically process the request in multiple batches.
|
||||
llm = LLM(
|
||||
model=args.model,
|
||||
speculative_model=args.speculative_model,
|
||||
num_speculative_tokens=args.num_speculative_tokens,
|
||||
speculative_draft_tensor_parallel_size=\
|
||||
args.speculative_draft_tensor_parallel_size,
|
||||
tokenizer=args.tokenizer,
|
||||
quantization=args.quantization,
|
||||
tensor_parallel_size=args.tensor_parallel_size,
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
dtype=args.dtype,
|
||||
max_model_len=args.max_model_len,
|
||||
enforce_eager=args.enforce_eager,
|
||||
kv_cache_dtype=args.kv_cache_dtype,
|
||||
quantization_param_path=args.quantization_param_path,
|
||||
device=args.device,
|
||||
ray_workers_use_nsight=args.ray_workers_use_nsight,
|
||||
use_v2_block_manager=args.use_v2_block_manager,
|
||||
enable_chunked_prefill=args.enable_chunked_prefill,
|
||||
download_dir=args.download_dir,
|
||||
block_size=args.block_size,
|
||||
gpu_memory_utilization=args.gpu_memory_utilization,
|
||||
load_format=args.load_format,
|
||||
distributed_executor_backend=args.distributed_executor_backend,
|
||||
otlp_traces_endpoint=args.otlp_traces_endpoint,
|
||||
enable_prefix_caching=args.enable_prefix_caching,
|
||||
)
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
n=args.n,
|
||||
temperature=0.0 if args.use_beam_search else 1.0,
|
||||
temperature=1.0,
|
||||
top_p=1.0,
|
||||
use_beam_search=args.use_beam_search,
|
||||
ignore_eos=True,
|
||||
max_tokens=args.output_len,
|
||||
)
|
||||
@ -61,7 +36,7 @@ def main(args: argparse.Namespace):
|
||||
dummy_prompt_token_ids = np.random.randint(10000,
|
||||
size=(args.batch_size,
|
||||
args.input_len))
|
||||
dummy_inputs: List[PromptInputs] = [{
|
||||
dummy_prompts: List[PromptType] = [{
|
||||
"prompt_token_ids": batch
|
||||
} for batch in dummy_prompt_token_ids.tolist()]
|
||||
|
||||
@ -74,13 +49,13 @@ def main(args: argparse.Namespace):
|
||||
],
|
||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||
str(profile_dir))) as p:
|
||||
llm.generate(dummy_inputs,
|
||||
llm.generate(dummy_prompts,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=False)
|
||||
print(p.key_averages())
|
||||
else:
|
||||
start_time = time.perf_counter()
|
||||
llm.generate(dummy_inputs,
|
||||
llm.generate(dummy_prompts,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=False)
|
||||
end_time = time.perf_counter()
|
||||
@ -127,19 +102,6 @@ if __name__ == '__main__':
|
||||
parser = FlexibleArgumentParser(
|
||||
description='Benchmark the latency of processing a single batch of '
|
||||
'requests till completion.')
|
||||
parser.add_argument('--model', type=str, default='facebook/opt-125m')
|
||||
parser.add_argument('--speculative-model', type=str, default=None)
|
||||
parser.add_argument('--num-speculative-tokens', type=int, default=None)
|
||||
parser.add_argument('--speculative-draft-tensor-parallel-size',
|
||||
'-spec-draft-tp',
|
||||
type=int,
|
||||
default=None)
|
||||
parser.add_argument('--tokenizer', type=str, default=None)
|
||||
parser.add_argument('--quantization',
|
||||
'-q',
|
||||
choices=[*QUANTIZATION_METHODS, None],
|
||||
default=None)
|
||||
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
|
||||
parser.add_argument('--input-len', type=int, default=32)
|
||||
parser.add_argument('--output-len', type=int, default=128)
|
||||
parser.add_argument('--batch-size', type=int, default=8)
|
||||
@ -156,45 +118,6 @@ if __name__ == '__main__':
|
||||
type=int,
|
||||
default=30,
|
||||
help='Number of iterations to run.')
|
||||
parser.add_argument('--trust-remote-code',
|
||||
action='store_true',
|
||||
help='trust remote code from huggingface')
|
||||
parser.add_argument(
|
||||
'--max-model-len',
|
||||
type=int,
|
||||
default=None,
|
||||
help='Maximum length of a sequence (including prompt and output). '
|
||||
'If None, will be derived from the model.')
|
||||
parser.add_argument(
|
||||
'--dtype',
|
||||
type=str,
|
||||
default='auto',
|
||||
choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
|
||||
help='data type for model weights and activations. '
|
||||
'The "auto" option will use FP16 precision '
|
||||
'for FP32 and FP16 models, and BF16 precision '
|
||||
'for BF16 models.')
|
||||
parser.add_argument('--enforce-eager',
|
||||
action='store_true',
|
||||
help='enforce eager mode and disable CUDA graph')
|
||||
parser.add_argument(
|
||||
'--kv-cache-dtype',
|
||||
type=str,
|
||||
choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
|
||||
default="auto",
|
||||
help='Data type for kv cache storage. If "auto", will use model '
|
||||
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
|
||||
'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
|
||||
parser.add_argument(
|
||||
'--quantization-param-path',
|
||||
type=str,
|
||||
default=None,
|
||||
help='Path to the JSON file containing the KV cache scaling factors. '
|
||||
'This should generally be supplied, when KV cache dtype is FP8. '
|
||||
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
|
||||
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
|
||||
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
|
||||
'instead supported for common inference criteria.')
|
||||
parser.add_argument(
|
||||
'--profile',
|
||||
action='store_true',
|
||||
@ -205,79 +128,12 @@ if __name__ == '__main__':
|
||||
default=None,
|
||||
help=('path to save the pytorch profiler output. Can be visualized '
|
||||
'with ui.perfetto.dev or Tensorboard.'))
|
||||
parser.add_argument("--device",
|
||||
type=str,
|
||||
default="auto",
|
||||
choices=DEVICE_OPTIONS,
|
||||
help='device type for vLLM execution')
|
||||
parser.add_argument('--block-size',
|
||||
type=int,
|
||||
default=16,
|
||||
help='block size of key/value cache')
|
||||
parser.add_argument(
|
||||
'--enable-chunked-prefill',
|
||||
action='store_true',
|
||||
help='If True, the prefill requests can be chunked based on the '
|
||||
'max_num_batched_tokens')
|
||||
parser.add_argument("--enable-prefix-caching",
|
||||
action='store_true',
|
||||
help="Enable automatic prefix caching")
|
||||
parser.add_argument('--use-v2-block-manager', action='store_true')
|
||||
parser.add_argument(
|
||||
"--ray-workers-use-nsight",
|
||||
action='store_true',
|
||||
help="If specified, use nsight to profile ray workers",
|
||||
)
|
||||
parser.add_argument('--download-dir',
|
||||
type=str,
|
||||
default=None,
|
||||
help='directory to download and load the weights, '
|
||||
'default to the default cache dir of huggingface')
|
||||
parser.add_argument(
|
||||
'--output-json',
|
||||
type=str,
|
||||
default=None,
|
||||
help='Path to save the latency results in JSON format.')
|
||||
parser.add_argument('--gpu-memory-utilization',
|
||||
type=float,
|
||||
default=0.9,
|
||||
help='the fraction of GPU memory to be used for '
|
||||
'the model executor, which can range from 0 to 1.'
|
||||
'If unspecified, will use the default value of 0.9.')
|
||||
parser.add_argument(
|
||||
'--load-format',
|
||||
type=str,
|
||||
default=EngineArgs.load_format,
|
||||
choices=[
|
||||
'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
|
||||
'bitsandbytes'
|
||||
],
|
||||
help='The format of the model weights to load.\n\n'
|
||||
'* "auto" will try to load the weights in the safetensors format '
|
||||
'and fall back to the pytorch bin format if safetensors format '
|
||||
'is not available.\n'
|
||||
'* "pt" will load the weights in the pytorch bin format.\n'
|
||||
'* "safetensors" will load the weights in the safetensors format.\n'
|
||||
'* "npcache" will load the weights in pytorch format and store '
|
||||
'a numpy cache to speed up the loading.\n'
|
||||
'* "dummy" will initialize the weights with random values, '
|
||||
'which is mainly for profiling.\n'
|
||||
'* "tensorizer" will load the weights using tensorizer from '
|
||||
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
|
||||
'section for more information.\n'
|
||||
'* "bitsandbytes" will load the weights using bitsandbytes '
|
||||
'quantization.\n')
|
||||
parser.add_argument(
|
||||
'--distributed-executor-backend',
|
||||
choices=['ray', 'mp'],
|
||||
default=None,
|
||||
help='Backend to use for distributed serving. When more than 1 GPU '
|
||||
'is used, will be automatically set to "ray" if installed '
|
||||
'or "mp" (multiprocessing) otherwise.')
|
||||
parser.add_argument(
|
||||
'--otlp-traces-endpoint',
|
||||
type=str,
|
||||
default=None,
|
||||
help='Target URL to which OpenTelemetry traces will be sent.')
|
||||
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
@ -25,6 +25,7 @@ ShareGPT example usage:
|
||||
--input-length-range 128:256
|
||||
"""
|
||||
|
||||
import dataclasses
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
@ -33,6 +34,7 @@ from typing import List, Optional, Tuple
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
try:
|
||||
@ -113,10 +115,10 @@ def repeat_and_sort_requests(requests: List[Tuple[str, int, int]],
|
||||
def main(args):
|
||||
tokenizer = get_tokenizer(args.model, trust_remote_code=True)
|
||||
input_length_range = tuple(map(int, args.input_length_range.split(':')))
|
||||
|
||||
random.seed(args.seed)
|
||||
if args.dataset_path is not None:
|
||||
print(f"Start to sample {args.num_prompts} prompts"
|
||||
"from {args.dataset_path}")
|
||||
f"from {args.dataset_path}")
|
||||
filtered_datasets = sample_requests(
|
||||
dataset_path=args.dataset_path,
|
||||
num_requests=args.num_prompts,
|
||||
@ -129,13 +131,9 @@ def main(args):
|
||||
filtered_datasets = [(PROMPT, prompt_len, args.output_len)
|
||||
] * args.num_prompts
|
||||
|
||||
llm = LLM(model=args.model,
|
||||
tokenizer_mode='auto',
|
||||
trust_remote_code=True,
|
||||
enforce_eager=True,
|
||||
use_v2_block_manager=args.use_v2_block_manager,
|
||||
tensor_parallel_size=args.tensor_parallel_size,
|
||||
enable_prefix_caching=args.enable_prefix_caching)
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
|
||||
|
||||
@ -144,13 +142,6 @@ def main(args):
|
||||
repeat_count=args.repeat_count,
|
||||
sort=args.sort)
|
||||
|
||||
print("------warm up------")
|
||||
test_prefix(
|
||||
llm=llm,
|
||||
prompts=prompts,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
print("------start generating------")
|
||||
test_prefix(
|
||||
llm=llm,
|
||||
@ -163,21 +154,11 @@ if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(
|
||||
description=
|
||||
'Benchmark the performance with or without automatic prefix caching.')
|
||||
parser.add_argument('--model',
|
||||
type=str,
|
||||
default='baichuan-inc/Baichuan2-13B-Chat')
|
||||
parser.add_argument("--dataset-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to the dataset.")
|
||||
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
|
||||
parser.add_argument('--output-len', type=int, default=10)
|
||||
parser.add_argument('--enable-prefix-caching',
|
||||
action='store_true',
|
||||
help='enable prefix caching')
|
||||
parser.add_argument('--use-v2-block-manager',
|
||||
action='store_true',
|
||||
help='Use BlockSpaceMangerV2')
|
||||
parser.add_argument('--num-prompts',
|
||||
type=int,
|
||||
default=1,
|
||||
@ -194,5 +175,7 @@ if __name__ == "__main__":
|
||||
default='128:256',
|
||||
help='Range of input lengths for sampling prompts,'
|
||||
'specified as "min:max" (e.g., "128:256").')
|
||||
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
177
benchmarks/benchmark_prioritization.py
Normal file
177
benchmarks/benchmark_prioritization.py
Normal file
@ -0,0 +1,177 @@
|
||||
"""Benchmark offline prioritization."""
|
||||
import argparse
|
||||
import dataclasses
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def sample_requests(
|
||||
dataset_path: str,
|
||||
num_requests: int,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
fixed_output_len: Optional[int],
|
||||
) -> List[Tuple[str, int, int]]:
|
||||
if fixed_output_len is not None and fixed_output_len < 4:
|
||||
raise ValueError("output_len too small")
|
||||
|
||||
# Load the dataset.
|
||||
with open(dataset_path) as f:
|
||||
dataset = json.load(f)
|
||||
# Filter out the conversations with less than 2 turns.
|
||||
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
||||
# Only keep the first two turns of each conversation.
|
||||
dataset = [(data["conversations"][0]["value"],
|
||||
data["conversations"][1]["value"]) for data in dataset]
|
||||
|
||||
# Shuffle the dataset.
|
||||
random.shuffle(dataset)
|
||||
|
||||
# Filter out sequences that are too long or too short
|
||||
filtered_dataset: List[Tuple[str, int, int]] = []
|
||||
for i in range(len(dataset)):
|
||||
if len(filtered_dataset) == num_requests:
|
||||
break
|
||||
|
||||
# Tokenize the prompts and completions.
|
||||
prompt = dataset[i][0]
|
||||
prompt_token_ids = tokenizer(prompt).input_ids
|
||||
completion = dataset[i][1]
|
||||
completion_token_ids = tokenizer(completion).input_ids
|
||||
prompt_len = len(prompt_token_ids)
|
||||
output_len = len(completion_token_ids
|
||||
) if fixed_output_len is None else fixed_output_len
|
||||
if prompt_len < 4 or output_len < 4:
|
||||
# Prune too short sequences.
|
||||
continue
|
||||
if prompt_len > 1024 or prompt_len + output_len > 2048:
|
||||
# Prune too long sequences.
|
||||
continue
|
||||
|
||||
#Select a equi-probable random priority
|
||||
priority = 0 if random.random() < 0.5 else 1
|
||||
|
||||
filtered_dataset.append((prompt, prompt_len, output_len, priority))
|
||||
|
||||
return filtered_dataset
|
||||
|
||||
|
||||
def run_vllm(
|
||||
requests: List[Tuple[str, int, int]],
|
||||
n: int,
|
||||
engine_args: EngineArgs,
|
||||
) -> float:
|
||||
from vllm import LLM, SamplingParams
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
|
||||
# Add the requests to the engine.
|
||||
prompts = []
|
||||
sampling_params = []
|
||||
priority = []
|
||||
for prompt, _, output_len, _priority in requests:
|
||||
prompts.append(prompt)
|
||||
priority.append(_priority)
|
||||
sampling_params.append(
|
||||
SamplingParams(
|
||||
n=n,
|
||||
temperature=1.0,
|
||||
top_p=1.0,
|
||||
ignore_eos=True,
|
||||
max_tokens=output_len,
|
||||
))
|
||||
|
||||
start = time.perf_counter()
|
||||
llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True)
|
||||
end = time.perf_counter()
|
||||
return end - start
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
print(args)
|
||||
random.seed(args.seed)
|
||||
|
||||
# Sample the requests.
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
args.tokenizer, trust_remote_code=args.trust_remote_code)
|
||||
if args.dataset is None:
|
||||
# Synthesize a prompt with the given input length.
|
||||
prompt = "hi" * (args.input_len - 1)
|
||||
requests = [(prompt, args.input_len, args.output_len)
|
||||
for _ in range(args.num_prompts)]
|
||||
else:
|
||||
requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
|
||||
args.output_len)
|
||||
|
||||
if args.backend == "vllm":
|
||||
elapsed_time = run_vllm(requests, args.n,
|
||||
EngineArgs.from_cli_args(args))
|
||||
else:
|
||||
raise ValueError(f"Unknown backend: {args.backend}")
|
||||
total_num_tokens = sum(prompt_len + output_len
|
||||
for _, prompt_len, output_len, priority in requests)
|
||||
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
||||
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
|
||||
|
||||
# Output JSON results if specified
|
||||
if args.output_json:
|
||||
results = {
|
||||
"elapsed_time": elapsed_time,
|
||||
"num_requests": len(requests),
|
||||
"total_num_tokens": total_num_tokens,
|
||||
"requests_per_second": len(requests) / elapsed_time,
|
||||
"tokens_per_second": total_num_tokens / elapsed_time,
|
||||
}
|
||||
with open(args.output_json, "w") as f:
|
||||
json.dump(results, f, indent=4)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
||||
parser.add_argument("--backend",
|
||||
type=str,
|
||||
choices=["vllm", "hf", "mii"],
|
||||
default="vllm")
|
||||
parser.add_argument("--dataset",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to the dataset.")
|
||||
parser.add_argument("--input-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Input prompt length for each request")
|
||||
parser.add_argument("--output-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Output length for each request. Overrides the "
|
||||
"output length from the dataset.")
|
||||
parser.add_argument("--n",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of generated sequences per prompt.")
|
||||
parser.add_argument("--num-prompts",
|
||||
type=int,
|
||||
default=200,
|
||||
help="Number of prompts to process.")
|
||||
parser.add_argument(
|
||||
'--output-json',
|
||||
type=str,
|
||||
default=None,
|
||||
help='Path to save the throughput results in JSON format.')
|
||||
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
args = parser.parse_args()
|
||||
if args.tokenizer is None:
|
||||
args.tokenizer = args.model
|
||||
if args.dataset is None:
|
||||
assert args.input_len is not None
|
||||
assert args.output_len is not None
|
||||
else:
|
||||
assert args.input_len is None
|
||||
|
||||
main(args)
|
||||
@ -1,4 +1,4 @@
|
||||
"""Benchmark online serving throughput.
|
||||
r"""Benchmark online serving throughput.
|
||||
|
||||
On the server side, run one of the following commands:
|
||||
vLLM OpenAI API server
|
||||
@ -24,6 +24,8 @@ On the client side, run:
|
||||
"""
|
||||
import argparse
|
||||
import asyncio
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
@ -31,11 +33,13 @@ import time
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
|
||||
from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
||||
RequestFuncOutput)
|
||||
from datasets import load_dataset
|
||||
from PIL.Image import Image
|
||||
from tqdm.asyncio import tqdm
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
@ -49,6 +53,8 @@ try:
|
||||
except ImportError:
|
||||
from argparse import ArgumentParser as FlexibleArgumentParser
|
||||
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
||||
|
||||
|
||||
@dataclass
|
||||
class BenchmarkMetrics:
|
||||
@ -56,6 +62,7 @@ class BenchmarkMetrics:
|
||||
total_input: int
|
||||
total_output: int
|
||||
request_throughput: float
|
||||
request_goodput: float
|
||||
output_throughput: float
|
||||
total_token_throughput: float
|
||||
mean_ttft_ms: float
|
||||
@ -84,11 +91,9 @@ def sample_sharegpt_requests(
|
||||
num_requests: int,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
fixed_output_len: Optional[int] = None,
|
||||
) -> List[Tuple[str, int, int]]:
|
||||
if fixed_output_len is not None and fixed_output_len < 4:
|
||||
raise ValueError("output_len too small")
|
||||
) -> List[Tuple[str, int, int, None]]:
|
||||
# Load the dataset.
|
||||
with open(dataset_path) as f:
|
||||
with open(dataset_path, encoding='utf-8') as f:
|
||||
dataset = json.load(f)
|
||||
# Filter out the conversations with less than 2 turns.
|
||||
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
||||
@ -113,13 +118,13 @@ def sample_sharegpt_requests(
|
||||
prompt_len = len(prompt_token_ids)
|
||||
output_len = len(completion_token_ids
|
||||
) if fixed_output_len is None else fixed_output_len
|
||||
if prompt_len < 4 or output_len < 4:
|
||||
if prompt_len < 4 or (fixed_output_len is None and output_len < 4):
|
||||
# Prune too short sequences.
|
||||
continue
|
||||
if prompt_len > 1024 or prompt_len + output_len > 2048:
|
||||
# Prune too long sequences.
|
||||
continue
|
||||
filtered_dataset.append((prompt, prompt_len, output_len))
|
||||
filtered_dataset.append((prompt, prompt_len, output_len, None))
|
||||
|
||||
return filtered_dataset
|
||||
|
||||
@ -131,13 +136,13 @@ def sample_sonnet_requests(
|
||||
output_len: int,
|
||||
prefix_len: int,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
) -> List[Tuple[str, str, int, int]]:
|
||||
) -> List[Tuple[str, str, int, int, None]]:
|
||||
assert (
|
||||
input_len > prefix_len
|
||||
), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
|
||||
|
||||
# Load the dataset.
|
||||
with open(dataset_path) as f:
|
||||
with open(dataset_path, encoding='utf-8') as f:
|
||||
poem_lines = f.readlines()
|
||||
|
||||
# Tokenize the poem lines.
|
||||
@ -174,9 +179,9 @@ def sample_sonnet_requests(
|
||||
# Sample the rest of lines per request.
|
||||
sampled_requests: List[Tuple[str, int, int]] = []
|
||||
for _ in range(num_requests):
|
||||
sampled_lines = "".join(
|
||||
prefix_lines +
|
||||
random.sample(poem_lines, num_input_lines - num_prefix_lines))
|
||||
num_lines_needed = num_input_lines - num_prefix_lines
|
||||
sampled_lines = "".join(prefix_lines +
|
||||
random.choices(poem_lines, k=num_lines_needed))
|
||||
|
||||
prompt = f"{base_prompt}{sampled_lines}"
|
||||
message = [
|
||||
@ -189,7 +194,67 @@ def sample_sonnet_requests(
|
||||
message, add_generation_prompt=True, tokenize=False)
|
||||
prompt_len = len(tokenizer(prompt_formatted).input_ids)
|
||||
sampled_requests.append(
|
||||
(prompt, prompt_formatted, prompt_len, output_len))
|
||||
(prompt, prompt_formatted, prompt_len, output_len, None))
|
||||
|
||||
return sampled_requests
|
||||
|
||||
|
||||
def sample_hf_requests(
|
||||
dataset_path: str,
|
||||
dataset_subset: str,
|
||||
dataset_split: str,
|
||||
num_requests: int,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
random_seed: int,
|
||||
fixed_output_len: Optional[int] = None,
|
||||
) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
|
||||
dataset = load_dataset(dataset_path,
|
||||
name=dataset_subset,
|
||||
split=dataset_split,
|
||||
streaming=True)
|
||||
assert "conversations" in dataset.features, (
|
||||
"HF Dataset must have 'conversations' column.")
|
||||
filter_func = lambda x: len(x["conversations"]) >= 2
|
||||
filtered_dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
|
||||
sampled_requests: List[Tuple[str, int, int, Dict[str,
|
||||
Collection[str]]]] = []
|
||||
for data in filtered_dataset:
|
||||
if len(sampled_requests) == num_requests:
|
||||
break
|
||||
|
||||
# Tokenize the prompts and completions.
|
||||
prompt = data["conversations"][0]["value"]
|
||||
prompt_token_ids = tokenizer(prompt).input_ids
|
||||
completion = data["conversations"][1]["value"]
|
||||
completion_token_ids = tokenizer(completion).input_ids
|
||||
prompt_len = len(prompt_token_ids)
|
||||
output_len = len(completion_token_ids
|
||||
) if fixed_output_len is None else fixed_output_len
|
||||
if fixed_output_len is None and (prompt_len < 4 or output_len < 4):
|
||||
# Prune too short sequences.
|
||||
continue
|
||||
if fixed_output_len is None and \
|
||||
(prompt_len > 1024 or prompt_len + output_len > 2048):
|
||||
# Prune too long sequences.
|
||||
continue
|
||||
|
||||
if "image" in data and isinstance(data["image"], Image):
|
||||
image: Image = data["image"]
|
||||
image = image.convert("RGB")
|
||||
image_data = io.BytesIO()
|
||||
image.save(image_data, format='JPEG')
|
||||
image_base64 = base64.b64encode(
|
||||
image_data.getvalue()).decode("utf-8")
|
||||
mm_content = {
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{image_base64}"
|
||||
},
|
||||
}
|
||||
else:
|
||||
mm_content = None
|
||||
|
||||
sampled_requests.append((prompt, prompt_len, output_len, mm_content))
|
||||
|
||||
return sampled_requests
|
||||
|
||||
@ -223,8 +288,8 @@ def sample_random_requests(
|
||||
[(offsets[i] + i + j) % tokenizer.vocab_size
|
||||
for j in range(input_lens[i])])
|
||||
|
||||
input_requests.append(
|
||||
(prompt, int(prefix_len + input_lens[i]), int(output_lens[i])))
|
||||
input_requests.append((prompt, int(prefix_len + input_lens[i]),
|
||||
int(output_lens[i]), None))
|
||||
|
||||
return input_requests
|
||||
|
||||
@ -232,8 +297,33 @@ def sample_random_requests(
|
||||
async def get_request(
|
||||
input_requests: List[Tuple[str, int, int]],
|
||||
request_rate: float,
|
||||
burstiness: float = 1.0,
|
||||
) -> AsyncGenerator[Tuple[str, int, int], None]:
|
||||
"""
|
||||
Asynchronously generates requests at a specified rate
|
||||
with OPTIONAL burstiness.
|
||||
|
||||
Args:
|
||||
input_requests:
|
||||
A list of input requests, each represented as a tuple.
|
||||
request_rate:
|
||||
The rate at which requests are generated (requests/s).
|
||||
burstiness (optional):
|
||||
The burstiness factor of the request generation.
|
||||
Only takes effect when request_rate is not inf.
|
||||
Default value is 1, which follows a Poisson process.
|
||||
Otherwise, the request intervals follow a gamma distribution.
|
||||
A lower burstiness value (0 < burstiness < 1) results
|
||||
in more bursty requests, while a higher burstiness value
|
||||
(burstiness > 1) results in a more uniform arrival of requests.
|
||||
"""
|
||||
input_requests = iter(input_requests)
|
||||
|
||||
# Calculate scale parameter theta to maintain the desired request_rate.
|
||||
assert burstiness > 0, (
|
||||
f"A positive burstiness factor is expected, but given {burstiness}.")
|
||||
theta = 1.0 / (request_rate * burstiness)
|
||||
|
||||
for request in input_requests:
|
||||
yield request
|
||||
|
||||
@ -241,8 +331,9 @@ async def get_request(
|
||||
# If the request rate is infinity, then we don't need to wait.
|
||||
continue
|
||||
|
||||
# Sample the request interval from the exponential distribution.
|
||||
interval = np.random.exponential(1.0 / request_rate)
|
||||
# Sample the request interval from the gamma distribution.
|
||||
# If burstiness is 1, it follows exponential distribution.
|
||||
interval = np.random.gamma(shape=burstiness, scale=theta)
|
||||
# The next request will be sent after the interval.
|
||||
await asyncio.sleep(interval)
|
||||
|
||||
@ -254,12 +345,15 @@ def calculate_metrics(
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
selected_percentile_metrics: List[str],
|
||||
selected_percentiles: List[float],
|
||||
gootput_config_dict: Dict[str, float],
|
||||
) -> Tuple[BenchmarkMetrics, List[int]]:
|
||||
actual_output_lens: List[int] = []
|
||||
total_input = 0
|
||||
completed = 0
|
||||
good_completed = 0
|
||||
itls: List[float] = []
|
||||
tpots: List[float] = []
|
||||
all_tpots: List[float] = []
|
||||
ttfts: List[float] = []
|
||||
e2els: List[float] = []
|
||||
for i in range(len(outputs)):
|
||||
@ -273,9 +367,13 @@ def calculate_metrics(
|
||||
add_special_tokens=False).input_ids)
|
||||
actual_output_lens.append(output_len)
|
||||
total_input += input_requests[i][1]
|
||||
tpot = 0
|
||||
if output_len > 1:
|
||||
tpots.append(
|
||||
(outputs[i].latency - outputs[i].ttft) / (output_len - 1))
|
||||
tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
|
||||
1)
|
||||
tpots.append(tpot)
|
||||
# Note: if output_len <= 1, we regard tpot as 0 for goodput
|
||||
all_tpots.append(tpot)
|
||||
itls += outputs[i].itl
|
||||
ttfts.append(outputs[i].ttft)
|
||||
e2els.append(outputs[i].latency)
|
||||
@ -283,6 +381,28 @@ def calculate_metrics(
|
||||
else:
|
||||
actual_output_lens.append(0)
|
||||
|
||||
if gootput_config_dict:
|
||||
valid_metrics = []
|
||||
slo_values = []
|
||||
|
||||
if "ttft" in gootput_config_dict:
|
||||
valid_metrics.append(ttfts)
|
||||
slo_values.append(gootput_config_dict["ttft"] /
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||
if "tpot" in gootput_config_dict:
|
||||
valid_metrics.append(all_tpots)
|
||||
slo_values.append(gootput_config_dict["tpot"] /
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||
if "e2el" in gootput_config_dict:
|
||||
valid_metrics.append(e2els)
|
||||
slo_values.append(gootput_config_dict["e2el"] /
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||
|
||||
for req_metric in zip(*valid_metrics):
|
||||
is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
|
||||
if is_good_req:
|
||||
good_completed += 1
|
||||
|
||||
if completed == 0:
|
||||
warnings.warn(
|
||||
"All requests failed. This is likely due to a misconfiguration "
|
||||
@ -293,6 +413,7 @@ def calculate_metrics(
|
||||
total_input=total_input,
|
||||
total_output=sum(actual_output_lens),
|
||||
request_throughput=completed / dur_s,
|
||||
request_goodput=good_completed / dur_s,
|
||||
output_throughput=sum(actual_output_lens) / dur_s,
|
||||
total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
|
||||
mean_ttft_ms=np.mean(ttfts or 0) *
|
||||
@ -311,9 +432,9 @@ def calculate_metrics(
|
||||
median_itl_ms=np.median(itls or 0) * 1000,
|
||||
percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
|
||||
for p in selected_percentiles],
|
||||
mean_e2el_ms=np.median(e2els or 0) * 1000,
|
||||
mean_e2el_ms=np.mean(e2els or 0) * 1000,
|
||||
std_e2el_ms=np.std(e2els or 0) * 1000,
|
||||
median_e2el_ms=np.mean(e2els or 0) * 1000,
|
||||
median_e2el_ms=np.median(e2els or 0) * 1000,
|
||||
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
|
||||
for p in selected_percentiles],
|
||||
)
|
||||
@ -330,12 +451,15 @@ async def benchmark(
|
||||
input_requests: List[Tuple[str, int, int]],
|
||||
logprobs: Optional[int],
|
||||
best_of: int,
|
||||
use_beam_search: bool,
|
||||
request_rate: float,
|
||||
burstiness: float,
|
||||
disable_tqdm: bool,
|
||||
profile: bool,
|
||||
selected_percentile_metrics: List[str],
|
||||
selected_percentiles: List[str],
|
||||
ignore_eos: bool,
|
||||
gootput_config_dict: Dict[str, float],
|
||||
max_concurrency: Optional[int],
|
||||
):
|
||||
if backend in ASYNC_REQUEST_FUNCS:
|
||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||
@ -343,7 +467,12 @@ async def benchmark(
|
||||
raise ValueError(f"Unknown backend: {backend}")
|
||||
|
||||
print("Starting initial single prompt test run...")
|
||||
test_prompt, test_prompt_len, test_output_len = input_requests[0]
|
||||
test_prompt, test_prompt_len, test_output_len, test_mm_content = (
|
||||
input_requests[0])
|
||||
if backend != "openai-chat" and test_mm_content is not None:
|
||||
# multi-modal benchmark is only available on OpenAI Chat backend.
|
||||
raise ValueError(
|
||||
"Multi-modal content is only supported on 'openai-chat' backend.")
|
||||
test_input = RequestFuncInput(
|
||||
model=model_id,
|
||||
prompt=test_prompt,
|
||||
@ -352,7 +481,8 @@ async def benchmark(
|
||||
output_len=test_output_len,
|
||||
logprobs=logprobs,
|
||||
best_of=best_of,
|
||||
use_beam_search=use_beam_search,
|
||||
multi_modal_content=test_mm_content,
|
||||
ignore_eos=ignore_eos,
|
||||
)
|
||||
test_output = await request_func(request_func_input=test_input)
|
||||
if not test_output.success:
|
||||
@ -364,42 +494,62 @@ async def benchmark(
|
||||
|
||||
if profile:
|
||||
print("Starting profiler...")
|
||||
profile_input = RequestFuncInput(
|
||||
model=model_id,
|
||||
prompt=test_prompt,
|
||||
api_url=base_url + "/start_profile",
|
||||
prompt_len=test_prompt_len,
|
||||
output_len=test_output_len,
|
||||
logprobs=logprobs,
|
||||
best_of=best_of,
|
||||
use_beam_search=use_beam_search,
|
||||
)
|
||||
profile_input = RequestFuncInput(model=model_id,
|
||||
prompt=test_prompt,
|
||||
api_url=base_url + "/start_profile",
|
||||
prompt_len=test_prompt_len,
|
||||
output_len=test_output_len,
|
||||
logprobs=logprobs,
|
||||
best_of=best_of,
|
||||
multi_modal_content=test_mm_content,
|
||||
ignore_eos=ignore_eos)
|
||||
profile_output = await request_func(request_func_input=profile_input)
|
||||
if profile_output.success:
|
||||
print("Profiler started")
|
||||
|
||||
if burstiness == 1.0:
|
||||
distribution = "Poisson process"
|
||||
else:
|
||||
distribution = "Gamma distribution"
|
||||
|
||||
print(f"Traffic request rate: {request_rate}")
|
||||
print(f"Burstiness factor: {burstiness} ({distribution})")
|
||||
print(f"Maximum request concurrency: {max_concurrency}")
|
||||
|
||||
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
||||
|
||||
# This can be used once the minimum Python version is 3.10 or higher,
|
||||
# and it will simplify the code in limited_request_func.
|
||||
# semaphore = (asyncio.Semaphore(max_concurrency)
|
||||
# if max_concurrency else contextlib.nullcontext())
|
||||
semaphore = (asyncio.Semaphore(max_concurrency)
|
||||
if max_concurrency else None)
|
||||
|
||||
async def limited_request_func(request_func_input, pbar):
|
||||
if semaphore is None:
|
||||
return await request_func(request_func_input=request_func_input,
|
||||
pbar=pbar)
|
||||
async with semaphore:
|
||||
return await request_func(request_func_input=request_func_input,
|
||||
pbar=pbar)
|
||||
|
||||
benchmark_start_time = time.perf_counter()
|
||||
tasks: List[asyncio.Task] = []
|
||||
async for request in get_request(input_requests, request_rate):
|
||||
prompt, prompt_len, output_len = request
|
||||
request_func_input = RequestFuncInput(
|
||||
model=model_id,
|
||||
prompt=prompt,
|
||||
api_url=api_url,
|
||||
prompt_len=prompt_len,
|
||||
output_len=output_len,
|
||||
logprobs=logprobs,
|
||||
best_of=best_of,
|
||||
use_beam_search=use_beam_search,
|
||||
)
|
||||
async for request in get_request(input_requests, request_rate, burstiness):
|
||||
prompt, prompt_len, output_len, mm_content = request
|
||||
request_func_input = RequestFuncInput(model=model_id,
|
||||
prompt=prompt,
|
||||
api_url=api_url,
|
||||
prompt_len=prompt_len,
|
||||
output_len=output_len,
|
||||
logprobs=logprobs,
|
||||
best_of=best_of,
|
||||
multi_modal_content=mm_content,
|
||||
ignore_eos=ignore_eos)
|
||||
tasks.append(
|
||||
asyncio.create_task(
|
||||
request_func(request_func_input=request_func_input,
|
||||
pbar=pbar)))
|
||||
limited_request_func(request_func_input=request_func_input,
|
||||
pbar=pbar)))
|
||||
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
|
||||
|
||||
if profile:
|
||||
@ -412,7 +562,6 @@ async def benchmark(
|
||||
output_len=test_output_len,
|
||||
logprobs=logprobs,
|
||||
best_of=best_of,
|
||||
use_beam_search=use_beam_search,
|
||||
)
|
||||
profile_output = await request_func(request_func_input=profile_input)
|
||||
if profile_output.success:
|
||||
@ -430,6 +579,7 @@ async def benchmark(
|
||||
tokenizer=tokenizer,
|
||||
selected_percentile_metrics=selected_percentile_metrics,
|
||||
selected_percentiles=selected_percentiles,
|
||||
gootput_config_dict=gootput_config_dict,
|
||||
)
|
||||
|
||||
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
|
||||
@ -441,6 +591,9 @@ async def benchmark(
|
||||
metrics.total_output))
|
||||
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
|
||||
metrics.request_throughput))
|
||||
if gootput_config_dict:
|
||||
print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
|
||||
metrics.request_goodput))
|
||||
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
|
||||
metrics.output_throughput))
|
||||
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
|
||||
@ -452,6 +605,8 @@ async def benchmark(
|
||||
"total_input_tokens": metrics.total_input,
|
||||
"total_output_tokens": metrics.total_output,
|
||||
"request_throughput": metrics.request_throughput,
|
||||
"request_goodput:":
|
||||
metrics.request_goodput if gootput_config_dict else None,
|
||||
"output_throughput": metrics.output_throughput,
|
||||
"total_token_throughput": metrics.total_token_throughput,
|
||||
"input_lens": [output.prompt_len for output in outputs],
|
||||
@ -470,7 +625,7 @@ async def benchmark(
|
||||
# E.g., "Time to First Token"
|
||||
metric_header: str,
|
||||
):
|
||||
# This function print and add statistics of the specified
|
||||
# This function prints and adds statistics of the specified
|
||||
# metric.
|
||||
if metric_attribute_name not in selected_percentile_metrics:
|
||||
return
|
||||
@ -505,6 +660,41 @@ async def benchmark(
|
||||
return result
|
||||
|
||||
|
||||
def check_goodput_args(args):
|
||||
# Check and parse goodput arguments
|
||||
gootput_config_dict = {}
|
||||
VALID_NAMES = ["ttft", "tpot", "e2el"]
|
||||
if args.goodput:
|
||||
gootput_config_dict = parse_goodput(args.goodput)
|
||||
for slo_name, slo_val in gootput_config_dict.items():
|
||||
if slo_name not in VALID_NAMES:
|
||||
raise ValueError(
|
||||
f"Invalid metric name found, {slo_name}: {slo_val}. "
|
||||
"The service level objective name should be one of "
|
||||
f"{str(VALID_NAMES)}. ")
|
||||
if slo_val < 0:
|
||||
raise ValueError(
|
||||
f"Invalid value found, {slo_name}: {slo_val}. "
|
||||
"The service level objective value should be "
|
||||
"non-negative.")
|
||||
return gootput_config_dict
|
||||
|
||||
|
||||
def parse_goodput(slo_pairs):
|
||||
gootput_config_dict = {}
|
||||
try:
|
||||
for slo_pair in slo_pairs:
|
||||
slo_name, slo_val = slo_pair.split(":")
|
||||
gootput_config_dict[slo_name] = float(slo_val)
|
||||
except ValueError as err:
|
||||
raise argparse.ArgumentTypeError(
|
||||
"Invalid format found for service level objectives. "
|
||||
"Specify service level objectives for goodput as \"KEY:VALUE\" "
|
||||
"pairs, where the key is a metric name, and the value is a "
|
||||
"number in milliseconds.") from err
|
||||
return gootput_config_dict
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
print(args)
|
||||
random.seed(args.seed)
|
||||
@ -556,9 +746,9 @@ def main(args: argparse.Namespace):
|
||||
prefix_len=args.sonnet_prefix_len,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
input_requests = [(prompt, prompt_len, output_len)
|
||||
input_requests = [(prompt, prompt_len, output_len, None)
|
||||
for prompt, prompt_formatted, prompt_len,
|
||||
output_len in input_requests]
|
||||
output_len, _ in input_requests]
|
||||
else:
|
||||
assert (
|
||||
tokenizer.chat_template or tokenizer.default_chat_template
|
||||
@ -571,9 +761,20 @@ def main(args: argparse.Namespace):
|
||||
prefix_len=args.sonnet_prefix_len,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
input_requests = [(prompt_formatted, prompt_len, output_len)
|
||||
input_requests = [(prompt_formatted, prompt_len, output_len, None)
|
||||
for prompt, prompt_formatted, prompt_len,
|
||||
output_len in input_requests]
|
||||
output_len, _ in input_requests]
|
||||
|
||||
elif args.dataset_name == "hf":
|
||||
input_requests = sample_hf_requests(
|
||||
dataset_path=args.dataset_path,
|
||||
dataset_subset=args.hf_subset,
|
||||
dataset_split=args.hf_split,
|
||||
num_requests=args.num_prompts,
|
||||
tokenizer=tokenizer,
|
||||
random_seed=args.seed,
|
||||
fixed_output_len=args.hf_output_len,
|
||||
)
|
||||
|
||||
elif args.dataset_name == "random":
|
||||
input_requests = sample_random_requests(
|
||||
@ -588,6 +789,8 @@ def main(args: argparse.Namespace):
|
||||
else:
|
||||
raise ValueError(f"Unknown dataset: {args.dataset_name}")
|
||||
|
||||
gootput_config_dict = check_goodput_args(args)
|
||||
|
||||
benchmark_result = asyncio.run(
|
||||
benchmark(
|
||||
backend=backend,
|
||||
@ -598,14 +801,17 @@ def main(args: argparse.Namespace):
|
||||
input_requests=input_requests,
|
||||
logprobs=args.logprobs,
|
||||
best_of=args.best_of,
|
||||
use_beam_search=args.use_beam_search,
|
||||
request_rate=args.request_rate,
|
||||
burstiness=args.burstiness,
|
||||
disable_tqdm=args.disable_tqdm,
|
||||
profile=args.profile,
|
||||
selected_percentile_metrics=args.percentile_metrics.split(","),
|
||||
selected_percentiles=[
|
||||
float(p) for p in args.metric_percentiles.split(",")
|
||||
],
|
||||
ignore_eos=args.ignore_eos,
|
||||
gootput_config_dict=gootput_config_dict,
|
||||
max_concurrency=args.max_concurrency,
|
||||
))
|
||||
|
||||
# Save config and results to json
|
||||
@ -619,7 +825,6 @@ def main(args: argparse.Namespace):
|
||||
result_json["model_id"] = model_id
|
||||
result_json["tokenizer_id"] = tokenizer_id
|
||||
result_json["best_of"] = args.best_of
|
||||
result_json["use_beam_search"] = args.use_beam_search
|
||||
result_json["num_prompts"] = args.num_prompts
|
||||
|
||||
# Metadata
|
||||
@ -636,18 +841,22 @@ def main(args: argparse.Namespace):
|
||||
# Traffic
|
||||
result_json["request_rate"] = (
|
||||
args.request_rate if args.request_rate < float("inf") else "inf")
|
||||
result_json["burstiness"] = args.burstiness
|
||||
result_json["max_concurrency"] = args.max_concurrency
|
||||
|
||||
# Merge with benchmark result
|
||||
result_json = {**result_json, **benchmark_result}
|
||||
|
||||
# Save to file
|
||||
base_model_id = model_id.split("/")[-1]
|
||||
file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa
|
||||
max_concurrency_str = (f"-concurrency{args.max_concurrency}"
|
||||
if args.max_concurrency is not None else "")
|
||||
file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" #noqa
|
||||
if args.result_filename:
|
||||
file_name = args.result_filename
|
||||
if args.result_dir:
|
||||
file_name = os.path.join(args.result_dir, file_name)
|
||||
with open(file_name, "w") as outfile:
|
||||
with open(file_name, "w", encoding='utf-8') as outfile:
|
||||
json.dump(result_json, outfile)
|
||||
|
||||
|
||||
@ -685,13 +894,27 @@ if __name__ == "__main__":
|
||||
"--dataset-name",
|
||||
type=str,
|
||||
default="sharegpt",
|
||||
choices=["sharegpt", "sonnet", "random"],
|
||||
choices=["sharegpt", "sonnet", "random", "hf"],
|
||||
help="Name of the dataset to benchmark on.",
|
||||
)
|
||||
parser.add_argument("--dataset-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to the dataset.")
|
||||
help="Path to the sharegpt/sonnet dataset. "
|
||||
"Or the huggingface dataset ID if using HF dataset.")
|
||||
parser.add_argument(
|
||||
"--max-concurrency",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Maximum number of concurrent requests. This can be used "
|
||||
"to help simulate an environment where a higher level component "
|
||||
"is enforcing a maximum number of concurrent requests. While the "
|
||||
"--request-rate argument controls the rate at which requests are "
|
||||
"initiated, this argument will control how many are actually allowed "
|
||||
"to execute at a time. This means that when used in combination, the "
|
||||
"actual request rate may be lower than specified with --request-rate, "
|
||||
"if the server is not processing requests fast enough to keep up.")
|
||||
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
@ -718,26 +941,6 @@ if __name__ == "__main__":
|
||||
default=1000,
|
||||
help="Number of prompts to process.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sharegpt-output-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Output length for each request. Overrides the output length "
|
||||
"from the ShareGPT dataset.")
|
||||
parser.add_argument(
|
||||
"--sonnet-input-len",
|
||||
type=int,
|
||||
default=550,
|
||||
help=
|
||||
"Number of input tokens per request, used only for sonnet dataset.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sonnet-output-len",
|
||||
type=int,
|
||||
default=150,
|
||||
help=
|
||||
"Number of output tokens per request, used only for sonnet dataset.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--logprobs",
|
||||
type=int,
|
||||
@ -748,50 +951,26 @@ if __name__ == "__main__":
|
||||
"logprob is returned for each token; or (2) if beam search "
|
||||
"is enabled 1 logprob per token is computed"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sonnet-prefix-len",
|
||||
type=int,
|
||||
default=200,
|
||||
help=
|
||||
"Number of prefix tokens per request, used only for sonnet dataset.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--random-input-len",
|
||||
type=int,
|
||||
default=1024,
|
||||
help=
|
||||
"Number of input tokens per request, used only for random sampling.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--random-output-len",
|
||||
type=int,
|
||||
default=128,
|
||||
help=
|
||||
"Number of output tokens per request, used only for random sampling.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--random-range-ratio",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Range of sampled ratio of input/output length, "
|
||||
"used only for random sampling.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--random-prefix-len",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Number of fixed prefix tokens before random "
|
||||
" context. The length range of context in a random "
|
||||
" request is [random-prefix-len, "
|
||||
" random-prefix-len + random-prefix-len * random-range-ratio).")
|
||||
parser.add_argument(
|
||||
"--request-rate",
|
||||
type=float,
|
||||
default=float("inf"),
|
||||
help="Number of requests per second. If this is inf, "
|
||||
"then all the requests are sent at time 0. "
|
||||
"Otherwise, we use Poisson process to synthesize "
|
||||
"the request arrival times.",
|
||||
"Otherwise, we use Poisson process or gamma distribution "
|
||||
"to synthesize the request arrival times.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--burstiness",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Burstiness factor of the request generation. "
|
||||
"Only take effect when request_rate is not inf. "
|
||||
"Default value is 1, which follows Poisson process. "
|
||||
"Otherwise, the request intervals follow a gamma distribution. "
|
||||
"A lower burstiness value (0 < burstiness < 1) results in more "
|
||||
"bursty requests. A higher burstiness value (burstiness > 1) "
|
||||
"results in a more uniform arrival of requests.",
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument(
|
||||
@ -839,6 +1018,11 @@ if __name__ == "__main__":
|
||||
"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
|
||||
" format.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ignore-eos",
|
||||
action="store_true",
|
||||
help="Set ignore_eos flag when sending the benchmark request."
|
||||
"Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
|
||||
parser.add_argument(
|
||||
"--percentile-metrics",
|
||||
type=str,
|
||||
@ -856,6 +1040,97 @@ if __name__ == "__main__":
|
||||
"Default value is \"99\". "
|
||||
"Use \"--percentile-metrics\" to select metrics.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--goodput",
|
||||
nargs="+",
|
||||
required=False,
|
||||
help="Specify service level objectives for goodput as \"KEY:VALUE\" "
|
||||
"pairs, where the key is a metric name, and the value is in "
|
||||
"milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
|
||||
"separated by spaces. Allowed request level metric names are "
|
||||
"\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
|
||||
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
|
||||
"and the blog: https://hao-ai-lab.github.io/blogs/distserve")
|
||||
|
||||
# group for dataset specific arguments
|
||||
sonnet_group = parser.add_argument_group("sonnet dataset options")
|
||||
sonnet_group.add_argument(
|
||||
"--sonnet-input-len",
|
||||
type=int,
|
||||
default=550,
|
||||
help=
|
||||
"Number of input tokens per request, used only for sonnet dataset.",
|
||||
)
|
||||
sonnet_group.add_argument(
|
||||
"--sonnet-output-len",
|
||||
type=int,
|
||||
default=150,
|
||||
help=
|
||||
"Number of output tokens per request, used only for sonnet dataset.",
|
||||
)
|
||||
sonnet_group.add_argument(
|
||||
"--sonnet-prefix-len",
|
||||
type=int,
|
||||
default=200,
|
||||
help=
|
||||
"Number of prefix tokens per request, used only for sonnet dataset.",
|
||||
)
|
||||
|
||||
sharegpt_group = parser.add_argument_group("sharegpt dataset options")
|
||||
sharegpt_group.add_argument(
|
||||
"--sharegpt-output-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Output length for each request. Overrides the output length "
|
||||
"from the ShareGPT dataset.")
|
||||
|
||||
random_group = parser.add_argument_group("random dataset options")
|
||||
random_group.add_argument(
|
||||
"--random-input-len",
|
||||
type=int,
|
||||
default=1024,
|
||||
help=
|
||||
"Number of input tokens per request, used only for random sampling.",
|
||||
)
|
||||
random_group.add_argument(
|
||||
"--random-output-len",
|
||||
type=int,
|
||||
default=128,
|
||||
help=
|
||||
"Number of output tokens per request, used only for random sampling.",
|
||||
)
|
||||
random_group.add_argument(
|
||||
"--random-range-ratio",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Range of sampled ratio of input/output length, "
|
||||
"used only for random sampling.",
|
||||
)
|
||||
random_group.add_argument(
|
||||
"--random-prefix-len",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Number of fixed prefix tokens before random "
|
||||
" context. The length range of context in a random "
|
||||
" request is [random-prefix-len, "
|
||||
" random-prefix-len + random-prefix-len * random-range-ratio).")
|
||||
|
||||
hf_group = parser.add_argument_group("hf dataset options")
|
||||
hf_group.add_argument("--hf-subset",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Subset of the HF dataset.")
|
||||
hf_group.add_argument("--hf-split",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Split of the HF dataset.")
|
||||
hf_group.add_argument(
|
||||
"--hf-output-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Output length for each request. Overrides the output lengths "
|
||||
"from the sampled HF dataset.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
@ -1,29 +1,71 @@
|
||||
"""Benchmark offline inference throughput."""
|
||||
import argparse
|
||||
import dataclasses
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
from typing import List, Optional, Tuple
|
||||
from typing import List, Optional
|
||||
|
||||
import torch
|
||||
import uvloop
|
||||
from PIL import Image
|
||||
from tqdm import tqdm
|
||||
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
||||
PreTrainedTokenizerBase)
|
||||
|
||||
from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
||||
from vllm.entrypoints.openai.api_server import (
|
||||
build_async_engine_client_from_engine_args)
|
||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||
from vllm.inputs import TextPrompt
|
||||
from vllm.multimodal import MultiModalDataDict
|
||||
from vllm.sampling_params import BeamSearchParams
|
||||
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
|
||||
|
||||
|
||||
def sample_requests(
|
||||
dataset_path: str,
|
||||
num_requests: int,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
fixed_output_len: Optional[int],
|
||||
) -> List[Tuple[str, int, int]]:
|
||||
@dataclasses.dataclass
|
||||
class SampleRequest:
|
||||
"""A class representing a single inference request for benchmarking.
|
||||
|
||||
Attributes:
|
||||
prompt: The input text prompt for the model.
|
||||
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
|
||||
images).
|
||||
prompt_len: The length of the prompt in tokens.
|
||||
expected_output_len: The expected length of the output in tokens.
|
||||
"""
|
||||
prompt: str
|
||||
prompt_len: int
|
||||
expected_output_len: int
|
||||
multi_modal_data: Optional[MultiModalDataDict] = None
|
||||
|
||||
|
||||
def _get_prompt_for_image_model(question: str, *, model: str) -> str:
|
||||
"""Prepend and append special tokens around the question to form a prompt.
|
||||
|
||||
Args:
|
||||
question: The input question text to wrap with special tokens
|
||||
model: The name of the model being used, to determine which special
|
||||
tokens to add
|
||||
|
||||
Returns:
|
||||
The formatted prompt string with appropriate special tokens for the
|
||||
model
|
||||
|
||||
Raises:
|
||||
ValueError: If an unsupported model name is provided
|
||||
"""
|
||||
model = model.lower()
|
||||
if "pixtral" in model:
|
||||
return f"<s>[INST]{question}\n[IMG][/INST]"
|
||||
raise ValueError(f"Unsupported model {model}")
|
||||
|
||||
|
||||
def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
||||
args: argparse.Namespace) -> List[SampleRequest]:
|
||||
dataset_path: str = args.dataset
|
||||
num_requests: int = args.num_prompts
|
||||
fixed_output_len: Optional[int] = args.output_len
|
||||
model: str = args.model
|
||||
if fixed_output_len is not None and fixed_output_len < 4:
|
||||
raise ValueError("output_len too small")
|
||||
|
||||
@ -32,23 +74,36 @@ def sample_requests(
|
||||
dataset = json.load(f)
|
||||
# Filter out the conversations with less than 2 turns.
|
||||
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
||||
# Only keep the first two turns of each conversation.
|
||||
dataset = [(data["conversations"][0]["value"],
|
||||
data["conversations"][1]["value"]) for data in dataset]
|
||||
|
||||
# Shuffle the dataset.
|
||||
random.shuffle(dataset)
|
||||
|
||||
# Filter out sequences that are too long or too short
|
||||
filtered_dataset: List[Tuple[str, int, int]] = []
|
||||
for i in range(len(dataset)):
|
||||
filtered_dataset: List[SampleRequest] = []
|
||||
for data in dataset:
|
||||
if len(filtered_dataset) == num_requests:
|
||||
break
|
||||
|
||||
# Only keep the first two turns of each conversation.
|
||||
prompt = data["conversations"][0]["value"]
|
||||
completion = data["conversations"][1]["value"]
|
||||
|
||||
multi_modal_data: Optional[MultiModalDataDict] = None
|
||||
if "image" in data:
|
||||
multi_modal_data = multi_modal_data or {}
|
||||
image_path = data["image"]
|
||||
# TODO(vllm-project/vllm/issues/9778): Support multiple images.
|
||||
assert isinstance(image_path,
|
||||
str), "Only support single image input"
|
||||
try:
|
||||
multi_modal_data["image"] = Image.open(image_path).convert(
|
||||
"RGB")
|
||||
except FileNotFoundError:
|
||||
# Ignore datapoint where asset is missing
|
||||
continue
|
||||
prompt = _get_prompt_for_image_model(question=prompt, model=model)
|
||||
|
||||
# Tokenize the prompts and completions.
|
||||
prompt = dataset[i][0]
|
||||
prompt_token_ids = tokenizer(prompt).input_ids
|
||||
completion = dataset[i][1]
|
||||
completion_token_ids = tokenizer(completion).input_ids
|
||||
prompt_len = len(prompt_token_ids)
|
||||
output_len = len(completion_token_ids
|
||||
@ -59,158 +114,88 @@ def sample_requests(
|
||||
if prompt_len > 1024 or prompt_len + output_len > 2048:
|
||||
# Prune too long sequences.
|
||||
continue
|
||||
filtered_dataset.append((prompt, prompt_len, output_len))
|
||||
filtered_dataset.append(
|
||||
SampleRequest(prompt=prompt,
|
||||
prompt_len=prompt_len,
|
||||
expected_output_len=output_len,
|
||||
multi_modal_data=multi_modal_data))
|
||||
|
||||
return filtered_dataset
|
||||
|
||||
|
||||
def run_vllm(
|
||||
requests: List[Tuple[str, int, int]],
|
||||
model: str,
|
||||
tokenizer: str,
|
||||
quantization: Optional[str],
|
||||
tensor_parallel_size: int,
|
||||
seed: int,
|
||||
requests: List[SampleRequest],
|
||||
n: int,
|
||||
use_beam_search: bool,
|
||||
trust_remote_code: bool,
|
||||
dtype: str,
|
||||
max_model_len: Optional[int],
|
||||
enforce_eager: bool,
|
||||
kv_cache_dtype: str,
|
||||
quantization_param_path: Optional[str],
|
||||
device: str,
|
||||
enable_prefix_caching: bool,
|
||||
enable_chunked_prefill: bool,
|
||||
max_num_batched_tokens: int,
|
||||
distributed_executor_backend: Optional[str],
|
||||
gpu_memory_utilization: float = 0.9,
|
||||
num_scheduler_steps: int = 1,
|
||||
use_v2_block_manager: bool = False,
|
||||
download_dir: Optional[str] = None,
|
||||
load_format: str = EngineArgs.load_format,
|
||||
disable_async_output_proc: bool = False,
|
||||
engine_args: EngineArgs,
|
||||
) -> float:
|
||||
from vllm import LLM, SamplingParams
|
||||
llm = LLM(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
quantization=quantization,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
seed=seed,
|
||||
trust_remote_code=trust_remote_code,
|
||||
dtype=dtype,
|
||||
max_model_len=max_model_len,
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
enforce_eager=enforce_eager,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
quantization_param_path=quantization_param_path,
|
||||
device=device,
|
||||
enable_prefix_caching=enable_prefix_caching,
|
||||
download_dir=download_dir,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
load_format=load_format,
|
||||
num_scheduler_steps=num_scheduler_steps,
|
||||
use_v2_block_manager=use_v2_block_manager,
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
)
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
|
||||
# Add the requests to the engine.
|
||||
prompts: List[str] = []
|
||||
prompts: List[TextPrompt] = []
|
||||
sampling_params: List[SamplingParams] = []
|
||||
for prompt, _, output_len in requests:
|
||||
prompts.append(prompt)
|
||||
for request in requests:
|
||||
prompts.append(
|
||||
TextPrompt(prompt=request.prompt,
|
||||
multi_modal_data=request.multi_modal_data))
|
||||
sampling_params.append(
|
||||
SamplingParams(
|
||||
n=n,
|
||||
temperature=0.0 if use_beam_search else 1.0,
|
||||
temperature=1.0,
|
||||
top_p=1.0,
|
||||
use_beam_search=use_beam_search,
|
||||
ignore_eos=True,
|
||||
max_tokens=output_len,
|
||||
max_tokens=request.expected_output_len,
|
||||
))
|
||||
|
||||
start = time.perf_counter()
|
||||
llm.generate(prompts, sampling_params, use_tqdm=True)
|
||||
end = time.perf_counter()
|
||||
use_beam_search = False
|
||||
|
||||
if not use_beam_search:
|
||||
start = time.perf_counter()
|
||||
llm.generate(prompts, sampling_params, use_tqdm=True)
|
||||
end = time.perf_counter()
|
||||
else:
|
||||
prompts = [request.prompt for request in requests]
|
||||
# output_len should be the same for all requests.
|
||||
output_len = requests[0][2]
|
||||
for request in requests:
|
||||
assert request.expected_output_len == output_len
|
||||
start = time.perf_counter()
|
||||
llm.beam_search(
|
||||
prompts,
|
||||
BeamSearchParams(
|
||||
beam_width=n,
|
||||
max_tokens=output_len,
|
||||
ignore_eos=True,
|
||||
))
|
||||
end = time.perf_counter()
|
||||
return end - start
|
||||
|
||||
|
||||
async def run_vllm_async(
|
||||
requests: List[Tuple[str, int, int]],
|
||||
model: str,
|
||||
tokenizer: str,
|
||||
quantization: Optional[str],
|
||||
tensor_parallel_size: int,
|
||||
seed: int,
|
||||
requests: List[SampleRequest],
|
||||
n: int,
|
||||
use_beam_search: bool,
|
||||
trust_remote_code: bool,
|
||||
dtype: str,
|
||||
max_model_len: Optional[int],
|
||||
enforce_eager: bool,
|
||||
kv_cache_dtype: str,
|
||||
quantization_param_path: Optional[str],
|
||||
device: str,
|
||||
enable_prefix_caching: bool,
|
||||
enable_chunked_prefill: bool,
|
||||
max_num_batched_tokens: int,
|
||||
distributed_executor_backend: Optional[str],
|
||||
gpu_memory_utilization: float = 0.9,
|
||||
num_scheduler_steps: int = 1,
|
||||
use_v2_block_manager: bool = False,
|
||||
download_dir: Optional[str] = None,
|
||||
load_format: str = EngineArgs.load_format,
|
||||
disable_async_output_proc: bool = False,
|
||||
engine_args: AsyncEngineArgs,
|
||||
disable_frontend_multiprocessing: bool = False,
|
||||
) -> float:
|
||||
from vllm import SamplingParams
|
||||
engine_args = AsyncEngineArgs(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
quantization=quantization,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
seed=seed,
|
||||
trust_remote_code=trust_remote_code,
|
||||
dtype=dtype,
|
||||
max_model_len=max_model_len,
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
enforce_eager=enforce_eager,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
quantization_param_path=quantization_param_path,
|
||||
device=device,
|
||||
enable_prefix_caching=enable_prefix_caching,
|
||||
download_dir=download_dir,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
load_format=load_format,
|
||||
num_scheduler_steps=num_scheduler_steps,
|
||||
use_v2_block_manager=use_v2_block_manager,
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
worker_use_ray=False,
|
||||
engine_use_ray=False,
|
||||
disable_log_requests=True,
|
||||
)
|
||||
|
||||
async with build_async_engine_client_from_engine_args(
|
||||
engine_args, disable_frontend_multiprocessing) as llm:
|
||||
|
||||
# Add the requests to the engine.
|
||||
prompts: List[str] = []
|
||||
prompts: List[TextPrompt] = []
|
||||
sampling_params: List[SamplingParams] = []
|
||||
for prompt, _, output_len in requests:
|
||||
prompts.append(prompt)
|
||||
for request in requests:
|
||||
prompts.append(
|
||||
TextPrompt(prompt=request.prompt,
|
||||
multi_modal_data=request.multi_modal_data))
|
||||
sampling_params.append(
|
||||
SamplingParams(
|
||||
n=n,
|
||||
temperature=0.0 if use_beam_search else 1.0,
|
||||
temperature=1.0,
|
||||
top_p=1.0,
|
||||
use_beam_search=use_beam_search,
|
||||
ignore_eos=True,
|
||||
max_tokens=output_len,
|
||||
max_tokens=request.expected_output_len,
|
||||
))
|
||||
|
||||
generators = []
|
||||
@ -226,15 +211,13 @@ async def run_vllm_async(
|
||||
|
||||
|
||||
def run_hf(
|
||||
requests: List[Tuple[str, int, int]],
|
||||
requests: List[SampleRequest],
|
||||
model: str,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
n: int,
|
||||
use_beam_search: bool,
|
||||
max_batch_size: int,
|
||||
trust_remote_code: bool,
|
||||
) -> float:
|
||||
assert not use_beam_search
|
||||
llm = AutoModelForCausalLM.from_pretrained(
|
||||
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
|
||||
if llm.config.model_type == "llama":
|
||||
@ -266,7 +249,7 @@ def run_hf(
|
||||
padding=True).input_ids
|
||||
llm_outputs = llm.generate(
|
||||
input_ids=input_ids.cuda(),
|
||||
do_sample=not use_beam_search,
|
||||
do_sample=True,
|
||||
num_return_sequences=n,
|
||||
temperature=1.0,
|
||||
top_p=1.0,
|
||||
@ -286,14 +269,14 @@ def run_hf(
|
||||
|
||||
|
||||
def run_mii(
|
||||
requests: List[Tuple[str, int, int]],
|
||||
requests: List[SampleRequest],
|
||||
model: str,
|
||||
tensor_parallel_size: int,
|
||||
output_len: int,
|
||||
) -> float:
|
||||
from mii import client, serve
|
||||
llm = serve(model, tensor_parallel=tensor_parallel_size)
|
||||
prompts = [prompt for prompt, _, _ in requests]
|
||||
prompts = [request.prompt for request in requests]
|
||||
|
||||
start = time.perf_counter()
|
||||
llm.generate(prompts, max_new_tokens=output_len)
|
||||
@ -312,46 +295,60 @@ def main(args: argparse.Namespace):
|
||||
args.tokenizer, trust_remote_code=args.trust_remote_code)
|
||||
if args.dataset is None:
|
||||
# Synthesize a prompt with the given input length.
|
||||
prompt = "hi" * (args.input_len - 1)
|
||||
requests = [(prompt, args.input_len, args.output_len)
|
||||
for _ in range(args.num_prompts)]
|
||||
else:
|
||||
requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
|
||||
args.output_len)
|
||||
|
||||
if args.backend == "vllm":
|
||||
run_args = [
|
||||
requests, args.model, args.tokenizer, args.quantization,
|
||||
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
|
||||
args.trust_remote_code, args.dtype, args.max_model_len,
|
||||
args.enforce_eager, args.kv_cache_dtype,
|
||||
args.quantization_param_path, args.device,
|
||||
args.enable_prefix_caching, args.enable_chunked_prefill,
|
||||
args.max_num_batched_tokens, args.distributed_executor_backend,
|
||||
args.gpu_memory_utilization, args.num_scheduler_steps,
|
||||
args.use_v2_block_manager, args.download_dir, args.load_format,
|
||||
args.disable_async_output_proc
|
||||
]
|
||||
|
||||
if args.async_engine:
|
||||
run_args.append(args.disable_frontend_multiprocessing)
|
||||
elapsed_time = uvloop.run(run_vllm_async(*run_args))
|
||||
# As tokenizer may add additional tokens like BOS, we need to try
|
||||
# different lengths to get the desired input length.
|
||||
for i in range(-10, 10):
|
||||
prompt = "hi " * (args.input_len + i)
|
||||
tokenized_prompt = tokenizer(prompt).input_ids
|
||||
if len(tokenized_prompt) == args.input_len:
|
||||
break
|
||||
else:
|
||||
elapsed_time = run_vllm(*run_args)
|
||||
raise ValueError(
|
||||
f"Failed to synthesize a prompt with {args.input_len} tokens.")
|
||||
requests = [
|
||||
SampleRequest(prompt=prompt,
|
||||
prompt_len=args.input_len,
|
||||
expected_output_len=args.output_len)
|
||||
for _ in range(args.num_prompts)
|
||||
]
|
||||
else:
|
||||
requests = sample_requests(tokenizer, args)
|
||||
|
||||
is_multi_modal = any(request.multi_modal_data is not None
|
||||
for request in requests)
|
||||
if args.backend == "vllm":
|
||||
if args.async_engine:
|
||||
elapsed_time = uvloop.run(
|
||||
run_vllm_async(
|
||||
requests,
|
||||
args.n,
|
||||
AsyncEngineArgs.from_cli_args(args),
|
||||
args.disable_frontend_multiprocessing,
|
||||
))
|
||||
else:
|
||||
elapsed_time = run_vllm(requests, args.n,
|
||||
EngineArgs.from_cli_args(args))
|
||||
elif args.backend == "hf":
|
||||
assert args.tensor_parallel_size == 1
|
||||
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
||||
args.use_beam_search, args.hf_max_batch_size,
|
||||
args.trust_remote_code)
|
||||
args.hf_max_batch_size, args.trust_remote_code)
|
||||
elif args.backend == "mii":
|
||||
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
|
||||
args.output_len)
|
||||
else:
|
||||
raise ValueError(f"Unknown backend: {args.backend}")
|
||||
total_num_tokens = sum(prompt_len + output_len
|
||||
for _, prompt_len, output_len in requests)
|
||||
total_num_tokens = sum(request.prompt_len + request.expected_output_len
|
||||
for request in requests)
|
||||
total_output_tokens = sum(request.expected_output_len
|
||||
for request in requests)
|
||||
if is_multi_modal:
|
||||
print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
|
||||
"following metrics are not accurate because image tokens are not"
|
||||
" counted. See vllm-project/vllm/issues/9778 for details.")
|
||||
# TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
|
||||
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
||||
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
|
||||
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
|
||||
f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
|
||||
|
||||
# Output JSON results if specified
|
||||
if args.output_json:
|
||||
@ -375,7 +372,9 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--dataset",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to the dataset.")
|
||||
help="Path to the dataset. The dataset is expected to "
|
||||
"be a json in form of List[Dict[..., conversations: "
|
||||
"List[Dict[..., value: <prompt_or_response>]]]]")
|
||||
parser.add_argument("--input-len",
|
||||
type=int,
|
||||
default=None,
|
||||
@ -385,142 +384,23 @@ if __name__ == "__main__":
|
||||
default=None,
|
||||
help="Output length for each request. Overrides the "
|
||||
"output length from the dataset.")
|
||||
parser.add_argument("--model", type=str, default="facebook/opt-125m")
|
||||
parser.add_argument("--tokenizer", type=str, default=None)
|
||||
parser.add_argument('--quantization',
|
||||
'-q',
|
||||
choices=[*QUANTIZATION_METHODS, None],
|
||||
default=None)
|
||||
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
|
||||
parser.add_argument("--n",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of generated sequences per prompt.")
|
||||
parser.add_argument("--use-beam-search", action="store_true")
|
||||
parser.add_argument("--num-prompts",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Number of prompts to process.")
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--hf-max-batch-size",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Maximum batch size for HF backend.")
|
||||
parser.add_argument('--trust-remote-code',
|
||||
action='store_true',
|
||||
help='trust remote code from huggingface')
|
||||
parser.add_argument(
|
||||
'--max-model-len',
|
||||
type=int,
|
||||
default=None,
|
||||
help='Maximum length of a sequence (including prompt and output). '
|
||||
'If None, will be derived from the model.')
|
||||
parser.add_argument(
|
||||
'--dtype',
|
||||
type=str,
|
||||
default='auto',
|
||||
choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
|
||||
help='data type for model weights and activations. '
|
||||
'The "auto" option will use FP16 precision '
|
||||
'for FP32 and FP16 models, and BF16 precision '
|
||||
'for BF16 models.')
|
||||
parser.add_argument('--gpu-memory-utilization',
|
||||
type=float,
|
||||
default=0.9,
|
||||
help='the fraction of GPU memory to be used for '
|
||||
'the model executor, which can range from 0 to 1.'
|
||||
'If unspecified, will use the default value of 0.9.')
|
||||
parser.add_argument("--enforce-eager",
|
||||
action="store_true",
|
||||
help="enforce eager execution")
|
||||
parser.add_argument(
|
||||
'--kv-cache-dtype',
|
||||
type=str,
|
||||
choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
|
||||
default="auto",
|
||||
help='Data type for kv cache storage. If "auto", will use model '
|
||||
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
|
||||
'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
|
||||
parser.add_argument(
|
||||
'--quantization-param-path',
|
||||
type=str,
|
||||
default=None,
|
||||
help='Path to the JSON file containing the KV cache scaling factors. '
|
||||
'This should generally be supplied, when KV cache dtype is FP8. '
|
||||
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
|
||||
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
|
||||
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
|
||||
'instead supported for common inference criteria.')
|
||||
parser.add_argument("--device",
|
||||
type=str,
|
||||
default="auto",
|
||||
choices=DEVICE_OPTIONS,
|
||||
help='device type for vLLM execution')
|
||||
parser.add_argument(
|
||||
"--num-scheduler-steps",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Maximum number of forward steps per scheduler call.")
|
||||
parser.add_argument("--use-v2-block-manager",
|
||||
action='store_true',
|
||||
help="Enable block manager v2.")
|
||||
parser.add_argument(
|
||||
"--enable-prefix-caching",
|
||||
action='store_true',
|
||||
help="Enable automatic prefix caching for vLLM backend.")
|
||||
parser.add_argument("--enable-chunked-prefill",
|
||||
action='store_true',
|
||||
help="enable chunked prefill for vLLM backend.")
|
||||
parser.add_argument('--max-num-batched-tokens',
|
||||
type=int,
|
||||
default=None,
|
||||
help='maximum number of batched tokens per '
|
||||
'iteration')
|
||||
parser.add_argument('--download-dir',
|
||||
type=str,
|
||||
default=None,
|
||||
help='directory to download and load the weights, '
|
||||
'default to the default cache dir of huggingface')
|
||||
parser.add_argument(
|
||||
'--output-json',
|
||||
type=str,
|
||||
default=None,
|
||||
help='Path to save the throughput results in JSON format.')
|
||||
parser.add_argument(
|
||||
'--distributed-executor-backend',
|
||||
choices=['ray', 'mp'],
|
||||
default=None,
|
||||
help='Backend to use for distributed serving. When more than 1 GPU '
|
||||
'is used, will be automatically set to "ray" if installed '
|
||||
'or "mp" (multiprocessing) otherwise.')
|
||||
parser.add_argument(
|
||||
'--load-format',
|
||||
type=str,
|
||||
default=EngineArgs.load_format,
|
||||
choices=[
|
||||
'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
|
||||
'bitsandbytes'
|
||||
],
|
||||
help='The format of the model weights to load.\n\n'
|
||||
'* "auto" will try to load the weights in the safetensors format '
|
||||
'and fall back to the pytorch bin format if safetensors format '
|
||||
'is not available.\n'
|
||||
'* "pt" will load the weights in the pytorch bin format.\n'
|
||||
'* "safetensors" will load the weights in the safetensors format.\n'
|
||||
'* "npcache" will load the weights in pytorch format and store '
|
||||
'a numpy cache to speed up the loading.\n'
|
||||
'* "dummy" will initialize the weights with random values, '
|
||||
'which is mainly for profiling.\n'
|
||||
'* "tensorizer" will load the weights using tensorizer from '
|
||||
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
|
||||
'section for more information.\n'
|
||||
'* "bitsandbytes" will load the weights using bitsandbytes '
|
||||
'quantization.\n')
|
||||
parser.add_argument(
|
||||
"--disable-async-output-proc",
|
||||
action='store_true',
|
||||
default=False,
|
||||
help="Disable async output processor for vLLM backend.")
|
||||
parser.add_argument("--async-engine",
|
||||
action='store_true',
|
||||
default=False,
|
||||
@ -529,6 +409,7 @@ if __name__ == "__main__":
|
||||
action='store_true',
|
||||
default=False,
|
||||
help="Disable decoupled async engine frontend.")
|
||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
||||
args = parser.parse_args()
|
||||
if args.tokenizer is None:
|
||||
args.tokenizer = args.model
|
||||
@ -551,8 +432,6 @@ if __name__ == "__main__":
|
||||
raise ValueError("dtype must be auto for MII backend.")
|
||||
if args.n != 1:
|
||||
raise ValueError("n must be 1 for MII backend.")
|
||||
if args.use_beam_search:
|
||||
raise ValueError("Beam search is not supported for MII backend.")
|
||||
if args.quantization is not None:
|
||||
raise ValueError("Quantization is only for vLLM backend.")
|
||||
if args.hf_max_batch_size is not None:
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
import random
|
||||
import time
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||
|
||||
|
||||
@ -16,10 +16,7 @@ def main(num_tokens: int,
|
||||
do_profile: bool = False,
|
||||
num_warmup_iters: int = 5,
|
||||
num_iters: int = 100) -> None:
|
||||
random.seed(seed)
|
||||
torch.random.manual_seed(seed)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed(seed)
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device("cuda")
|
||||
|
||||
layer = RMSNorm(hidden_size).to(dtype=dtype)
|
||||
|
||||
@ -4,8 +4,10 @@ import itertools
|
||||
import math
|
||||
import pickle as pkl
|
||||
import time
|
||||
from typing import Callable, Iterable, List, Tuple
|
||||
from itertools import product
|
||||
from typing import Callable, Iterable, List, Optional, Tuple
|
||||
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
from torch.utils.benchmark import Measurement as TMeasurement
|
||||
@ -84,6 +86,10 @@ def loop_over_weights(
|
||||
fn(a, w_ref, w_q, w_s)
|
||||
|
||||
|
||||
_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
|
||||
_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
|
||||
|
||||
|
||||
def bench(atype: torch.dtype,
|
||||
wtype: ScalarType,
|
||||
group_size: int,
|
||||
@ -94,6 +100,8 @@ def bench(atype: torch.dtype,
|
||||
sub_label: str,
|
||||
benchmark_marlinv1: bool = True,
|
||||
sweep_schedules: bool = True) -> Iterable[TMeasurement]:
|
||||
global _SWEEP_SCHEDULES_RESULTS
|
||||
|
||||
a, weights = make_bench_tensors(atype, wtype, group_size, m, n, k)
|
||||
sub_label += f", L={len(weights)}"
|
||||
|
||||
@ -163,6 +171,11 @@ def bench(atype: torch.dtype,
|
||||
best_schedule = None
|
||||
schedules = ops.machete_supported_schedules(wtype)
|
||||
for schedule in reversed(schedules):
|
||||
schedule_M = int(schedule.split("_")[0].split("x")[1])
|
||||
|
||||
# Prune known bad schedules
|
||||
if schedule_M >= 2 * max(m, 16) or schedule_M < m // 4:
|
||||
continue
|
||||
|
||||
def run(a, _, w_q, w_s, schedule=schedule):
|
||||
ops.machete_gemm(a,
|
||||
@ -175,6 +188,20 @@ def bench(atype: torch.dtype,
|
||||
res = bench_fn(label, sub_label, "machete_best",
|
||||
lambda: loop_over_weights(a, weights_machete, run))
|
||||
|
||||
results_row = {
|
||||
"M": m,
|
||||
"K": k,
|
||||
"N": n,
|
||||
"group_size": group_size,
|
||||
"schedule": schedule,
|
||||
"median": res.median,
|
||||
}
|
||||
if _SWEEP_SCHEDULES_RESULTS is None:
|
||||
_SWEEP_SCHEDULES_RESULTS = pd.DataFrame(
|
||||
columns=results_row.keys())
|
||||
_SWEEP_SCHEDULES_RESULTS.\
|
||||
loc[len(_SWEEP_SCHEDULES_RESULTS)] = results_row
|
||||
|
||||
print(f" {res.median:5.5} ", schedule)
|
||||
if not best or res.median < best.median:
|
||||
best = res
|
||||
@ -235,18 +262,22 @@ def run_square_bench(args):
|
||||
dim_sizes = list(
|
||||
range(args.dim_start, args.dim_end + 1, args.dim_increment))
|
||||
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
|
||||
|
||||
data = run(args.dtype, args.sweep_schedules, MKNs)
|
||||
|
||||
make_output(data, MKNs, f"square_bench-{args.dtype}")
|
||||
|
||||
|
||||
def run_range_bench(args):
|
||||
dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
|
||||
n = len(dim_sizes)
|
||||
Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
|
||||
Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
|
||||
Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
|
||||
MKNs = list(zip(Ms, Ks, Ns))
|
||||
m_start, k_start, n_start = (int(x) for x in args.dim_start.split(","))
|
||||
m_end, k_end, n_end = (int(x) for x in args.dim_end.split(","))
|
||||
m_increment, k_increment, n_increment = \
|
||||
(int(x) for x in args.dim_increment.split(","))
|
||||
Ms = list(range(m_start, m_end + 1, m_increment))
|
||||
Ks = list(range(k_start, k_end + 1, k_increment))
|
||||
Ns = list(range(n_start, n_end + 1, n_increment))
|
||||
MKNs = list(product(Ms, Ks, Ns))
|
||||
|
||||
data = run(args.dtype, args.sweep_schedules, MKNs)
|
||||
|
||||
make_output(data, MKNs, f"range_bench-{args.dtype}")
|
||||
@ -333,6 +364,9 @@ Benchmark Machete GEMM.
|
||||
action="store_true",
|
||||
help="Run a sweep over all supported schedules",
|
||||
)
|
||||
parser.add_argument("--sweep-csv-out",
|
||||
help="CSV to store sweep results",
|
||||
default="sch_sweep_results.csv")
|
||||
subparsers = parser.add_subparsers(dest="cmd", required=True)
|
||||
|
||||
square_parser = subparsers.add_parser("square_bench")
|
||||
@ -342,12 +376,21 @@ Benchmark Machete GEMM.
|
||||
square_parser.set_defaults(func=run_square_bench)
|
||||
|
||||
range_parser = subparsers.add_parser("range_bench")
|
||||
range_parser.add_argument("--dim-start", type=int, required=True)
|
||||
range_parser.add_argument("--dim-end", type=int, required=True)
|
||||
range_parser.add_argument("--dim-increment", type=int, required=True)
|
||||
range_parser.add_argument("--m-constant", type=int, default=None)
|
||||
range_parser.add_argument("--n-constant", type=int, default=None)
|
||||
range_parser.add_argument("--k-constant", type=int, default=None)
|
||||
range_parser.add_argument(
|
||||
"--dim-start",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Start value for M,K,N as common separated list")
|
||||
range_parser.add_argument(
|
||||
"--dim-end",
|
||||
type=str,
|
||||
required=True,
|
||||
help="End value (inclusive) for M,K,N as common separated list")
|
||||
range_parser.add_argument(
|
||||
"--dim-increment",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Increment value for M,K,N as common separated list")
|
||||
range_parser.set_defaults(func=run_range_bench)
|
||||
|
||||
model_parser = subparsers.add_parser("model_bench")
|
||||
@ -369,4 +412,9 @@ Benchmark Machete GEMM.
|
||||
model_parser.set_defaults(func=run_model_bench)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
_SWEEP_SCHEDULES_RESULTS_CSV = args.sweep_csv_out
|
||||
args.func(args)
|
||||
|
||||
if _SWEEP_SCHEDULES_RESULTS is not None:
|
||||
_SWEEP_SCHEDULES_RESULTS.to_csv(_SWEEP_SCHEDULES_RESULTS_CSV)
|
||||
|
||||
@ -10,6 +10,7 @@ from ray.experimental.tqdm_ray import tqdm
|
||||
from transformers import AutoConfig
|
||||
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
@ -88,22 +89,23 @@ def benchmark_config(
|
||||
input_gating.copy_(gating_output[i])
|
||||
|
||||
def run():
|
||||
fused_moe(
|
||||
x,
|
||||
w1,
|
||||
w2,
|
||||
input_gating,
|
||||
topk,
|
||||
renormalize=True,
|
||||
inplace=True,
|
||||
override_config=config,
|
||||
use_fp8_w8a8=use_fp8_w8a8,
|
||||
use_int8_w8a16=use_int8_w8a16,
|
||||
w1_scale=w1_scale,
|
||||
w2_scale=w2_scale,
|
||||
a1_scale=a1_scale,
|
||||
a2_scale=a2_scale,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe import override_config
|
||||
with override_config(config):
|
||||
fused_moe(
|
||||
x,
|
||||
w1,
|
||||
w2,
|
||||
input_gating,
|
||||
topk,
|
||||
renormalize=True,
|
||||
inplace=True,
|
||||
use_fp8_w8a8=use_fp8_w8a8,
|
||||
use_int8_w8a16=use_int8_w8a16,
|
||||
w1_scale=w1_scale,
|
||||
w2_scale=w2_scale,
|
||||
a1_scale=a1_scale,
|
||||
a2_scale=a2_scale,
|
||||
)
|
||||
|
||||
# JIT compilation & warmup
|
||||
run()
|
||||
@ -166,7 +168,7 @@ class BenchmarkWorker:
|
||||
|
||||
def __init__(self, seed: int) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
current_platform.seed_everything(seed)
|
||||
self.seed = seed
|
||||
|
||||
def benchmark(
|
||||
@ -180,7 +182,7 @@ class BenchmarkWorker:
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
) -> Tuple[Dict[str, int], float]:
|
||||
torch.cuda.manual_seed_all(self.seed)
|
||||
current_platform.seed_everything(self.seed)
|
||||
dtype_str = get_config_dtype_str(dtype,
|
||||
use_int8_w8a16=use_int8_w8a16,
|
||||
use_fp8_w8a8=use_fp8_w8a8)
|
||||
|
||||
@ -5,6 +5,7 @@ from typing import List, Optional
|
||||
import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
|
||||
create_kv_caches_with_random)
|
||||
|
||||
@ -28,10 +29,7 @@ def main(
|
||||
device: str = "cuda",
|
||||
kv_cache_dtype: Optional[str] = None,
|
||||
) -> None:
|
||||
random.seed(seed)
|
||||
torch.random.manual_seed(seed)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed(seed)
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
scale = float(1.0 / (head_size**0.5))
|
||||
query = torch.empty(num_seqs,
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
import random
|
||||
import time
|
||||
|
||||
import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||
|
||||
|
||||
@ -17,10 +17,7 @@ def main(num_tokens: int,
|
||||
do_profile: bool = False,
|
||||
num_warmup_iters: int = 5,
|
||||
num_iters: int = 100) -> None:
|
||||
random.seed(seed)
|
||||
torch.random.manual_seed(seed)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed(seed)
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device("cuda")
|
||||
|
||||
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
||||
|
||||
@ -6,6 +6,7 @@ import torch
|
||||
|
||||
from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
|
||||
get_rope)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
@ -22,9 +23,7 @@ def benchmark_rope_kernels_multi_lora(
|
||||
max_position: int = 8192,
|
||||
base: int = 10000,
|
||||
) -> None:
|
||||
torch.random.manual_seed(seed)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed(seed)
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
if rotary_dim is None:
|
||||
rotary_dim = head_size
|
||||
@ -33,7 +32,7 @@ def benchmark_rope_kernels_multi_lora(
|
||||
# batched RoPE can take multiple scaling factors
|
||||
batched_rope = get_rope(head_size, rotary_dim, max_position, base,
|
||||
is_neox_style, {
|
||||
"type": "linear",
|
||||
"rope_type": "linear",
|
||||
"factor": tuple(scaling_factors)
|
||||
})
|
||||
# non-batched RoPE takes only one scaling factor, we create multiple
|
||||
@ -43,7 +42,7 @@ def benchmark_rope_kernels_multi_lora(
|
||||
non_batched_ropes.append(
|
||||
get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
|
||||
{
|
||||
"type": "linear",
|
||||
"rope_type": "linear",
|
||||
"factor": (scaling_factor, )
|
||||
}))
|
||||
|
||||
|
||||
@ -45,8 +45,7 @@ if __name__ == "__main__":
|
||||
rows = int(math.ceil(len(results) / 2))
|
||||
fig, axs = plt.subplots(rows, 2, figsize=(12, 5 * rows))
|
||||
axs = axs.flatten()
|
||||
axs_idx = 0
|
||||
for shape, data in results.items():
|
||||
for axs_idx, (shape, data) in enumerate(results.items()):
|
||||
plt.sca(axs[axs_idx])
|
||||
df = pd.DataFrame(data)
|
||||
sns.lineplot(data=df,
|
||||
@ -59,6 +58,5 @@ if __name__ == "__main__":
|
||||
palette="Dark2")
|
||||
plt.title(f"Shape: {shape}")
|
||||
plt.ylabel("time (median, s)")
|
||||
axs_idx += 1
|
||||
plt.tight_layout()
|
||||
plt.savefig("graph_machete_bench.pdf")
|
||||
|
||||
1
benchmarks/kernels/requirements.txt
Normal file
1
benchmarks/kernels/requirements.txt
Normal file
@ -0,0 +1 @@
|
||||
pandas
|
||||
@ -4,13 +4,13 @@ PORT=8000
|
||||
MODEL=$1
|
||||
TOKENS=$2
|
||||
|
||||
docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \
|
||||
-v $PWD/data:/data \
|
||||
docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \
|
||||
-v "$PWD/data:/data" \
|
||||
ghcr.io/huggingface/text-generation-inference:2.2.0 \
|
||||
--model-id $MODEL \
|
||||
--model-id "$MODEL" \
|
||||
--sharded false \
|
||||
--max-input-length 1024 \
|
||||
--max-total-tokens 2048 \
|
||||
--max-best-of 5 \
|
||||
--max-concurrent-requests 5000 \
|
||||
--max-batch-total-tokens $TOKENS
|
||||
--max-batch-total-tokens "$TOKENS"
|
||||
|
||||
@ -16,7 +16,6 @@ def main(args):
|
||||
enforce_eager=True,
|
||||
enable_prefix_caching=True,
|
||||
tensor_parallel_size=args.tensor_parallel_size,
|
||||
use_v2_block_manager=args.use_v2_block_manager,
|
||||
)
|
||||
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
|
||||
@ -56,8 +55,5 @@ if __name__ == "__main__":
|
||||
parser.add_argument('--enable-prefix-caching',
|
||||
action='store_true',
|
||||
help='enable prefix caching')
|
||||
parser.add_argument('--use-v2-block-manager',
|
||||
action='store_true',
|
||||
help='Use BlockSpaceMangerV2')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
@ -1,5 +1,8 @@
|
||||
include(FetchContent)
|
||||
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_CXX_EXTENSIONS ON)
|
||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
|
||||
#
|
||||
# Define environment variables for special configurations
|
||||
@ -15,6 +18,7 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
|
||||
#
|
||||
list(APPEND CXX_COMPILE_FLAGS
|
||||
"-fopenmp"
|
||||
"-mf16c"
|
||||
"-DVLLM_CPU_EXTENSION")
|
||||
|
||||
execute_process(COMMAND cat /proc/cpuinfo
|
||||
@ -82,9 +86,39 @@ else()
|
||||
message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
|
||||
endif()
|
||||
|
||||
#
|
||||
# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 platforms)
|
||||
#
|
||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||
FetchContent_Declare(
|
||||
oneDNN
|
||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
||||
GIT_TAG v3.6
|
||||
GIT_PROGRESS TRUE
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
|
||||
set(ONEDNN_LIBRARY_TYPE "STATIC")
|
||||
set(ONEDNN_BUILD_DOC "OFF")
|
||||
set(ONEDNN_BUILD_EXAMPLES "OFF")
|
||||
set(ONEDNN_BUILD_TESTS "OFF")
|
||||
set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
|
||||
set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
|
||||
set(ONEDNN_BUILD_GRAPH "OFF")
|
||||
set(ONEDNN_ENABLE_JIT_PROFILING "OFF")
|
||||
set(ONEDNN_ENABLE_ITT_TASKS "OFF")
|
||||
set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
|
||||
set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
|
||||
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
|
||||
|
||||
FetchContent_MakeAvailable(oneDNN)
|
||||
|
||||
list(APPEND LIBS dnnl)
|
||||
endif()
|
||||
|
||||
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
|
||||
|
||||
list(APPEND LIBS dnnl numa)
|
||||
list(APPEND LIBS numa)
|
||||
|
||||
#
|
||||
# _C extension
|
||||
@ -120,4 +154,3 @@ define_gpu_extension_target(
|
||||
)
|
||||
|
||||
message(STATUS "Enabling C extension.")
|
||||
add_dependencies(default _C)
|
||||
|
||||
@ -133,10 +133,181 @@ macro(string_to_ver OUT_VER IN_STR)
|
||||
string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
|
||||
endmacro()
|
||||
|
||||
#
|
||||
# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS` and store them in
|
||||
# `CUDA_ARCH_FLAGS`.
|
||||
#
|
||||
# Example:
|
||||
# CMAKE_CUDA_FLAGS="-Wall -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75"
|
||||
# clear_cuda_arches(CUDA_ARCH_FLAGS)
|
||||
# CUDA_ARCH_FLAGS="-gencode arch=compute_70,code=sm_70;-gencode arch=compute_75,code=sm_75"
|
||||
# CMAKE_CUDA_FLAGS="-Wall"
|
||||
#
|
||||
macro(clear_cuda_arches CUDA_ARCH_FLAGS)
|
||||
# Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
|
||||
string(REGEX MATCHALL "-gencode arch=[^ ]+" CUDA_ARCH_FLAGS
|
||||
${CMAKE_CUDA_FLAGS})
|
||||
|
||||
# Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
|
||||
# and passed back via the `CUDA_ARCHITECTURES` property.
|
||||
string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
|
||||
${CMAKE_CUDA_FLAGS})
|
||||
endmacro()
|
||||
|
||||
#
|
||||
# Extract unique CUDA architectures from a list of compute capabilities codes in
|
||||
# the form `<major><minor>[<letter>]`, convert them to the form sort
|
||||
# `<major>.<minor>`, dedupes them and then sorts them in ascending order and
|
||||
# stores them in `OUT_ARCHES`.
|
||||
#
|
||||
# Example:
|
||||
# CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a"
|
||||
# extract_unique_cuda_archs_ascending(OUT_ARCHES CUDA_ARCH_FLAGS)
|
||||
# OUT_ARCHES="7.5;...;9.0"
|
||||
function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS)
|
||||
set(_CUDA_ARCHES)
|
||||
foreach(_ARCH ${CUDA_ARCH_FLAGS})
|
||||
string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
|
||||
if (_COMPUTE)
|
||||
set(_COMPUTE ${CMAKE_MATCH_1})
|
||||
endif()
|
||||
|
||||
string_to_ver(_COMPUTE_VER ${_COMPUTE})
|
||||
list(APPEND _CUDA_ARCHES ${_COMPUTE_VER})
|
||||
endforeach()
|
||||
|
||||
list(REMOVE_DUPLICATES _CUDA_ARCHES)
|
||||
list(SORT _CUDA_ARCHES COMPARE NATURAL ORDER ASCENDING)
|
||||
set(${OUT_ARCHES} ${_CUDA_ARCHES} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
#
|
||||
# For a specific file set the `-gencode` flag in compile options conditionally
|
||||
# for the CUDA language.
|
||||
#
|
||||
# Example:
|
||||
# set_gencode_flag_for_srcs(
|
||||
# SRCS "foo.cu"
|
||||
# ARCH "compute_75"
|
||||
# CODE "sm_75")
|
||||
# adds: "-gencode arch=compute_75,code=sm_75" to the compile options for
|
||||
# `foo.cu` (only for the CUDA language).
|
||||
#
|
||||
macro(set_gencode_flag_for_srcs)
|
||||
set(options)
|
||||
set(oneValueArgs ARCH CODE)
|
||||
set(multiValueArgs SRCS)
|
||||
cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
|
||||
"${multiValueArgs}" ${ARGN} )
|
||||
set(_FLAG -gencode arch=${arg_ARCH},code=${arg_CODE})
|
||||
set_property(
|
||||
SOURCE ${arg_SRCS}
|
||||
APPEND PROPERTY
|
||||
COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:${_FLAG}>"
|
||||
)
|
||||
|
||||
message(DEBUG "Setting gencode flag for ${arg_SRCS}: ${_FLAG}")
|
||||
endmacro(set_gencode_flag_for_srcs)
|
||||
|
||||
#
|
||||
# For a list of source files set the `-gencode` flags in the files specific
|
||||
# compile options (specifically for the CUDA language).
|
||||
#
|
||||
# arguments are:
|
||||
# SRCS: list of source files
|
||||
# CUDA_ARCHS: list of CUDA architectures in the form `<major>.<minor>[letter]`
|
||||
# BUILD_PTX_FOR_ARCH: if set to true, then the PTX code will be built
|
||||
# for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS
|
||||
# that is larger than BUILD_PTX_FOR_ARCH.
|
||||
#
|
||||
macro(set_gencode_flags_for_srcs)
|
||||
set(options)
|
||||
set(oneValueArgs BUILD_PTX_FOR_ARCH)
|
||||
set(multiValueArgs SRCS CUDA_ARCHS)
|
||||
cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
|
||||
"${multiValueArgs}" ${ARGN} )
|
||||
|
||||
foreach(_ARCH ${arg_CUDA_ARCHS})
|
||||
string(REPLACE "." "" _ARCH "${_ARCH}")
|
||||
set_gencode_flag_for_srcs(
|
||||
SRCS ${arg_SRCS}
|
||||
ARCH "compute_${_ARCH}"
|
||||
CODE "sm_${_ARCH}")
|
||||
endforeach()
|
||||
|
||||
if (${arg_BUILD_PTX_FOR_ARCH})
|
||||
list(SORT arg_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
|
||||
list(GET arg_CUDA_ARCHS -1 _HIGHEST_ARCH)
|
||||
if (_HIGHEST_ARCH VERSION_GREATER_EQUAL ${arg_BUILD_PTX_FOR_ARCH})
|
||||
string(REPLACE "." "" _PTX_ARCH "${arg_BUILD_PTX_FOR_ARCH}")
|
||||
set_gencode_flag_for_srcs(
|
||||
SRCS ${arg_SRCS}
|
||||
ARCH "compute_${_PTX_ARCH}"
|
||||
CODE "compute_${_PTX_ARCH}")
|
||||
endif()
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
#
|
||||
# For the given `SRC_CUDA_ARCHS` list of gencode versions in the form
|
||||
# `<major>.<minor>[letter]` compute the "loose intersection" with the
|
||||
# `TGT_CUDA_ARCHS` list of gencodes.
|
||||
# The loose intersection is defined as:
|
||||
# { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} }
|
||||
# where `<=` is the version comparison operator.
|
||||
# In other words, for each version in `TGT_CUDA_ARCHS` find the highest version
|
||||
# in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
|
||||
# We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is
|
||||
# in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add
|
||||
# 9.0a to the result.
|
||||
# The result is stored in `OUT_CUDA_ARCHS`.
|
||||
#
|
||||
# Example:
|
||||
# SRC_CUDA_ARCHS="7.5;8.0;8.6;9.0;9.0a"
|
||||
# TGT_CUDA_ARCHS="8.0;8.9;9.0"
|
||||
# cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
|
||||
# OUT_CUDA_ARCHS="8.0;8.6;9.0;9.0a"
|
||||
#
|
||||
function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
|
||||
list(REMOVE_DUPLICATES SRC_CUDA_ARCHS)
|
||||
|
||||
# if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should
|
||||
# remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS
|
||||
set(_CUDA_ARCHS)
|
||||
if ("9.0a" IN_LIST SRC_CUDA_ARCHS)
|
||||
list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a")
|
||||
if ("9.0" IN_LIST TGT_CUDA_ARCHS)
|
||||
set(_CUDA_ARCHS "9.0a")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
|
||||
|
||||
# for each ARCH in CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that is
|
||||
# less or eqault to ARCH
|
||||
foreach(_ARCH ${CUDA_ARCHS})
|
||||
set(_TMP_ARCH)
|
||||
foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
|
||||
if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
|
||||
set(_TMP_ARCH ${_SRC_ARCH})
|
||||
else()
|
||||
break()
|
||||
endif()
|
||||
endforeach()
|
||||
if (_TMP_ARCH)
|
||||
list(APPEND _CUDA_ARCHS ${_TMP_ARCH})
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
list(REMOVE_DUPLICATES _CUDA_ARCHS)
|
||||
set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
#
|
||||
# Override the GPU architectures detected by cmake/torch and filter them by
|
||||
# `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
|
||||
# `GPU_ARCHES`.
|
||||
# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set
|
||||
# the architectures on a per file basis.
|
||||
#
|
||||
# Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
|
||||
#
|
||||
@ -174,109 +345,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
|
||||
"None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
|
||||
" supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
|
||||
endif()
|
||||
|
||||
elseif(${GPU_LANG} STREQUAL "CUDA")
|
||||
#
|
||||
# Setup/process CUDA arch flags.
|
||||
#
|
||||
# The torch cmake setup hardcodes the detected architecture flags in
|
||||
# `CMAKE_CUDA_FLAGS`. Since `CMAKE_CUDA_FLAGS` is a "global" variable, it
|
||||
# can't modified on a per-target basis.
|
||||
# So, all the `-gencode` flags need to be extracted and removed from
|
||||
# `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method.
|
||||
# Since it's not possible to use `target_compiler_options` for adding target
|
||||
# specific `-gencode` arguments, the target's `CUDA_ARCHITECTURES` property
|
||||
# must be used instead. This requires repackaging the architecture flags
|
||||
# into a format that cmake expects for `CUDA_ARCHITECTURES`.
|
||||
#
|
||||
# This is a bit fragile in that it depends on torch using `-gencode` as opposed
|
||||
# to one of the other nvcc options to specify architectures.
|
||||
#
|
||||
# Note: torch uses the `TORCH_CUDA_ARCH_LIST` environment variable to override
|
||||
# detected architectures.
|
||||
#
|
||||
message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
|
||||
|
||||
# Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
|
||||
string(REGEX MATCHALL "-gencode arch=[^ ]+" _CUDA_ARCH_FLAGS
|
||||
${CMAKE_CUDA_FLAGS})
|
||||
|
||||
# Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
|
||||
# and passed back via the `CUDA_ARCHITECTURES` property.
|
||||
string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
|
||||
${CMAKE_CUDA_FLAGS})
|
||||
|
||||
# If this error is triggered, it might mean that torch has changed how it sets
|
||||
# up nvcc architecture code generation flags.
|
||||
if (NOT _CUDA_ARCH_FLAGS)
|
||||
message(FATAL_ERROR
|
||||
"Could not find any architecture related code generation flags in "
|
||||
"CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})")
|
||||
endif()
|
||||
|
||||
message(DEBUG "final CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
|
||||
message(DEBUG "arch flags: ${_CUDA_ARCH_FLAGS}")
|
||||
|
||||
# Initialize the architecture lists to empty.
|
||||
set(${GPU_ARCHES})
|
||||
|
||||
# Process each `gencode` flag.
|
||||
foreach(_ARCH ${_CUDA_ARCH_FLAGS})
|
||||
# For each flag, extract the version number and whether it refers to PTX
|
||||
# or native code.
|
||||
# Note: if a regex matches then `CMAKE_MATCH_1` holds the binding
|
||||
# for that match.
|
||||
|
||||
string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
|
||||
if (_COMPUTE)
|
||||
set(_COMPUTE ${CMAKE_MATCH_1})
|
||||
endif()
|
||||
|
||||
string(REGEX MATCH "code=sm_\([0-9]+a?\)" _SM ${_ARCH})
|
||||
if (_SM)
|
||||
set(_SM ${CMAKE_MATCH_1})
|
||||
endif()
|
||||
|
||||
string(REGEX MATCH "code=compute_\([0-9]+a?\)" _CODE ${_ARCH})
|
||||
if (_CODE)
|
||||
set(_CODE ${CMAKE_MATCH_1})
|
||||
endif()
|
||||
|
||||
# Make sure the virtual architecture can be matched.
|
||||
if (NOT _COMPUTE)
|
||||
message(FATAL_ERROR
|
||||
"Could not determine virtual architecture from: ${_ARCH}.")
|
||||
endif()
|
||||
|
||||
# One of sm_ or compute_ must exist.
|
||||
if ((NOT _SM) AND (NOT _CODE))
|
||||
message(FATAL_ERROR
|
||||
"Could not determine a codegen architecture from: ${_ARCH}.")
|
||||
endif()
|
||||
|
||||
if (_SM)
|
||||
# -real suffix let CMake to only generate elf code for the kernels.
|
||||
# we want this, otherwise the added ptx (default) will increase binary size.
|
||||
set(_VIRT "-real")
|
||||
set(_CODE_ARCH ${_SM})
|
||||
else()
|
||||
# -virtual suffix let CMake to generate ptx code for the kernels.
|
||||
set(_VIRT "-virtual")
|
||||
set(_CODE_ARCH ${_CODE})
|
||||
endif()
|
||||
|
||||
# Check if the current version is in the supported arch list.
|
||||
string_to_ver(_CODE_VER ${_CODE_ARCH})
|
||||
if (NOT _CODE_VER IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
|
||||
message(STATUS "discarding unsupported CUDA arch ${_VER}.")
|
||||
continue()
|
||||
endif()
|
||||
|
||||
# Add it to the arch list.
|
||||
list(APPEND ${GPU_ARCHES} "${_CODE_ARCH}${_VIRT}")
|
||||
endforeach()
|
||||
endif()
|
||||
message(STATUS "${GPU_LANG} target arches: ${${GPU_ARCHES}}")
|
||||
endmacro()
|
||||
|
||||
#
|
||||
@ -350,18 +419,15 @@ function (define_gpu_extension_target GPU_MOD_NAME)
|
||||
target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
|
||||
${GPU_INCLUDE_DIRECTORIES})
|
||||
|
||||
# TODO: is torch_python_LIBRARY needed?
|
||||
target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${torch_python_LIBRARY}
|
||||
${GPU_LIBRARIES})
|
||||
target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})
|
||||
|
||||
# Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
|
||||
# dependencies that are not necessary and may not be installed.
|
||||
if (GPU_LANGUAGE STREQUAL "CUDA")
|
||||
target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB}
|
||||
${CUDA_LIBRARIES})
|
||||
target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver)
|
||||
else()
|
||||
target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
|
||||
endif()
|
||||
|
||||
install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION})
|
||||
install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME})
|
||||
endfunction()
|
||||
|
||||
@ -1,17 +1,19 @@
|
||||
# ruff: noqa
|
||||
# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py
|
||||
|
||||
# Unlike the rest of the PyTorch this file must be python2 compliant.
|
||||
# This script outputs relevant system environment info
|
||||
# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
|
||||
import datetime
|
||||
import locale
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
# Unlike the rest of the PyTorch this file must be python2 compliant.
|
||||
# This script outputs relevant system environment info
|
||||
# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
|
||||
from collections import namedtuple
|
||||
|
||||
from vllm.envs import environment_variables
|
||||
|
||||
try:
|
||||
import torch
|
||||
TORCH_AVAILABLE = True
|
||||
@ -52,6 +54,7 @@ SystemEnv = namedtuple(
|
||||
'vllm_version', # vllm specific field
|
||||
'vllm_build_flags', # vllm specific field
|
||||
'gpu_topo', # vllm specific field
|
||||
'env_vars',
|
||||
])
|
||||
|
||||
DEFAULT_CONDA_PATTERNS = {
|
||||
@ -267,13 +270,16 @@ def get_neuron_sdk_version(run_lambda):
|
||||
|
||||
|
||||
def get_vllm_version():
|
||||
try:
|
||||
import vllm
|
||||
return vllm.__version__ + "@" + vllm.__commit__
|
||||
except Exception:
|
||||
# old version of vllm does not have __commit__
|
||||
return 'N/A'
|
||||
from vllm import __version__, __version_tuple__
|
||||
|
||||
if __version__ == "dev":
|
||||
return "N/A (dev)"
|
||||
|
||||
if len(__version_tuple__) == 4: # dev build
|
||||
git_sha = __version_tuple__[-1][1:] # type: ignore
|
||||
return f"{__version__} (git sha: {git_sha}"
|
||||
|
||||
return __version__
|
||||
|
||||
def summarize_vllm_build_flags():
|
||||
# This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
|
||||
@ -285,9 +291,14 @@ def summarize_vllm_build_flags():
|
||||
|
||||
|
||||
def get_gpu_topo(run_lambda):
|
||||
output = None
|
||||
|
||||
if get_platform() == 'linux':
|
||||
return run_and_read_all(run_lambda, 'nvidia-smi topo -m')
|
||||
return None
|
||||
output = run_and_read_all(run_lambda, 'nvidia-smi topo -m')
|
||||
if output is None:
|
||||
output = run_and_read_all(run_lambda, 'rocm-smi --showtopo')
|
||||
|
||||
return output
|
||||
|
||||
|
||||
# example outputs of CPU infos
|
||||
@ -504,6 +515,22 @@ def is_xnnpack_available():
|
||||
else:
|
||||
return "N/A"
|
||||
|
||||
def get_env_vars():
|
||||
env_vars = ''
|
||||
secret_terms=('secret', 'token', 'api', 'access', 'password')
|
||||
report_prefix = ("TORCH", "NCCL", "PYTORCH",
|
||||
"CUDA", "CUBLAS", "CUDNN",
|
||||
"OMP_", "MKL_",
|
||||
"NVIDIA")
|
||||
for k, v in os.environ.items():
|
||||
if any(term in k.lower() for term in secret_terms):
|
||||
continue
|
||||
if k in environment_variables:
|
||||
env_vars = env_vars + "{}={}".format(k, v) + "\n"
|
||||
if k.startswith(report_prefix):
|
||||
env_vars = env_vars + "{}={}".format(k, v) + "\n"
|
||||
|
||||
return env_vars
|
||||
|
||||
def get_env_info():
|
||||
run_lambda = run
|
||||
@ -575,6 +602,7 @@ def get_env_info():
|
||||
vllm_version=vllm_version,
|
||||
vllm_build_flags=vllm_build_flags,
|
||||
gpu_topo=gpu_topo,
|
||||
env_vars=get_env_vars(),
|
||||
)
|
||||
|
||||
|
||||
@ -623,6 +651,8 @@ vLLM Build Flags:
|
||||
{vllm_build_flags}
|
||||
GPU Topology:
|
||||
{gpu_topo}
|
||||
|
||||
{env_vars}
|
||||
""".strip()
|
||||
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user