Compare commits
574 Commits
tms/distri
...
lwilkinson
| Author | SHA1 | Date | |
|---|---|---|---|
| de92ab523b | |||
| 9f04a6cf57 | |||
| a0a11bc0b5 | |||
| 143b09e6be | |||
| 090f485aa1 | |||
| 6d76bd034a | |||
| 9e16220e4e | |||
| 5215c80a49 | |||
| dd2a94fd9d | |||
| e526b1c091 | |||
| 44ead56ad5 | |||
| 28e7c30b01 | |||
| 2cf200c5b8 | |||
| 5bbfd95bdb | |||
| 6b0c303ab4 | |||
| 4819bb8715 | |||
| 0edaf752d7 | |||
| 6e8d8c4afb | |||
| 8d524ce79f | |||
| 9f9c38c392 | |||
| a65f46be5e | |||
| 57393715e8 | |||
| ee2eb6ecd8 | |||
| 23322431c8 | |||
| 3654847db5 | |||
| eefbf4a68b | |||
| 88faa466d7 | |||
| 881e1af43a | |||
| d84b97a3e3 | |||
| d331759488 | |||
| 9659bc7f27 | |||
| 3277e8f9e1 | |||
| 8d705996df | |||
| 38c8bce8b6 | |||
| ac45c44d98 | |||
| d6664664b4 | |||
| b879ecd6e2 | |||
| 3f8e952179 | |||
| 326a1b001d | |||
| 2d7b09b998 | |||
| 97608dc276 | |||
| 3146519add | |||
| 8026a335a1 | |||
| a59cd9d9f7 | |||
| 5c54d9759d | |||
| 0a6d305e0f | |||
| f81c1bb055 | |||
| fb0e0d46fc | |||
| 26b5f7bd2a | |||
| dfbc1f8880 | |||
| 87c94bc879 | |||
| 28b18cc741 | |||
| 4931486988 | |||
| 0f81b310db | |||
| e6680f9e25 | |||
| 27a145e893 | |||
| da31f6ad3d | |||
| 98df153abf | |||
| e0f63e4a35 | |||
| b4e081cb15 | |||
| 79731a79f0 | |||
| 53d7c39271 | |||
| 61dcc280fa | |||
| 0f46a780d4 | |||
| e1a7fe4af5 | |||
| 82de9b9d46 | |||
| ad57f23f6a | |||
| 3700642013 | |||
| 0bd409cf01 | |||
| e360316ab9 | |||
| c3e0e9337e | |||
| e283eff060 | |||
| ba17d955a0 | |||
| 6e672daf62 | |||
| 2dff2e21d9 | |||
| 71470bc4af | |||
| 9e0726e5bf | |||
| 53c21e492e | |||
| 0780bb5783 | |||
| 58bb902186 | |||
| 7349d5268b | |||
| 9484641616 | |||
| 207b750e19 | |||
| 5daffe7cf6 | |||
| 2836dd73f1 | |||
| d2aab336ad | |||
| 9532a6d563 | |||
| 3e36fcbee6 | |||
| 055bd3978e | |||
| 0f7919fca0 | |||
| 61445453df | |||
| ec02e536df | |||
| 9cb497bfa3 | |||
| ca9e2be3ed | |||
| 601f856d56 | |||
| 287f527f54 | |||
| 582d301f98 | |||
| f12d9256b3 | |||
| b9b753e7a7 | |||
| 56bd537dde | |||
| 8f0d516715 | |||
| f4135232b9 | |||
| 4904e53c32 | |||
| 004203e953 | |||
| 5c765aec65 | |||
| ad510309ee | |||
| 366f6b3a4d | |||
| 6e599eebe8 | |||
| 88edf5994c | |||
| ff08e51940 | |||
| 8f4a1c9a04 | |||
| 36ede45989 | |||
| 0e40b26073 | |||
| 0271c2ff2f | |||
| e91d3c9cda | |||
| bf668b5bf5 | |||
| da3e0bd6e5 | |||
| fcfd1eb9c5 | |||
| d979dd6beb | |||
| b876860c62 | |||
| 13986365a9 | |||
| 5c8fe389d6 | |||
| 5bbaf492a6 | |||
| 533db0935d | |||
| fc91da5499 | |||
| 547795232d | |||
| 30ef30ed5a | |||
| 02f82fe438 | |||
| 2ca5f82c2a | |||
| 6f8d261882 | |||
| 4cd7fe6cea | |||
| 16f3250527 | |||
| e3bc17ceea | |||
| 05cbbe20c5 | |||
| 65f311ce59 | |||
| 1b0a155534 | |||
| 44bc46da60 | |||
| b7b23da4d2 | |||
| fdde18229e | |||
| b917da442b | |||
| fb58e3a651 | |||
| 76080cff79 | |||
| ba5c5e5404 | |||
| 555e7225bc | |||
| 0e36abf993 | |||
| 452b2a3180 | |||
| 0d0cc9e150 | |||
| 9266d98048 | |||
| 176bbce1db | |||
| a1873db23d | |||
| a33ea28b1b | |||
| 7b49cb1c6b | |||
| f03e9cf2bb | |||
| 37f86d9048 | |||
| 58b11b24a6 | |||
| ad341c5194 | |||
| 759b87ef3e | |||
| f693b067a2 | |||
| 04e38500ee | |||
| ab714131e4 | |||
| 755fa8b657 | |||
| 2470419119 | |||
| 61a6905ab0 | |||
| 37efc63b64 | |||
| a4528f0cac | |||
| a2480251ec | |||
| 7234fe2685 | |||
| f1e2c095ec | |||
| 12a223ef9b | |||
| e18f085103 | |||
| afa2607596 | |||
| 48b763d6b5 | |||
| 947e982ede | |||
| c6c9122d50 | |||
| 8aa1485fcf | |||
| 89ac266b26 | |||
| c6f36cfa26 | |||
| b18b417fbf | |||
| 9ba1c88a93 | |||
| e0e58f9729 | |||
| b361f14e39 | |||
| 01c753ed98 | |||
| 94b71ae106 | |||
| 7d44c691b0 | |||
| e17a4d3bf9 | |||
| ec261b0291 | |||
| 04fe61aa3d | |||
| 25708d317a | |||
| 0e18a5d058 | |||
| 34a20c49b3 | |||
| 31084b3b1f | |||
| bccc43c033 | |||
| 1395dd9c28 | |||
| 9ace2eaf35 | |||
| 656c24f1b5 | |||
| 63fe3a700f | |||
| 0ae970ed15 | |||
| 65e8466c37 | |||
| 1b769dccf3 | |||
| 2cc571199b | |||
| a4ed731546 | |||
| d128d0d554 | |||
| a6c050286a | |||
| 139a7f07bd | |||
| 150d9e6337 | |||
| 139a97ec56 | |||
| 18cc33dd60 | |||
| 7656cf4cf3 | |||
| 3ea57a56d9 | |||
| 75856bc2cb | |||
| 304dcdf575 | |||
| 88e46c7c8d | |||
| d8937de4c8 | |||
| e626d286f5 | |||
| c7ffe93d9c | |||
| 15a72ac478 | |||
| 04ff4be310 | |||
| 93269bb43e | |||
| 82acf2184d | |||
| 86ae693f20 | |||
| 8f605ee309 | |||
| a9b2a1d704 | |||
| 57c22e57f9 | |||
| bda9d0535f | |||
| 3d847a3125 | |||
| 5f8c9a425e | |||
| 1cbf951ba2 | |||
| a8936e5193 | |||
| 01a395e9e7 | |||
| 971948b846 | |||
| eed2f463b2 | |||
| 20950b29fb | |||
| 3339cba3ff | |||
| 0b8caf9095 | |||
| ccf27cc4d4 | |||
| c657369841 | |||
| 6c66f28fa5 | |||
| de509ae8eb | |||
| e7c4f9ee86 | |||
| 9094d11c5d | |||
| 56e544f24b | |||
| 97d6c30cc9 | |||
| a40a8506df | |||
| c215f5c877 | |||
| 1cd6eaba54 | |||
| f27fdfc3ed | |||
| de10ff0b7c | |||
| 9d197280fa | |||
| e98def439c | |||
| 05c1126f29 | |||
| 875af38e01 | |||
| 7728dd77bb | |||
| 2f6e6b33fb | |||
| a55c95096b | |||
| 97349fe2bc | |||
| 62965de5fe | |||
| 7ae75fa6d0 | |||
| f1b286b2fb | |||
| c7742d6113 | |||
| cea96a0156 | |||
| 2eddd437ba | |||
| 75d29cf4e1 | |||
| 41d3082c41 | |||
| 7cfea0df39 | |||
| 1c41175b2a | |||
| b9ad5e4588 | |||
| ee70ce0e4e | |||
| 1ba3ae80bf | |||
| 5ac3168ee3 | |||
| 396ee94180 | |||
| e189b50f53 | |||
| 136d750f5f | |||
| b3caeb82e7 | |||
| eab2f3980c | |||
| 9fe98d4250 | |||
| 29c6fbe58c | |||
| c72f049cb4 | |||
| f3a683b7c9 | |||
| 46d81d6951 | |||
| 5c3f2628d5 | |||
| 7311f74468 | |||
| 8ed01e32f7 | |||
| e38e96a3c0 | |||
| 40d86ee412 | |||
| 85d051f026 | |||
| 5140f54b89 | |||
| 947edd099e | |||
| fde60ee775 | |||
| b38bc652ac | |||
| adaf2c6d4f | |||
| 42343f1f89 | |||
| 965bc71b04 | |||
| 807a328bb6 | |||
| e0be2c4d09 | |||
| 9c8b2c2a8a | |||
| 2212cd6cfb | |||
| ce3a9b1378 | |||
| 2ce90e5b01 | |||
| 633f6e804b | |||
| b57296bb9a | |||
| 34ddcf9ff4 | |||
| fe56180c7f | |||
| 07d80d7b0e | |||
| 2dd72d23d9 | |||
| a6c7fb8cff | |||
| a7272c23d0 | |||
| 6066284914 | |||
| 1e9ea8e69d | |||
| d9f9a3fd96 | |||
| 1b25f1fe75 | |||
| e8cb0d0495 | |||
| 684174115d | |||
| cdb79ee63d | |||
| 5a19a6c670 | |||
| 2ded067fd2 | |||
| 13abd0eaf9 | |||
| 61b8cea3b4 | |||
| 526078a96c | |||
| 6da0078523 | |||
| 73e3949d07 | |||
| 6eca337ce0 | |||
| 85bda9e7d0 | |||
| 610852a423 | |||
| f0f4de8f26 | |||
| fc5f756db4 | |||
| e74bfc70e4 | |||
| 90eeea8f85 | |||
| dde295a934 | |||
| 6d8d0a24c0 | |||
| 11ef7a611e | |||
| dc2f159f8a | |||
| d5b981f8b1 | |||
| eec6942014 | |||
| fd48d99ffd | |||
| f8c15c4efb | |||
| aa08a954f9 | |||
| 13e4ee1dc3 | |||
| 772ce5af97 | |||
| 63d92abb7c | |||
| 11599b0e1f | |||
| f3137cdd81 | |||
| 82ec66f514 | |||
| 78c13e30e1 | |||
| 5c9b807b34 | |||
| 14bf19e39f | |||
| 4ac7713e32 | |||
| 8560a5b258 | |||
| 316b1bf706 | |||
| 7c734ee09b | |||
| f59ec35b7f | |||
| 2671334d45 | |||
| 2cc5016a19 | |||
| 6929f8b437 | |||
| 32ec9e2f2a | |||
| accac82928 | |||
| 23637dcdef | |||
| 6364af92f8 | |||
| 7aaa2bd5a8 | |||
| 2f5c14de6a | |||
| f002e9a870 | |||
| a1f3610fc6 | |||
| 4ecedd1806 | |||
| 107111a859 | |||
| 2dec7c1a5d | |||
| 08d2bd78da | |||
| 4f76a05f4f | |||
| f154bb9ff0 | |||
| 3ec7170ff1 | |||
| c401c64b4c | |||
| b77c7d327f | |||
| 35bc8bd5fb | |||
| 4594fc3b28 | |||
| ae268b6326 | |||
| 35366ae57c | |||
| 2226d5bd85 | |||
| 44554a0068 | |||
| 226b452a20 | |||
| f38ee34a0a | |||
| b194557a6c | |||
| 774d0c014b | |||
| 2c8db17cfd | |||
| 4fb56914c5 | |||
| 6d83b5ef3f | |||
| 0df4d9b06b | |||
| ed25054577 | |||
| 10904e6d75 | |||
| a32237665d | |||
| bc8a8ce5ec | |||
| 32142b3c62 | |||
| 82b8027be6 | |||
| 3779eb8c81 | |||
| 9e23ad9655 | |||
| e69a92a1ce | |||
| 8425f785ad | |||
| c17231e827 | |||
| 6e5b5ca580 | |||
| 488d8a986a | |||
| af376ca19d | |||
| e7b2042681 | |||
| 90f1e55421 | |||
| 5e70dcd6e6 | |||
| 25d585ab7b | |||
| 8d0a01a5f2 | |||
| 0ec82edda5 | |||
| 005ae9be6c | |||
| 29d1ffc5b4 | |||
| 304dce7ec0 | |||
| 6ece16c4fe | |||
| a0e827e07c | |||
| a15a50fc17 | |||
| 6dda13c86b | |||
| 6b46c4b653 | |||
| d97841078b | |||
| e6b90a2805 | |||
| be54a951a3 | |||
| 042af0c8d3 | |||
| 378d33c392 | |||
| 940af1f03a | |||
| 92615d7fe8 | |||
| 8188196a1c | |||
| 7ba34b1241 | |||
| 9499e26e2a | |||
| 51ba839555 | |||
| d1fb65bde3 | |||
| 3a1d8940ae | |||
| 2b504eb770 | |||
| 10eb24cc91 | |||
| 2e8cbb58f3 | |||
| 752c6ade2e | |||
| 881e3cbe3b | |||
| 9f414a12ad | |||
| 6a971ed692 | |||
| da6579bf41 | |||
| c81259d33a | |||
| e3a0e43d7f | |||
| b3d82108e7 | |||
| 6d0734c562 | |||
| 7d94577138 | |||
| 59f935300c | |||
| 18e519ec86 | |||
| 1eaff27815 | |||
| cf8cc32674 | |||
| 3a2cb2649d | |||
| 3e04107d97 | |||
| 37bd8d6e4c | |||
| 468e2400fe | |||
| dcc6cfb991 | |||
| dd572c0ab3 | |||
| 9ffe905a41 | |||
| 9a9fda1423 | |||
| 466e878f2a | |||
| 217937221b | |||
| 5782581acf | |||
| 0f199f197b | |||
| 29a5ac1d04 | |||
| b53450e368 | |||
| 9b5913ed10 | |||
| 376e7eb838 | |||
| 90330563c6 | |||
| 462c6b0b50 | |||
| bfa828f399 | |||
| dc1b6af362 | |||
| 716b03277e | |||
| 1a0e7110dd | |||
| 82ae694de6 | |||
| 10ca263058 | |||
| 908e9f8f54 | |||
| 06cc133a63 | |||
| 3a41a3dcff | |||
| bb0645c644 | |||
| 510e839429 | |||
| f7b6e600b8 | |||
| 0056be26f6 | |||
| 7cc5a549ad | |||
| 83caef8bac | |||
| 2f3461ad23 | |||
| 7e2ff2620e | |||
| 1d75a029a9 | |||
| 17a7ceef27 | |||
| 6e2a3c0841 | |||
| 631be12edb | |||
| a9d47e8652 | |||
| fc562e22e2 | |||
| 1ca65412b8 | |||
| 3112714bdc | |||
| 0c03d154b5 | |||
| 9b7edc0343 | |||
| be2e1632fd | |||
| ce3ef95c11 | |||
| 18f7bfb501 | |||
| 3d833aa759 | |||
| 0e499c4f4d | |||
| 0767d9863f | |||
| c0efbbb5de | |||
| f7a3ee0ea1 | |||
| 57d404bbb8 | |||
| d833982e48 | |||
| 4672c72f44 | |||
| af68574e3d | |||
| 78228a67ce | |||
| 54deb61b87 | |||
| 0e2b4bd546 | |||
| e2ba707d64 | |||
| 44a2b3494e | |||
| 144b148de2 | |||
| 97dbafaad6 | |||
| 96c0c4ea66 | |||
| 930efd02ab | |||
| a4def24c2c | |||
| ff2dd13145 | |||
| 0889f66297 | |||
| 1d112d90a5 | |||
| b74c731342 | |||
| d682f5e1bd | |||
| ef3c01c975 | |||
| 642bf2dd8b | |||
| e6e3407b8d | |||
| 60499f63af | |||
| 05ddc34913 | |||
| a00dabcb33 | |||
| a8675b7d98 | |||
| 8a75b3a1e5 | |||
| f8848bb201 | |||
| 2e3484c237 | |||
| e080e068ed | |||
| 5f4a501b9a | |||
| 539c0c3add | |||
| 18e7d6c7b8 | |||
| 2731e8cbcb | |||
| 919eef995b | |||
| e34e4411b9 | |||
| d46397661f | |||
| 243eac58a4 | |||
| 8332924320 | |||
| d4b502a73a | |||
| 44a595f6d6 | |||
| 92e0cc79a8 | |||
| 8ea80fca4a | |||
| 21d9529a79 | |||
| d6eca0c130 | |||
| 6645882e95 | |||
| 065816d25f | |||
| 90e46ee5e3 | |||
| 8f592524cb | |||
| 0323e29153 | |||
| 252bf0809e | |||
| 62da375465 | |||
| 5b0249b86e | |||
| 895a6c2a08 | |||
| 5cc573e791 | |||
| f0b66d6929 | |||
| a743a35948 | |||
| 7b31e8a8ff | |||
| 2f3920638c | |||
| 020d9b05bc | |||
| 37bdf9f324 | |||
| e4419df256 | |||
| 952f3c5c1e | |||
| 9edd08231b | |||
| 2dc3b8b0a2 | |||
| 18bf91e6a8 | |||
| 00f526f55b | |||
| a8439e2fd4 | |||
| 2a7f25fbe2 | |||
| 9c60a6299d | |||
| 2259b47951 | |||
| 04f11d97a0 | |||
| ffb740ae95 | |||
| 020269c4c5 | |||
| 9ccfd094ff | |||
| f93bdd3151 | |||
| df8f889f37 | |||
| 37c9babaa0 | |||
| 8293182c8c |
@ -28,6 +28,7 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performanc
|
|||||||
## Trigger the benchmark
|
## Trigger the benchmark
|
||||||
|
|
||||||
Performance benchmark will be triggered when:
|
Performance benchmark will be triggered when:
|
||||||
|
|
||||||
- A PR being merged into vllm.
|
- A PR being merged into vllm.
|
||||||
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
|
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
|
||||||
|
|
||||||
@ -38,6 +39,7 @@ bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|||||||
```
|
```
|
||||||
|
|
||||||
Runtime environment variables:
|
Runtime environment variables:
|
||||||
|
|
||||||
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
|
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
|
||||||
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
|
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
|
||||||
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
|
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
|
||||||
@ -46,12 +48,14 @@ Runtime environment variables:
|
|||||||
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
|
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
|
||||||
|
|
||||||
Nightly benchmark will be triggered when:
|
Nightly benchmark will be triggered when:
|
||||||
|
|
||||||
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
||||||
|
|
||||||
## Performance benchmark details
|
## Performance benchmark details
|
||||||
|
|
||||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||||
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
||||||
|
>
|
||||||
### Latency test
|
### Latency test
|
||||||
|
|
||||||
Here is an example of one test inside `latency-tests.json`:
|
Here is an example of one test inside `latency-tests.json`:
|
||||||
@ -74,7 +78,7 @@ Here is an example of one test inside `latency-tests.json`:
|
|||||||
In this example:
|
In this example:
|
||||||
|
|
||||||
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
||||||
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
- The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
||||||
|
|
||||||
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
||||||
|
|
||||||
@ -82,13 +86,13 @@ WARNING: The benchmarking script will save json results by itself, so please do
|
|||||||
|
|
||||||
### Throughput test
|
### Throughput test
|
||||||
|
|
||||||
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
|
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`.
|
||||||
|
|
||||||
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
||||||
|
|
||||||
### Serving test
|
### Serving test
|
||||||
|
|
||||||
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
[
|
[
|
||||||
@ -118,8 +122,8 @@ Inside this example:
|
|||||||
|
|
||||||
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
||||||
- The `server-parameters` includes the command line arguments for vLLM server.
|
- The `server-parameters` includes the command line arguments for vLLM server.
|
||||||
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
|
- The `client-parameters` includes the command line arguments for `vllm bench serve`.
|
||||||
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
|
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve`
|
||||||
|
|
||||||
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
|
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
|
||||||
|
|
||||||
@ -149,6 +153,7 @@ Here is an example using the script to compare result_a and result_b without det
|
|||||||
|
|
||||||
Here is an example using the script to compare result_a and result_b with detail test name.
|
Here is an example using the script to compare result_a and result_b with detail test name.
|
||||||
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
|
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
|
||||||
|
|
||||||
| | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio |
|
| | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio |
|
||||||
|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
|
|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
|
||||||
| 0 | serving_llama8B_tp1_sharegpt_qps_1 | 142.633982 | serving_llama8B_tp1_sharegpt_qps_1 | 156.526018 | 1.097396 |
|
| 0 | serving_llama8B_tp1_sharegpt_qps_1 | 142.633982 | serving_llama8B_tp1_sharegpt_qps_1 | 156.526018 | 1.097396 |
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# Nightly benchmark annotation
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
@ -13,15 +14,15 @@ Please download the visualization scripts in the post
|
|||||||
|
|
||||||
- Find the docker we use in `benchmarking pipeline`
|
- Find the docker we use in `benchmarking pipeline`
|
||||||
- Deploy the docker, and inside the docker:
|
- Deploy the docker, and inside the docker:
|
||||||
- Download `nightly-benchmarks.zip`.
|
- Download `nightly-benchmarks.zip`.
|
||||||
- In the same folder, run the following code:
|
- In the same folder, run the following code:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export HF_TOKEN=<your HF token>
|
export HF_TOKEN=<your HF token>
|
||||||
apt update
|
apt update
|
||||||
apt install -y git
|
apt install -y git
|
||||||
unzip nightly-benchmarks.zip
|
unzip nightly-benchmarks.zip
|
||||||
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
And the results will be inside `./benchmarks/results`.
|
And the results will be inside `./benchmarks/results`.
|
||||||
|
|||||||
@ -13,25 +13,25 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
|
|||||||
## Setup
|
## Setup
|
||||||
|
|
||||||
- Docker images:
|
- Docker images:
|
||||||
- vLLM: `vllm/vllm-openai:v0.6.2`
|
- vLLM: `vllm/vllm-openai:v0.6.2`
|
||||||
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
|
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
|
||||||
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
|
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
|
||||||
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
|
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
|
||||||
- *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
|
- *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
|
||||||
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
|
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
|
||||||
- Hardware
|
- Hardware
|
||||||
- 8x Nvidia A100 GPUs
|
- 8x Nvidia A100 GPUs
|
||||||
- Workload:
|
- Workload:
|
||||||
- Dataset
|
- Dataset
|
||||||
- ShareGPT dataset
|
- ShareGPT dataset
|
||||||
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
|
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
|
||||||
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
|
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
|
||||||
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
|
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
|
||||||
- Models: llama-3 8B, llama-3 70B.
|
- Models: llama-3 8B, llama-3 70B.
|
||||||
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
|
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
|
||||||
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
|
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
|
||||||
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
||||||
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
||||||
|
|
||||||
## Known issues
|
## Known issues
|
||||||
|
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# Performance benchmarks descriptions
|
||||||
|
|
||||||
## Latency tests
|
## Latency tests
|
||||||
|
|
||||||
|
|||||||
@ -44,6 +44,7 @@ serving_column_mapping = {
|
|||||||
"test_name": "Test name",
|
"test_name": "Test name",
|
||||||
"gpu_type": "GPU",
|
"gpu_type": "GPU",
|
||||||
"completed": "# of req.",
|
"completed": "# of req.",
|
||||||
|
"max_concurrency": "# of max concurrency.",
|
||||||
"request_throughput": "Tput (req/s)",
|
"request_throughput": "Tput (req/s)",
|
||||||
"total_token_throughput": "Total Token Tput (tok/s)",
|
"total_token_throughput": "Total Token Tput (tok/s)",
|
||||||
"output_throughput": "Output Tput (tok/s)",
|
"output_throughput": "Output Tput (tok/s)",
|
||||||
@ -100,7 +101,7 @@ if __name__ == "__main__":
|
|||||||
raw_result = json.loads(f.read())
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
if "serving" in str(test_file):
|
if "serving" in str(test_file):
|
||||||
# this result is generated via `benchmark_serving.py`
|
# this result is generated via `vllm bench serve` command
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
try:
|
||||||
@ -120,7 +121,7 @@ if __name__ == "__main__":
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
elif "latency" in f.name:
|
elif "latency" in f.name:
|
||||||
# this result is generated via `benchmark_latency.py`
|
# this result is generated via `vllm bench latency` command
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
try:
|
||||||
@ -148,7 +149,7 @@ if __name__ == "__main__":
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
elif "throughput" in f.name:
|
elif "throughput" in f.name:
|
||||||
# this result is generated via `benchmark_throughput.py`
|
# this result is generated via `vllm bench throughput` command
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
try:
|
||||||
|
|||||||
@ -73,7 +73,7 @@ get_current_llm_serving_engine() {
|
|||||||
echo "Container: vllm"
|
echo "Container: vllm"
|
||||||
# move to a completely irrelevant directory, to avoid import vllm from current folder
|
# move to a completely irrelevant directory, to avoid import vllm from current folder
|
||||||
export CURRENT_LLM_SERVING_ENGINE=vllm
|
export CURRENT_LLM_SERVING_ENGINE=vllm
|
||||||
|
|
||||||
return
|
return
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
@ -95,12 +95,14 @@ json2args() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
kill_gpu_processes() {
|
kill_gpu_processes() {
|
||||||
pkill -f python
|
pkill -f '[p]ython'
|
||||||
pkill -f python3
|
pkill -f '[p]ython3'
|
||||||
pkill -f tritonserver
|
pkill -f '[t]ritonserver'
|
||||||
pkill -f pt_main_thread
|
pkill -f '[p]t_main_thread'
|
||||||
pkill -f text-generation
|
pkill -f '[t]ext-generation'
|
||||||
pkill -f lmdeploy
|
pkill -f '[l]mdeploy'
|
||||||
|
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
||||||
|
pkill -f '[V]LLM'
|
||||||
|
|
||||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||||
sleep 1
|
sleep 1
|
||||||
@ -125,7 +127,7 @@ ensure_installed() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_serving_tests() {
|
run_serving_tests() {
|
||||||
# run serving tests using `benchmark_serving.py`
|
# run serving tests using `vllm bench serve` command
|
||||||
# $1: a json file specifying serving test cases
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
local serving_test_file
|
local serving_test_file
|
||||||
@ -225,7 +227,7 @@ run_serving_tests() {
|
|||||||
|
|
||||||
if [[ "$dataset_name" = "sharegpt" ]]; then
|
if [[ "$dataset_name" = "sharegpt" ]]; then
|
||||||
|
|
||||||
client_command="python3 benchmark_serving.py \
|
client_command="vllm bench serve \
|
||||||
--backend $backend \
|
--backend $backend \
|
||||||
--tokenizer /tokenizer_cache \
|
--tokenizer /tokenizer_cache \
|
||||||
--model $model \
|
--model $model \
|
||||||
@ -246,7 +248,7 @@ run_serving_tests() {
|
|||||||
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
||||||
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
||||||
|
|
||||||
client_command="python3 benchmark_serving.py \
|
client_command="vllm bench serve \
|
||||||
--backend $backend \
|
--backend $backend \
|
||||||
--tokenizer /tokenizer_cache \
|
--tokenizer /tokenizer_cache \
|
||||||
--model $model \
|
--model $model \
|
||||||
@ -265,13 +267,13 @@ run_serving_tests() {
|
|||||||
$client_args"
|
$client_args"
|
||||||
|
|
||||||
else
|
else
|
||||||
|
|
||||||
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
|
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
|
||||||
exit 1
|
exit 1
|
||||||
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
echo "Running test case $test_name with qps $qps"
|
||||||
echo "Client command: $client_command"
|
echo "Client command: $client_command"
|
||||||
@ -302,7 +304,7 @@ run_serving_tests() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_genai_perf_tests() {
|
run_genai_perf_tests() {
|
||||||
# run genai-perf tests
|
# run genai-perf tests
|
||||||
|
|
||||||
# $1: a json file specifying genai-perf test cases
|
# $1: a json file specifying genai-perf test cases
|
||||||
local genai_perf_test_file
|
local genai_perf_test_file
|
||||||
@ -311,14 +313,14 @@ run_genai_perf_tests() {
|
|||||||
# Iterate over genai-perf tests
|
# Iterate over genai-perf tests
|
||||||
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
|
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
|
||||||
# get the test name, and append the GPU type back to it.
|
# get the test name, and append the GPU type back to it.
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
|
||||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
echo "Skip test case $test_name."
|
echo "Skip test case $test_name."
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# prepend the current serving engine to the test name
|
# prepend the current serving engine to the test name
|
||||||
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
||||||
|
|
||||||
@ -369,10 +371,10 @@ run_genai_perf_tests() {
|
|||||||
qps=$num_prompts
|
qps=$num_prompts
|
||||||
echo "now qps is $qps"
|
echo "now qps is $qps"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
new_test_name=$test_name"_qps_"$qps
|
new_test_name=$test_name"_qps_"$qps
|
||||||
backend=$CURRENT_LLM_SERVING_ENGINE
|
backend=$CURRENT_LLM_SERVING_ENGINE
|
||||||
|
|
||||||
if [[ "$backend" == *"vllm"* ]]; then
|
if [[ "$backend" == *"vllm"* ]]; then
|
||||||
backend="vllm"
|
backend="vllm"
|
||||||
fi
|
fi
|
||||||
@ -413,7 +415,7 @@ prepare_dataset() {
|
|||||||
do
|
do
|
||||||
cat sonnet.txt >> sonnet_4x.txt
|
cat sonnet.txt >> sonnet_4x.txt
|
||||||
done
|
done
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
main() {
|
main() {
|
||||||
|
|||||||
@ -33,7 +33,7 @@ check_gpus() {
|
|||||||
|
|
||||||
check_cpus() {
|
check_cpus() {
|
||||||
# check the number of CPUs and NUMA Node and GPU type.
|
# check the number of CPUs and NUMA Node and GPU type.
|
||||||
declare -g numa_count=$(python3 -c "from numa import info;numa_size = info.get_num_configured_nodes(); print(numa_size)")
|
declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
|
||||||
if [[ $numa_count -gt 0 ]]; then
|
if [[ $numa_count -gt 0 ]]; then
|
||||||
echo "NUMA found."
|
echo "NUMA found."
|
||||||
echo $numa_count
|
echo $numa_count
|
||||||
@ -126,7 +126,8 @@ kill_gpu_processes() {
|
|||||||
ps -aux
|
ps -aux
|
||||||
lsof -t -i:8000 | xargs -r kill -9
|
lsof -t -i:8000 | xargs -r kill -9
|
||||||
pgrep python3 | xargs -r kill -9
|
pgrep python3 | xargs -r kill -9
|
||||||
|
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
||||||
|
pgrep VLLM | xargs -r kill -9
|
||||||
|
|
||||||
# wait until GPU memory usage smaller than 1GB
|
# wait until GPU memory usage smaller than 1GB
|
||||||
if command -v nvidia-smi; then
|
if command -v nvidia-smi; then
|
||||||
@ -164,7 +165,7 @@ upload_to_buildkite() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_latency_tests() {
|
run_latency_tests() {
|
||||||
# run latency tests using `benchmark_latency.py`
|
# run latency tests using `vllm bench latency` command
|
||||||
# $1: a json file specifying latency test cases
|
# $1: a json file specifying latency test cases
|
||||||
|
|
||||||
local latency_test_file
|
local latency_test_file
|
||||||
@ -205,7 +206,7 @@ run_latency_tests() {
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
latency_command=" $latency_envs python3 benchmark_latency.py \
|
latency_command=" $latency_envs vllm bench latency \
|
||||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||||
$latency_args"
|
$latency_args"
|
||||||
|
|
||||||
@ -231,7 +232,7 @@ run_latency_tests() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_throughput_tests() {
|
run_throughput_tests() {
|
||||||
# run throughput tests using `benchmark_throughput.py`
|
# run throughput tests using `vllm bench throughput`
|
||||||
# $1: a json file specifying throughput test cases
|
# $1: a json file specifying throughput test cases
|
||||||
|
|
||||||
local throughput_test_file
|
local throughput_test_file
|
||||||
@ -272,7 +273,7 @@ run_throughput_tests() {
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
throughput_command=" $throughput_envs python3 benchmark_throughput.py \
|
throughput_command=" $throughput_envs vllm bench throughput \
|
||||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||||
$throughput_args"
|
$throughput_args"
|
||||||
|
|
||||||
@ -297,7 +298,7 @@ run_throughput_tests() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_serving_tests() {
|
run_serving_tests() {
|
||||||
# run serving tests using `benchmark_serving.py`
|
# run serving tests using `vllm bench serve` command
|
||||||
# $1: a json file specifying serving test cases
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
local serving_test_file
|
local serving_test_file
|
||||||
@ -393,7 +394,7 @@ run_serving_tests() {
|
|||||||
|
|
||||||
# pass the tensor parallel size to the client so that it can be displayed
|
# pass the tensor parallel size to the client so that it can be displayed
|
||||||
# on the benchmark dashboard
|
# on the benchmark dashboard
|
||||||
client_command="python3 benchmark_serving.py \
|
client_command="vllm bench serve \
|
||||||
--save-result \
|
--save-result \
|
||||||
--result-dir $RESULTS_FOLDER \
|
--result-dir $RESULTS_FOLDER \
|
||||||
--result-filename ${new_test_name}.json \
|
--result-filename ${new_test_name}.json \
|
||||||
@ -447,7 +448,7 @@ main() {
|
|||||||
(which jq) || (apt-get update && apt-get -y install jq)
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
(which lsof) || (apt-get update && apt-get install -y lsof)
|
(which lsof) || (apt-get update && apt-get install -y lsof)
|
||||||
|
|
||||||
# get the current IP address, required by benchmark_serving.py
|
# get the current IP address, required by `vllm bench serve` command
|
||||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||||
# turn of the reporting of the status of each request, to clean up the terminal output
|
# turn of the reporting of the status of each request, to clean up the terminal output
|
||||||
export VLLM_LOGGING_LEVEL="WARNING"
|
export VLLM_LOGGING_LEVEL="WARNING"
|
||||||
|
|||||||
209
.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
Normal file
@ -0,0 +1,209 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"max_concurrency": 60,
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"max_concurrency": 60,
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp4_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"max_concurrency": 60,
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_random_128_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"max_concurrency": 1000,
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_random_128_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"max_concurrency": 1000,
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp4_random_128_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"max_concurrency": 1000,
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
211
.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
Normal file
@ -0,0 +1,211 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_pp1_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"pipeline_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"max_concurrency": 60,
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_pp3_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"pipeline_parallel_size": 3,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"max_concurrency": 60,
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2pp6_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"pipeline_parallel_size": 3,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"max_concurrency": 60,
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_pp1_random_128_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"pipeline_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"max_concurrency": 1000,
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_pp3_random_128_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL:": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"pipeline_parallel_size": 3,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"max_concurrency": 1000,
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2pp3_random_128_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"pipeline_parallel_size": 3,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"max_concurrency": 1000,
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
@ -6,6 +6,7 @@
|
|||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
},
|
},
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
@ -18,6 +19,8 @@
|
|||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": "",
|
"disable_log_requests": "",
|
||||||
"enforce_eager": "",
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
@ -36,6 +39,7 @@
|
|||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
},
|
},
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
@ -48,6 +52,8 @@
|
|||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": "",
|
"disable_log_requests": "",
|
||||||
"enforce_eager": "",
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
@ -66,6 +72,7 @@
|
|||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
},
|
},
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
@ -78,6 +85,8 @@
|
|||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": "",
|
"disable_log_requests": "",
|
||||||
"enforce_eager": "",
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
@ -96,6 +105,7 @@
|
|||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
},
|
},
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
@ -109,6 +119,8 @@
|
|||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": "",
|
"disable_log_requests": "",
|
||||||
"enforce_eager": "",
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
@ -129,6 +141,7 @@
|
|||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
},
|
},
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
@ -142,6 +155,8 @@
|
|||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": "",
|
"disable_log_requests": "",
|
||||||
"enforce_eager": "",
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
|
|||||||
@ -108,7 +108,6 @@ fi
|
|||||||
if [[ $commands == *" kernels/attention"* ]]; then
|
if [[ $commands == *" kernels/attention"* ]]; then
|
||||||
commands="${commands} \
|
commands="${commands} \
|
||||||
--ignore=kernels/attention/test_attention_selector.py \
|
--ignore=kernels/attention/test_attention_selector.py \
|
||||||
--ignore=kernels/attention/test_blocksparse_attention.py \
|
|
||||||
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
||||||
--ignore=kernels/attention/test_flash_attn.py \
|
--ignore=kernels/attention/test_flash_attn.py \
|
||||||
--ignore=kernels/attention/test_flashinfer.py \
|
--ignore=kernels/attention/test_flashinfer.py \
|
||||||
|
|||||||
@ -6,15 +6,16 @@ set -ex
|
|||||||
|
|
||||||
# allow to bind to different cores
|
# allow to bind to different cores
|
||||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||||
|
# used for TP/PP E2E test
|
||||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
|
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
|
||||||
NUMA_NODE=${NUMA_NODE:-1}
|
NUMA_NODE=${NUMA_NODE:-1}
|
||||||
|
|
||||||
export CMAKE_BUILD_PARALLEL_LEVEL=32
|
export CMAKE_BUILD_PARALLEL_LEVEL=32
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
set -e;
|
set -e;
|
||||||
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
|
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
|
||||||
}
|
}
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
remove_docker_container
|
remove_docker_container
|
||||||
@ -24,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
|
|||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||||
|
|
||||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
||||||
|
|
||||||
function cpu_tests() {
|
function cpu_tests() {
|
||||||
set -e
|
set -e
|
||||||
@ -68,7 +69,7 @@ function cpu_tests() {
|
|||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v \
|
pytest -s -v \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
||||||
|
|
||||||
# Note: disable it until supports V1
|
# Note: disable it until supports V1
|
||||||
# Run AWQ test
|
# Run AWQ test
|
||||||
@ -77,24 +78,23 @@ function cpu_tests() {
|
|||||||
# VLLM_USE_V1=0 pytest -s -v \
|
# VLLM_USE_V1=0 pytest -s -v \
|
||||||
# tests/quantization/test_ipex_quant.py"
|
# tests/quantization/test_ipex_quant.py"
|
||||||
|
|
||||||
# online serving
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
|
||||||
set -e
|
|
||||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
|
|
||||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
|
||||||
VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset-name random \
|
|
||||||
--model facebook/opt-125m \
|
|
||||||
--num-prompts 20 \
|
|
||||||
--endpoint /v1/completions \
|
|
||||||
--tokenizer facebook/opt-125m"
|
|
||||||
|
|
||||||
# Run multi-lora tests
|
# Run multi-lora tests
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v \
|
pytest -s -v \
|
||||||
tests/lora/test_qwen2vl.py"
|
tests/lora/test_qwen2vl.py"
|
||||||
|
|
||||||
|
# online serving
|
||||||
|
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
||||||
|
set -e
|
||||||
|
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
||||||
|
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||||
|
vllm bench serve \
|
||||||
|
--backend vllm \
|
||||||
|
--dataset-name random \
|
||||||
|
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||||
|
--num-prompts 20 \
|
||||||
|
--endpoint /v1/completions'
|
||||||
}
|
}
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
|
|||||||
@ -16,8 +16,7 @@ DOCKER_BUILDKIT=1 docker build . \
|
|||||||
--build-arg max_jobs=66 \
|
--build-arg max_jobs=66 \
|
||||||
--build-arg nvcc_threads=2 \
|
--build-arg nvcc_threads=2 \
|
||||||
--build-arg RUN_WHEEL_CHECK=false \
|
--build-arg RUN_WHEEL_CHECK=false \
|
||||||
--build-arg torch_cuda_arch_list="9.0+PTX" \
|
--build-arg torch_cuda_arch_list="9.0+PTX"
|
||||||
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() { docker rm -f gh200-test || true; }
|
remove_docker_container() { docker rm -f gh200-test || true; }
|
||||||
|
|||||||
166
.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
Executable file
@ -0,0 +1,166 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -xu
|
||||||
|
|
||||||
|
|
||||||
|
remove_docker_container() {
|
||||||
|
docker rm -f tpu-test || true;
|
||||||
|
docker rm -f vllm-tpu || true;
|
||||||
|
}
|
||||||
|
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
|
||||||
|
# Remove the container that might not be cleaned up in the previous run.
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Build the docker image.
|
||||||
|
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
||||||
|
|
||||||
|
# Set up cleanup.
|
||||||
|
cleanup_docker() {
|
||||||
|
# Get Docker's root directory
|
||||||
|
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||||
|
if [ -z "$docker_root" ]; then
|
||||||
|
echo "Failed to determine Docker root directory."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Docker root directory: $docker_root"
|
||||||
|
# Check disk usage of the filesystem where Docker's root directory is located
|
||||||
|
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||||
|
# Define the threshold
|
||||||
|
threshold=70
|
||||||
|
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||||
|
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||||
|
# Remove dangling images (those that are not tagged and not used by any container)
|
||||||
|
docker image prune -f
|
||||||
|
# Remove unused volumes / force the system prune for old images as well.
|
||||||
|
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||||
|
echo "Docker images and volumes cleanup completed."
|
||||||
|
else
|
||||||
|
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
cleanup_docker
|
||||||
|
|
||||||
|
# For HF_TOKEN.
|
||||||
|
source /etc/environment
|
||||||
|
|
||||||
|
docker run --privileged --net host --shm-size=16G -it \
|
||||||
|
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||||
|
vllm-tpu /bin/bash -c '
|
||||||
|
set -e # Exit immediately if a command exits with a non-zero status.
|
||||||
|
set -u # Treat unset variables as an error.
|
||||||
|
|
||||||
|
echo "--- Starting script inside Docker container ---"
|
||||||
|
|
||||||
|
# Create results directory
|
||||||
|
RESULTS_DIR=$(mktemp -d)
|
||||||
|
# If mktemp fails, set -e will cause the script to exit.
|
||||||
|
echo "Results will be stored in: $RESULTS_DIR"
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
echo "--- Installing Python dependencies ---"
|
||||||
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
|
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
|
||||||
|
&& python3 -m pip install --progress-bar off hf-transfer
|
||||||
|
echo "--- Python dependencies installed ---"
|
||||||
|
export VLLM_USE_V1=1
|
||||||
|
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||||
|
export VLLM_XLA_CACHE_PATH=
|
||||||
|
echo "Using VLLM V1"
|
||||||
|
|
||||||
|
echo "--- Hardware Information ---"
|
||||||
|
# tpu-info
|
||||||
|
echo "--- Starting Tests ---"
|
||||||
|
set +e
|
||||||
|
overall_script_exit_code=0
|
||||||
|
|
||||||
|
# --- Test Definitions ---
|
||||||
|
# If a test fails, this function will print logs and will not cause the main script to exit.
|
||||||
|
run_test() {
|
||||||
|
local test_num=$1
|
||||||
|
local test_name=$2
|
||||||
|
local test_command=$3
|
||||||
|
local log_file="$RESULTS_DIR/test_${test_num}.log"
|
||||||
|
local actual_exit_code
|
||||||
|
|
||||||
|
echo "--- TEST_$test_num: Running $test_name ---"
|
||||||
|
|
||||||
|
# Execute the test command.
|
||||||
|
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
|
||||||
|
actual_exit_code=$?
|
||||||
|
|
||||||
|
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
|
||||||
|
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
|
||||||
|
|
||||||
|
if [ "$actual_exit_code" -ne 0 ]; then
|
||||||
|
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
|
||||||
|
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
|
||||||
|
if [ -f "$log_file" ]; then
|
||||||
|
cat "$log_file" >&2
|
||||||
|
else
|
||||||
|
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
|
||||||
|
fi
|
||||||
|
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
|
||||||
|
return "$actual_exit_code" # Return the failure code
|
||||||
|
else
|
||||||
|
echo "TEST_$test_num ($test_name) PASSED."
|
||||||
|
return 0 # Return success
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Helper function to call run_test and update the overall script exit code
|
||||||
|
run_and_track_test() {
|
||||||
|
local test_num_arg="$1"
|
||||||
|
local test_name_arg="$2"
|
||||||
|
local test_command_arg="$3"
|
||||||
|
|
||||||
|
# Run the test
|
||||||
|
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
|
||||||
|
local test_specific_exit_code=$?
|
||||||
|
|
||||||
|
# If the test failed, set the overall script exit code to 1
|
||||||
|
if [ "$test_specific_exit_code" -ne 0 ]; then
|
||||||
|
# No need for extra echo here, run_test already logged the failure.
|
||||||
|
overall_script_exit_code=1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- Actual Test Execution ---
|
||||||
|
run_and_track_test 1 "test_struct_output_generate.py" \
|
||||||
|
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
||||||
|
run_and_track_test 2 "test_moe_pallas.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
||||||
|
run_and_track_test 3 "test_lora.py" \
|
||||||
|
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
||||||
|
run_and_track_test 4 "test_tpu_qkv_linear.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
|
||||||
|
run_and_track_test 5 "test_spmd_model_weight_loading.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
|
||||||
|
run_and_track_test 6 "test_kv_cache_update_kernel.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
|
||||||
|
|
||||||
|
# After all tests have been attempted, exit with the overall status.
|
||||||
|
if [ "$overall_script_exit_code" -ne 0 ]; then
|
||||||
|
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
|
||||||
|
else
|
||||||
|
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
|
||||||
|
fi
|
||||||
|
exit "$overall_script_exit_code"
|
||||||
|
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
|
||||||
|
|
||||||
|
# Capture the exit code of the docker run command
|
||||||
|
DOCKER_RUN_EXIT_CODE=$?
|
||||||
|
|
||||||
|
# The trap will run for cleanup.
|
||||||
|
# Exit the main script with the Docker run command's exit code.
|
||||||
|
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
|
||||||
|
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
|
||||||
|
exit "$DOCKER_RUN_EXIT_CODE"
|
||||||
|
else
|
||||||
|
echo "Docker run command completed successfully."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||||
|
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||||
@ -62,7 +62,8 @@ echo "Results will be stored in: $RESULTS_DIR"
|
|||||||
echo "--- Installing Python dependencies ---"
|
echo "--- Installing Python dependencies ---"
|
||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
|
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
|
||||||
|
&& python3 -m pip install --progress-bar off hf-transfer
|
||||||
echo "--- Python dependencies installed ---"
|
echo "--- Python dependencies installed ---"
|
||||||
export VLLM_USE_V1=1
|
export VLLM_USE_V1=1
|
||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||||
@ -134,7 +135,7 @@ run_and_track_test 1 "test_compilation.py" \
|
|||||||
run_and_track_test 2 "test_basic.py" \
|
run_and_track_test 2 "test_basic.py" \
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
|
||||||
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
|
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
|
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
|
||||||
run_and_track_test 4 "test_quantization_accuracy.py" \
|
run_and_track_test 4 "test_quantization_accuracy.py" \
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
|
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
|
||||||
run_and_track_test 5 "examples/offline_inference/tpu.py" \
|
run_and_track_test 5 "examples/offline_inference/tpu.py" \
|
||||||
@ -149,18 +150,6 @@ run_and_track_test 9 "test_multimodal.py" \
|
|||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
|
||||||
run_and_track_test 10 "test_pallas.py" \
|
run_and_track_test 10 "test_pallas.py" \
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
|
||||||
run_and_track_test 11 "test_struct_output_generate.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
|
||||||
run_and_track_test 12 "test_moe_pallas.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
|
||||||
run_and_track_test 13 "test_lora.py" \
|
|
||||||
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
|
||||||
run_and_track_test 14 "test_tpu_qkv_linear.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
|
|
||||||
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
|
|
||||||
run_and_track_test 16 "test_kv_cache_update_kernel.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
|
|
||||||
|
|
||||||
# After all tests have been attempted, exit with the overall status.
|
# After all tests have been attempted, exit with the overall status.
|
||||||
if [ "$overall_script_exit_code" -ne 0 ]; then
|
if [ "$overall_script_exit_code" -ne 0 ]; then
|
||||||
|
|||||||
@ -31,4 +31,13 @@ docker run \
|
|||||||
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||||
cd tests
|
cd tests
|
||||||
pytest -v -s v1/core
|
pytest -v -s v1/core
|
||||||
|
pytest -v -s v1/engine
|
||||||
|
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
||||||
|
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||||
|
pytest -v -s v1/structured_output
|
||||||
|
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py
|
||||||
|
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py
|
||||||
|
pytest -v -s v1/test_serial_utils.py
|
||||||
|
pytest -v -s v1/test_utils.py
|
||||||
|
pytest -v -s v1/test_metrics_reader.py
|
||||||
'
|
'
|
||||||
|
|||||||
@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
|
|||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
|
||||||
# run python-based benchmarks and upload the result to buildkite
|
# run python-based benchmarks and upload the result to buildkite
|
||||||
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
|
vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
|
||||||
bench_latency_exit_code=$?
|
bench_latency_exit_code=$?
|
||||||
|
|
||||||
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
|
vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
|
||||||
bench_throughput_exit_code=$?
|
bench_throughput_exit_code=$?
|
||||||
|
|
||||||
# run server-based benchmarks and upload the result to buildkite
|
# run server-based benchmarks and upload the result to buildkite
|
||||||
@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
|
|||||||
|
|
||||||
# wait for server to start, timeout after 600 seconds
|
# wait for server to start, timeout after 600 seconds
|
||||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||||
python3 benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--dataset-name sharegpt \
|
--dataset-name sharegpt \
|
||||||
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
|
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||||
|
|||||||
@ -77,7 +77,7 @@ done
|
|||||||
echo "run benchmark test..."
|
echo "run benchmark test..."
|
||||||
echo "logging to $BM_LOG"
|
echo "logging to $BM_LOG"
|
||||||
echo
|
echo
|
||||||
python benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--dataset-name sonnet \
|
--dataset-name sonnet \
|
||||||
|
|||||||
@ -82,7 +82,7 @@ steps:
|
|||||||
- bash standalone_tests/python_only_compile.sh
|
- bash standalone_tests/python_only_compile.sh
|
||||||
|
|
||||||
- label: Basic Correctness Test # 30min
|
- label: Basic Correctness Test # 30min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
fast_check: true
|
fast_check: true
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -99,7 +99,7 @@ steps:
|
|||||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||||
|
|
||||||
- label: Chunked Prefill Test
|
- label: Chunked Prefill Test
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/basic_correctness/test_chunked_prefill
|
- tests/basic_correctness/test_chunked_prefill
|
||||||
@ -108,7 +108,7 @@ steps:
|
|||||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||||
|
|
||||||
- label: Core Test # 10min
|
- label: Core Test # 10min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
fast_check: true
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/core
|
- vllm/core
|
||||||
@ -128,11 +128,10 @@ steps:
|
|||||||
- tests/entrypoints/offline_mode
|
- tests/entrypoints/offline_mode
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
|
||||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
|
||||||
- label: Entrypoints Test (API Server) # 40min
|
- label: Entrypoints Test (API Server) # 40min
|
||||||
@ -159,13 +158,14 @@ steps:
|
|||||||
- tests/distributed/test_utils
|
- tests/distributed/test_utils
|
||||||
- tests/distributed/test_pynccl
|
- tests/distributed/test_pynccl
|
||||||
- tests/distributed/test_events
|
- tests/distributed/test_events
|
||||||
- tests/spec_decode/e2e/test_integration_dist_tp4
|
|
||||||
- tests/compile/test_basic_correctness
|
- tests/compile/test_basic_correctness
|
||||||
- examples/offline_inference/rlhf.py
|
- examples/offline_inference/rlhf.py
|
||||||
- examples/offline_inference/rlhf_colocate.py
|
- examples/offline_inference/rlhf_colocate.py
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
- tests/v1/test_async_llm_dp.py
|
- tests/v1/test_async_llm_dp.py
|
||||||
- tests/v1/test_external_lb_dp.py
|
- tests/v1/test_external_lb_dp.py
|
||||||
|
- tests/v1/test_internal_lb_dp.py
|
||||||
|
- tests/v1/test_hybrid_lb_dp.py
|
||||||
- tests/v1/engine/test_engine_core_client.py
|
- tests/v1/engine/test_engine_core_client.py
|
||||||
commands:
|
commands:
|
||||||
# test with tp=2 and external_dp=2
|
# test with tp=2 and external_dp=2
|
||||||
@ -177,12 +177,13 @@ steps:
|
|||||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
||||||
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
|
||||||
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
|
||||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||||
- pytest -v -s distributed/test_utils.py
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
- pytest -v -s distributed/test_events.py
|
- pytest -v -s distributed/test_events.py
|
||||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
|
||||||
# TODO: create a dedicated test section for multi-GPU example tests
|
# TODO: create a dedicated test section for multi-GPU example tests
|
||||||
# when we have multiple distributed example tests
|
# when we have multiple distributed example tests
|
||||||
- pushd ../examples/offline_inference
|
- pushd ../examples/offline_inference
|
||||||
@ -208,7 +209,7 @@ steps:
|
|||||||
- pytest -v -s distributed/test_eplb_execute.py
|
- pytest -v -s distributed/test_eplb_execute.py
|
||||||
|
|
||||||
- label: Metrics, Tracing Test # 10min
|
- label: Metrics, Tracing Test # 10min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@ -266,6 +267,7 @@ steps:
|
|||||||
- pytest -v -s v1/structured_output
|
- pytest -v -s v1/structured_output
|
||||||
- pytest -v -s v1/spec_decode
|
- pytest -v -s v1/spec_decode
|
||||||
- pytest -v -s v1/kv_connector/unit
|
- pytest -v -s v1/kv_connector/unit
|
||||||
|
- pytest -v -s v1/metrics
|
||||||
- pytest -v -s v1/test_serial_utils.py
|
- pytest -v -s v1/test_serial_utils.py
|
||||||
- pytest -v -s v1/test_utils.py
|
- pytest -v -s v1/test_utils.py
|
||||||
- pytest -v -s v1/test_oracle.py
|
- pytest -v -s v1/test_oracle.py
|
||||||
@ -274,7 +276,7 @@ steps:
|
|||||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||||
- pytest -v -s v1/e2e
|
- pytest -v -s v1/e2e
|
||||||
# Integration test for streaming correctness (requires special branch).
|
# Integration test for streaming correctness (requires special branch).
|
||||||
- pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
|
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||||
|
|
||||||
- label: Examples Test # 25min
|
- label: Examples Test # 25min
|
||||||
@ -303,7 +305,7 @@ steps:
|
|||||||
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||||
|
|
||||||
- label: Prefix Caching Test # 9min
|
- label: Prefix Caching Test # 9min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/prefix_caching
|
- tests/prefix_caching
|
||||||
@ -330,19 +332,8 @@ steps:
|
|||||||
- pytest -v -s samplers
|
- pytest -v -s samplers
|
||||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||||
|
|
||||||
- label: Speculative decoding tests # 40min
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/spec_decode
|
|
||||||
- tests/spec_decode
|
|
||||||
- vllm/model_executor/models/eagle.py
|
|
||||||
commands:
|
|
||||||
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
|
|
||||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
|
|
||||||
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
|
||||||
|
|
||||||
- label: LoRA Test %N # 15min each
|
- label: LoRA Test %N # 15min each
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
- tests/lora
|
- tests/lora
|
||||||
@ -362,9 +353,10 @@ steps:
|
|||||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||||
- pytest -v -s compile/test_sequence_parallelism.py
|
- pytest -v -s compile/test_sequence_parallelism.py
|
||||||
- pytest -v -s compile/test_async_tp.py
|
- pytest -v -s compile/test_async_tp.py
|
||||||
|
- pytest -v -s compile/test_fusion_all_reduce.py
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 9min
|
- label: PyTorch Fullgraph Smoke Test # 9min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@ -377,7 +369,7 @@ steps:
|
|||||||
- pytest -v -s compile/piecewise/test_full_cudagraph.py
|
- pytest -v -s compile/piecewise/test_full_cudagraph.py
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 18min
|
- label: PyTorch Fullgraph Test # 18min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@ -386,7 +378,7 @@ steps:
|
|||||||
- pytest -v -s compile/test_full_graph.py
|
- pytest -v -s compile/test_full_graph.py
|
||||||
|
|
||||||
- label: Kernels Core Operation Test
|
- label: Kernels Core Operation Test
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- tests/kernels/core
|
- tests/kernels/core
|
||||||
@ -394,7 +386,7 @@ steps:
|
|||||||
- pytest -v -s kernels/core
|
- pytest -v -s kernels/core
|
||||||
|
|
||||||
- label: Kernels Attention Test %N
|
- label: Kernels Attention Test %N
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/attention/
|
- csrc/attention/
|
||||||
- vllm/attention
|
- vllm/attention
|
||||||
@ -405,23 +397,24 @@ steps:
|
|||||||
parallelism: 2
|
parallelism: 2
|
||||||
|
|
||||||
- label: Kernels Quantization Test %N
|
- label: Kernels Quantization Test %N
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/quantization/
|
- csrc/quantization/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
- tests/kernels/quantization
|
- tests/kernels/quantization
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
parallelism: 2
|
parallelism: 2
|
||||||
|
|
||||||
- label: Kernels MoE Test
|
- label: Kernels MoE Test %N
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/moe/
|
- csrc/moe/
|
||||||
- tests/kernels/moe
|
- tests/kernels/moe
|
||||||
- vllm/model_executor/layers/fused_moe/
|
- vllm/model_executor/layers/fused_moe/
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/moe
|
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
|
parallelism: 2
|
||||||
|
|
||||||
- label: Kernels Mamba Test
|
- label: Kernels Mamba Test
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@ -445,8 +438,7 @@ steps:
|
|||||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
|
|
||||||
- label: Model Executor Test
|
- label: Model Executor Test
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
soft_fail: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor
|
- vllm/model_executor
|
||||||
- tests/model_executor
|
- tests/model_executor
|
||||||
@ -456,7 +448,7 @@ steps:
|
|||||||
- pytest -v -s model_executor
|
- pytest -v -s model_executor
|
||||||
|
|
||||||
- label: Benchmarks # 9min
|
- label: Benchmarks # 9min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/.buildkite"
|
working_dir: "/vllm-workspace/.buildkite"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- benchmarks/
|
- benchmarks/
|
||||||
@ -464,7 +456,7 @@ steps:
|
|||||||
- bash scripts/run-benchmarks.sh
|
- bash scripts/run-benchmarks.sh
|
||||||
|
|
||||||
- label: Benchmarks CLI Test # 10min
|
- label: Benchmarks CLI Test # 10min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/benchmarks/
|
- tests/benchmarks/
|
||||||
@ -623,7 +615,7 @@ steps:
|
|||||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
||||||
|
|
||||||
- label: Quantized Models Test
|
- label: Quantized Models Test
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
- tests/models/quantization
|
- tests/models/quantization
|
||||||
@ -632,7 +624,7 @@ steps:
|
|||||||
|
|
||||||
# This test is used only in PR development phase to test individual models and should never run on main
|
# This test is used only in PR development phase to test individual models and should never run on main
|
||||||
- label: Custom Models Test
|
- label: Custom Models Test
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
optional: true
|
optional: true
|
||||||
commands:
|
commands:
|
||||||
- echo 'Testing custom models...'
|
- echo 'Testing custom models...'
|
||||||
@ -652,11 +644,22 @@ steps:
|
|||||||
- python3 examples/offline_inference/audio_language.py --model-type whisper
|
- python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||||
|
|
||||||
|
- label: Blackwell Test
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
|
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
|
|
||||||
- label: Distributed Comm Ops Test # 7min
|
- label: Distributed Comm Ops Test # 7min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -726,10 +729,10 @@ steps:
|
|||||||
- pytest -v -s distributed/test_sequence_parallel.py
|
- pytest -v -s distributed/test_sequence_parallel.py
|
||||||
# this test fails consistently.
|
# this test fails consistently.
|
||||||
# TODO: investigate and fix
|
# TODO: investigate and fix
|
||||||
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
|
||||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
|
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||||
|
- pytest -v -s models/multimodal/generation/test_maverick.py
|
||||||
|
|
||||||
- label: Plugin Tests (2 GPUs) # 40min
|
- label: Plugin Tests (2 GPUs) # 40min
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@ -753,7 +756,7 @@ steps:
|
|||||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||||
|
|
||||||
- label: Multi-step Tests (4 GPUs) # 36min
|
- label: Multi-step Tests (4 GPUs) # 36min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -774,7 +777,7 @@ steps:
|
|||||||
- pytest -v -s multi_step/test_correctness_llm.py
|
- pytest -v -s multi_step/test_correctness_llm.py
|
||||||
|
|
||||||
- label: Pipeline Parallelism Test # 45min
|
- label: Pipeline Parallelism Test # 45min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -788,7 +791,7 @@ steps:
|
|||||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
|
|
||||||
- label: LoRA TP Test (Distributed)
|
- label: LoRA TP Test (Distributed)
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
|
|||||||
28
.github/CODEOWNERS
vendored
@ -10,7 +10,6 @@
|
|||||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
||||||
/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
|
|
||||||
/vllm/multimodal @DarkLight1337 @ywang96
|
/vllm/multimodal @DarkLight1337 @ywang96
|
||||||
/vllm/vllm_flash_attn @LucasWilkinson
|
/vllm/vllm_flash_attn @LucasWilkinson
|
||||||
/vllm/lora @jeejeelee
|
/vllm/lora @jeejeelee
|
||||||
@ -35,15 +34,12 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||||
/tests/distributed/test_same_node.py @youkaichao
|
/tests/distributed/test_same_node.py @youkaichao
|
||||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
|
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
|
||||||
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
|
|
||||||
/tests/kernels @tlrmchlsmth @WoosukKwon
|
/tests/kernels @tlrmchlsmth @WoosukKwon
|
||||||
/tests/model_executor/test_guided_processors.py @mgoin @russellb
|
|
||||||
/tests/models @DarkLight1337 @ywang96
|
/tests/models @DarkLight1337 @ywang96
|
||||||
/tests/multi_step @alexm-redhat @comaniac
|
/tests/multi_step @alexm-redhat @comaniac
|
||||||
/tests/multimodal @DarkLight1337 @ywang96
|
/tests/multimodal @DarkLight1337 @ywang96
|
||||||
/tests/prefix_caching @comaniac @KuntaiDu
|
/tests/prefix_caching @comaniac @KuntaiDu
|
||||||
/tests/quantization @mgoin @robertgshaw2-redhat
|
/tests/quantization @mgoin @robertgshaw2-redhat
|
||||||
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
|
||||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||||
@ -53,3 +49,27 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
# Docs
|
# Docs
|
||||||
/docs @hmellor
|
/docs @hmellor
|
||||||
mkdocs.yaml @hmellor
|
mkdocs.yaml @hmellor
|
||||||
|
|
||||||
|
# CPU
|
||||||
|
/vllm/v1/worker/^cpu @bigPYJ1151
|
||||||
|
/csrc/cpu @bigPYJ1151
|
||||||
|
/vllm/platforms/cpu.py @bigPYJ1151
|
||||||
|
/cmake/cpu_extension.cmake @bigPYJ1151
|
||||||
|
/docker/Dockerfile.cpu @bigPYJ1151
|
||||||
|
|
||||||
|
# Intel GPU
|
||||||
|
/vllm/v1/worker/^xpu @jikunshang
|
||||||
|
/vllm/platforms/xpu.py @jikunshang
|
||||||
|
/docker/Dockerfile.xpu @jikunshang
|
||||||
|
|
||||||
|
# Qwen-specific files
|
||||||
|
/vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
|
||||||
|
/vllm/model_executor/models/qwen* @sighingnow
|
||||||
|
|
||||||
|
# Mistral-specific files
|
||||||
|
/vllm/model_executor/models/mistral*.py @patrickvonplaten
|
||||||
|
/vllm/model_executor/models/mixtral*.py @patrickvonplaten
|
||||||
|
/vllm/model_executor/models/voxtral*.py @patrickvonplaten
|
||||||
|
/vllm/model_executor/models/pixtral*.py @patrickvonplaten
|
||||||
|
/vllm/transformers_utils/configs/mistral.py @patrickvonplaten
|
||||||
|
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
|
||||||
|
|||||||
2
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
@ -46,7 +46,7 @@ body:
|
|||||||
- type: markdown
|
- type: markdown
|
||||||
attributes:
|
attributes:
|
||||||
value: >
|
value: >
|
||||||
Thanks for contributing 🎉!
|
Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
|
||||||
- type: checkboxes
|
- type: checkboxes
|
||||||
id: askllm
|
id: askllm
|
||||||
attributes:
|
attributes:
|
||||||
|
|||||||
4
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -1,4 +1,5 @@
|
|||||||
## Essential Elements of an Effective PR Description Checklist
|
# Essential Elements of an Effective PR Description Checklist
|
||||||
|
|
||||||
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
|
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
|
||||||
- [ ] The test plan, such as providing test command.
|
- [ ] The test plan, such as providing test command.
|
||||||
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
|
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
|
||||||
@ -14,5 +15,4 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE B
|
|||||||
|
|
||||||
## (Optional) Documentation Update
|
## (Optional) Documentation Update
|
||||||
|
|
||||||
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
|
||||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
|
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
|
||||||
|
|||||||
6
.github/mergify.yml
vendored
@ -149,9 +149,6 @@ pull_request_rules:
|
|||||||
- files=examples/offline_inference/structured_outputs.py
|
- files=examples/offline_inference/structured_outputs.py
|
||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
||||||
- files~=^vllm/model_executor/guided_decoding/
|
|
||||||
- files=tests/model_executor/test_guided_processors.py
|
|
||||||
- files=tests/entrypoints/llm/test_guided_generate.py
|
|
||||||
- files~=^tests/v1/structured_output/
|
- files~=^tests/v1/structured_output/
|
||||||
- files=tests/v1/entrypoints/llm/test_guided_generate.py
|
- files=tests/v1/entrypoints/llm/test_guided_generate.py
|
||||||
- files~=^vllm/v1/structured_output/
|
- files~=^vllm/v1/structured_output/
|
||||||
@ -164,10 +161,7 @@ pull_request_rules:
|
|||||||
description: Automatically apply speculative-decoding label
|
description: Automatically apply speculative-decoding label
|
||||||
conditions:
|
conditions:
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/spec_decode/
|
|
||||||
- files~=^vllm/v1/spec_decode/
|
- files~=^vllm/v1/spec_decode/
|
||||||
- files=vllm/model_executor/layers/spec_decode_base_sampler.py
|
|
||||||
- files~=^tests/spec_decode/
|
|
||||||
- files~=^tests/v1/spec_decode/
|
- files~=^tests/v1/spec_decode/
|
||||||
- files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py
|
- files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py
|
||||||
- files~=^vllm/model_executor/models/.*eagle.*\.py
|
- files~=^vllm/model_executor/models/.*eagle.*\.py
|
||||||
|
|||||||
4
.github/workflows/lint-and-deploy.yaml
vendored
@ -2,6 +2,10 @@ name: Lint and Deploy Charts
|
|||||||
|
|
||||||
on: pull_request
|
on: pull_request
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
|
|
||||||
|
|||||||
17
.github/workflows/matchers/markdownlint.json
vendored
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
{
|
||||||
|
"problemMatcher": [
|
||||||
|
{
|
||||||
|
"owner": "markdownlint",
|
||||||
|
"pattern": [
|
||||||
|
{
|
||||||
|
"regexp": "^([^:]*):(\\d+):?(\\d+)?\\s([\\w-\\/]*)\\s(.*)$",
|
||||||
|
"file": 1,
|
||||||
|
"line": 2,
|
||||||
|
"column": 3,
|
||||||
|
"code": 4,
|
||||||
|
"message": 5
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
5
.github/workflows/pre-commit.yml
vendored
@ -5,6 +5,10 @@ on:
|
|||||||
push:
|
push:
|
||||||
branches: [main]
|
branches: [main]
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.ref }}
|
||||||
|
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
|
|
||||||
@ -17,6 +21,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
python-version: "3.12"
|
python-version: "3.12"
|
||||||
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
||||||
|
- run: echo "::add-matcher::.github/workflows/matchers/markdownlint.json"
|
||||||
- run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
|
- run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
|
||||||
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
|
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
|
||||||
with:
|
with:
|
||||||
|
|||||||
1
.github/workflows/scripts/build.sh
vendored
@ -15,7 +15,6 @@ $python_executable -m pip install -r requirements/build.txt -r requirements/cuda
|
|||||||
export MAX_JOBS=1
|
export MAX_JOBS=1
|
||||||
# Make sure release wheels are built for the following architectures
|
# Make sure release wheels are built for the following architectures
|
||||||
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
|
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
|
||||||
export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
|
|
||||||
|
|
||||||
bash tools/check_repo.sh
|
bash tools/check_repo.sh
|
||||||
|
|
||||||
|
|||||||
13
.markdownlint.yaml
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
MD007:
|
||||||
|
indent: 4
|
||||||
|
MD013: false
|
||||||
|
MD024:
|
||||||
|
siblings_only: true
|
||||||
|
MD033: false
|
||||||
|
MD042: false
|
||||||
|
MD045: false
|
||||||
|
MD046: false
|
||||||
|
MD051: false
|
||||||
|
MD052: false
|
||||||
|
MD053: false
|
||||||
|
MD059: false
|
||||||
@ -35,12 +35,12 @@ repos:
|
|||||||
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
||||||
types_or: [c++, cuda]
|
types_or: [c++, cuda]
|
||||||
args: [--style=file, --verbose]
|
args: [--style=file, --verbose]
|
||||||
- repo: https://github.com/jackdewinter/pymarkdown
|
- repo: https://github.com/igorshubovych/markdownlint-cli
|
||||||
rev: v0.9.29
|
rev: v0.45.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: pymarkdown
|
- id: markdownlint
|
||||||
exclude: '.*\.inc\.md'
|
exclude: '.*\.inc\.md'
|
||||||
args: [fix]
|
stages: [manual] # Only run in CI
|
||||||
- repo: https://github.com/rhysd/actionlint
|
- repo: https://github.com/rhysd/actionlint
|
||||||
rev: v1.7.7
|
rev: v1.7.7
|
||||||
hooks:
|
hooks:
|
||||||
|
|||||||
@ -7,6 +7,9 @@ build:
|
|||||||
os: ubuntu-22.04
|
os: ubuntu-22.04
|
||||||
tools:
|
tools:
|
||||||
python: "3.12"
|
python: "3.12"
|
||||||
|
jobs:
|
||||||
|
post_checkout:
|
||||||
|
- git fetch --unshallow || true
|
||||||
|
|
||||||
mkdocs:
|
mkdocs:
|
||||||
configuration: mkdocs.yaml
|
configuration: mkdocs.yaml
|
||||||
|
|||||||
@ -296,7 +296,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
|
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
|
||||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||||
"csrc/cutlass_extensions/common.cpp"
|
"csrc/cutlass_extensions/common.cpp"
|
||||||
"csrc/attention/mla/cutlass_mla_entry.cu")
|
"csrc/attention/mla/cutlass_mla_entry.cu"
|
||||||
|
"csrc/quantization/fp8/per_token_group_quant.cu")
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${VLLM_EXT_SRC}"
|
SRCS "${VLLM_EXT_SRC}"
|
||||||
@ -577,7 +578,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# if it's possible to compile MoE kernels that use its output.
|
# if it's possible to compile MoE kernels that use its output.
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -595,6 +596,26 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${SRCS}"
|
||||||
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
|
||||||
|
message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
|
||||||
|
else()
|
||||||
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
|
message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
|
||||||
|
"not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
|
||||||
|
"if you intend on running FP8 quantized MoE models on Blackwell.")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
|
||||||
|
"in CUDA target architectures.")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
# moe_data.cu is used by all CUTLASS MoE kernels.
|
# moe_data.cu is used by all CUTLASS MoE kernels.
|
||||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
||||||
@ -614,7 +635,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"in CUDA target architectures.")
|
"in CUDA target architectures.")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
|
||||||
@ -747,6 +768,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
|
list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
|
set(MOE_PERMUTE_SRC
|
||||||
|
"csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
|
||||||
|
"csrc/moe/moe_permute_unpermute_op.cu")
|
||||||
|
|
||||||
|
list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
|
||||||
|
endif()
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${VLLM_MOE_EXT_SRC}"
|
SRCS "${VLLM_MOE_EXT_SRC}"
|
||||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||||
@ -815,17 +844,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|
||||||
set(MOE_PERMUTE_SRC
|
|
||||||
"csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
|
|
||||||
"csrc/moe/moe_permute_unpermute_op.cu")
|
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${MARLIN_PERMUTE_SRC}"
|
|
||||||
CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
|
|
||||||
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
|
|
||||||
endif()
|
|
||||||
message(STATUS "Enabling moe extension.")
|
message(STATUS "Enabling moe extension.")
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
_moe_C
|
_moe_C
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
<!-- markdownlint-disable MD001 MD041 -->
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<picture>
|
<picture>
|
||||||
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png">
|
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png">
|
||||||
@ -16,6 +17,7 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
---
|
---
|
||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
|
||||||
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
|
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
|
||||||
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
|
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
|
||||||
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
|
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
|
||||||
@ -46,6 +48,7 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
</details>
|
</details>
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## About
|
## About
|
||||||
|
|
||||||
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
||||||
@ -75,6 +78,7 @@ vLLM is flexible and easy to use with:
|
|||||||
- Multi-LoRA support
|
- Multi-LoRA support
|
||||||
|
|
||||||
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
||||||
|
|
||||||
- Transformer-like LLMs (e.g., Llama)
|
- Transformer-like LLMs (e.g., Llama)
|
||||||
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
|
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
|
||||||
- Embedding Models (e.g., E5-Mistral)
|
- Embedding Models (e.g., E5-Mistral)
|
||||||
@ -91,6 +95,7 @@ pip install vllm
|
|||||||
```
|
```
|
||||||
|
|
||||||
Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
|
Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
|
||||||
|
|
||||||
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html)
|
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html)
|
||||||
- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
|
- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
|
||||||
- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
|
- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
|
||||||
@ -107,6 +112,7 @@ vLLM is a community project. Our compute resources for development and testing a
|
|||||||
<!-- Note: Please sort them in alphabetical order. -->
|
<!-- Note: Please sort them in alphabetical order. -->
|
||||||
<!-- Note: Please keep these consistent with docs/community/sponsors.md -->
|
<!-- Note: Please keep these consistent with docs/community/sponsors.md -->
|
||||||
Cash Donations:
|
Cash Donations:
|
||||||
|
|
||||||
- a16z
|
- a16z
|
||||||
- Dropbox
|
- Dropbox
|
||||||
- Sequoia Capital
|
- Sequoia Capital
|
||||||
@ -114,6 +120,7 @@ Cash Donations:
|
|||||||
- ZhenFund
|
- ZhenFund
|
||||||
|
|
||||||
Compute Resources:
|
Compute Resources:
|
||||||
|
|
||||||
- AMD
|
- AMD
|
||||||
- Anyscale
|
- Anyscale
|
||||||
- AWS
|
- AWS
|
||||||
|
|||||||
36
RELEASE.md
@ -52,3 +52,39 @@ After branch cut, we approach finalizing the release branch with clear criteria
|
|||||||
* Release branch specific changes (e.g. change version identifiers or CI fixes)
|
* Release branch specific changes (e.g. change version identifiers or CI fixes)
|
||||||
|
|
||||||
Please note: **No feature work allowed for cherry picks**. All PRs that are considered for cherry-picks need to be merged on trunk, the only exception are Release branch specific changes.
|
Please note: **No feature work allowed for cherry picks**. All PRs that are considered for cherry-picks need to be merged on trunk, the only exception are Release branch specific changes.
|
||||||
|
|
||||||
|
## Manual validations
|
||||||
|
|
||||||
|
### E2E Performance Validation
|
||||||
|
|
||||||
|
Before each release, we perform end-to-end performance validation to ensure no regressions are introduced. This validation uses the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) on PyTorch CI.
|
||||||
|
|
||||||
|
**Current Coverage:**
|
||||||
|
|
||||||
|
* Models: Llama3, Llama4, and Mixtral
|
||||||
|
* Hardware: NVIDIA H100 and AMD MI300x
|
||||||
|
* _Note: Coverage may change based on new model releases and hardware availability_
|
||||||
|
|
||||||
|
**Performance Validation Process:**
|
||||||
|
|
||||||
|
**Step 1: Get Access**
|
||||||
|
Request write access to the [pytorch/pytorch-integration-testing](https://github.com/pytorch/pytorch-integration-testing) repository to run the benchmark workflow.
|
||||||
|
|
||||||
|
**Step 2: Review Benchmark Setup**
|
||||||
|
Familiarize yourself with the benchmark configurations:
|
||||||
|
|
||||||
|
* [CUDA setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/cuda)
|
||||||
|
* [ROCm setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/rocm)
|
||||||
|
|
||||||
|
**Step 3: Run the Benchmark**
|
||||||
|
Navigate to the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) and configure:
|
||||||
|
|
||||||
|
* **vLLM branch**: Set to the release branch (e.g., `releases/v0.9.2`)
|
||||||
|
* **vLLM commit**: Set to the RC commit hash
|
||||||
|
|
||||||
|
**Step 4: Review Results**
|
||||||
|
Once the workflow completes, benchmark results will be available on the [vLLM benchmark dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) under the corresponding branch and commit.
|
||||||
|
|
||||||
|
**Step 5: Performance Comparison**
|
||||||
|
Compare the current results against the previous release to verify no performance regressions have occurred. Here is an
|
||||||
|
example of [v0.9.1 vs v0.9.2](https://hud.pytorch.org/benchmark/llms?startTime=Thu%2C%2017%20Apr%202025%2021%3A43%3A50%20GMT&stopTime=Wed%2C%2016%20Jul%202025%2021%3A43%3A50%20GMT&granularity=week&lBranch=releases/v0.9.1&lCommit=b6553be1bc75f046b00046a4ad7576364d03c835&rBranch=releases/v0.9.2&rCommit=a5dd03c1ebc5e4f56f3c9d3dc0436e9c582c978f&repoName=vllm-project%2Fvllm&benchmarkName=&modelName=All%20Models&backendName=All%20Backends&modeName=All%20Modes&dtypeName=All%20DType&deviceName=All%20Devices&archName=All%20Platforms).
|
||||||
|
|||||||
@ -4,7 +4,7 @@ This README guides you through running benchmark tests with the extensive
|
|||||||
datasets supported on vLLM. It’s a living document, updated as new features and datasets
|
datasets supported on vLLM. It’s a living document, updated as new features and datasets
|
||||||
become available.
|
become available.
|
||||||
|
|
||||||
**Dataset Overview**
|
## Dataset Overview
|
||||||
|
|
||||||
<table style="width:100%; border-collapse: collapse;">
|
<table style="width:100%; border-collapse: collapse;">
|
||||||
<thead>
|
<thead>
|
||||||
@ -81,9 +81,10 @@ become available.
|
|||||||
|
|
||||||
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
|
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
|
||||||
|
|
||||||
---
|
## 🚀 Example - Online Benchmark
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><b>🚀 Example - Online Benchmark</b></summary>
|
<summary>Show more</summary>
|
||||||
|
|
||||||
<br/>
|
<br/>
|
||||||
|
|
||||||
@ -98,7 +99,7 @@ Then run the benchmarking script
|
|||||||
```bash
|
```bash
|
||||||
# download dataset
|
# download dataset
|
||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
--endpoint /v1/completions \
|
--endpoint /v1/completions \
|
||||||
@ -109,39 +110,39 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
|
|
||||||
If successful, you will see the following output
|
If successful, you will see the following output
|
||||||
|
|
||||||
```
|
```text
|
||||||
============ Serving Benchmark Result ============
|
============ Serving Benchmark Result ============
|
||||||
Successful requests: 10
|
Successful requests: 10
|
||||||
Benchmark duration (s): 5.78
|
Benchmark duration (s): 5.78
|
||||||
Total input tokens: 1369
|
Total input tokens: 1369
|
||||||
Total generated tokens: 2212
|
Total generated tokens: 2212
|
||||||
Request throughput (req/s): 1.73
|
Request throughput (req/s): 1.73
|
||||||
Output token throughput (tok/s): 382.89
|
Output token throughput (tok/s): 382.89
|
||||||
Total Token throughput (tok/s): 619.85
|
Total Token throughput (tok/s): 619.85
|
||||||
---------------Time to First Token----------------
|
---------------Time to First Token----------------
|
||||||
Mean TTFT (ms): 71.54
|
Mean TTFT (ms): 71.54
|
||||||
Median TTFT (ms): 73.88
|
Median TTFT (ms): 73.88
|
||||||
P99 TTFT (ms): 79.49
|
P99 TTFT (ms): 79.49
|
||||||
-----Time per Output Token (excl. 1st token)------
|
-----Time per Output Token (excl. 1st token)------
|
||||||
Mean TPOT (ms): 7.91
|
Mean TPOT (ms): 7.91
|
||||||
Median TPOT (ms): 7.96
|
Median TPOT (ms): 7.96
|
||||||
P99 TPOT (ms): 8.03
|
P99 TPOT (ms): 8.03
|
||||||
---------------Inter-token Latency----------------
|
---------------Inter-token Latency----------------
|
||||||
Mean ITL (ms): 7.74
|
Mean ITL (ms): 7.74
|
||||||
Median ITL (ms): 7.70
|
Median ITL (ms): 7.70
|
||||||
P99 ITL (ms): 8.39
|
P99 ITL (ms): 8.39
|
||||||
==================================================
|
==================================================
|
||||||
```
|
```
|
||||||
|
|
||||||
**Custom Dataset**
|
### Custom Dataset
|
||||||
|
|
||||||
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
|
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
|
||||||
|
|
||||||
```
|
```json
|
||||||
{"prompt": "What is the capital of India?"}
|
{"prompt": "What is the capital of India?"}
|
||||||
{"prompt": "What is the capital of Iran?"}
|
{"prompt": "What is the capital of Iran?"}
|
||||||
{"prompt": "What is the capital of China?"}
|
{"prompt": "What is the capital of China?"}
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# start server
|
# start server
|
||||||
@ -150,7 +151,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# run benchmarking script
|
# run benchmarking script
|
||||||
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
|
vllm bench serve --port 9001 --save-result --save-detailed \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||||
--endpoint /v1/completions \
|
--endpoint /v1/completions \
|
||||||
@ -166,7 +167,7 @@ python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detaile
|
|||||||
|
|
||||||
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
|
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
|
||||||
|
|
||||||
**VisionArena Benchmark for Vision Language Models**
|
### VisionArena Benchmark for Vision Language Models
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# need a model with vision capability here
|
# need a model with vision capability here
|
||||||
@ -174,7 +175,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
|||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend openai-chat \
|
--backend openai-chat \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--endpoint /v1/chat/completions \
|
--endpoint /v1/chat/completions \
|
||||||
@ -184,7 +185,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
--num-prompts 1000
|
--num-prompts 1000
|
||||||
```
|
```
|
||||||
|
|
||||||
**InstructCoder Benchmark with Speculative Decoding**
|
### InstructCoder Benchmark with Speculative Decoding
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
||||||
@ -194,23 +195,23 @@ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
|||||||
```
|
```
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
python3 benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
--dataset-path likaixin/InstructCoder \
|
--dataset-path likaixin/InstructCoder \
|
||||||
--num-prompts 2048
|
--num-prompts 2048
|
||||||
```
|
```
|
||||||
|
|
||||||
**Other HuggingFaceDataset Examples**
|
### Other HuggingFaceDataset Examples
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
||||||
```
|
```
|
||||||
|
|
||||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
`lmms-lab/LLaVA-OneVision-Data`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend openai-chat \
|
--backend openai-chat \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--endpoint /v1/chat/completions \
|
--endpoint /v1/chat/completions \
|
||||||
@ -221,10 +222,10 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
--num-prompts 10
|
--num-prompts 10
|
||||||
```
|
```
|
||||||
|
|
||||||
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
`Aeala/ShareGPT_Vicuna_unfiltered`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend openai-chat \
|
--backend openai-chat \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--endpoint /v1/chat/completions \
|
--endpoint /v1/chat/completions \
|
||||||
@ -234,10 +235,10 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
--num-prompts 10
|
--num-prompts 10
|
||||||
```
|
```
|
||||||
|
|
||||||
**`AI-MO/aimo-validation-aime`**
|
`AI-MO/aimo-validation-aime`:
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--model Qwen/QwQ-32B \
|
--model Qwen/QwQ-32B \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
--dataset-path AI-MO/aimo-validation-aime \
|
--dataset-path AI-MO/aimo-validation-aime \
|
||||||
@ -245,23 +246,23 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
--seed 42
|
--seed 42
|
||||||
```
|
```
|
||||||
|
|
||||||
**`philschmid/mt-bench`**
|
`philschmid/mt-bench`:
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--model Qwen/QwQ-32B \
|
--model Qwen/QwQ-32B \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
--dataset-path philschmid/mt-bench \
|
--dataset-path philschmid/mt-bench \
|
||||||
--num-prompts 80
|
--num-prompts 80
|
||||||
```
|
```
|
||||||
|
|
||||||
**Running With Sampling Parameters**
|
### Running With Sampling Parameters
|
||||||
|
|
||||||
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
||||||
parameters can be specified. Example client command:
|
parameters can be specified. Example client command:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
--endpoint /v1/completions \
|
--endpoint /v1/completions \
|
||||||
@ -273,30 +274,34 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
--num-prompts 10
|
--num-prompts 10
|
||||||
```
|
```
|
||||||
|
|
||||||
**Running With Ramp-Up Request Rate**
|
### Running With Ramp-Up Request Rate
|
||||||
|
|
||||||
The benchmark tool also supports ramping up the request rate over the
|
The benchmark tool also supports ramping up the request rate over the
|
||||||
duration of the benchmark run. This can be useful for stress testing the
|
duration of the benchmark run. This can be useful for stress testing the
|
||||||
server or finding the maximum throughput that it can handle, given some latency budget.
|
server or finding the maximum throughput that it can handle, given some latency budget.
|
||||||
|
|
||||||
Two ramp-up strategies are supported:
|
Two ramp-up strategies are supported:
|
||||||
|
|
||||||
- `linear`: Increases the request rate linearly from a start value to an end value.
|
- `linear`: Increases the request rate linearly from a start value to an end value.
|
||||||
- `exponential`: Increases the request rate exponentially.
|
- `exponential`: Increases the request rate exponentially.
|
||||||
|
|
||||||
The following arguments can be used to control the ramp-up:
|
The following arguments can be used to control the ramp-up:
|
||||||
|
|
||||||
- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
|
- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
|
||||||
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
|
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
|
||||||
- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
|
- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
## 📈 Example - Offline Throughput Benchmark
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><b>📈 Example - Offline Throughput Benchmark</b></summary>
|
<summary>Show more</summary>
|
||||||
|
|
||||||
<br/>
|
<br/>
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
--dataset-name sonnet \
|
--dataset-name sonnet \
|
||||||
--dataset-path vllm/benchmarks/sonnet.txt \
|
--dataset-path vllm/benchmarks/sonnet.txt \
|
||||||
@ -305,16 +310,16 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
|||||||
|
|
||||||
If successful, you will see the following output
|
If successful, you will see the following output
|
||||||
|
|
||||||
```
|
```text
|
||||||
Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
|
Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
|
||||||
Total num prompt tokens: 5014
|
Total num prompt tokens: 5014
|
||||||
Total num output tokens: 1500
|
Total num output tokens: 1500
|
||||||
```
|
```
|
||||||
|
|
||||||
**VisionArena Benchmark for Vision Language Models**
|
### VisionArena Benchmark for Vision Language Models
|
||||||
|
|
||||||
``` bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--backend vllm-chat \
|
--backend vllm-chat \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
@ -325,18 +330,18 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
|||||||
|
|
||||||
The `num prompt tokens` now includes image token counts
|
The `num prompt tokens` now includes image token counts
|
||||||
|
|
||||||
```
|
```text
|
||||||
Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
|
Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
|
||||||
Total num prompt tokens: 14527
|
Total num prompt tokens: 14527
|
||||||
Total num output tokens: 1280
|
Total num output tokens: 1280
|
||||||
```
|
```
|
||||||
|
|
||||||
**InstructCoder Benchmark with Speculative Decoding**
|
### InstructCoder Benchmark with Speculative Decoding
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||||
VLLM_USE_V1=1 \
|
VLLM_USE_V1=1 \
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--dataset-name=hf \
|
--dataset-name=hf \
|
||||||
--dataset-path=likaixin/InstructCoder \
|
--dataset-path=likaixin/InstructCoder \
|
||||||
--model=meta-llama/Meta-Llama-3-8B-Instruct \
|
--model=meta-llama/Meta-Llama-3-8B-Instruct \
|
||||||
@ -349,18 +354,18 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
|||||||
"prompt_lookup_min": 2}'
|
"prompt_lookup_min": 2}'
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```text
|
||||||
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
|
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
|
||||||
Total num prompt tokens: 261136
|
Total num prompt tokens: 261136
|
||||||
Total num output tokens: 204800
|
Total num output tokens: 204800
|
||||||
```
|
```
|
||||||
|
|
||||||
**Other HuggingFaceDataset Examples**
|
### Other HuggingFaceDataset Examples
|
||||||
|
|
||||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
`lmms-lab/LLaVA-OneVision-Data`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--backend vllm-chat \
|
--backend vllm-chat \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
@ -370,10 +375,10 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
|||||||
--num-prompts 10
|
--num-prompts 10
|
||||||
```
|
```
|
||||||
|
|
||||||
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
`Aeala/ShareGPT_Vicuna_unfiltered`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--backend vllm-chat \
|
--backend vllm-chat \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
@ -382,10 +387,10 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
|||||||
--num-prompts 10
|
--num-prompts 10
|
||||||
```
|
```
|
||||||
|
|
||||||
**`AI-MO/aimo-validation-aime`**
|
`AI-MO/aimo-validation-aime`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model Qwen/QwQ-32B \
|
--model Qwen/QwQ-32B \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
@ -394,12 +399,12 @@ python3 benchmarks/benchmark_throughput.py \
|
|||||||
--num-prompts 10
|
--num-prompts 10
|
||||||
```
|
```
|
||||||
|
|
||||||
**Benchmark with LoRA Adapters**
|
Benchmark with LoRA adapters:
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
# download dataset
|
# download dataset
|
||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model meta-llama/Llama-2-7b-hf \
|
--model meta-llama/Llama-2-7b-hf \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||||
@ -413,20 +418,22 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
## 🛠️ Example - Structured Output Benchmark
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><b>🛠️ Example - Structured Output Benchmark</b></summary>
|
<summary>Show more</summary>
|
||||||
|
|
||||||
<br/>
|
<br/>
|
||||||
|
|
||||||
Benchmark the performance of structured output generation (JSON, grammar, regex).
|
Benchmark the performance of structured output generation (JSON, grammar, regex).
|
||||||
|
|
||||||
**Server Setup**
|
### Server Setup
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
|
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
|
||||||
```
|
```
|
||||||
|
|
||||||
**JSON Schema Benchmark**
|
### JSON Schema Benchmark
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||||
@ -438,7 +445,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
|
|||||||
--num-prompts 1000
|
--num-prompts 1000
|
||||||
```
|
```
|
||||||
|
|
||||||
**Grammar-based Generation Benchmark**
|
### Grammar-based Generation Benchmark
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||||
@ -450,7 +457,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
|
|||||||
--num-prompts 1000
|
--num-prompts 1000
|
||||||
```
|
```
|
||||||
|
|
||||||
**Regex-based Generation Benchmark**
|
### Regex-based Generation Benchmark
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||||
@ -461,7 +468,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
|
|||||||
--num-prompts 1000
|
--num-prompts 1000
|
||||||
```
|
```
|
||||||
|
|
||||||
**Choice-based Generation Benchmark**
|
### Choice-based Generation Benchmark
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||||
@ -472,7 +479,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
|
|||||||
--num-prompts 1000
|
--num-prompts 1000
|
||||||
```
|
```
|
||||||
|
|
||||||
**XGrammar Benchmark Dataset**
|
### XGrammar Benchmark Dataset
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||||
@ -485,14 +492,16 @@ python3 benchmarks/benchmark_serving_structured_output.py \
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
## 📚 Example - Long Document QA Benchmark
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><b>📚 Example - Long Document QA Benchmark</b></summary>
|
<summary>Show more</summary>
|
||||||
|
|
||||||
<br/>
|
<br/>
|
||||||
|
|
||||||
Benchmark the performance of long document question-answering with prefix caching.
|
Benchmark the performance of long document question-answering with prefix caching.
|
||||||
|
|
||||||
**Basic Long Document QA Test**
|
### Basic Long Document QA Test
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
||||||
@ -504,7 +513,7 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
|||||||
--repeat-count 5
|
--repeat-count 5
|
||||||
```
|
```
|
||||||
|
|
||||||
**Different Repeat Modes**
|
### Different Repeat Modes
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Random mode (default) - shuffle prompts randomly
|
# Random mode (default) - shuffle prompts randomly
|
||||||
@ -537,14 +546,16 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
## 🗂️ Example - Prefix Caching Benchmark
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><b>🗂️ Example - Prefix Caching Benchmark</b></summary>
|
<summary>Show more</summary>
|
||||||
|
|
||||||
<br/>
|
<br/>
|
||||||
|
|
||||||
Benchmark the efficiency of automatic prefix caching.
|
Benchmark the efficiency of automatic prefix caching.
|
||||||
|
|
||||||
**Fixed Prompt with Prefix Caching**
|
### Fixed Prompt with Prefix Caching
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 benchmarks/benchmark_prefix_caching.py \
|
python3 benchmarks/benchmark_prefix_caching.py \
|
||||||
@ -555,7 +566,7 @@ python3 benchmarks/benchmark_prefix_caching.py \
|
|||||||
--input-length-range 128:256
|
--input-length-range 128:256
|
||||||
```
|
```
|
||||||
|
|
||||||
**ShareGPT Dataset with Prefix Caching**
|
### ShareGPT Dataset with Prefix Caching
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# download dataset
|
# download dataset
|
||||||
@ -572,14 +583,16 @@ python3 benchmarks/benchmark_prefix_caching.py \
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
## ⚡ Example - Request Prioritization Benchmark
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><b>⚡ Example - Request Prioritization Benchmark</b></summary>
|
<summary>Show more</summary>
|
||||||
|
|
||||||
<br/>
|
<br/>
|
||||||
|
|
||||||
Benchmark the performance of request prioritization in vLLM.
|
Benchmark the performance of request prioritization in vLLM.
|
||||||
|
|
||||||
**Basic Prioritization Test**
|
### Basic Prioritization Test
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 benchmarks/benchmark_prioritization.py \
|
python3 benchmarks/benchmark_prioritization.py \
|
||||||
@ -590,7 +603,7 @@ python3 benchmarks/benchmark_prioritization.py \
|
|||||||
--scheduling-policy priority
|
--scheduling-policy priority
|
||||||
```
|
```
|
||||||
|
|
||||||
**Multiple Sequences per Prompt**
|
### Multiple Sequences per Prompt
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 benchmarks/benchmark_prioritization.py \
|
python3 benchmarks/benchmark_prioritization.py \
|
||||||
|
|||||||
145
benchmarks/auto_tune/README.md
Normal file
@ -0,0 +1,145 @@
|
|||||||
|
# Automated vLLM Server Parameter Tuning
|
||||||
|
|
||||||
|
This script automates the process of finding the optimal server parameter combination (`max-num-seqs` and `max-num-batched-tokens`) to maximize throughput for a vLLM server. It also supports additional constraints such as E2E latency and prefix cache hit rate.
|
||||||
|
|
||||||
|
## Table of Contents
|
||||||
|
|
||||||
|
- [Prerequisites](#prerequisites)
|
||||||
|
- [Configuration](#configuration)
|
||||||
|
- [How to Run](#how-to-run)
|
||||||
|
- [Example Use Cases](#example-use-cases)
|
||||||
|
- [Output](#output)
|
||||||
|
- [How It Works](#how-it-works)
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
Before running the script, please ensure the following steps are completed:
|
||||||
|
|
||||||
|
1. **Clone vLLM & Set Up Branch**: Clone the vLLM repository and check out to your desired branch.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/vllm-project/vllm.git
|
||||||
|
cd vllm
|
||||||
|
# git checkout <your-branch>
|
||||||
|
```
|
||||||
|
|
||||||
|
1. **Install Environment**: Install or update the correct running environment. For TPU usage, activate your `conda` environment and install the corresponding `torch` and `torch_xla` versions.
|
||||||
|
|
||||||
|
2. **Model Configuration**: If you are using a customized model, ensure its configuration files are correctly placed and accessible.
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
You must set the following variables at the top of the script before execution.
|
||||||
|
|
||||||
|
| Variable | Description | Example Value |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
|
||||||
|
| `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` |
|
||||||
|
| `SYSTEM`| **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
|
||||||
|
| `TP` | **Required.** The tensor-parallelism size. | `1` |
|
||||||
|
| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
|
||||||
|
| `INPUT_LEN` | **Required.** Request input length. | `4000` |
|
||||||
|
| `OUTPUT_LEN` | **Required.** Request output length. | `16` |
|
||||||
|
| `MAX_MODEL_LEN` | **Required.** Max model length. | `4096` |
|
||||||
|
| `MIN_CACHE_HIT_PCT` | Prefix cache hit rate in percentage (0-100). Set to `0` to disable. | `60` |
|
||||||
|
| `MAX_LATENCY_ALLOWED_MS` | The maximum allowed P99 end-to-end latency in milliseconds. Set to a very large number (e.g., `100000000000`) to effectively ignore the latency constraint. | `500` |
|
||||||
|
| `NUM_SEQS_LIST` | A space-separated string of `max-num-seqs` values to test. | `"128 256"` |
|
||||||
|
| `NUM_BATCHED_TOKENS_LIST` | A space-separated string of `max-num-batched-tokens` values to test. | `"1024 2048 4096"` |
|
||||||
|
|
||||||
|
**Note**: The default `NUM_SEQS_LIST` and `NUM_BATCHED_TOKENS_LIST` are set for medium-sized inputs/outputs. For very short contexts (e.g., 20 input, 20 output tokens), you may need to test larger values for `max-num-seqs`.
|
||||||
|
|
||||||
|
## How to Run
|
||||||
|
|
||||||
|
1. **Configure**: Edit the script and set the variables in the [Configuration](#configuration) section.
|
||||||
|
2. **Execute**: Run the script. Since the process can take a long time, it is highly recommended to use a terminal multiplexer like `tmux` or `screen` to prevent the script from stopping if your connection is lost.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd <FOLDER_OF_THIS_SCRIPT>
|
||||||
|
bash auto_tune.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Please note that the `bash auto_tune.sh` command cannot contain full or partial path with keyword `vllm`, otherwise `pkill -f vllm` command will also kill this script itself.
|
||||||
|
|
||||||
|
## Example Use Cases
|
||||||
|
|
||||||
|
Here are a few examples of how to configure the script for different goals:
|
||||||
|
|
||||||
|
### 1. Maximize Throughput (No Latency Constraint)
|
||||||
|
|
||||||
|
- **Goal**: Find the best `max-num-seqs` and `max-num-batched-tokens` to get the highest possible throughput for 1800 input tokens and 20 output tokens.
|
||||||
|
- **Configuration**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
INPUT_LEN=1800
|
||||||
|
OUTPUT_LEN=20
|
||||||
|
MAX_MODEL_LEN=2048
|
||||||
|
MIN_CACHE_HIT_PCT=0
|
||||||
|
MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 2. Maximize Throughput with a Latency Requirement
|
||||||
|
|
||||||
|
- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
|
||||||
|
- **Configuration**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
INPUT_LEN=1800
|
||||||
|
OUTPUT_LEN=20
|
||||||
|
MAX_MODEL_LEN=2048
|
||||||
|
MIN_CACHE_HIT_PCT=0
|
||||||
|
MAX_LATENCY_ALLOWED_MS=500
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
|
||||||
|
|
||||||
|
- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
|
||||||
|
- **Configuration**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
INPUT_LEN=1800
|
||||||
|
OUTPUT_LEN=20
|
||||||
|
MAX_MODEL_LEN=2048
|
||||||
|
MIN_CACHE_HIT_PCT=60
|
||||||
|
MAX_LATENCY_ALLOWED_MS=500
|
||||||
|
```
|
||||||
|
|
||||||
|
## Output
|
||||||
|
|
||||||
|
After the script finishes, you will find the results in a new, timestamped directory created inside `$BASE/auto-benchmark/`.
|
||||||
|
|
||||||
|
- **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run:
|
||||||
|
- `vllm_log_...txt`: The log output from the vLLM server for each parameter combination.
|
||||||
|
- `bm_log_...txt`: The log output from the `vllm bench serve` command for each benchmark run.
|
||||||
|
|
||||||
|
- **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.
|
||||||
|
|
||||||
|
```text
|
||||||
|
# Example result.txt content
|
||||||
|
hash:a1b2c3d4...
|
||||||
|
max_num_seqs: 128, max_num_batched_tokens: 2048, request_rate: 10.0, e2el: 450.5, throughput: 9.8, goodput: 9.8
|
||||||
|
max_num_seqs: 128, max_num_batched_tokens: 4096 does not meet latency requirement 500
|
||||||
|
...
|
||||||
|
best_max_num_seqs: 256, best_num_batched_tokens: 2048, best_throughput: 12.5, profile saved in: /home/user/vllm/auto-benchmark/2024_08_01_10_30/profile
|
||||||
|
```
|
||||||
|
|
||||||
|
If it cannot find the best parameters, the final row will be `best_max_num_seqs: 0, best_num_batched_tokens: 0, best_throughput: 0`. This can be due to either the server not starting properly, or the latency requirement being too strict.
|
||||||
|
|
||||||
|
- **Profiler Trace**: A directory named `profile` is created inside the log directory. It contains the profiler trace file (e.g., `.xplane.pb` for TPU or a `.json` trace for GPU) from the single best-performing run.
|
||||||
|
|
||||||
|
## How It Works
|
||||||
|
|
||||||
|
The script follows a systematic process to find the optimal parameters:
|
||||||
|
|
||||||
|
1. **Find Max GPU Memory Utilization**: The script first determines the highest safe `gpu-memory-utilization` (starting from 0.98 and decreasing) that does not cause an Out-Of-Memory (OOM) error when launching the server. This ensures the benchmark runs use the maximum available memory without crashing.
|
||||||
|
|
||||||
|
2. **Iterate and Benchmark**: It then enters a nested loop, iterating through every combination of `max-num-seqs` and `max-num-batched-tokens` provided in the configuration lists.
|
||||||
|
|
||||||
|
3. **Latency-Aware Throughput Search**: For each parameter combination:
|
||||||
|
- The vLLM server is started.
|
||||||
|
- A benchmark is first run with an infinite request rate (`--request-rate inf`).
|
||||||
|
- If the resulting P99 E2E latency is within the `MAX_LATENCY_ALLOWED_MS` limit, this throughput is considered the maximum for this configuration.
|
||||||
|
- If the latency is too high, the script performs a search by iteratively decreasing the request rate until the latency constraint is met. This finds the highest sustainable throughput for the given parameters and latency requirement.
|
||||||
|
|
||||||
|
4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.
|
||||||
|
|
||||||
|
5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.
|
||||||
@ -1,45 +1,18 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
|
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
|
||||||
# The current server parameter combination is max_num_seqs and max_num_batched_tokens
|
# See details in README (benchmarks/auto_tune/README.md).
|
||||||
# It also supports additional requirement: e2e latency and prefix cache.
|
|
||||||
|
|
||||||
# Pre-requisite:
|
|
||||||
# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version.
|
|
||||||
# 2. If the model is customized, replace the MODEL's config with the customized config.
|
|
||||||
# 3. Set variables (ALL REQUIRED)
|
|
||||||
# BASE: your directory for vllm repo
|
|
||||||
# MODEL: the model served by vllm
|
|
||||||
# SYSTEM: the hardware, choice TPU or GPU, for other systems, "get best profile" might not support.
|
|
||||||
# TP: ways of tensor parallelism
|
|
||||||
# DOWNLOAD_DIR: directory to download and load model weights.
|
|
||||||
# INPUT_LEN: request input len
|
|
||||||
# OUTPUT_LEN: request output len
|
|
||||||
# MIN_CACHE_HIT_PCT: prefix cache rate
|
|
||||||
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
|
|
||||||
# NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
|
|
||||||
# NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
|
|
||||||
# Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
|
|
||||||
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
|
|
||||||
# 5. The final result will be saved in RESULT file.
|
|
||||||
|
|
||||||
|
|
||||||
# Example use cases
|
|
||||||
# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput?
|
|
||||||
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000
|
|
||||||
# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter?
|
|
||||||
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500
|
|
||||||
# 3. If we want to reach 60% prefix cache, what's the best server parameter?
|
|
||||||
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500
|
|
||||||
|
|
||||||
TAG=$(date +"%Y_%m_%d_%H_%M")
|
TAG=$(date +"%Y_%m_%d_%H_%M")
|
||||||
BASE=""
|
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||||
|
BASE="$SCRIPT_DIR/../../.."
|
||||||
MODEL="meta-llama/Llama-3.1-8B-Instruct"
|
MODEL="meta-llama/Llama-3.1-8B-Instruct"
|
||||||
SYSTEM="TPU"
|
SYSTEM="TPU"
|
||||||
TP=1
|
TP=1
|
||||||
DOWNLOAD_DIR=""
|
DOWNLOAD_DIR=""
|
||||||
INPUT_LEN=4000
|
INPUT_LEN=4000
|
||||||
OUTPUT_LEN=16
|
OUTPUT_LEN=16
|
||||||
|
MAX_MODEL_LEN=4096
|
||||||
MIN_CACHE_HIT_PCT=0
|
MIN_CACHE_HIT_PCT=0
|
||||||
MAX_LATENCY_ALLOWED_MS=100000000000
|
MAX_LATENCY_ALLOWED_MS=100000000000
|
||||||
NUM_SEQS_LIST="128 256"
|
NUM_SEQS_LIST="128 256"
|
||||||
@ -65,6 +38,13 @@ current_hash=$(git rev-parse HEAD)
|
|||||||
echo "hash:$current_hash" >> "$RESULT"
|
echo "hash:$current_hash" >> "$RESULT"
|
||||||
echo "current_hash: $current_hash"
|
echo "current_hash: $current_hash"
|
||||||
|
|
||||||
|
TOTAL_LEN=$((INPUT_LEN + OUTPUT_LEN))
|
||||||
|
RED='\033[0;31m'
|
||||||
|
if (( TOTAL_LEN > MAX_MODEL_LEN )); then
|
||||||
|
echo -e "${RED}FAILED: INPUT_LEN($INPUT_LEN) + OUTPUT_LEN($OUTPUT_LEN) = $TOTAL_LEN, which is > MAX_MODEL_LEN = $MAX_MODEL_LEN.\033[0m" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
best_throughput=0
|
best_throughput=0
|
||||||
best_max_num_seqs=0
|
best_max_num_seqs=0
|
||||||
best_num_batched_tokens=0
|
best_num_batched_tokens=0
|
||||||
@ -76,7 +56,7 @@ start_server() {
|
|||||||
local max_num_batched_tokens=$3
|
local max_num_batched_tokens=$3
|
||||||
local vllm_log=$4
|
local vllm_log=$4
|
||||||
local profile_dir=$5
|
local profile_dir=$5
|
||||||
|
|
||||||
pkill -f vllm
|
pkill -f vllm
|
||||||
|
|
||||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
|
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
|
||||||
@ -89,13 +69,13 @@ start_server() {
|
|||||||
--enable-prefix-caching \
|
--enable-prefix-caching \
|
||||||
--load-format dummy \
|
--load-format dummy \
|
||||||
--download-dir "$DOWNLOAD_DIR" \
|
--download-dir "$DOWNLOAD_DIR" \
|
||||||
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
|
--max-model-len $MAX_MODEL_LEN > "$vllm_log" 2>&1 &
|
||||||
|
|
||||||
# wait for 10 minutes...
|
# wait for 10 minutes...
|
||||||
server_started=0
|
server_started=0
|
||||||
for i in {1..60}; do
|
for i in {1..60}; do
|
||||||
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
||||||
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
||||||
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
||||||
server_started=1
|
server_started=1
|
||||||
break
|
break
|
||||||
@ -118,10 +98,10 @@ update_best_profile() {
|
|||||||
selected_profile_file=
|
selected_profile_file=
|
||||||
if [[ "$SYSTEM" == "TPU" ]]; then
|
if [[ "$SYSTEM" == "TPU" ]]; then
|
||||||
selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
|
selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
|
||||||
fi
|
fi
|
||||||
if [[ "$SYSTEM" == "GPU" ]]; then
|
if [[ "$SYSTEM" == "GPU" ]]; then
|
||||||
selected_profile_file="${sorted_paths[$profile_index]}"
|
selected_profile_file="${sorted_paths[$profile_index]}"
|
||||||
fi
|
fi
|
||||||
rm -f $PROFILE_PATH/*
|
rm -f $PROFILE_PATH/*
|
||||||
cp $selected_profile_file $PROFILE_PATH
|
cp $selected_profile_file $PROFILE_PATH
|
||||||
}
|
}
|
||||||
@ -149,17 +129,18 @@ run_benchmark() {
|
|||||||
echo "server started."
|
echo "server started."
|
||||||
fi
|
fi
|
||||||
echo
|
echo
|
||||||
|
|
||||||
echo "run benchmark test..."
|
echo "run benchmark test..."
|
||||||
meet_latency_requirement=0
|
meet_latency_requirement=0
|
||||||
# get a basic qps by using request-rate inf
|
# get a basic qps by using request-rate inf
|
||||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
||||||
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
||||||
python benchmarks/benchmark_serving.py \
|
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
||||||
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--dataset-name random \
|
--dataset-name random \
|
||||||
--random-input-len $INPUT_LEN \
|
--random-input-len $adjusted_input_len \
|
||||||
--random-output-len $OUTPUT_LEN \
|
--random-output-len $OUTPUT_LEN \
|
||||||
--ignore-eos \
|
--ignore-eos \
|
||||||
--disable-tqdm \
|
--disable-tqdm \
|
||||||
@ -188,11 +169,11 @@ run_benchmark() {
|
|||||||
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
||||||
sleep 5
|
sleep 5
|
||||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
||||||
python benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--dataset-name random \
|
--dataset-name random \
|
||||||
--random-input-len $INPUT_LEN \
|
--random-input-len $adjusted_input_len \
|
||||||
--random-output-len $OUTPUT_LEN \
|
--random-output-len $OUTPUT_LEN \
|
||||||
--ignore-eos \
|
--ignore-eos \
|
||||||
--disable-tqdm \
|
--disable-tqdm \
|
||||||
@ -273,4 +254,3 @@ done
|
|||||||
echo "finish permutations"
|
echo "finish permutations"
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
|
||||||
|
|
||||||
@ -11,6 +11,7 @@ from typing import Any, Optional
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||||
@ -34,6 +35,10 @@ def save_to_pytorch_benchmark_format(
|
|||||||
write_to_json(pt_file, pt_records)
|
write_to_json(pt_file, pt_records)
|
||||||
|
|
||||||
|
|
||||||
|
@deprecated(
|
||||||
|
"benchmark_latency.py is deprecated and will be removed in a "
|
||||||
|
"future version. Please use 'vllm bench latency' instead.",
|
||||||
|
)
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
print(args)
|
print(args)
|
||||||
|
|
||||||
|
|||||||
@ -30,7 +30,7 @@ import os
|
|||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
import warnings
|
import warnings
|
||||||
from collections.abc import AsyncGenerator, Iterable
|
from collections.abc import Iterable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Any, Literal, Optional
|
from typing import Any, Literal, Optional
|
||||||
@ -38,6 +38,7 @@ from typing import Any, Literal, Optional
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from tqdm.asyncio import tqdm
|
from tqdm.asyncio import tqdm
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
from backend_request_func import (
|
from backend_request_func import (
|
||||||
ASYNC_REQUEST_FUNCS,
|
ASYNC_REQUEST_FUNCS,
|
||||||
@ -73,6 +74,7 @@ from benchmark_dataset import (
|
|||||||
VisionArenaDataset,
|
VisionArenaDataset,
|
||||||
)
|
)
|
||||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||||
|
from vllm.benchmarks.serve import get_request
|
||||||
|
|
||||||
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
||||||
|
|
||||||
@ -107,101 +109,6 @@ class BenchmarkMetrics:
|
|||||||
percentiles_e2el_ms: list[tuple[float, float]]
|
percentiles_e2el_ms: list[tuple[float, float]]
|
||||||
|
|
||||||
|
|
||||||
def _get_current_request_rate(
|
|
||||||
ramp_up_strategy: Optional[Literal["linear", "exponential"]],
|
|
||||||
ramp_up_start_rps: Optional[int],
|
|
||||||
ramp_up_end_rps: Optional[int],
|
|
||||||
request_index: int,
|
|
||||||
total_requests: int,
|
|
||||||
request_rate: float,
|
|
||||||
) -> float:
|
|
||||||
if (
|
|
||||||
ramp_up_strategy
|
|
||||||
and ramp_up_start_rps is not None
|
|
||||||
and ramp_up_end_rps is not None
|
|
||||||
):
|
|
||||||
progress = request_index / max(total_requests - 1, 1)
|
|
||||||
if ramp_up_strategy == "linear":
|
|
||||||
increase = (ramp_up_end_rps - ramp_up_start_rps) * progress
|
|
||||||
return ramp_up_start_rps + increase
|
|
||||||
elif ramp_up_strategy == "exponential":
|
|
||||||
ratio = ramp_up_end_rps / ramp_up_start_rps
|
|
||||||
return ramp_up_start_rps * (ratio**progress)
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unknown ramp-up strategy: {ramp_up_strategy}")
|
|
||||||
return request_rate
|
|
||||||
|
|
||||||
|
|
||||||
async def get_request(
|
|
||||||
input_requests: list[SampleRequest],
|
|
||||||
request_rate: float,
|
|
||||||
burstiness: float = 1.0,
|
|
||||||
ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
|
|
||||||
ramp_up_start_rps: Optional[int] = None,
|
|
||||||
ramp_up_end_rps: Optional[int] = None,
|
|
||||||
) -> AsyncGenerator[tuple[SampleRequest, float], None]:
|
|
||||||
"""
|
|
||||||
Asynchronously generates requests at a specified rate
|
|
||||||
with OPTIONAL burstiness and OPTIONAL ramp-up strategy.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
input_requests:
|
|
||||||
A list of input requests, each represented as a SampleRequest.
|
|
||||||
request_rate:
|
|
||||||
The rate at which requests are generated (requests/s).
|
|
||||||
burstiness (optional):
|
|
||||||
The burstiness factor of the request generation.
|
|
||||||
Only takes effect when request_rate is not inf.
|
|
||||||
Default value is 1, which follows a Poisson process.
|
|
||||||
Otherwise, the request intervals follow a gamma distribution.
|
|
||||||
A lower burstiness value (0 < burstiness < 1) results
|
|
||||||
in more bursty requests, while a higher burstiness value
|
|
||||||
(burstiness > 1) results in a more uniform arrival of requests.
|
|
||||||
ramp_up_strategy (optional):
|
|
||||||
The ramp-up strategy. Can be "linear" or "exponential".
|
|
||||||
If None, uses constant request rate (specified by request_rate).
|
|
||||||
ramp_up_start_rps (optional):
|
|
||||||
The starting request rate for ramp-up.
|
|
||||||
ramp_up_end_rps (optional):
|
|
||||||
The ending request rate for ramp-up.
|
|
||||||
"""
|
|
||||||
assert burstiness > 0, (
|
|
||||||
f"A positive burstiness factor is expected, but given {burstiness}."
|
|
||||||
)
|
|
||||||
# Convert to list to get length for ramp-up calculations
|
|
||||||
if isinstance(input_requests, Iterable) and not isinstance(input_requests, list):
|
|
||||||
input_requests = list(input_requests)
|
|
||||||
|
|
||||||
total_requests = len(input_requests)
|
|
||||||
request_index = 0
|
|
||||||
|
|
||||||
for request in input_requests:
|
|
||||||
current_request_rate = _get_current_request_rate(
|
|
||||||
ramp_up_strategy,
|
|
||||||
ramp_up_start_rps,
|
|
||||||
ramp_up_end_rps,
|
|
||||||
request_index,
|
|
||||||
total_requests,
|
|
||||||
request_rate,
|
|
||||||
)
|
|
||||||
|
|
||||||
yield request, current_request_rate
|
|
||||||
|
|
||||||
request_index += 1
|
|
||||||
|
|
||||||
if current_request_rate == float("inf"):
|
|
||||||
# If the request rate is infinity, then we don't need to wait.
|
|
||||||
continue
|
|
||||||
|
|
||||||
theta = 1.0 / (current_request_rate * burstiness)
|
|
||||||
|
|
||||||
# Sample the request interval from the gamma distribution.
|
|
||||||
# If burstiness is 1, it follows exponential distribution.
|
|
||||||
interval = np.random.gamma(shape=burstiness, scale=theta)
|
|
||||||
# The next request will be sent after the interval.
|
|
||||||
await asyncio.sleep(interval)
|
|
||||||
|
|
||||||
|
|
||||||
def calculate_metrics(
|
def calculate_metrics(
|
||||||
input_requests: list[SampleRequest],
|
input_requests: list[SampleRequest],
|
||||||
outputs: list[RequestFuncOutput],
|
outputs: list[RequestFuncOutput],
|
||||||
@ -489,20 +396,6 @@ async def benchmark(
|
|||||||
tasks.append(asyncio.create_task(task))
|
tasks.append(asyncio.create_task(task))
|
||||||
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
|
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
if profile:
|
|
||||||
print("Stopping profiler...")
|
|
||||||
profile_input = RequestFuncInput(
|
|
||||||
model=model_id,
|
|
||||||
prompt=test_prompt,
|
|
||||||
api_url=base_url + "/stop_profile",
|
|
||||||
prompt_len=test_prompt_len,
|
|
||||||
output_len=test_output_len,
|
|
||||||
logprobs=logprobs,
|
|
||||||
)
|
|
||||||
profile_output = await request_func(request_func_input=profile_input)
|
|
||||||
if profile_output.success:
|
|
||||||
print("Profiler stopped")
|
|
||||||
|
|
||||||
if pbar is not None:
|
if pbar is not None:
|
||||||
pbar.close()
|
pbar.close()
|
||||||
|
|
||||||
@ -520,6 +413,10 @@ async def benchmark(
|
|||||||
|
|
||||||
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
||||||
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
||||||
|
if max_concurrency is not None:
|
||||||
|
print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
|
||||||
|
if request_rate != float("inf"):
|
||||||
|
print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate))
|
||||||
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
|
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
|
||||||
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
|
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
|
||||||
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
|
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
|
||||||
@ -611,6 +508,20 @@ async def benchmark(
|
|||||||
|
|
||||||
print("=" * 50)
|
print("=" * 50)
|
||||||
|
|
||||||
|
if profile:
|
||||||
|
print("Stopping profiler...")
|
||||||
|
profile_input = RequestFuncInput(
|
||||||
|
model=model_id,
|
||||||
|
prompt=test_prompt,
|
||||||
|
api_url=base_url + "/stop_profile",
|
||||||
|
prompt_len=test_prompt_len,
|
||||||
|
output_len=test_output_len,
|
||||||
|
logprobs=logprobs,
|
||||||
|
)
|
||||||
|
profile_output = await request_func(request_func_input=profile_input)
|
||||||
|
if profile_output.success:
|
||||||
|
print("Profiler stopped")
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
@ -687,6 +598,10 @@ def save_to_pytorch_benchmark_format(
|
|||||||
write_to_json(pt_file, pt_records)
|
write_to_json(pt_file, pt_records)
|
||||||
|
|
||||||
|
|
||||||
|
@deprecated(
|
||||||
|
"benchmark_serving.py is deprecated and will be removed in a future "
|
||||||
|
"version. Please use 'vllm bench serve' instead.",
|
||||||
|
)
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
print(args)
|
print(args)
|
||||||
random.seed(args.seed)
|
random.seed(args.seed)
|
||||||
|
|||||||
@ -538,20 +538,6 @@ async def benchmark(
|
|||||||
)
|
)
|
||||||
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
|
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
if profile:
|
|
||||||
print("Stopping profiler...")
|
|
||||||
profile_input = RequestFuncInput(
|
|
||||||
model=model_id,
|
|
||||||
prompt=test_request.prompt,
|
|
||||||
api_url=base_url + "/stop_profile",
|
|
||||||
prompt_len=test_request.prompt_len,
|
|
||||||
output_len=test_request.expected_output_len,
|
|
||||||
extra_body={test_request.structure_type: test_request.schema},
|
|
||||||
)
|
|
||||||
profile_output = await request_func(request_func_input=profile_input)
|
|
||||||
if profile_output.success:
|
|
||||||
print("Profiler stopped")
|
|
||||||
|
|
||||||
if pbar is not None:
|
if pbar is not None:
|
||||||
pbar.close()
|
pbar.close()
|
||||||
|
|
||||||
@ -569,6 +555,10 @@ async def benchmark(
|
|||||||
|
|
||||||
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
||||||
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
||||||
|
if max_concurrency is not None:
|
||||||
|
print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
|
||||||
|
if request_rate != float("inf"):
|
||||||
|
print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate))
|
||||||
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
|
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
|
||||||
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
|
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
|
||||||
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
|
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
|
||||||
@ -666,6 +656,20 @@ async def benchmark(
|
|||||||
|
|
||||||
print("=" * 50)
|
print("=" * 50)
|
||||||
|
|
||||||
|
if profile:
|
||||||
|
print("Stopping profiler...")
|
||||||
|
profile_input = RequestFuncInput(
|
||||||
|
model=model_id,
|
||||||
|
prompt=test_request.prompt,
|
||||||
|
api_url=base_url + "/stop_profile",
|
||||||
|
prompt_len=test_request.prompt_len,
|
||||||
|
output_len=test_request.expected_output_len,
|
||||||
|
extra_body={test_request.structure_type: test_request.schema},
|
||||||
|
)
|
||||||
|
profile_output = await request_func(request_func_input=profile_input)
|
||||||
|
if profile_output.success:
|
||||||
|
print("Profiler stopped")
|
||||||
|
|
||||||
return result, ret
|
return result, ret
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -15,6 +15,7 @@ import torch
|
|||||||
import uvloop
|
import uvloop
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
|
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
|
||||||
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
from benchmark_dataset import (
|
from benchmark_dataset import (
|
||||||
AIMODataset,
|
AIMODataset,
|
||||||
@ -167,7 +168,8 @@ async def run_vllm_async(
|
|||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
|
|
||||||
async with build_async_engine_client_from_engine_args(
|
async with build_async_engine_client_from_engine_args(
|
||||||
engine_args, disable_frontend_multiprocessing
|
engine_args,
|
||||||
|
disable_frontend_multiprocessing=disable_frontend_multiprocessing,
|
||||||
) as llm:
|
) as llm:
|
||||||
model_config = await llm.get_model_config()
|
model_config = await llm.get_model_config()
|
||||||
assert all(
|
assert all(
|
||||||
@ -381,6 +383,10 @@ def get_requests(args, tokenizer):
|
|||||||
return dataset_cls(**common_kwargs).sample(**sample_kwargs)
|
return dataset_cls(**common_kwargs).sample(**sample_kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@deprecated(
|
||||||
|
"benchmark_throughput.py is deprecated and will be removed in a "
|
||||||
|
"future version. Please use 'vllm bench throughput' instead.",
|
||||||
|
)
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
if args.seed is None:
|
if args.seed is None:
|
||||||
args.seed = 0
|
args.seed = 0
|
||||||
|
|||||||
@ -3,7 +3,7 @@
|
|||||||
# benchmark the overhead of disaggregated prefill.
|
# benchmark the overhead of disaggregated prefill.
|
||||||
# methodology:
|
# methodology:
|
||||||
# - send all request to prefill vLLM instance. It will buffer KV cache.
|
# - send all request to prefill vLLM instance. It will buffer KV cache.
|
||||||
# - then send all request to decode instance.
|
# - then send all request to decode instance.
|
||||||
# - The TTFT of decode instance is the overhead.
|
# - The TTFT of decode instance is the overhead.
|
||||||
|
|
||||||
set -ex
|
set -ex
|
||||||
@ -12,6 +12,8 @@ kill_gpu_processes() {
|
|||||||
# kill all processes on GPU.
|
# kill all processes on GPU.
|
||||||
pgrep pt_main_thread | xargs -r kill -9
|
pgrep pt_main_thread | xargs -r kill -9
|
||||||
pgrep python3 | xargs -r kill -9
|
pgrep python3 | xargs -r kill -9
|
||||||
|
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
||||||
|
pgrep VLLM | xargs -r kill -9
|
||||||
sleep 10
|
sleep 10
|
||||||
|
|
||||||
# remove vllm config file
|
# remove vllm config file
|
||||||
@ -61,7 +63,7 @@ benchmark() {
|
|||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||||
|
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=1 python3 \
|
CUDA_VISIBLE_DEVICES=1 python3 \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
-m vllm.entrypoints.openai.api_server \
|
||||||
@ -76,38 +78,38 @@ benchmark() {
|
|||||||
wait_for_server 8200
|
wait_for_server 8200
|
||||||
|
|
||||||
# let the prefill instance finish prefill
|
# let the prefill instance finish prefill
|
||||||
python3 ../benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $model \
|
--model $model \
|
||||||
--dataset-name $dataset_name \
|
--dataset-name $dataset_name \
|
||||||
--dataset-path $dataset_path \
|
--dataset-path $dataset_path \
|
||||||
--sonnet-input-len $input_len \
|
--sonnet-input-len $input_len \
|
||||||
--sonnet-output-len "$output_len" \
|
--sonnet-output-len "$output_len" \
|
||||||
--sonnet-prefix-len $prefix_len \
|
--sonnet-prefix-len $prefix_len \
|
||||||
--num-prompts $num_prompts \
|
--num-prompts $num_prompts \
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--save-result \
|
--save-result \
|
||||||
--result-dir $results_folder \
|
--result-dir $results_folder \
|
||||||
--result-filename disagg_prefill_tp1.json \
|
--result-filename disagg_prefill_tp1.json \
|
||||||
--request-rate "inf"
|
--request-rate "inf"
|
||||||
|
|
||||||
|
|
||||||
# send the request to decode.
|
# send the request to decode.
|
||||||
# The TTFT of this command will be the overhead of disagg prefill impl.
|
# The TTFT of this command will be the overhead of disagg prefill impl.
|
||||||
python3 ../benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $model \
|
--model $model \
|
||||||
--dataset-name $dataset_name \
|
--dataset-name $dataset_name \
|
||||||
--dataset-path $dataset_path \
|
--dataset-path $dataset_path \
|
||||||
--sonnet-input-len $input_len \
|
--sonnet-input-len $input_len \
|
||||||
--sonnet-output-len "$output_len" \
|
--sonnet-output-len "$output_len" \
|
||||||
--sonnet-prefix-len $prefix_len \
|
--sonnet-prefix-len $prefix_len \
|
||||||
--num-prompts $num_prompts \
|
--num-prompts $num_prompts \
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--save-result \
|
--save-result \
|
||||||
--result-dir $results_folder \
|
--result-dir $results_folder \
|
||||||
--result-filename disagg_prefill_tp1_overhead.json \
|
--result-filename disagg_prefill_tp1_overhead.json \
|
||||||
--request-rate "$qps"
|
--request-rate "$qps"
|
||||||
kill_gpu_processes
|
kill_gpu_processes
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -18,6 +18,8 @@ kill_gpu_processes() {
|
|||||||
# kill all processes on GPU.
|
# kill all processes on GPU.
|
||||||
pgrep pt_main_thread | xargs -r kill -9
|
pgrep pt_main_thread | xargs -r kill -9
|
||||||
pgrep python3 | xargs -r kill -9
|
pgrep python3 | xargs -r kill -9
|
||||||
|
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
||||||
|
pgrep VLLM | xargs -r kill -9
|
||||||
for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
|
for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
|
||||||
sleep 1
|
sleep 1
|
||||||
}
|
}
|
||||||
@ -58,7 +60,7 @@ launch_chunked_prefill() {
|
|||||||
|
|
||||||
|
|
||||||
launch_disagg_prefill() {
|
launch_disagg_prefill() {
|
||||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
# disagg prefill
|
# disagg prefill
|
||||||
CUDA_VISIBLE_DEVICES=0 python3 \
|
CUDA_VISIBLE_DEVICES=0 python3 \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
-m vllm.entrypoints.openai.api_server \
|
||||||
@ -97,20 +99,20 @@ benchmark() {
|
|||||||
output_len=$2
|
output_len=$2
|
||||||
tag=$3
|
tag=$3
|
||||||
|
|
||||||
python3 ../benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $model \
|
--model $model \
|
||||||
--dataset-name $dataset_name \
|
--dataset-name $dataset_name \
|
||||||
--dataset-path $dataset_path \
|
--dataset-path $dataset_path \
|
||||||
--sonnet-input-len $input_len \
|
--sonnet-input-len $input_len \
|
||||||
--sonnet-output-len "$output_len" \
|
--sonnet-output-len "$output_len" \
|
||||||
--sonnet-prefix-len $prefix_len \
|
--sonnet-prefix-len $prefix_len \
|
||||||
--num-prompts $num_prompts \
|
--num-prompts $num_prompts \
|
||||||
--port 8000 \
|
--port 8000 \
|
||||||
--save-result \
|
--save-result \
|
||||||
--result-dir $results_folder \
|
--result-dir $results_folder \
|
||||||
--result-filename "$tag"-qps-"$qps".json \
|
--result-filename "$tag"-qps-"$qps".json \
|
||||||
--request-rate "$qps"
|
--request-rate "$qps"
|
||||||
|
|
||||||
sleep 2
|
sleep 2
|
||||||
}
|
}
|
||||||
|
|||||||
@ -80,11 +80,6 @@ def bench_run(
|
|||||||
a, score, topk, renormalize=False
|
a, score, topk, renormalize=False
|
||||||
)
|
)
|
||||||
|
|
||||||
ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
|
|
||||||
ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
|
|
||||||
c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
|
|
||||||
c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
|
|
||||||
|
|
||||||
def run_triton_moe(
|
def run_triton_moe(
|
||||||
a: torch.Tensor,
|
a: torch.Tensor,
|
||||||
w1: torch.Tensor,
|
w1: torch.Tensor,
|
||||||
@ -116,10 +111,6 @@ def bench_run(
|
|||||||
w2: torch.Tensor,
|
w2: torch.Tensor,
|
||||||
w1_scale: torch.Tensor,
|
w1_scale: torch.Tensor,
|
||||||
w2_scale: torch.Tensor,
|
w2_scale: torch.Tensor,
|
||||||
ab_strides1: torch.Tensor,
|
|
||||||
ab_strides2: torch.Tensor,
|
|
||||||
c_strides1: torch.Tensor,
|
|
||||||
c_strides2: torch.Tensor,
|
|
||||||
topk_weights: torch.Tensor,
|
topk_weights: torch.Tensor,
|
||||||
topk_ids: torch.Tensor,
|
topk_ids: torch.Tensor,
|
||||||
per_act_token: bool,
|
per_act_token: bool,
|
||||||
@ -134,10 +125,6 @@ def bench_run(
|
|||||||
topk_ids,
|
topk_ids,
|
||||||
w1_scale,
|
w1_scale,
|
||||||
w2_scale,
|
w2_scale,
|
||||||
ab_strides1,
|
|
||||||
ab_strides2,
|
|
||||||
c_strides1,
|
|
||||||
c_strides2,
|
|
||||||
per_act_token,
|
per_act_token,
|
||||||
a1_scale=None,
|
a1_scale=None,
|
||||||
)
|
)
|
||||||
@ -149,10 +136,6 @@ def bench_run(
|
|||||||
w2_q: torch.Tensor,
|
w2_q: torch.Tensor,
|
||||||
w1_scale: torch.Tensor,
|
w1_scale: torch.Tensor,
|
||||||
w2_scale: torch.Tensor,
|
w2_scale: torch.Tensor,
|
||||||
ab_strides1: torch.Tensor,
|
|
||||||
ab_strides2: torch.Tensor,
|
|
||||||
c_strides1: torch.Tensor,
|
|
||||||
c_strides2: torch.Tensor,
|
|
||||||
topk_weights: torch.Tensor,
|
topk_weights: torch.Tensor,
|
||||||
topk_ids: torch.Tensor,
|
topk_ids: torch.Tensor,
|
||||||
):
|
):
|
||||||
@ -167,10 +150,6 @@ def bench_run(
|
|||||||
topk_ids,
|
topk_ids,
|
||||||
w1_scale,
|
w1_scale,
|
||||||
w2_scale,
|
w2_scale,
|
||||||
ab_strides1,
|
|
||||||
ab_strides2,
|
|
||||||
c_strides1,
|
|
||||||
c_strides2,
|
|
||||||
per_act_token,
|
per_act_token,
|
||||||
a1_scale=None,
|
a1_scale=None,
|
||||||
)
|
)
|
||||||
@ -215,10 +194,6 @@ def bench_run(
|
|||||||
w2_q,
|
w2_q,
|
||||||
w1_scale,
|
w1_scale,
|
||||||
w2_scale,
|
w2_scale,
|
||||||
ab_strides1,
|
|
||||||
ab_strides2,
|
|
||||||
c_strides1,
|
|
||||||
c_strides2,
|
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
)
|
)
|
||||||
@ -256,10 +231,6 @@ def bench_run(
|
|||||||
"w1_scale": w1_scale,
|
"w1_scale": w1_scale,
|
||||||
"w2_scale": w2_scale,
|
"w2_scale": w2_scale,
|
||||||
"per_act_token": per_act_token,
|
"per_act_token": per_act_token,
|
||||||
"ab_strides1": ab_strides1,
|
|
||||||
"ab_strides2": ab_strides2,
|
|
||||||
"c_strides1": c_strides1,
|
|
||||||
"c_strides2": c_strides2,
|
|
||||||
# cuda graph params
|
# cuda graph params
|
||||||
"cutlass_graph": cutlass_graph,
|
"cutlass_graph": cutlass_graph,
|
||||||
"triton_graph": triton_graph,
|
"triton_graph": triton_graph,
|
||||||
@ -318,10 +289,6 @@ def bench_run(
|
|||||||
w2_q,
|
w2_q,
|
||||||
w1_scale,
|
w1_scale,
|
||||||
w2_scale,
|
w2_scale,
|
||||||
ab_strides1,
|
|
||||||
ab_strides2,
|
|
||||||
c_strides1,
|
|
||||||
c_strides2,
|
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
per_act_token,
|
per_act_token,
|
||||||
@ -330,7 +297,7 @@ def bench_run(
|
|||||||
|
|
||||||
results.append(
|
results.append(
|
||||||
benchmark.Timer(
|
benchmark.Timer(
|
||||||
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501
|
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501
|
||||||
globals=globals,
|
globals=globals,
|
||||||
label=label,
|
label=label,
|
||||||
sub_label=sub_label,
|
sub_label=sub_label,
|
||||||
|
|||||||
@ -576,7 +576,11 @@ def main(args: argparse.Namespace):
|
|||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
intermediate_size = config.intermediate_size
|
intermediate_size = config.intermediate_size
|
||||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||||
elif config.architectures[0] in ("DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"):
|
elif config.architectures[0] in (
|
||||||
|
"DeepseekV3ForCausalLM",
|
||||||
|
"DeepseekV2ForCausalLM",
|
||||||
|
"Glm4MoeForCausalLM",
|
||||||
|
):
|
||||||
E = config.n_routed_experts
|
E = config.n_routed_experts
|
||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
intermediate_size = config.moe_intermediate_size
|
intermediate_size = config.moe_intermediate_size
|
||||||
|
|||||||
@ -5,9 +5,8 @@ import itertools
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
|
||||||
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
|
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
|
||||||
moe_align_block_size_triton,
|
moe_align_block_size,
|
||||||
)
|
)
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
|
|
||||||
@ -21,62 +20,6 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8):
|
|
||||||
"""
|
|
||||||
Verifies vllm vs. Triton
|
|
||||||
"""
|
|
||||||
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
|
||||||
|
|
||||||
# 1. malloc space for triton and vllm
|
|
||||||
# malloc enough space (max_num_tokens_padded) for the sorted ids
|
|
||||||
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
|
|
||||||
sorted_ids_triton = torch.empty(
|
|
||||||
(max_num_tokens_padded,), dtype=torch.int32, device="cuda"
|
|
||||||
)
|
|
||||||
sorted_ids_triton.fill_(topk_ids.numel()) # fill with sentinel value
|
|
||||||
expert_ids_triton = torch.zeros(
|
|
||||||
(max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda"
|
|
||||||
)
|
|
||||||
num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda")
|
|
||||||
|
|
||||||
sorted_ids_vllm = torch.empty_like(sorted_ids_triton)
|
|
||||||
sorted_ids_vllm.fill_(topk_ids.numel())
|
|
||||||
expert_ids_vllm = torch.zeros_like(expert_ids_triton)
|
|
||||||
num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton)
|
|
||||||
|
|
||||||
# 2. run implementations
|
|
||||||
moe_align_block_size_triton(
|
|
||||||
topk_ids,
|
|
||||||
num_experts,
|
|
||||||
block_size,
|
|
||||||
sorted_ids_triton,
|
|
||||||
expert_ids_triton,
|
|
||||||
num_tokens_post_pad_triton,
|
|
||||||
)
|
|
||||||
|
|
||||||
ops.moe_align_block_size(
|
|
||||||
topk_ids,
|
|
||||||
num_experts,
|
|
||||||
block_size,
|
|
||||||
sorted_ids_vllm,
|
|
||||||
expert_ids_vllm,
|
|
||||||
num_tokens_post_pad_vllm,
|
|
||||||
)
|
|
||||||
print(f"✅ VLLM implementation works with {num_experts} experts!")
|
|
||||||
|
|
||||||
# 3. compare results
|
|
||||||
if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose(
|
|
||||||
num_tokens_post_pad_triton, num_tokens_post_pad_vllm
|
|
||||||
):
|
|
||||||
print("✅ Triton and VLLM implementations match.")
|
|
||||||
else:
|
|
||||||
print("❌ Triton and VLLM implementations DO NOT match.")
|
|
||||||
print("Triton expert_ids:", expert_ids_triton)
|
|
||||||
print("VLLM expert_ids:", expert_ids_vllm)
|
|
||||||
print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
|
|
||||||
print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
|
|
||||||
|
|
||||||
|
|
||||||
# test configurations
|
# test configurations
|
||||||
num_tokens_range = [1, 16, 256, 4096]
|
num_tokens_range = [1, 16, 256, 4096]
|
||||||
num_experts_range = [16, 64, 224, 256, 280, 512]
|
num_experts_range = [16, 64, 224, 256, 280, 512]
|
||||||
@ -89,8 +32,8 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range
|
|||||||
x_names=["num_tokens", "num_experts", "topk"],
|
x_names=["num_tokens", "num_experts", "topk"],
|
||||||
x_vals=configs,
|
x_vals=configs,
|
||||||
line_arg="provider",
|
line_arg="provider",
|
||||||
line_vals=["vllm", "triton"], # "triton"
|
line_vals=["vllm"],
|
||||||
line_names=["VLLM", "Triton"], # "Triton"
|
line_names=["vLLM"],
|
||||||
plot_name="moe-align-block-size-performance",
|
plot_name="moe-align-block-size-performance",
|
||||||
args={},
|
args={},
|
||||||
)
|
)
|
||||||
@ -100,37 +43,11 @@ def benchmark(num_tokens, num_experts, topk, provider):
|
|||||||
block_size = 256
|
block_size = 256
|
||||||
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
||||||
|
|
||||||
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
|
|
||||||
sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
|
|
||||||
sorted_ids.fill_(topk_ids.numel())
|
|
||||||
max_num_m_blocks = max_num_tokens_padded // block_size
|
|
||||||
expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
|
|
||||||
num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda")
|
|
||||||
|
|
||||||
quantiles = [0.5, 0.2, 0.8]
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
|
|
||||||
if provider == "vllm":
|
if provider == "vllm":
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||||
lambda: ops.moe_align_block_size(
|
lambda: moe_align_block_size(topk_ids, block_size, num_experts),
|
||||||
topk_ids,
|
|
||||||
num_experts,
|
|
||||||
block_size,
|
|
||||||
sorted_ids.clone(),
|
|
||||||
expert_ids.clone(),
|
|
||||||
num_tokens_post_pad.clone(),
|
|
||||||
),
|
|
||||||
quantiles=quantiles,
|
|
||||||
)
|
|
||||||
elif provider == "triton":
|
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
|
||||||
lambda: moe_align_block_size_triton(
|
|
||||||
topk_ids,
|
|
||||||
num_experts,
|
|
||||||
block_size,
|
|
||||||
sorted_ids.clone(),
|
|
||||||
expert_ids.clone(),
|
|
||||||
num_tokens_post_pad.clone(),
|
|
||||||
),
|
|
||||||
quantiles=quantiles,
|
quantiles=quantiles,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -154,6 +71,4 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
print("Running correctness check...")
|
|
||||||
check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
|
|
||||||
benchmark.run(print_data=True, show_plots=True)
|
benchmark.run(print_data=True, show_plots=True)
|
||||||
|
|||||||
@ -8,12 +8,13 @@ import ray
|
|||||||
import torch
|
import torch
|
||||||
from transformers import AutoConfig
|
from transformers import AutoConfig
|
||||||
|
|
||||||
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
|
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||||
|
from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
|
||||||
_moe_permute,
|
_moe_permute,
|
||||||
_moe_unpermute_and_reduce,
|
_moe_unpermute_and_reduce,
|
||||||
|
moe_permute,
|
||||||
|
moe_unpermute,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
|
||||||
from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import *
|
|
||||||
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
|
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
@ -63,18 +64,19 @@ def benchmark_permute(
|
|||||||
|
|
||||||
def run():
|
def run():
|
||||||
if use_customized_permute:
|
if use_customized_permute:
|
||||||
(permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
|
(
|
||||||
moe_permute(
|
permuted_hidden_states,
|
||||||
qhidden_states,
|
a1q_scale,
|
||||||
topk_weights=topk_weights,
|
first_token_off,
|
||||||
topk_ids=topk_ids,
|
inv_perm_idx,
|
||||||
token_expert_indices=token_expert_indices,
|
m_indices,
|
||||||
topk=topk,
|
) = moe_permute(
|
||||||
n_expert=num_experts,
|
qhidden_states,
|
||||||
n_local_expert=num_experts,
|
a1q_scale=None,
|
||||||
expert_map=None,
|
topk_ids=topk_ids,
|
||||||
align_block_size=align_block_size,
|
n_expert=num_experts,
|
||||||
)
|
expert_map=None,
|
||||||
|
align_block_size=align_block_size,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
(
|
(
|
||||||
@ -150,18 +152,19 @@ def benchmark_unpermute(
|
|||||||
|
|
||||||
def prepare():
|
def prepare():
|
||||||
if use_customized_permute:
|
if use_customized_permute:
|
||||||
(permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
|
(
|
||||||
moe_permute(
|
permuted_hidden_states,
|
||||||
qhidden_states,
|
a1q_scale,
|
||||||
topk_weights=topk_weights,
|
first_token_off,
|
||||||
topk_ids=topk_ids,
|
inv_perm_idx,
|
||||||
token_expert_indices=token_expert_indices,
|
m_indices,
|
||||||
topk=topk,
|
) = moe_permute(
|
||||||
n_expert=num_experts,
|
qhidden_states,
|
||||||
n_local_expert=num_experts,
|
a1q_scale=None,
|
||||||
expert_map=None,
|
topk_ids=topk_ids,
|
||||||
align_block_size=align_block_size,
|
n_expert=num_experts,
|
||||||
)
|
expert_map=None,
|
||||||
|
align_block_size=align_block_size,
|
||||||
)
|
)
|
||||||
# convert to fp16/bf16 as gemm output
|
# convert to fp16/bf16 as gemm output
|
||||||
return (
|
return (
|
||||||
@ -191,16 +194,19 @@ def benchmark_unpermute(
|
|||||||
|
|
||||||
def run(input: tuple):
|
def run(input: tuple):
|
||||||
if use_customized_permute:
|
if use_customized_permute:
|
||||||
(permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = input
|
(
|
||||||
|
permuted_hidden_states,
|
||||||
|
first_token_off,
|
||||||
|
inv_perm_idx,
|
||||||
|
m_indices,
|
||||||
|
) = input
|
||||||
|
output = torch.empty_like(hidden_states)
|
||||||
moe_unpermute(
|
moe_unpermute(
|
||||||
|
output,
|
||||||
permuted_hidden_states,
|
permuted_hidden_states,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
|
||||||
inv_perm_idx,
|
inv_perm_idx,
|
||||||
first_token_off,
|
first_token_off,
|
||||||
topk,
|
|
||||||
num_experts,
|
|
||||||
num_experts,
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
(
|
(
|
||||||
@ -211,7 +217,11 @@ def benchmark_unpermute(
|
|||||||
inv_perm,
|
inv_perm,
|
||||||
) = input
|
) = input
|
||||||
_moe_unpermute_and_reduce(
|
_moe_unpermute_and_reduce(
|
||||||
output_hidden_states, permuted_hidden_states, inv_perm, topk_weights
|
output_hidden_states,
|
||||||
|
permuted_hidden_states,
|
||||||
|
inv_perm,
|
||||||
|
topk_weights,
|
||||||
|
True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# JIT compilation & warmup
|
# JIT compilation & warmup
|
||||||
@ -318,6 +328,7 @@ def main(args: argparse.Namespace):
|
|||||||
elif (
|
elif (
|
||||||
config.architectures[0] == "DeepseekV3ForCausalLM"
|
config.architectures[0] == "DeepseekV3ForCausalLM"
|
||||||
or config.architectures[0] == "DeepseekV2ForCausalLM"
|
or config.architectures[0] == "DeepseekV2ForCausalLM"
|
||||||
|
or config.architectures[0] == "Glm4MoeForCausalLM"
|
||||||
):
|
):
|
||||||
E = config.n_routed_experts
|
E = config.n_routed_experts
|
||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
|
|||||||
159
benchmarks/kernels/benchmark_per_token_group_quant.py
Normal file
@ -0,0 +1,159 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import math
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from typing import Callable
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm.model_executor.layers.quantization.utils import fp8_utils, int8_utils
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def _triton_mode():
|
||||||
|
"""Temporarily force the Triton fallback path"""
|
||||||
|
with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
|
||||||
|
yield
|
||||||
|
|
||||||
|
|
||||||
|
def _time_cuda(
|
||||||
|
fn: Callable[[], tuple[torch.Tensor, torch.Tensor]],
|
||||||
|
warmup_iters: int,
|
||||||
|
bench_iters: int,
|
||||||
|
) -> float:
|
||||||
|
# warmup
|
||||||
|
for _ in range(warmup_iters):
|
||||||
|
fn()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
start = torch.cuda.Event(enable_timing=True)
|
||||||
|
end = torch.cuda.Event(enable_timing=True)
|
||||||
|
|
||||||
|
start.record()
|
||||||
|
for _ in range(bench_iters):
|
||||||
|
fn()
|
||||||
|
end.record()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
return start.elapsed_time(end) / bench_iters # ms/iter
|
||||||
|
|
||||||
|
|
||||||
|
def _run_single(
|
||||||
|
shape: tuple[int, int],
|
||||||
|
group_size: int,
|
||||||
|
dtype: str,
|
||||||
|
*,
|
||||||
|
column_major: bool = False,
|
||||||
|
scale_ue8m0: bool = False,
|
||||||
|
warmup_iters: int,
|
||||||
|
bench_iters: int,
|
||||||
|
) -> None:
|
||||||
|
num_tokens, hidden_dim = shape
|
||||||
|
|
||||||
|
device = torch.device("cuda")
|
||||||
|
torch.manual_seed(42)
|
||||||
|
x = torch.randn(num_tokens, hidden_dim, device=device, dtype=torch.bfloat16) * 8
|
||||||
|
|
||||||
|
if dtype == "fp8":
|
||||||
|
|
||||||
|
def cuda_impl():
|
||||||
|
return fp8_utils.per_token_group_quant_fp8(
|
||||||
|
x,
|
||||||
|
group_size,
|
||||||
|
column_major_scales=column_major,
|
||||||
|
use_ue8m0=scale_ue8m0,
|
||||||
|
)
|
||||||
|
|
||||||
|
def triton_impl():
|
||||||
|
with _triton_mode():
|
||||||
|
return fp8_utils.per_token_group_quant_fp8(
|
||||||
|
x,
|
||||||
|
group_size,
|
||||||
|
column_major_scales=column_major,
|
||||||
|
use_ue8m0=scale_ue8m0,
|
||||||
|
)
|
||||||
|
elif dtype == "int8":
|
||||||
|
|
||||||
|
def cuda_impl():
|
||||||
|
return int8_utils.per_token_group_quant_int8(x, group_size)
|
||||||
|
|
||||||
|
def triton_impl():
|
||||||
|
with _triton_mode():
|
||||||
|
return int8_utils.per_token_group_quant_int8(x, group_size)
|
||||||
|
else:
|
||||||
|
raise ValueError("dtype must be 'fp8' or 'int8'")
|
||||||
|
|
||||||
|
cuda_ms = _time_cuda(cuda_impl, warmup_iters, bench_iters)
|
||||||
|
triton_ms = _time_cuda(triton_impl, warmup_iters, bench_iters)
|
||||||
|
|
||||||
|
speedup = triton_ms / cuda_ms if cuda_ms else math.inf
|
||||||
|
|
||||||
|
cfg_desc = (
|
||||||
|
f"shape={shape} gs={group_size:<3} col_major={column_major:<5} "
|
||||||
|
f"ue8m0={scale_ue8m0:<5} dtype={dtype}"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f"{cfg_desc:55} | CUDA {cuda_ms:7.3f} ms | Triton {triton_ms:7.3f} ms | "
|
||||||
|
f"speed-up ×{speedup:5.2f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--warmup-iters", type=int, default=10)
|
||||||
|
parser.add_argument("--bench-iters", type=int, default=100)
|
||||||
|
parser.add_argument("--dtype", choices=["fp8", "int8", "both"], default="both")
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if not current_platform.is_cuda():
|
||||||
|
raise RuntimeError("CUDA device is required to run this benchmark.")
|
||||||
|
|
||||||
|
args = parse_args()
|
||||||
|
warmup_iters, bench_iters = args.warmup_iters, args.bench_iters
|
||||||
|
|
||||||
|
shapes = [(32, 128), (64, 256), (16, 512)]
|
||||||
|
group_sizes = [64, 128]
|
||||||
|
|
||||||
|
dtypes = ["fp8", "int8"] if args.dtype == "both" else [args.dtype]
|
||||||
|
|
||||||
|
header = (
|
||||||
|
"Configuration".ljust(55)
|
||||||
|
+ " | "
|
||||||
|
+ "CUDA (ms)".center(12)
|
||||||
|
+ " | "
|
||||||
|
+ "Triton (ms)".center(13)
|
||||||
|
+ " | "
|
||||||
|
+ "Speed-up"
|
||||||
|
)
|
||||||
|
print(header)
|
||||||
|
print("-" * len(header))
|
||||||
|
|
||||||
|
for dtype in dtypes:
|
||||||
|
for shape in shapes:
|
||||||
|
for gs in group_sizes:
|
||||||
|
if dtype == "fp8":
|
||||||
|
for col_major in (False, True):
|
||||||
|
for ue8m0 in (False, True):
|
||||||
|
_run_single(
|
||||||
|
shape,
|
||||||
|
gs,
|
||||||
|
dtype,
|
||||||
|
column_major=col_major,
|
||||||
|
scale_ue8m0=ue8m0,
|
||||||
|
warmup_iters=warmup_iters,
|
||||||
|
bench_iters=bench_iters,
|
||||||
|
)
|
||||||
|
else: # INT8 has no col-major / ue8m0 switches
|
||||||
|
_run_single(
|
||||||
|
shape,
|
||||||
|
gs,
|
||||||
|
dtype,
|
||||||
|
warmup_iters=warmup_iters,
|
||||||
|
bench_iters=bench_iters,
|
||||||
|
)
|
||||||
@ -71,22 +71,20 @@ def benchmark_decode(
|
|||||||
if kv_cache_dtype.startswith("fp8"):
|
if kv_cache_dtype.startswith("fp8"):
|
||||||
kv_cache, _ = to_float8(kv_cache)
|
kv_cache, _ = to_float8(kv_cache)
|
||||||
|
|
||||||
|
output_trtllm = torch.empty(q.shape, dtype=dtype)
|
||||||
|
|
||||||
# Benchmark TRT decode
|
# Benchmark TRT decode
|
||||||
def trt_decode():
|
def trt_decode():
|
||||||
return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
|
return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
|
||||||
q,
|
q,
|
||||||
kv_cache,
|
kv_cache,
|
||||||
workspace_buffer,
|
workspace_buffer,
|
||||||
num_qo_heads,
|
|
||||||
num_kv_heads,
|
|
||||||
sm_scale,
|
|
||||||
block_tables,
|
block_tables,
|
||||||
kv_lens_tensor,
|
kv_lens_tensor,
|
||||||
page_size,
|
|
||||||
max_kv_len,
|
max_kv_len,
|
||||||
kv_cache_dtype,
|
bmm1_scale=k_scale * sm_scale,
|
||||||
k_scale,
|
bmm2_scale=v_scale,
|
||||||
v_scale,
|
out=output_trtllm,
|
||||||
)
|
)
|
||||||
|
|
||||||
def time_fn(fn, warmup=10, trials=20):
|
def time_fn(fn, warmup=10, trials=20):
|
||||||
@ -125,6 +123,8 @@ def benchmark_decode(
|
|||||||
kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
|
kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
|
||||||
kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
|
kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
|
||||||
|
|
||||||
|
output_baseline = torch.empty(q.shape, dtype=dtype)
|
||||||
|
|
||||||
wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
|
wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
|
||||||
workspace_buffer,
|
workspace_buffer,
|
||||||
kv_layout,
|
kv_layout,
|
||||||
@ -145,7 +145,7 @@ def benchmark_decode(
|
|||||||
)
|
)
|
||||||
|
|
||||||
def baseline_decode():
|
def baseline_decode():
|
||||||
return wrapper.run(q, kv_cache, sm_scale, k_scale, v_scale)
|
return wrapper.run(q, kv_cache, sm_scale, k_scale, v_scale, output_baseline)
|
||||||
|
|
||||||
baseline_mean, baseline_std = time_fn(baseline_decode)
|
baseline_mean, baseline_std = time_fn(baseline_decode)
|
||||||
|
|
||||||
@ -214,25 +214,39 @@ if __name__ == "__main__":
|
|||||||
max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
|
max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
|
||||||
all_results = []
|
all_results = []
|
||||||
|
|
||||||
print("Running benchmark for kv_cache_dtype: bfloat16")
|
|
||||||
print(
|
print(
|
||||||
"\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\tbaseline_std\tspeedup_percent"
|
"Running benchmark for q_dtype = bfloat16, kv_cache_dtype: bfloat16, "
|
||||||
|
"output_dtype: bfloat16"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
"\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
|
||||||
|
"baseline_std\tspeedup_percent"
|
||||||
)
|
)
|
||||||
for max_seq_len in max_seq_lens:
|
for max_seq_len in max_seq_lens:
|
||||||
for bs in num_seqs:
|
for bs in num_seqs:
|
||||||
result = benchmark_decode(
|
result = benchmark_decode(
|
||||||
bs, max_seq_len, dtype=torch.bfloat16, kv_cache_dtype="auto"
|
bs,
|
||||||
|
max_seq_len,
|
||||||
|
dtype=torch.bfloat16,
|
||||||
|
kv_cache_dtype="auto",
|
||||||
)
|
)
|
||||||
all_results.append(result)
|
all_results.append(result)
|
||||||
|
|
||||||
print("Running benchmark for q_dtype = bfloat16, kv_cache_dtype: fp8")
|
|
||||||
print(
|
print(
|
||||||
"\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\tbaseline_std\tspeedup_percent"
|
"Running benchmark for q_dtype = bfloat16, kv_cache_dtype: fp8, "
|
||||||
|
"output_dtype: bfloat16"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
"\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
|
||||||
|
"baseline_std\tspeedup_percent"
|
||||||
)
|
)
|
||||||
for max_seq_len in max_seq_lens:
|
for max_seq_len in max_seq_lens:
|
||||||
for bs in num_seqs:
|
for bs in num_seqs:
|
||||||
result = benchmark_decode(
|
result = benchmark_decode(
|
||||||
bs, max_seq_len, dtype=torch.bfloat16, kv_cache_dtype="fp8"
|
bs,
|
||||||
|
max_seq_len,
|
||||||
|
dtype=torch.bfloat16,
|
||||||
|
kv_cache_dtype="fp8",
|
||||||
)
|
)
|
||||||
all_results.append(result)
|
all_results.append(result)
|
||||||
|
|
||||||
|
|||||||
@ -8,7 +8,7 @@ Currently this just includes dense GEMMs and only works on Hopper GPUs.
|
|||||||
|
|
||||||
You need to install vLLM in your usual fashion, then install DeepGEMM from source in its own directory:
|
You need to install vLLM in your usual fashion, then install DeepGEMM from source in its own directory:
|
||||||
|
|
||||||
```
|
```bash
|
||||||
git clone --recursive https://github.com/deepseek-ai/DeepGEMM
|
git clone --recursive https://github.com/deepseek-ai/DeepGEMM
|
||||||
cd DeepGEMM
|
cd DeepGEMM
|
||||||
python setup.py install
|
python setup.py install
|
||||||
@ -17,7 +17,7 @@ uv pip install -e .
|
|||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
```
|
```console
|
||||||
python benchmark_fp8_block_dense_gemm.py
|
python benchmark_fp8_block_dense_gemm.py
|
||||||
INFO 02-26 21:55:13 [__init__.py:207] Automatically detected platform cuda.
|
INFO 02-26 21:55:13 [__init__.py:207] Automatically detected platform cuda.
|
||||||
===== STARTING FP8 GEMM BENCHMARK =====
|
===== STARTING FP8 GEMM BENCHMARK =====
|
||||||
|
|||||||
108
benchmarks/kv_cache/benchmark_block_pool.py
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import gc
|
||||||
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
from vllm.v1.core.block_pool import BlockPool
|
||||||
|
|
||||||
|
|
||||||
|
class Metric:
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.cnt: int = 0
|
||||||
|
self.sum_v: int = 0
|
||||||
|
self.max_v: Optional[int] = None
|
||||||
|
|
||||||
|
def update(self, v: int) -> None:
|
||||||
|
self.cnt += 1
|
||||||
|
self.sum_v += v
|
||||||
|
if self.max_v is None:
|
||||||
|
self.max_v = v
|
||||||
|
else:
|
||||||
|
self.max_v = max(self.max_v, v)
|
||||||
|
|
||||||
|
def avg_v(self) -> float:
|
||||||
|
return self.sum_v * 1.0 / self.cnt
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
rows = []
|
||||||
|
for allocate_block in args.allocate_blocks:
|
||||||
|
# Enforce a GC collect ahead to minimize the impact among runs
|
||||||
|
gc.collect()
|
||||||
|
block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
|
||||||
|
|
||||||
|
get_blocks_metric: Metric = Metric()
|
||||||
|
free_blocks_metric: Metric = Metric()
|
||||||
|
for _ in range(args.num_iteration):
|
||||||
|
t1 = time.monotonic_ns()
|
||||||
|
blocks = block_pool.get_new_blocks(allocate_block)
|
||||||
|
t2 = time.monotonic_ns()
|
||||||
|
block_pool.free_blocks(blocks)
|
||||||
|
t3 = time.monotonic_ns()
|
||||||
|
get_blocks_metric.update(t2 - t1)
|
||||||
|
free_blocks_metric.update(t3 - t2)
|
||||||
|
|
||||||
|
if get_blocks_metric.max_v is not None and free_blocks_metric.max_v is not None:
|
||||||
|
rows.append(
|
||||||
|
[
|
||||||
|
get_blocks_metric.cnt,
|
||||||
|
args.num_gpu_blocks,
|
||||||
|
allocate_block,
|
||||||
|
get_blocks_metric.avg_v() / 1000000,
|
||||||
|
get_blocks_metric.max_v / 1000000.0,
|
||||||
|
free_blocks_metric.avg_v() / 1000000,
|
||||||
|
free_blocks_metric.max_v / 1000000.0,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
"No valid metrics found."
|
||||||
|
f" {get_blocks_metric.max_v=} {free_blocks_metric.max_v=}"
|
||||||
|
)
|
||||||
|
|
||||||
|
print(
|
||||||
|
tabulate(
|
||||||
|
rows,
|
||||||
|
headers=[
|
||||||
|
"Iterations",
|
||||||
|
"Total\nBlocks",
|
||||||
|
"Allocated\nBlocks",
|
||||||
|
"Get Blocks\nAvg (ms)",
|
||||||
|
"Get Blocks\nMax (ms)",
|
||||||
|
"Free Blocks\nAvg (ms)",
|
||||||
|
"Free Blocks\nMax (ms)",
|
||||||
|
],
|
||||||
|
tablefmt="grid",
|
||||||
|
floatfmt=".6f",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def invoke_main() -> None:
|
||||||
|
parser = FlexibleArgumentParser(
|
||||||
|
description="Benchmark the performance of BlockPool for KV Cache."
|
||||||
|
)
|
||||||
|
parser.add_argument("--num-gpu-blocks", type=int, default=100000)
|
||||||
|
parser.add_argument(
|
||||||
|
"--num-iteration",
|
||||||
|
type=int,
|
||||||
|
default=1000,
|
||||||
|
help="Number of iterations to run to stablize final data readings",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--allocate-blocks",
|
||||||
|
type=int,
|
||||||
|
nargs="*",
|
||||||
|
default=[10, 50, 100, 500, 1000],
|
||||||
|
help="Number of blocks to allocate",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
invoke_main() # pragma: no cover
|
||||||
@ -58,6 +58,22 @@ function (find_isa CPUINFO TARGET OUT)
|
|||||||
endif()
|
endif()
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
|
|
||||||
|
function(check_sysctl TARGET OUT)
|
||||||
|
execute_process(COMMAND sysctl -n "${TARGET}"
|
||||||
|
RESULT_VARIABLE SYSCTL_RET
|
||||||
|
OUTPUT_VARIABLE SYSCTL_INFO
|
||||||
|
ERROR_QUIET
|
||||||
|
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||||
|
if(SYSCTL_RET EQUAL 0 AND
|
||||||
|
(SYSCTL_INFO STREQUAL "1" OR SYSCTL_INFO GREATER 0))
|
||||||
|
set(${OUT} ON PARENT_SCOPE)
|
||||||
|
else()
|
||||||
|
set(${OUT} OFF PARENT_SCOPE)
|
||||||
|
endif()
|
||||||
|
endfunction()
|
||||||
|
|
||||||
|
|
||||||
function (is_avx512_disabled OUT)
|
function (is_avx512_disabled OUT)
|
||||||
set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512})
|
set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512})
|
||||||
if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true")
|
if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true")
|
||||||
@ -70,7 +86,10 @@ endfunction()
|
|||||||
is_avx512_disabled(AVX512_DISABLED)
|
is_avx512_disabled(AVX512_DISABLED)
|
||||||
|
|
||||||
if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
|
if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
|
||||||
set(APPLE_SILICON_FOUND TRUE)
|
message(STATUS "Apple Silicon Detected")
|
||||||
|
set(ENABLE_NUMA OFF)
|
||||||
|
check_sysctl(hw.optional.neon ASIMD_FOUND)
|
||||||
|
check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND)
|
||||||
else()
|
else()
|
||||||
find_isa(${CPUINFO} "avx2" AVX2_FOUND)
|
find_isa(${CPUINFO} "avx2" AVX2_FOUND)
|
||||||
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
|
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
|
||||||
@ -82,7 +101,6 @@ else()
|
|||||||
find_isa(${CPUINFO} "S390" S390_FOUND)
|
find_isa(${CPUINFO} "S390" S390_FOUND)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||||
list(APPEND CXX_COMPILE_FLAGS
|
list(APPEND CXX_COMPILE_FLAGS
|
||||||
"-mavx512f"
|
"-mavx512f"
|
||||||
@ -149,9 +167,6 @@ elseif (ASIMD_FOUND)
|
|||||||
set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")
|
set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")
|
||||||
endif()
|
endif()
|
||||||
list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})
|
list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})
|
||||||
elseif(APPLE_SILICON_FOUND)
|
|
||||||
message(STATUS "Apple Silicon Detected")
|
|
||||||
set(ENABLE_NUMA OFF)
|
|
||||||
elseif (S390_FOUND)
|
elseif (S390_FOUND)
|
||||||
message(STATUS "S390 detected")
|
message(STATUS "S390 detected")
|
||||||
# Check for S390 VXE support
|
# Check for S390 VXE support
|
||||||
|
|||||||
@ -24,7 +24,7 @@
|
|||||||
|
|
||||||
#include "attention_dtypes.h"
|
#include "attention_dtypes.h"
|
||||||
#include "attention_utils.cuh"
|
#include "attention_utils.cuh"
|
||||||
#include "cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
|
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
#include <hip/hip_bf16.h>
|
#include <hip/hip_bf16.h>
|
||||||
|
|||||||
@ -16,9 +16,8 @@
|
|||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "attention_kernels.cuh"
|
#include "attention_kernels.cuh"
|
||||||
#include "cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
|
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
@ -75,7 +74,7 @@ void paged_attention_v1_launcher(
|
|||||||
const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
|
const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
|
||||||
const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
|
const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
|
||||||
|
|
||||||
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
const int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
||||||
int padded_max_seq_len =
|
int padded_max_seq_len =
|
||||||
DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
|
DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
|
||||||
int logits_size = padded_max_seq_len * sizeof(float);
|
int logits_size = padded_max_seq_len * sizeof(float);
|
||||||
|
|||||||
@ -16,9 +16,8 @@
|
|||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "attention_kernels.cuh"
|
#include "attention_kernels.cuh"
|
||||||
#include "cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
|
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
@ -79,7 +78,7 @@ void paged_attention_v2_launcher(
|
|||||||
const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
|
const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
|
||||||
const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
|
const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
|
||||||
|
|
||||||
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
const int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
||||||
int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
|
int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
|
||||||
int logits_size = PARTITION_SIZE * sizeof(float);
|
int logits_size = PARTITION_SIZE * sizeof(float);
|
||||||
int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
|
int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
|
||||||
|
|||||||
@ -16,12 +16,14 @@ struct KernelVecType<float> {
|
|||||||
using cvt_vec_type = vec_op::FP32Vec16;
|
using cvt_vec_type = vec_op::FP32Vec16;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
|
||||||
template <>
|
template <>
|
||||||
struct KernelVecType<c10::BFloat16> {
|
struct KernelVecType<c10::BFloat16> {
|
||||||
using load_vec_type = vec_op::BF16Vec16;
|
using load_vec_type = vec_op::BF16Vec16;
|
||||||
using azp_adj_load_vec_type = vec_op::INT32Vec16;
|
using azp_adj_load_vec_type = vec_op::INT32Vec16;
|
||||||
using cvt_vec_type = vec_op::FP32Vec16;
|
using cvt_vec_type = vec_op::FP32Vec16;
|
||||||
};
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct KernelVecType<c10::Half> {
|
struct KernelVecType<c10::Half> {
|
||||||
|
|||||||
@ -7,7 +7,7 @@
|
|||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
#define MAX_SHM_RANK_NUM 8
|
#define MAX_SHM_RANK_NUM 8
|
||||||
#define PER_THREAD_SHM_BUFFER_BYTES (2 * 1024 * 1024)
|
#define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024)
|
||||||
static_assert(PER_THREAD_SHM_BUFFER_BYTES % 2 == 0);
|
static_assert(PER_THREAD_SHM_BUFFER_BYTES % 2 == 0);
|
||||||
#define PER_THREAD_SHM_BUFFER_OFFSET (PER_THREAD_SHM_BUFFER_BYTES >> 1)
|
#define PER_THREAD_SHM_BUFFER_OFFSET (PER_THREAD_SHM_BUFFER_BYTES >> 1)
|
||||||
#define MIN_THREAD_PROCESS_SIZE (256)
|
#define MIN_THREAD_PROCESS_SIZE (256)
|
||||||
@ -34,9 +34,10 @@ struct KernelVecType<c10::Half> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct ThreadSHMContext {
|
struct ThreadSHMContext {
|
||||||
volatile char _curr_thread_stamp;
|
volatile char _curr_thread_stamp[2];
|
||||||
volatile char _ready_thread_stamp;
|
volatile char _ready_thread_stamp[2];
|
||||||
char _padding1[6];
|
int local_stamp_buffer_idx;
|
||||||
|
int remote_stamp_buffer_idx;
|
||||||
int thread_id;
|
int thread_id;
|
||||||
int thread_num;
|
int thread_num;
|
||||||
int rank;
|
int rank;
|
||||||
@ -45,23 +46,28 @@ struct ThreadSHMContext {
|
|||||||
int swizzled_ranks[MAX_SHM_RANK_NUM];
|
int swizzled_ranks[MAX_SHM_RANK_NUM];
|
||||||
void* thread_shm_ptrs[MAX_SHM_RANK_NUM];
|
void* thread_shm_ptrs[MAX_SHM_RANK_NUM];
|
||||||
ThreadSHMContext* shm_contexts[MAX_SHM_RANK_NUM];
|
ThreadSHMContext* shm_contexts[MAX_SHM_RANK_NUM];
|
||||||
size_t _thread_buffer_mask;
|
size_t _thread_buffer_mask[2];
|
||||||
char _padding2[56];
|
char _padding2[40];
|
||||||
|
|
||||||
ThreadSHMContext(const int thread_id, const int thread_num, const int rank,
|
ThreadSHMContext(const int thread_id, const int thread_num, const int rank,
|
||||||
const int group_size, void* thread_shm_ptr)
|
const int group_size, void* thread_shm_ptr)
|
||||||
: _curr_thread_stamp(1),
|
: local_stamp_buffer_idx(0),
|
||||||
_ready_thread_stamp(0),
|
remote_stamp_buffer_idx(0),
|
||||||
thread_id(thread_id),
|
thread_id(thread_id),
|
||||||
thread_num(thread_num),
|
thread_num(thread_num),
|
||||||
rank(rank),
|
rank(rank),
|
||||||
group_size(group_size),
|
group_size(group_size),
|
||||||
_spinning_count(0),
|
_spinning_count(0) {
|
||||||
_thread_buffer_mask(0) {
|
|
||||||
static_assert(sizeof(ThreadSHMContext) % 64 == 0);
|
static_assert(sizeof(ThreadSHMContext) % 64 == 0);
|
||||||
TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
|
TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
|
||||||
TORCH_CHECK((size_t)this % 64 == 0);
|
TORCH_CHECK((size_t)this % 64 == 0);
|
||||||
TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
|
TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
|
||||||
|
_curr_thread_stamp[0] = 1;
|
||||||
|
_curr_thread_stamp[1] = 1;
|
||||||
|
_ready_thread_stamp[0] = 0;
|
||||||
|
_ready_thread_stamp[1] = 0;
|
||||||
|
_thread_buffer_mask[0] = 0;
|
||||||
|
_thread_buffer_mask[1] = 0;
|
||||||
for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
|
for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
|
||||||
shm_contexts[i] = nullptr;
|
shm_contexts[i] = nullptr;
|
||||||
thread_shm_ptrs[i] = nullptr;
|
thread_shm_ptrs[i] = nullptr;
|
||||||
@ -70,6 +76,11 @@ struct ThreadSHMContext {
|
|||||||
set_context(rank, this, thread_shm_ptr);
|
set_context(rank, this, thread_shm_ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void set_stamp_buffer_idx(int local, int remote) {
|
||||||
|
local_stamp_buffer_idx = local;
|
||||||
|
remote_stamp_buffer_idx = remote;
|
||||||
|
}
|
||||||
|
|
||||||
void set_context(int rank, ThreadSHMContext* ptr, void* thread_shm_ptr) {
|
void set_context(int rank, ThreadSHMContext* ptr, void* thread_shm_ptr) {
|
||||||
TORCH_CHECK(rank < MAX_SHM_RANK_NUM);
|
TORCH_CHECK(rank < MAX_SHM_RANK_NUM);
|
||||||
TORCH_CHECK(ptr);
|
TORCH_CHECK(ptr);
|
||||||
@ -84,23 +95,27 @@ struct ThreadSHMContext {
|
|||||||
T* get_thread_shm_ptr(int rank) {
|
T* get_thread_shm_ptr(int rank) {
|
||||||
return reinterpret_cast<T*>(
|
return reinterpret_cast<T*>(
|
||||||
reinterpret_cast<int8_t*>(thread_shm_ptrs[rank]) +
|
reinterpret_cast<int8_t*>(thread_shm_ptrs[rank]) +
|
||||||
(PER_THREAD_SHM_BUFFER_OFFSET & _thread_buffer_mask));
|
(PER_THREAD_SHM_BUFFER_OFFSET &
|
||||||
|
_thread_buffer_mask[local_stamp_buffer_idx]));
|
||||||
}
|
}
|
||||||
|
|
||||||
void next_buffer() { _thread_buffer_mask ^= 0xFFFFFFFFFFFFFFFF; }
|
void next_buffer() {
|
||||||
|
_thread_buffer_mask[local_stamp_buffer_idx] ^= 0xFFFFFFFFFFFFFFFF;
|
||||||
|
}
|
||||||
|
|
||||||
char get_curr_stamp() const { return _curr_thread_stamp; }
|
char get_curr_stamp(int idx) const { return _curr_thread_stamp[idx]; }
|
||||||
|
|
||||||
char get_ready_stamp() const { return _ready_thread_stamp; }
|
char get_ready_stamp(int idx) const { return _ready_thread_stamp[idx]; }
|
||||||
|
|
||||||
void next_stamp() {
|
void next_stamp() {
|
||||||
_mm_mfence();
|
_mm_mfence();
|
||||||
_curr_thread_stamp += 1;
|
_curr_thread_stamp[local_stamp_buffer_idx] += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
void commit_ready_stamp() {
|
void commit_ready_stamp() {
|
||||||
_mm_mfence();
|
_mm_mfence();
|
||||||
_ready_thread_stamp = _curr_thread_stamp;
|
_ready_thread_stamp[local_stamp_buffer_idx] =
|
||||||
|
_curr_thread_stamp[local_stamp_buffer_idx];
|
||||||
}
|
}
|
||||||
|
|
||||||
int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; }
|
int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; }
|
||||||
@ -117,10 +132,11 @@ struct ThreadSHMContext {
|
|||||||
void wait_for_one(int rank, Cond&& cond) {
|
void wait_for_one(int rank, Cond&& cond) {
|
||||||
ThreadSHMContext* rank_ctx = shm_contexts[rank];
|
ThreadSHMContext* rank_ctx = shm_contexts[rank];
|
||||||
for (;;) {
|
for (;;) {
|
||||||
char local_curr_stamp = get_curr_stamp();
|
char local_curr_stamp = get_curr_stamp(local_stamp_buffer_idx);
|
||||||
char local_ready_stamp = get_ready_stamp();
|
char local_ready_stamp = get_ready_stamp(local_stamp_buffer_idx);
|
||||||
char rank_curr_stamp = rank_ctx->get_curr_stamp();
|
char rank_curr_stamp = rank_ctx->get_curr_stamp(remote_stamp_buffer_idx);
|
||||||
char rank_ready_stamp = rank_ctx->get_ready_stamp();
|
char rank_ready_stamp =
|
||||||
|
rank_ctx->get_ready_stamp(remote_stamp_buffer_idx);
|
||||||
if (cond(local_curr_stamp, local_ready_stamp, rank_curr_stamp,
|
if (cond(local_curr_stamp, local_ready_stamp, rank_curr_stamp,
|
||||||
rank_ready_stamp)) {
|
rank_ready_stamp)) {
|
||||||
break;
|
break;
|
||||||
@ -361,6 +377,15 @@ void shm_cc_loop(ThreadSHMContext* ctx, int64_t elem_num, F&& inner_func) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void reset_threads_stamp_buffer_idx(ThreadSHMContext* ctx, int local,
|
||||||
|
int remote) {
|
||||||
|
int thread_num = ctx->thread_num;
|
||||||
|
for (int i = 0; i < thread_num; ++i) {
|
||||||
|
ThreadSHMContext* thread_ctx = ctx + i;
|
||||||
|
thread_ctx->set_stamp_buffer_idx(local, remote);
|
||||||
|
}
|
||||||
|
}
|
||||||
}; // namespace shm_cc_ops
|
}; // namespace shm_cc_ops
|
||||||
|
|
||||||
namespace shm_cc_ops {
|
namespace shm_cc_ops {
|
||||||
@ -632,6 +657,7 @@ void shm_send_tensor_list_impl(ThreadSHMContext* ctx, int64_t dst,
|
|||||||
TensorListMeta* metadata = new (metadata_tensor.data_ptr()) TensorListMeta();
|
TensorListMeta* metadata = new (metadata_tensor.data_ptr()) TensorListMeta();
|
||||||
metadata->bind_tensor_list(tensor_list_with_metadata);
|
metadata->bind_tensor_list(tensor_list_with_metadata);
|
||||||
|
|
||||||
|
shm_cc_ops::reset_threads_stamp_buffer_idx(ctx, 0, 1);
|
||||||
shm_cc_ops::shm_cc_loop<int8_t>(
|
shm_cc_ops::shm_cc_loop<int8_t>(
|
||||||
ctx, metadata->total_bytes,
|
ctx, metadata->total_bytes,
|
||||||
[&](ThreadSHMContext* thread_ctx, int64_t data_offset,
|
[&](ThreadSHMContext* thread_ctx, int64_t data_offset,
|
||||||
@ -659,6 +685,7 @@ std::vector<torch::Tensor> shm_recv_tensor_list_impl(ThreadSHMContext* ctx,
|
|||||||
torch::Tensor metadata_tensor =
|
torch::Tensor metadata_tensor =
|
||||||
torch::empty({sizeof(TensorListMeta)}, options);
|
torch::empty({sizeof(TensorListMeta)}, options);
|
||||||
|
|
||||||
|
shm_cc_ops::reset_threads_stamp_buffer_idx(ctx, 1, 0);
|
||||||
ctx->wait_for_one(src, ThreadSHMContext::check_stamp_ready);
|
ctx->wait_for_one(src, ThreadSHMContext::check_stamp_ready);
|
||||||
shm_cc_ops::memcpy(metadata_tensor.data_ptr(),
|
shm_cc_ops::memcpy(metadata_tensor.data_ptr(),
|
||||||
ctx->get_thread_shm_ptr<void>(src),
|
ctx->get_thread_shm_ptr<void>(src),
|
||||||
@ -677,7 +704,7 @@ std::vector<torch::Tensor> shm_recv_tensor_list_impl(ThreadSHMContext* ctx,
|
|||||||
ctx, metadata.total_bytes,
|
ctx, metadata.total_bytes,
|
||||||
[&](ThreadSHMContext* thread_ctx, int64_t data_offset,
|
[&](ThreadSHMContext* thread_ctx, int64_t data_offset,
|
||||||
int64_t data_elem_num, bool fast_mode) {
|
int64_t data_elem_num, bool fast_mode) {
|
||||||
ctx->wait_for_one(src, ThreadSHMContext::check_stamp_ready);
|
thread_ctx->wait_for_one(src, ThreadSHMContext::check_stamp_ready);
|
||||||
int64_t curr_shm_offset = 0;
|
int64_t curr_shm_offset = 0;
|
||||||
while (curr_shm_offset < data_elem_num) {
|
while (curr_shm_offset < data_elem_num) {
|
||||||
MemPiece frag = metadata.get_data(data_offset + curr_shm_offset);
|
MemPiece frag = metadata.get_data(data_offset + curr_shm_offset);
|
||||||
|
|||||||
@ -151,7 +151,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
|
ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
|
||||||
|
|
||||||
// Quantization
|
// Quantization
|
||||||
#if defined(__AVX512F__) || defined(__aarch64__)
|
#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
|
||||||
at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
|
at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
|
||||||
|
|
||||||
// Compute int8 quantized tensor for given scaling factor.
|
// Compute int8 quantized tensor for given scaling factor.
|
||||||
|
|||||||
@ -4,8 +4,35 @@
|
|||||||
#include <hip/hip_runtime.h>
|
#include <hip/hip_runtime.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(USE_ROCM) && defined(__GFX9__)
|
#ifdef USE_ROCM
|
||||||
#define WARP_SIZE 64
|
struct Utils {
|
||||||
|
static __host__ int get_warp_size() {
|
||||||
|
static bool is_cached = false;
|
||||||
|
static int result;
|
||||||
|
|
||||||
|
if (!is_cached) {
|
||||||
|
int device_id;
|
||||||
|
cudaDeviceProp deviceProp;
|
||||||
|
cudaGetDevice(&device_id);
|
||||||
|
cudaGetDeviceProperties(&deviceProp, device_id);
|
||||||
|
|
||||||
|
result = deviceProp.warpSize;
|
||||||
|
is_cached = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ constexpr int get_warp_size() {
|
||||||
|
#ifdef __GFX9__
|
||||||
|
return 64;
|
||||||
|
#else
|
||||||
|
return 32;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#define WARP_SIZE Utils::get_warp_size()
|
||||||
#else
|
#else
|
||||||
#define WARP_SIZE 32
|
#define WARP_SIZE 32
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -15,15 +15,16 @@ namespace vllm {
|
|||||||
// TODO(woosuk): Further optimize this kernel.
|
// TODO(woosuk): Further optimize this kernel.
|
||||||
template <typename scalar_t>
|
template <typename scalar_t>
|
||||||
__global__ void rms_norm_kernel(
|
__global__ void rms_norm_kernel(
|
||||||
scalar_t* __restrict__ out, // [..., hidden_size]
|
scalar_t* __restrict__ out, // [..., hidden_size]
|
||||||
const scalar_t* __restrict__ input, // [..., hidden_size]
|
const scalar_t* __restrict__ input, // [..., hidden_size]
|
||||||
|
const int64_t input_stride,
|
||||||
const scalar_t* __restrict__ weight, // [hidden_size]
|
const scalar_t* __restrict__ weight, // [hidden_size]
|
||||||
const float epsilon, const int num_tokens, const int hidden_size) {
|
const float epsilon, const int num_tokens, const int hidden_size) {
|
||||||
__shared__ float s_variance;
|
__shared__ float s_variance;
|
||||||
float variance = 0.0f;
|
float variance = 0.0f;
|
||||||
|
|
||||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||||
const float x = (float)input[blockIdx.x * hidden_size + idx];
|
const float x = (float)input[blockIdx.x * input_stride + idx];
|
||||||
variance += x * x;
|
variance += x * x;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -37,7 +38,7 @@ __global__ void rms_norm_kernel(
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||||
float x = (float)input[blockIdx.x * hidden_size + idx];
|
float x = (float)input[blockIdx.x * input_stride + idx];
|
||||||
out[blockIdx.x * hidden_size + idx] =
|
out[blockIdx.x * hidden_size + idx] =
|
||||||
((scalar_t)(x * s_variance)) * weight[idx];
|
((scalar_t)(x * s_variance)) * weight[idx];
|
||||||
}
|
}
|
||||||
@ -50,7 +51,8 @@ __global__ void rms_norm_kernel(
|
|||||||
template <typename scalar_t, int width>
|
template <typename scalar_t, int width>
|
||||||
__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
|
__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
|
||||||
fused_add_rms_norm_kernel(
|
fused_add_rms_norm_kernel(
|
||||||
scalar_t* __restrict__ input, // [..., hidden_size]
|
scalar_t* __restrict__ input, // [..., hidden_size]
|
||||||
|
const int64_t input_stride,
|
||||||
scalar_t* __restrict__ residual, // [..., hidden_size]
|
scalar_t* __restrict__ residual, // [..., hidden_size]
|
||||||
const scalar_t* __restrict__ weight, // [hidden_size]
|
const scalar_t* __restrict__ weight, // [hidden_size]
|
||||||
const float epsilon, const int num_tokens, const int hidden_size) {
|
const float epsilon, const int num_tokens, const int hidden_size) {
|
||||||
@ -59,6 +61,7 @@ fused_add_rms_norm_kernel(
|
|||||||
static_assert(sizeof(_f16Vec<scalar_t, width>) == sizeof(scalar_t) * width);
|
static_assert(sizeof(_f16Vec<scalar_t, width>) == sizeof(scalar_t) * width);
|
||||||
|
|
||||||
const int vec_hidden_size = hidden_size / width;
|
const int vec_hidden_size = hidden_size / width;
|
||||||
|
const int64_t vec_input_stride = input_stride / width;
|
||||||
__shared__ float s_variance;
|
__shared__ float s_variance;
|
||||||
float variance = 0.0f;
|
float variance = 0.0f;
|
||||||
/* These and the argument pointers are all declared `restrict` as they are
|
/* These and the argument pointers are all declared `restrict` as they are
|
||||||
@ -73,7 +76,8 @@ fused_add_rms_norm_kernel(
|
|||||||
|
|
||||||
for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
|
for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
|
||||||
int id = blockIdx.x * vec_hidden_size + idx;
|
int id = blockIdx.x * vec_hidden_size + idx;
|
||||||
_f16Vec<scalar_t, width> temp = input_v[id];
|
int64_t strided_id = blockIdx.x * vec_input_stride + idx;
|
||||||
|
_f16Vec<scalar_t, width> temp = input_v[strided_id];
|
||||||
temp += residual_v[id];
|
temp += residual_v[id];
|
||||||
variance += temp.sum_squares();
|
variance += temp.sum_squares();
|
||||||
residual_v[id] = temp;
|
residual_v[id] = temp;
|
||||||
@ -90,10 +94,11 @@ fused_add_rms_norm_kernel(
|
|||||||
|
|
||||||
for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
|
for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
|
||||||
int id = blockIdx.x * vec_hidden_size + idx;
|
int id = blockIdx.x * vec_hidden_size + idx;
|
||||||
|
int64_t strided_id = blockIdx.x * vec_input_stride + idx;
|
||||||
_f16Vec<scalar_t, width> temp = residual_v[id];
|
_f16Vec<scalar_t, width> temp = residual_v[id];
|
||||||
temp *= s_variance;
|
temp *= s_variance;
|
||||||
temp *= weight_v[idx];
|
temp *= weight_v[idx];
|
||||||
input_v[id] = temp;
|
input_v[strided_id] = temp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -103,7 +108,8 @@ fused_add_rms_norm_kernel(
|
|||||||
template <typename scalar_t, int width>
|
template <typename scalar_t, int width>
|
||||||
__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
|
__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
|
||||||
fused_add_rms_norm_kernel(
|
fused_add_rms_norm_kernel(
|
||||||
scalar_t* __restrict__ input, // [..., hidden_size]
|
scalar_t* __restrict__ input, // [..., hidden_size]
|
||||||
|
const int64_t input_stride,
|
||||||
scalar_t* __restrict__ residual, // [..., hidden_size]
|
scalar_t* __restrict__ residual, // [..., hidden_size]
|
||||||
const scalar_t* __restrict__ weight, // [hidden_size]
|
const scalar_t* __restrict__ weight, // [hidden_size]
|
||||||
const float epsilon, const int num_tokens, const int hidden_size) {
|
const float epsilon, const int num_tokens, const int hidden_size) {
|
||||||
@ -111,7 +117,7 @@ fused_add_rms_norm_kernel(
|
|||||||
float variance = 0.0f;
|
float variance = 0.0f;
|
||||||
|
|
||||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||||
scalar_t z = input[blockIdx.x * hidden_size + idx];
|
scalar_t z = input[blockIdx.x * input_stride + idx];
|
||||||
z += residual[blockIdx.x * hidden_size + idx];
|
z += residual[blockIdx.x * hidden_size + idx];
|
||||||
float x = (float)z;
|
float x = (float)z;
|
||||||
variance += x * x;
|
variance += x * x;
|
||||||
@ -129,7 +135,7 @@ fused_add_rms_norm_kernel(
|
|||||||
|
|
||||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||||
float x = (float)residual[blockIdx.x * hidden_size + idx];
|
float x = (float)residual[blockIdx.x * hidden_size + idx];
|
||||||
input[blockIdx.x * hidden_size + idx] =
|
input[blockIdx.x * input_stride + idx] =
|
||||||
((scalar_t)(x * s_variance)) * weight[idx];
|
((scalar_t)(x * s_variance)) * weight[idx];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -141,11 +147,12 @@ void rms_norm(torch::Tensor& out, // [..., hidden_size]
|
|||||||
torch::Tensor& weight, // [hidden_size]
|
torch::Tensor& weight, // [hidden_size]
|
||||||
double epsilon) {
|
double epsilon) {
|
||||||
TORCH_CHECK(out.is_contiguous());
|
TORCH_CHECK(out.is_contiguous());
|
||||||
TORCH_CHECK(input.is_contiguous());
|
TORCH_CHECK(input.stride(-1) == 1);
|
||||||
TORCH_CHECK(weight.is_contiguous());
|
TORCH_CHECK(weight.is_contiguous());
|
||||||
|
|
||||||
int hidden_size = input.size(-1);
|
int hidden_size = input.size(-1);
|
||||||
int num_tokens = input.numel() / hidden_size;
|
int num_tokens = input.numel() / hidden_size;
|
||||||
|
int64_t input_stride = input.stride(-2);
|
||||||
|
|
||||||
dim3 grid(num_tokens);
|
dim3 grid(num_tokens);
|
||||||
dim3 block(std::min(hidden_size, 1024));
|
dim3 block(std::min(hidden_size, 1024));
|
||||||
@ -153,26 +160,29 @@ void rms_norm(torch::Tensor& out, // [..., hidden_size]
|
|||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
|
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
|
||||||
vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
|
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), input_stride,
|
||||||
weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
|
weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
#define LAUNCH_FUSED_ADD_RMS_NORM(width) \
|
#define LAUNCH_FUSED_ADD_RMS_NORM(width) \
|
||||||
VLLM_DISPATCH_FLOATING_TYPES( \
|
VLLM_DISPATCH_FLOATING_TYPES( \
|
||||||
input.scalar_type(), "fused_add_rms_norm_kernel", [&] { \
|
input.scalar_type(), "fused_add_rms_norm_kernel", [&] { \
|
||||||
vllm::fused_add_rms_norm_kernel<scalar_t, width> \
|
vllm::fused_add_rms_norm_kernel<scalar_t, width> \
|
||||||
<<<grid, block, 0, stream>>>(input.data_ptr<scalar_t>(), \
|
<<<grid, block, 0, stream>>>( \
|
||||||
residual.data_ptr<scalar_t>(), \
|
input.data_ptr<scalar_t>(), input_stride, \
|
||||||
weight.data_ptr<scalar_t>(), epsilon, \
|
residual.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(), \
|
||||||
num_tokens, hidden_size); \
|
epsilon, num_tokens, hidden_size); \
|
||||||
});
|
});
|
||||||
|
|
||||||
void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
||||||
torch::Tensor& residual, // [..., hidden_size]
|
torch::Tensor& residual, // [..., hidden_size]
|
||||||
torch::Tensor& weight, // [hidden_size]
|
torch::Tensor& weight, // [hidden_size]
|
||||||
double epsilon) {
|
double epsilon) {
|
||||||
|
TORCH_CHECK(residual.is_contiguous());
|
||||||
|
TORCH_CHECK(weight.is_contiguous());
|
||||||
int hidden_size = input.size(-1);
|
int hidden_size = input.size(-1);
|
||||||
|
int64_t input_stride = input.stride(-2);
|
||||||
int num_tokens = input.numel() / hidden_size;
|
int num_tokens = input.numel() / hidden_size;
|
||||||
|
|
||||||
dim3 grid(num_tokens);
|
dim3 grid(num_tokens);
|
||||||
@ -194,9 +204,16 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
|||||||
auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
|
auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
|
||||||
auto res_ptr = reinterpret_cast<std::uintptr_t>(residual.data_ptr());
|
auto res_ptr = reinterpret_cast<std::uintptr_t>(residual.data_ptr());
|
||||||
auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
|
auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
|
||||||
bool ptrs_are_aligned =
|
constexpr int vector_width = 8;
|
||||||
inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
|
constexpr int req_alignment_bytes =
|
||||||
if (ptrs_are_aligned && hidden_size % 8 == 0) {
|
vector_width * 2; // vector_width * sizeof(bfloat16 or float16) (float32
|
||||||
|
// falls back to non-vectorized version anyway)
|
||||||
|
bool ptrs_are_aligned = inp_ptr % req_alignment_bytes == 0 &&
|
||||||
|
res_ptr % req_alignment_bytes == 0 &&
|
||||||
|
wt_ptr % req_alignment_bytes == 0;
|
||||||
|
bool offsets_are_multiple_of_vector_width =
|
||||||
|
hidden_size % vector_width == 0 && input_stride % vector_width == 0;
|
||||||
|
if (ptrs_are_aligned && offsets_are_multiple_of_vector_width) {
|
||||||
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
||||||
} else {
|
} else {
|
||||||
LAUNCH_FUSED_ADD_RMS_NORM(0);
|
LAUNCH_FUSED_ADD_RMS_NORM(0);
|
||||||
|
|||||||
@ -23,8 +23,9 @@ namespace vllm {
|
|||||||
// TODO(woosuk): Further optimize this kernel.
|
// TODO(woosuk): Further optimize this kernel.
|
||||||
template <typename scalar_t, typename fp8_type>
|
template <typename scalar_t, typename fp8_type>
|
||||||
__global__ void rms_norm_static_fp8_quant_kernel(
|
__global__ void rms_norm_static_fp8_quant_kernel(
|
||||||
fp8_type* __restrict__ out, // [..., hidden_size]
|
fp8_type* __restrict__ out, // [..., hidden_size]
|
||||||
const scalar_t* __restrict__ input, // [..., hidden_size]
|
const scalar_t* __restrict__ input, // [..., hidden_size]
|
||||||
|
const int input_stride,
|
||||||
const scalar_t* __restrict__ weight, // [hidden_size]
|
const scalar_t* __restrict__ weight, // [hidden_size]
|
||||||
const float* __restrict__ scale, // [1]
|
const float* __restrict__ scale, // [1]
|
||||||
const float epsilon, const int num_tokens, const int hidden_size) {
|
const float epsilon, const int num_tokens, const int hidden_size) {
|
||||||
@ -32,7 +33,7 @@ __global__ void rms_norm_static_fp8_quant_kernel(
|
|||||||
float variance = 0.0f;
|
float variance = 0.0f;
|
||||||
|
|
||||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||||
const float x = (float)input[blockIdx.x * hidden_size + idx];
|
const float x = (float)input[blockIdx.x * input_stride + idx];
|
||||||
variance += x * x;
|
variance += x * x;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -49,7 +50,7 @@ __global__ void rms_norm_static_fp8_quant_kernel(
|
|||||||
float const scale_inv = 1.0f / *scale;
|
float const scale_inv = 1.0f / *scale;
|
||||||
|
|
||||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||||
float x = (float)input[blockIdx.x * hidden_size + idx];
|
float x = (float)input[blockIdx.x * input_stride + idx];
|
||||||
float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx];
|
float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx];
|
||||||
out[blockIdx.x * hidden_size + idx] =
|
out[blockIdx.x * hidden_size + idx] =
|
||||||
scaled_fp8_conversion<true, fp8_type>(out_norm, scale_inv);
|
scaled_fp8_conversion<true, fp8_type>(out_norm, scale_inv);
|
||||||
@ -63,8 +64,9 @@ __global__ void rms_norm_static_fp8_quant_kernel(
|
|||||||
template <typename scalar_t, int width, typename fp8_type>
|
template <typename scalar_t, int width, typename fp8_type>
|
||||||
__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
|
__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
|
||||||
fused_add_rms_norm_static_fp8_quant_kernel(
|
fused_add_rms_norm_static_fp8_quant_kernel(
|
||||||
fp8_type* __restrict__ out, // [..., hidden_size]
|
fp8_type* __restrict__ out, // [..., hidden_size]
|
||||||
scalar_t* __restrict__ input, // [..., hidden_size]
|
scalar_t* __restrict__ input, // [..., hidden_size]
|
||||||
|
const int input_stride,
|
||||||
scalar_t* __restrict__ residual, // [..., hidden_size]
|
scalar_t* __restrict__ residual, // [..., hidden_size]
|
||||||
const scalar_t* __restrict__ weight, // [hidden_size]
|
const scalar_t* __restrict__ weight, // [hidden_size]
|
||||||
const float* __restrict__ scale, // [1]
|
const float* __restrict__ scale, // [1]
|
||||||
@ -74,6 +76,7 @@ fused_add_rms_norm_static_fp8_quant_kernel(
|
|||||||
static_assert(sizeof(_f16Vec<scalar_t, width>) == sizeof(scalar_t) * width);
|
static_assert(sizeof(_f16Vec<scalar_t, width>) == sizeof(scalar_t) * width);
|
||||||
|
|
||||||
const int vec_hidden_size = hidden_size / width;
|
const int vec_hidden_size = hidden_size / width;
|
||||||
|
const int vec_input_stride = input_stride / width;
|
||||||
__shared__ float s_variance;
|
__shared__ float s_variance;
|
||||||
float variance = 0.0f;
|
float variance = 0.0f;
|
||||||
/* These and the argument pointers are all declared `restrict` as they are
|
/* These and the argument pointers are all declared `restrict` as they are
|
||||||
@ -87,8 +90,9 @@ fused_add_rms_norm_static_fp8_quant_kernel(
|
|||||||
reinterpret_cast<const _f16Vec<scalar_t, width>*>(weight);
|
reinterpret_cast<const _f16Vec<scalar_t, width>*>(weight);
|
||||||
|
|
||||||
for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
|
for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
|
||||||
|
int stride_id = blockIdx.x * vec_input_stride + idx;
|
||||||
int id = blockIdx.x * vec_hidden_size + idx;
|
int id = blockIdx.x * vec_hidden_size + idx;
|
||||||
_f16Vec<scalar_t, width> temp = input_v[id];
|
_f16Vec<scalar_t, width> temp = input_v[stride_id];
|
||||||
temp += residual_v[id];
|
temp += residual_v[id];
|
||||||
variance += temp.sum_squares();
|
variance += temp.sum_squares();
|
||||||
residual_v[id] = temp;
|
residual_v[id] = temp;
|
||||||
@ -125,8 +129,9 @@ fused_add_rms_norm_static_fp8_quant_kernel(
|
|||||||
template <typename scalar_t, int width, typename fp8_type>
|
template <typename scalar_t, int width, typename fp8_type>
|
||||||
__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
|
__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
|
||||||
fused_add_rms_norm_static_fp8_quant_kernel(
|
fused_add_rms_norm_static_fp8_quant_kernel(
|
||||||
fp8_type* __restrict__ out, // [..., hidden_size]
|
fp8_type* __restrict__ out, // [..., hidden_size]
|
||||||
scalar_t* __restrict__ input, // [..., hidden_size]
|
scalar_t* __restrict__ input, // [..., hidden_size]
|
||||||
|
const int input_stride,
|
||||||
scalar_t* __restrict__ residual, // [..., hidden_size]
|
scalar_t* __restrict__ residual, // [..., hidden_size]
|
||||||
const scalar_t* __restrict__ weight, // [hidden_size]
|
const scalar_t* __restrict__ weight, // [hidden_size]
|
||||||
const float* __restrict__ scale, // [1]
|
const float* __restrict__ scale, // [1]
|
||||||
@ -135,7 +140,7 @@ fused_add_rms_norm_static_fp8_quant_kernel(
|
|||||||
float variance = 0.0f;
|
float variance = 0.0f;
|
||||||
|
|
||||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||||
scalar_t z = input[blockIdx.x * hidden_size + idx];
|
scalar_t z = input[blockIdx.x * input_stride + idx];
|
||||||
z += residual[blockIdx.x * hidden_size + idx];
|
z += residual[blockIdx.x * hidden_size + idx];
|
||||||
float x = (float)z;
|
float x = (float)z;
|
||||||
variance += x * x;
|
variance += x * x;
|
||||||
@ -169,7 +174,9 @@ void rms_norm_static_fp8_quant(torch::Tensor& out, // [..., hidden_size]
|
|||||||
torch::Tensor& weight, // [hidden_size]
|
torch::Tensor& weight, // [hidden_size]
|
||||||
torch::Tensor& scale, // [1]
|
torch::Tensor& scale, // [1]
|
||||||
double epsilon) {
|
double epsilon) {
|
||||||
|
TORCH_CHECK(out.is_contiguous());
|
||||||
int hidden_size = input.size(-1);
|
int hidden_size = input.size(-1);
|
||||||
|
int input_stride = input.stride(-2);
|
||||||
int num_tokens = input.numel() / hidden_size;
|
int num_tokens = input.numel() / hidden_size;
|
||||||
|
|
||||||
dim3 grid(num_tokens);
|
dim3 grid(num_tokens);
|
||||||
@ -183,8 +190,9 @@ void rms_norm_static_fp8_quant(torch::Tensor& out, // [..., hidden_size]
|
|||||||
vllm::rms_norm_static_fp8_quant_kernel<scalar_t, fp8_t>
|
vllm::rms_norm_static_fp8_quant_kernel<scalar_t, fp8_t>
|
||||||
<<<grid, block, 0, stream>>>(
|
<<<grid, block, 0, stream>>>(
|
||||||
out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
|
out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
|
||||||
weight.data_ptr<scalar_t>(), scale.data_ptr<float>(),
|
input_stride, weight.data_ptr<scalar_t>(),
|
||||||
epsilon, num_tokens, hidden_size);
|
scale.data_ptr<float>(), epsilon, num_tokens,
|
||||||
|
hidden_size);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -198,7 +206,7 @@ void rms_norm_static_fp8_quant(torch::Tensor& out, // [..., hidden_size]
|
|||||||
width, fp8_t> \
|
width, fp8_t> \
|
||||||
<<<grid, block, 0, stream>>>( \
|
<<<grid, block, 0, stream>>>( \
|
||||||
out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(), \
|
out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(), \
|
||||||
residual.data_ptr<scalar_t>(), \
|
input_stride, residual.data_ptr<scalar_t>(), \
|
||||||
weight.data_ptr<scalar_t>(), scale.data_ptr<float>(), \
|
weight.data_ptr<scalar_t>(), scale.data_ptr<float>(), \
|
||||||
epsilon, num_tokens, hidden_size); \
|
epsilon, num_tokens, hidden_size); \
|
||||||
}); \
|
}); \
|
||||||
@ -210,7 +218,10 @@ void fused_add_rms_norm_static_fp8_quant(
|
|||||||
torch::Tensor& weight, // [hidden_size]
|
torch::Tensor& weight, // [hidden_size]
|
||||||
torch::Tensor& scale, // [1]
|
torch::Tensor& scale, // [1]
|
||||||
double epsilon) {
|
double epsilon) {
|
||||||
|
TORCH_CHECK(out.is_contiguous());
|
||||||
|
TORCH_CHECK(residual.is_contiguous());
|
||||||
int hidden_size = input.size(-1);
|
int hidden_size = input.size(-1);
|
||||||
|
int input_stride = input.stride(-2);
|
||||||
int num_tokens = input.numel() / hidden_size;
|
int num_tokens = input.numel() / hidden_size;
|
||||||
|
|
||||||
dim3 grid(num_tokens);
|
dim3 grid(num_tokens);
|
||||||
@ -234,7 +245,7 @@ void fused_add_rms_norm_static_fp8_quant(
|
|||||||
auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
|
auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
|
||||||
bool ptrs_are_aligned =
|
bool ptrs_are_aligned =
|
||||||
inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
|
inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
|
||||||
if (ptrs_are_aligned && hidden_size % 8 == 0) {
|
if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0) {
|
||||||
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
||||||
} else {
|
} else {
|
||||||
LAUNCH_FUSED_ADD_RMS_NORM(0);
|
LAUNCH_FUSED_ADD_RMS_NORM(0);
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
#include <cub/cub.cuh>
|
||||||
|
|
||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
#include <ATen/cuda/Atomic.cuh>
|
#include <ATen/cuda/Atomic.cuh>
|
||||||
@ -19,9 +20,14 @@ __global__ void moe_align_block_size_kernel(
|
|||||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
||||||
int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
|
int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
|
||||||
int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
|
int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
|
||||||
size_t numel, int32_t* __restrict__ cumsum) {
|
size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded) {
|
||||||
extern __shared__ int32_t shared_counts[];
|
extern __shared__ int32_t shared_counts[];
|
||||||
|
|
||||||
|
// Initialize sorted_token_ids with numel
|
||||||
|
for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
|
||||||
|
sorted_token_ids[it] = numel;
|
||||||
|
}
|
||||||
|
|
||||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||||
const int my_expert_start = warp_id * experts_per_warp;
|
const int my_expert_start = warp_id * experts_per_warp;
|
||||||
|
|
||||||
@ -45,18 +51,27 @@ __global__ void moe_align_block_size_kernel(
|
|||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
// Compute prefix sum over token counts per expert
|
||||||
cumsum[0] = 0;
|
using BlockScan = cub::BlockScan<int32_t, 1024>;
|
||||||
for (int i = 1; i <= num_experts; ++i) {
|
__shared__ typename BlockScan::TempStorage temp_storage;
|
||||||
int expert_count = 0;
|
|
||||||
int warp_idx = (i - 1) / experts_per_warp;
|
|
||||||
int expert_offset = (i - 1) % experts_per_warp;
|
|
||||||
expert_count = shared_counts[warp_idx * experts_per_warp + expert_offset];
|
|
||||||
|
|
||||||
cumsum[i] =
|
int expert_count = 0;
|
||||||
cumsum[i - 1] + CEILDIV(expert_count, block_size) * block_size;
|
int expert_id = threadIdx.x;
|
||||||
}
|
if (expert_id < num_experts) {
|
||||||
*total_tokens_post_pad = cumsum[num_experts];
|
int warp_idx = expert_id / experts_per_warp;
|
||||||
|
int expert_offset = expert_id % experts_per_warp;
|
||||||
|
expert_count = shared_counts[warp_idx * experts_per_warp + expert_offset];
|
||||||
|
expert_count = CEILDIV(expert_count, block_size) * block_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
int cumsum_val;
|
||||||
|
BlockScan(temp_storage).ExclusiveSum(expert_count, cumsum_val);
|
||||||
|
if (expert_id <= num_experts) {
|
||||||
|
cumsum[expert_id] = cumsum_val;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (expert_id == num_experts) {
|
||||||
|
*total_tokens_post_pad = cumsum_val;
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
@ -67,6 +82,13 @@ __global__ void moe_align_block_size_kernel(
|
|||||||
expert_ids[i / block_size] = threadIdx.x;
|
expert_ids[i / block_size] = threadIdx.x;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Fill remaining expert_ids with 0
|
||||||
|
const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x;
|
||||||
|
const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
|
||||||
|
for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) {
|
||||||
|
expert_ids[i] = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename scalar_t>
|
template <typename scalar_t>
|
||||||
@ -105,7 +127,12 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
|
|||||||
const scalar_t* __restrict__ topk_ids,
|
const scalar_t* __restrict__ topk_ids,
|
||||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
||||||
int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
|
int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
|
||||||
int32_t block_size, size_t numel) {
|
int32_t block_size, size_t numel, int32_t max_num_tokens_padded) {
|
||||||
|
// Initialize sorted_token_ids with numel
|
||||||
|
for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
|
||||||
|
sorted_token_ids[it] = numel;
|
||||||
|
}
|
||||||
|
|
||||||
const size_t tid = threadIdx.x;
|
const size_t tid = threadIdx.x;
|
||||||
const size_t stride = blockDim.x;
|
const size_t stride = blockDim.x;
|
||||||
|
|
||||||
@ -153,6 +180,13 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Fill remaining expert_ids with 0
|
||||||
|
const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x;
|
||||||
|
const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
|
||||||
|
for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) {
|
||||||
|
expert_ids[i] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
for (size_t i = tid; i < numel; i += stride) {
|
for (size_t i = tid; i < numel; i += stride) {
|
||||||
int32_t expert_id = topk_ids[i];
|
int32_t expert_id = topk_ids[i];
|
||||||
int32_t rank_post_pad =
|
int32_t rank_post_pad =
|
||||||
@ -179,13 +213,17 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
|||||||
int threads = 1024;
|
int threads = 1024;
|
||||||
threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
|
threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
|
||||||
|
|
||||||
|
// BlockScan uses 1024 threads and assigns one thread per expert.
|
||||||
|
TORCH_CHECK(padded_num_experts < 1024,
|
||||||
|
"padded_num_experts must be less than 1024");
|
||||||
|
|
||||||
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
|
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
|
||||||
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
|
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
|
||||||
// calc needed amount of shared mem for `cumsum` tensors
|
// calc needed amount of shared mem for `cumsum` tensors
|
||||||
auto options_int =
|
auto options_int =
|
||||||
torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
|
torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
|
||||||
torch::Tensor cumsum_buffer =
|
torch::Tensor cumsum_buffer =
|
||||||
torch::zeros({num_experts + 1}, options_int);
|
torch::empty({num_experts + 1}, options_int);
|
||||||
bool small_batch_expert_mode =
|
bool small_batch_expert_mode =
|
||||||
(topk_ids.numel() < 1024) && (num_experts <= 64);
|
(topk_ids.numel() < 1024) && (num_experts <= 64);
|
||||||
|
|
||||||
@ -203,7 +241,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
|||||||
sorted_token_ids.data_ptr<int32_t>(),
|
sorted_token_ids.data_ptr<int32_t>(),
|
||||||
experts_ids.data_ptr<int32_t>(),
|
experts_ids.data_ptr<int32_t>(),
|
||||||
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
|
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
|
||||||
topk_ids.numel());
|
topk_ids.numel(), sorted_token_ids.size(0));
|
||||||
} else {
|
} else {
|
||||||
auto align_kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
|
auto align_kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
|
||||||
|
|
||||||
@ -217,7 +255,8 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
|||||||
experts_ids.data_ptr<int32_t>(),
|
experts_ids.data_ptr<int32_t>(),
|
||||||
num_tokens_post_pad.data_ptr<int32_t>(), num_experts,
|
num_tokens_post_pad.data_ptr<int32_t>(), num_experts,
|
||||||
padded_num_experts, experts_per_warp, block_size,
|
padded_num_experts, experts_per_warp, block_size,
|
||||||
topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>());
|
topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>(),
|
||||||
|
sorted_token_ids.size(0));
|
||||||
|
|
||||||
const int block_threads = std::min(256, (int)threads);
|
const int block_threads = std::min(256, (int)threads);
|
||||||
const int num_blocks =
|
const int num_blocks =
|
||||||
|
|||||||
@ -10,32 +10,28 @@
|
|||||||
|
|
||||||
void moe_permute(
|
void moe_permute(
|
||||||
const torch::Tensor& input, // [n_token, hidden]
|
const torch::Tensor& input, // [n_token, hidden]
|
||||||
const torch::Tensor& topk_weights, //[n_token, topk]
|
const torch::Tensor& topk_ids, // [n_token, topk]
|
||||||
torch::Tensor& topk_ids, // [n_token, topk]
|
|
||||||
const torch::Tensor& token_expert_indices, // [n_token, topk]
|
const torch::Tensor& token_expert_indices, // [n_token, topk]
|
||||||
const std::optional<torch::Tensor>& expert_map, // [n_expert]
|
const std::optional<torch::Tensor>& expert_map, // [n_expert]
|
||||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
||||||
const std::optional<int64_t>& align_block_size,
|
const std::optional<int64_t>& align_block_size,
|
||||||
torch::Tensor&
|
torch::Tensor& permuted_input, // [permuted_size, hidden]
|
||||||
permuted_input, // [topk * n_token/align_block_size_m, hidden]
|
|
||||||
torch::Tensor& expert_first_token_offset, // [n_local_expert + 1]
|
torch::Tensor& expert_first_token_offset, // [n_local_expert + 1]
|
||||||
torch::Tensor& src_row_id2dst_row_id_map, // [n_token, topk]
|
torch::Tensor& inv_permuted_idx, // [n_token, topk]
|
||||||
|
torch::Tensor& permuted_idx, // [permute_size]
|
||||||
torch::Tensor& m_indices) { // [align_expand_m]
|
torch::Tensor& m_indices) { // [align_expand_m]
|
||||||
TORCH_CHECK(topk_weights.scalar_type() == at::ScalarType::Float,
|
|
||||||
"topk_weights must be float32");
|
|
||||||
TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long,
|
TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long,
|
||||||
"expert_first_token_offset must be int64");
|
"expert_first_token_offset must be int64");
|
||||||
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
|
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
|
||||||
"topk_ids must be int32");
|
"topk_ids must be int32");
|
||||||
TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
|
TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
|
||||||
"token_expert_indices must be int32");
|
"token_expert_indices must be int32");
|
||||||
TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
|
TORCH_CHECK(inv_permuted_idx.scalar_type() == at::ScalarType::Int,
|
||||||
"src_row_id2dst_row_id_map must be int32");
|
"inv_permuted_idx must be int32");
|
||||||
TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
|
TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
|
||||||
"expert_first_token_offset shape != n_local_expert+1")
|
"expert_first_token_offset shape != n_local_expert+1")
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(inv_permuted_idx.sizes() == token_expert_indices.sizes(),
|
||||||
src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(),
|
"token_expert_indices shape must be same as inv_permuted_idx");
|
||||||
"token_expert_indices shape must be same as src_row_id2dst_row_id_map");
|
|
||||||
auto n_token = input.sizes()[0];
|
auto n_token = input.sizes()[0];
|
||||||
auto n_hidden = input.sizes()[1];
|
auto n_hidden = input.sizes()[1];
|
||||||
auto align_block_size_value =
|
auto align_block_size_value =
|
||||||
@ -46,8 +42,9 @@ void moe_permute(
|
|||||||
auto sort_workspace = torch::empty(
|
auto sort_workspace = torch::empty(
|
||||||
{sorter_size},
|
{sorter_size},
|
||||||
torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
|
torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
|
||||||
|
auto copy_topk_ids = topk_ids.clone(); // copy topk_ids for preprocess
|
||||||
auto permuted_experts_id = torch::empty_like(topk_ids);
|
auto permuted_experts_id = torch::empty_like(topk_ids);
|
||||||
auto dst_row_id2src_row_id_map = torch::empty_like(src_row_id2dst_row_id_map);
|
auto sorted_row_idx = torch::empty_like(inv_permuted_idx);
|
||||||
auto align_expert_first_token_offset =
|
auto align_expert_first_token_offset =
|
||||||
torch::zeros_like(expert_first_token_offset);
|
torch::zeros_like(expert_first_token_offset);
|
||||||
|
|
||||||
@ -67,24 +64,22 @@ void moe_permute(
|
|||||||
const int* expert_map_ptr = get_ptr<int>(expert_map.value());
|
const int* expert_map_ptr = get_ptr<int>(expert_map.value());
|
||||||
valid_num_ptr =
|
valid_num_ptr =
|
||||||
get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
|
get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
|
||||||
preprocessTopkIdLauncher(get_ptr<int>(topk_ids), n_token * topk,
|
preprocessTopkIdLauncher(get_ptr<int>(copy_topk_ids), n_token * topk,
|
||||||
expert_map_ptr, n_expert, stream);
|
expert_map_ptr, n_expert, stream);
|
||||||
}
|
}
|
||||||
// expert sort topk expert id and scan expert id get expert_first_token_offset
|
// expert sort topk expert id and scan expert id get expert_first_token_offset
|
||||||
sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indices),
|
sortAndScanExpert(
|
||||||
get_ptr<int>(permuted_experts_id),
|
get_ptr<int>(copy_topk_ids), get_ptr<int>(token_expert_indices),
|
||||||
get_ptr<int>(dst_row_id2src_row_id_map),
|
get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
|
||||||
get_ptr<int64_t>(expert_first_token_offset), n_token,
|
get_ptr<int64_t>(expert_first_token_offset), n_token, n_expert,
|
||||||
n_expert, n_local_expert, topk, sorter,
|
n_local_expert, topk, sorter, get_ptr<int>(sort_workspace), stream);
|
||||||
get_ptr<int>(sort_workspace), stream);
|
|
||||||
|
|
||||||
// dispatch expandInputRowsKernelLauncher
|
// dispatch expandInputRowsKernelLauncher
|
||||||
MOE_DISPATCH(input.scalar_type(), [&] {
|
MOE_DISPATCH(input.scalar_type(), [&] {
|
||||||
expandInputRowsKernelLauncher<scalar_t>(
|
expandInputRowsKernelLauncher<scalar_t>(
|
||||||
get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
|
get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
|
||||||
get_ptr<float>(topk_weights), get_ptr<int>(permuted_experts_id),
|
get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
|
||||||
get_ptr<int>(dst_row_id2src_row_id_map),
|
get_ptr<int>(inv_permuted_idx), get_ptr<int>(permuted_idx),
|
||||||
get_ptr<int>(src_row_id2dst_row_id_map),
|
|
||||||
get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
|
get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
|
||||||
n_hidden, topk, n_local_expert, align_block_size_value, stream);
|
n_hidden, topk, n_local_expert, align_block_size_value, stream);
|
||||||
});
|
});
|
||||||
@ -101,32 +96,34 @@ void moe_permute(
|
|||||||
}
|
}
|
||||||
|
|
||||||
void moe_unpermute(
|
void moe_unpermute(
|
||||||
const torch::Tensor& permuted_hidden_states, // [n_token * topk, hidden]
|
const torch::Tensor& permuted_hidden_states, // [n_token * topk, hidden]
|
||||||
const torch::Tensor& topk_weights, //[n_token, topk]
|
const torch::Tensor& topk_weights, // [n_token, topk]
|
||||||
const torch::Tensor& topk_ids, // [n_token, topk]
|
const torch::Tensor& inv_permuted_idx, // [n_token, topk]
|
||||||
const torch::Tensor& src_row_id2dst_row_id_map, // [n_token, topk]
|
const std::optional<torch::Tensor>&
|
||||||
const torch::Tensor& expert_first_token_offset, // [n_local_expert+1]
|
expert_first_token_offset, // [n_local_expert+1]
|
||||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
int64_t topk,
|
||||||
torch::Tensor& hidden_states // [n_token, hidden]
|
torch::Tensor& hidden_states // [n_token, hidden]
|
||||||
) {
|
) {
|
||||||
TORCH_CHECK(src_row_id2dst_row_id_map.sizes() == topk_ids.sizes(),
|
|
||||||
"topk_ids shape must be same as src_row_id2dst_row_id_map");
|
|
||||||
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
|
|
||||||
"topk_ids must be int32");
|
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
permuted_hidden_states.scalar_type() == hidden_states.scalar_type(),
|
permuted_hidden_states.scalar_type() == hidden_states.scalar_type(),
|
||||||
"topk_ids dtype must be same as src_row_id2dst_row_id_map");
|
"permuted_hidden_states dtype must be same as hidden_states");
|
||||||
auto n_token = hidden_states.size(0);
|
auto n_token = hidden_states.size(0);
|
||||||
auto n_hidden = hidden_states.size(1);
|
auto n_hidden = hidden_states.size(1);
|
||||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||||
const int64_t* valid_ptr =
|
|
||||||
get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
|
int64_t const* valid_ptr = nullptr;
|
||||||
|
if (expert_first_token_offset.has_value()) {
|
||||||
|
int n_local_expert = expert_first_token_offset.value().size(0) - 1;
|
||||||
|
valid_ptr =
|
||||||
|
get_ptr<int64_t>(expert_first_token_offset.value()) + n_local_expert;
|
||||||
|
}
|
||||||
|
|
||||||
MOE_DISPATCH(hidden_states.scalar_type(), [&] {
|
MOE_DISPATCH(hidden_states.scalar_type(), [&] {
|
||||||
finalizeMoeRoutingKernelLauncher<scalar_t, scalar_t>(
|
finalizeMoeRoutingKernelLauncher<scalar_t, scalar_t>(
|
||||||
get_ptr<scalar_t>(permuted_hidden_states),
|
get_ptr<scalar_t>(permuted_hidden_states),
|
||||||
get_ptr<scalar_t>(hidden_states), get_ptr<float>(topk_weights),
|
get_ptr<scalar_t>(hidden_states), get_ptr<float>(topk_weights),
|
||||||
get_ptr<int>(src_row_id2dst_row_id_map), get_ptr<int>(topk_ids),
|
get_ptr<int>(inv_permuted_idx), n_token, n_hidden, topk, valid_ptr,
|
||||||
n_token, n_hidden, topk, valid_ptr, stream);
|
stream);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -160,30 +157,6 @@ __global__ void shuffleInputRowsKernel(const T* input,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
__global__ void shuffleInputRowsKernelSlow(const T* input,
|
|
||||||
const int32_t* dst2src_map,
|
|
||||||
T* output, int64_t num_src_rows,
|
|
||||||
int64_t num_dst_rows,
|
|
||||||
int64_t num_cols) {
|
|
||||||
int64_t dest_row_idx = blockIdx.x;
|
|
||||||
int64_t const source_row_idx = dst2src_map[dest_row_idx];
|
|
||||||
|
|
||||||
if (blockIdx.x < num_dst_rows) {
|
|
||||||
// Duplicate and permute rows
|
|
||||||
auto const* source_row_ptr = input + source_row_idx * num_cols;
|
|
||||||
auto* dest_row_ptr = output + dest_row_idx * num_cols;
|
|
||||||
|
|
||||||
int64_t const start_offset = threadIdx.x;
|
|
||||||
int64_t const stride = blockDim.x;
|
|
||||||
|
|
||||||
for (int elem_index = start_offset; elem_index < num_cols;
|
|
||||||
elem_index += stride) {
|
|
||||||
dest_row_ptr[elem_index] = source_row_ptr[elem_index];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void shuffle_rows(const torch::Tensor& input_tensor,
|
void shuffle_rows(const torch::Tensor& input_tensor,
|
||||||
const torch::Tensor& dst2src_map,
|
const torch::Tensor& dst2src_map,
|
||||||
torch::Tensor& output_tensor) {
|
torch::Tensor& output_tensor) {
|
||||||
@ -197,24 +170,17 @@ void shuffle_rows(const torch::Tensor& input_tensor,
|
|||||||
int64_t const num_src_rows = input_tensor.size(0);
|
int64_t const num_src_rows = input_tensor.size(0);
|
||||||
int64_t const num_cols = input_tensor.size(1);
|
int64_t const num_cols = input_tensor.size(1);
|
||||||
|
|
||||||
if (num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)) {
|
TORCH_CHECK(!(num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)),
|
||||||
// use slow kernel if num_cols can't be aligned to 128 bits
|
"num_cols must be divisible by 128 / "
|
||||||
MOE_DISPATCH(input_tensor.scalar_type(), [&] {
|
"sizeof(input_tensor.scalar_type()) / 8");
|
||||||
shuffleInputRowsKernelSlow<scalar_t><<<blocks, threads, 0, stream>>>(
|
|
||||||
reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
|
MOE_DISPATCH(input_tensor.scalar_type(), [&] {
|
||||||
dst2src_map.data_ptr<int32_t>(),
|
shuffleInputRowsKernel<scalar_t><<<blocks, threads, 0, stream>>>(
|
||||||
reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
|
reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
|
||||||
num_dest_rows, num_cols);
|
dst2src_map.data_ptr<int32_t>(),
|
||||||
});
|
reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
|
||||||
} else {
|
num_dest_rows, num_cols);
|
||||||
MOE_DISPATCH(input_tensor.scalar_type(), [&] {
|
});
|
||||||
shuffleInputRowsKernel<scalar_t><<<blocks, threads, 0, stream>>>(
|
|
||||||
reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
|
|
||||||
dst2src_map.data_ptr<int32_t>(),
|
|
||||||
reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
|
|
||||||
num_dest_rows, num_cols);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|||||||
@ -177,7 +177,7 @@ __global__ void getMIndicesKernel(int64_t* expert_first_token_offset,
|
|||||||
int tidx = threadIdx.x;
|
int tidx = threadIdx.x;
|
||||||
extern __shared__ int64_t smem_expert_first_token_offset[];
|
extern __shared__ int64_t smem_expert_first_token_offset[];
|
||||||
for (int i = tidx; i <= num_local_expert; i += blockDim.x) {
|
for (int i = tidx; i <= num_local_expert; i += blockDim.x) {
|
||||||
smem_expert_first_token_offset[tidx] = __ldg(expert_first_token_offset + i);
|
smem_expert_first_token_offset[i] = __ldg(expert_first_token_offset + i);
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
auto last_token_offset = smem_expert_first_token_offset[eidx + 1];
|
auto last_token_offset = smem_expert_first_token_offset[eidx + 1];
|
||||||
|
|||||||
@ -57,31 +57,19 @@ void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void expandInputRowsKernelLauncher(
|
void expandInputRowsKernelLauncher(
|
||||||
T const* unpermuted_input, T* permuted_output,
|
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
|
||||||
const float* unpermuted_scales, int* sorted_experts,
|
|
||||||
int const* expanded_dest_row_to_expanded_source_row,
|
int const* expanded_dest_row_to_expanded_source_row,
|
||||||
int* expanded_source_row_to_expanded_dest_row,
|
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
|
||||||
int64_t* expert_first_token_offset, int64_t const num_rows,
|
int64_t* expert_first_token_offset, int64_t const num_rows,
|
||||||
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
|
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
|
||||||
int num_local_experts, const int& align_block_size, cudaStream_t stream);
|
int num_local_experts, const int& align_block_size, cudaStream_t stream);
|
||||||
|
|
||||||
// Final kernel to unpermute and scale
|
|
||||||
// This kernel unpermutes the original data, does the k-way reduction and
|
|
||||||
// performs the final skip connection.
|
|
||||||
template <typename T, typename OutputType, bool CHECK_SKIPPED>
|
|
||||||
__global__ void finalizeMoeRoutingKernel(
|
|
||||||
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
|
||||||
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
|
||||||
int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
|
|
||||||
int64_t const* num_valid_ptr);
|
|
||||||
|
|
||||||
template <class T, class OutputType>
|
template <class T, class OutputType>
|
||||||
void finalizeMoeRoutingKernelLauncher(
|
void finalizeMoeRoutingKernelLauncher(
|
||||||
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
||||||
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
||||||
int const* expert_for_source_row, int64_t const num_rows,
|
int64_t const num_rows, int64_t const cols, int64_t const k,
|
||||||
int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
|
int64_t const* num_valid_ptr, cudaStream_t stream);
|
||||||
cudaStream_t stream);
|
|
||||||
|
|
||||||
void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
|
void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
|
||||||
const int* expert_map_ptr, int num_experts,
|
const int* expert_map_ptr, int num_experts,
|
||||||
|
|||||||
@ -2,10 +2,9 @@
|
|||||||
|
|
||||||
template <typename T, bool CHECK_SKIPPED, bool ALIGN_BLOCK_SIZE>
|
template <typename T, bool CHECK_SKIPPED, bool ALIGN_BLOCK_SIZE>
|
||||||
__global__ void expandInputRowsKernel(
|
__global__ void expandInputRowsKernel(
|
||||||
T const* unpermuted_input, T* permuted_output,
|
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
|
||||||
const float* unpermuted_scales, int* sorted_experts,
|
|
||||||
int const* expanded_dest_row_to_expanded_source_row,
|
int const* expanded_dest_row_to_expanded_source_row,
|
||||||
int* expanded_source_row_to_expanded_dest_row,
|
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
|
||||||
int64_t* expert_first_token_offset, int64_t const num_rows,
|
int64_t* expert_first_token_offset, int64_t const num_rows,
|
||||||
int64_t const* num_dest_rows, int64_t const cols, int64_t k,
|
int64_t const* num_dest_rows, int64_t const cols, int64_t k,
|
||||||
int num_local_experts, int align_block_size) {
|
int num_local_experts, int align_block_size) {
|
||||||
@ -54,6 +53,10 @@ __global__ void expandInputRowsKernel(
|
|||||||
assert(expanded_dest_row <= INT32_MAX);
|
assert(expanded_dest_row <= INT32_MAX);
|
||||||
expanded_source_row_to_expanded_dest_row[expanded_source_row] =
|
expanded_source_row_to_expanded_dest_row[expanded_source_row] =
|
||||||
static_cast<int>(expanded_dest_row);
|
static_cast<int>(expanded_dest_row);
|
||||||
|
// skip non local expert token
|
||||||
|
if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
|
||||||
|
permuted_idx[expanded_dest_row] = expanded_source_row;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
|
if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
|
||||||
@ -62,7 +65,7 @@ __global__ void expandInputRowsKernel(
|
|||||||
using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
|
using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
|
||||||
|
|
||||||
// Duplicate and permute rows
|
// Duplicate and permute rows
|
||||||
int64_t const source_row = expanded_source_row % num_rows;
|
int64_t const source_row = expanded_source_row / k;
|
||||||
|
|
||||||
auto const* source_row_ptr =
|
auto const* source_row_ptr =
|
||||||
reinterpret_cast<DataElem const*>(unpermuted_input + source_row * cols);
|
reinterpret_cast<DataElem const*>(unpermuted_input + source_row * cols);
|
||||||
@ -82,10 +85,9 @@ __global__ void expandInputRowsKernel(
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void expandInputRowsKernelLauncher(
|
void expandInputRowsKernelLauncher(
|
||||||
T const* unpermuted_input, T* permuted_output,
|
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
|
||||||
const float* unpermuted_scales, int* sorted_experts,
|
|
||||||
int const* expanded_dest_row_to_expanded_source_row,
|
int const* expanded_dest_row_to_expanded_source_row,
|
||||||
int* expanded_source_row_to_expanded_dest_row,
|
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
|
||||||
int64_t* expert_first_token_offset, int64_t const num_rows,
|
int64_t* expert_first_token_offset, int64_t const num_rows,
|
||||||
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
|
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
|
||||||
int num_local_experts, const int& align_block_size, cudaStream_t stream) {
|
int num_local_experts, const int& align_block_size, cudaStream_t stream) {
|
||||||
@ -105,11 +107,11 @@ void expandInputRowsKernelLauncher(
|
|||||||
int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1);
|
int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1);
|
||||||
|
|
||||||
func<<<blocks, threads, smem_size, stream>>>(
|
func<<<blocks, threads, smem_size, stream>>>(
|
||||||
unpermuted_input, permuted_output, unpermuted_scales, sorted_experts,
|
unpermuted_input, permuted_output, sorted_experts,
|
||||||
expanded_dest_row_to_expanded_source_row,
|
expanded_dest_row_to_expanded_source_row,
|
||||||
expanded_source_row_to_expanded_dest_row, expert_first_token_offset,
|
expanded_source_row_to_expanded_dest_row, permuted_idx,
|
||||||
num_rows, num_valid_tokens_ptr, cols, k, num_local_experts,
|
expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k,
|
||||||
align_block_size);
|
num_local_experts, align_block_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class T, class U>
|
template <class T, class U>
|
||||||
@ -128,11 +130,9 @@ template <typename T, typename OutputType, bool CHECK_SKIPPED>
|
|||||||
__global__ void finalizeMoeRoutingKernel(
|
__global__ void finalizeMoeRoutingKernel(
|
||||||
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
||||||
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
||||||
int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
|
int64_t const orig_cols, int64_t const k, int64_t const* num_valid_ptr) {
|
||||||
int64_t const* num_valid_ptr) {
|
|
||||||
assert(orig_cols % 4 == 0);
|
assert(orig_cols % 4 == 0);
|
||||||
int64_t const original_row = blockIdx.x;
|
int64_t const original_row = blockIdx.x;
|
||||||
int64_t const num_rows = gridDim.x;
|
|
||||||
auto const offset = original_row * orig_cols;
|
auto const offset = original_row * orig_cols;
|
||||||
OutputType* reduced_row_ptr = reduced_unpermuted_output + offset;
|
OutputType* reduced_row_ptr = reduced_unpermuted_output + offset;
|
||||||
int64_t const num_valid = *num_valid_ptr;
|
int64_t const num_valid = *num_valid_ptr;
|
||||||
@ -159,14 +159,13 @@ __global__ void finalizeMoeRoutingKernel(
|
|||||||
ComputeElem thread_output;
|
ComputeElem thread_output;
|
||||||
thread_output.fill(0);
|
thread_output.fill(0);
|
||||||
for (int k_idx = 0; k_idx < k; ++k_idx) {
|
for (int k_idx = 0; k_idx < k; ++k_idx) {
|
||||||
int64_t const expanded_original_row = original_row + k_idx * num_rows;
|
int64_t const expanded_original_row = original_row * k + k_idx;
|
||||||
int64_t const expanded_permuted_row =
|
int64_t const expanded_permuted_row =
|
||||||
expanded_source_row_to_expanded_dest_row[expanded_original_row];
|
expanded_source_row_to_expanded_dest_row[expanded_original_row];
|
||||||
|
|
||||||
int64_t const k_offset = original_row * k + k_idx;
|
int64_t const k_offset = original_row * k + k_idx;
|
||||||
float const row_scale = scales[k_offset];
|
float const row_scale = scales[k_offset];
|
||||||
|
|
||||||
// Check after row_rescale has accumulated
|
|
||||||
if (CHECK_SKIPPED && expanded_permuted_row >= num_valid) {
|
if (CHECK_SKIPPED && expanded_permuted_row >= num_valid) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -189,9 +188,8 @@ template <class T, class OutputType>
|
|||||||
void finalizeMoeRoutingKernelLauncher(
|
void finalizeMoeRoutingKernelLauncher(
|
||||||
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
||||||
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
||||||
int const* expert_for_source_row, int64_t const num_rows,
|
int64_t const num_rows, int64_t const cols, int64_t const k,
|
||||||
int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
|
int64_t const* num_valid_ptr, cudaStream_t stream) {
|
||||||
cudaStream_t stream) {
|
|
||||||
int64_t const blocks = num_rows;
|
int64_t const blocks = num_rows;
|
||||||
int64_t const threads = 256;
|
int64_t const threads = 256;
|
||||||
bool const check_finished = num_valid_ptr != nullptr;
|
bool const check_finished = num_valid_ptr != nullptr;
|
||||||
@ -201,6 +199,5 @@ void finalizeMoeRoutingKernelLauncher(
|
|||||||
auto* const kernel = func_map[check_finished];
|
auto* const kernel = func_map[check_finished];
|
||||||
kernel<<<blocks, threads, 0, stream>>>(
|
kernel<<<blocks, threads, 0, stream>>>(
|
||||||
expanded_permuted_rows, reduced_unpermuted_output, scales,
|
expanded_permuted_rows, reduced_unpermuted_output, scales,
|
||||||
expanded_source_row_to_expanded_dest_row, expert_for_source_row, cols, k,
|
expanded_source_row_to_expanded_dest_row, cols, k, num_valid_ptr);
|
||||||
num_valid_ptr);
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -190,8 +190,8 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
|||||||
2) This implementation assumes k is small, but will work for any k.
|
2) This implementation assumes k is small, but will work for any k.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, typename IndType>
|
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType>
|
||||||
__launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
|
__launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||||
void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices,
|
void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices,
|
||||||
int* source_rows, const int k, const int start_expert, const int end_expert)
|
int* source_rows, const int k, const int start_expert, const int end_expert)
|
||||||
{
|
{
|
||||||
@ -209,12 +209,12 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
|
|||||||
|
|
||||||
// Restrictions based on previous section.
|
// Restrictions based on previous section.
|
||||||
static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
|
static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
|
||||||
static_assert(WARP_SIZE % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
|
static_assert(WARP_SIZE_PARAM % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
|
||||||
static_assert(THREADS_PER_ROW == (THREADS_PER_ROW & -THREADS_PER_ROW), "THREADS_PER_ROW must be power of 2");
|
static_assert(THREADS_PER_ROW == (THREADS_PER_ROW & -THREADS_PER_ROW), "THREADS_PER_ROW must be power of 2");
|
||||||
static_assert(THREADS_PER_ROW <= WARP_SIZE, "THREADS_PER_ROW can be at most warp size");
|
static_assert(THREADS_PER_ROW <= WARP_SIZE_PARAM, "THREADS_PER_ROW can be at most warp size");
|
||||||
|
|
||||||
// We have NUM_EXPERTS elements per row. We specialize for small #experts
|
// We have NUM_EXPERTS elements per row. We specialize for small #experts
|
||||||
static constexpr int ELTS_PER_WARP = WARP_SIZE * VPT;
|
static constexpr int ELTS_PER_WARP = WARP_SIZE_PARAM * VPT;
|
||||||
static constexpr int ROWS_PER_WARP = ELTS_PER_WARP / ELTS_PER_ROW;
|
static constexpr int ROWS_PER_WARP = ELTS_PER_WARP / ELTS_PER_ROW;
|
||||||
static constexpr int ROWS_PER_CTA = WARPS_PER_CTA * ROWS_PER_WARP;
|
static constexpr int ROWS_PER_CTA = WARPS_PER_CTA * ROWS_PER_WARP;
|
||||||
|
|
||||||
@ -393,41 +393,51 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
|
|||||||
namespace detail
|
namespace detail
|
||||||
{
|
{
|
||||||
// Constructs some constants needed to partition the work across threads at compile time.
|
// Constructs some constants needed to partition the work across threads at compile time.
|
||||||
template <int EXPERTS, int BYTES_PER_LDG>
|
template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM>
|
||||||
struct TopkConstants
|
struct TopkConstants
|
||||||
{
|
{
|
||||||
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
|
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
|
||||||
static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE) == 0, "");
|
static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0, "");
|
||||||
static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE));
|
static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM));
|
||||||
static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
|
static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
|
||||||
static constexpr int THREADS_PER_ROW = EXPERTS / VPT;
|
static constexpr int THREADS_PER_ROW = EXPERTS / VPT;
|
||||||
static constexpr int ROWS_PER_WARP = WARP_SIZE / THREADS_PER_ROW;
|
static const int ROWS_PER_WARP = WARP_SIZE_PARAM / THREADS_PER_ROW;
|
||||||
};
|
};
|
||||||
} // namespace detail
|
} // namespace detail
|
||||||
|
|
||||||
template <int EXPERTS, int WARPS_PER_TB, typename IndType>
|
template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, typename IndType>
|
||||||
void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
|
void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
|
||||||
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
|
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
|
||||||
{
|
{
|
||||||
static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
|
static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
|
||||||
|
|
||||||
static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
|
static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
|
||||||
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG>;
|
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
|
||||||
static constexpr int VPT = Constants::VPT;
|
static constexpr int VPT = Constants::VPT;
|
||||||
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
||||||
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
||||||
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
|
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
|
||||||
|
|
||||||
dim3 block_dim(WARP_SIZE, WARPS_PER_TB);
|
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
|
||||||
topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG><<<num_blocks, block_dim, 0, stream>>>(
|
topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM><<<num_blocks, block_dim, 0, stream>>>(
|
||||||
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
|
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB) \
|
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB) \
|
||||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB>( \
|
switch (warpSize) { \
|
||||||
gating_output, nullptr, topk_weights, topk_indices, \
|
case 32: \
|
||||||
token_expert_indices, num_tokens, topk, 0, num_experts, \
|
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32>( \
|
||||||
stream);
|
gating_output, nullptr, topk_weights, topk_indices, \
|
||||||
|
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
|
||||||
|
break; \
|
||||||
|
case 64: \
|
||||||
|
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64>( \
|
||||||
|
gating_output, nullptr, topk_weights, topk_indices, \
|
||||||
|
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
|
||||||
|
break; \
|
||||||
|
default: \
|
||||||
|
TORCH_CHECK(false, "Unsupported warp size: ", warpSize); \
|
||||||
|
}
|
||||||
|
|
||||||
template <typename IndType>
|
template <typename IndType>
|
||||||
void topkGatingSoftmaxKernelLauncher(
|
void topkGatingSoftmaxKernelLauncher(
|
||||||
@ -441,6 +451,7 @@ void topkGatingSoftmaxKernelLauncher(
|
|||||||
const int topk,
|
const int topk,
|
||||||
cudaStream_t stream) {
|
cudaStream_t stream) {
|
||||||
static constexpr int WARPS_PER_TB = 4;
|
static constexpr int WARPS_PER_TB = 4;
|
||||||
|
auto warpSize = WARP_SIZE;
|
||||||
switch (num_experts) {
|
switch (num_experts) {
|
||||||
case 1:
|
case 1:
|
||||||
LAUNCH_SOFTMAX(1, WARPS_PER_TB);
|
LAUNCH_SOFTMAX(1, WARPS_PER_TB);
|
||||||
|
|||||||
@ -56,18 +56,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
|||||||
" -> Tensor");
|
" -> Tensor");
|
||||||
|
|
||||||
m.def(
|
m.def(
|
||||||
"moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
|
"moe_permute(Tensor input, Tensor topk_ids,"
|
||||||
"Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
|
"Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
|
||||||
"int n_local_expert,"
|
"int n_local_expert,"
|
||||||
"int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
|
"int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
|
||||||
"expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "
|
"expert_first_token_offset, Tensor! inv_permuted_idx, Tensor! "
|
||||||
"m_indices)->()");
|
"permuted_idx, Tensor! m_indices)->()");
|
||||||
|
|
||||||
m.def(
|
m.def(
|
||||||
"moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights,"
|
"moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights,"
|
||||||
"Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor "
|
"Tensor inv_permuted_idx, Tensor? expert_first_token_offset, "
|
||||||
"expert_first_token_offset, int n_expert, int n_local_expert,int "
|
"int topk, Tensor! hidden_states)->()");
|
||||||
"topk, Tensor! hidden_states)->()");
|
|
||||||
|
|
||||||
m.def("moe_permute_unpermute_supported() -> bool");
|
m.def("moe_permute_unpermute_supported() -> bool");
|
||||||
m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
|
m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
|
||||||
|
|||||||
10
csrc/ops.h
@ -287,6 +287,16 @@ void scaled_fp4_experts_quant(
|
|||||||
torch::Tensor const& input, torch::Tensor const& input_global_scale,
|
torch::Tensor const& input, torch::Tensor const& input_global_scale,
|
||||||
torch::Tensor const& input_offset_by_experts,
|
torch::Tensor const& input_offset_by_experts,
|
||||||
torch::Tensor const& output_scale_offset_by_experts);
|
torch::Tensor const& output_scale_offset_by_experts);
|
||||||
|
|
||||||
|
void per_token_group_quant_fp8(const torch::Tensor& input,
|
||||||
|
torch::Tensor& output_q, torch::Tensor& output_s,
|
||||||
|
int64_t group_size, double eps, double fp8_min,
|
||||||
|
double fp8_max, bool scale_ue8m0);
|
||||||
|
|
||||||
|
void per_token_group_quant_int8(const torch::Tensor& input,
|
||||||
|
torch::Tensor& output_q,
|
||||||
|
torch::Tensor& output_s, int64_t group_size,
|
||||||
|
double eps, double int8_min, double int8_max);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||||
|
|||||||
@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include "core/math.hpp"
|
#include "core/math.hpp"
|
||||||
#include "cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
|
|
||||||
#include "quantization/fp8/common.cuh"
|
#include "quantization/fp8/common.cuh"
|
||||||
|
|||||||
@ -1,6 +1,10 @@
|
|||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
|
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
#include "../per_token_group_quant_8bit.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
#include "../../dispatch_utils.h"
|
#include "../../dispatch_utils.h"
|
||||||
@ -336,3 +340,13 @@ void dynamic_scaled_int8_quant(
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
void per_token_group_quant_int8(const torch::Tensor& input,
|
||||||
|
torch::Tensor& output_q,
|
||||||
|
torch::Tensor& output_s, int64_t group_size,
|
||||||
|
double eps, double int8_min, double int8_max) {
|
||||||
|
per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
|
||||||
|
int8_min, int8_max);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|||||||
@ -86,6 +86,7 @@ D = s_a s_b \widehat A \widehat B
|
|||||||
```
|
```
|
||||||
|
|
||||||
Epilogue parameters:
|
Epilogue parameters:
|
||||||
|
|
||||||
- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
|
- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
|
||||||
- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
|
- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
|
||||||
|
|
||||||
@ -135,7 +136,7 @@ That is precomputed and stored in `azp_with_adj` as a row-vector.
|
|||||||
Epilogue parameters:
|
Epilogue parameters:
|
||||||
|
|
||||||
- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
|
- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
|
||||||
- Generally this will be per-tensor as the zero-points are per-tensor.
|
- Generally this will be per-tensor as the zero-points are per-tensor.
|
||||||
- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
|
- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
|
||||||
- `azp_with_adj` is the precomputed zero-point term ($` z_a J_a \widehat B `$), is per-channel (row-vector).
|
- `azp_with_adj` is the precomputed zero-point term ($` z_a J_a \widehat B `$), is per-channel (row-vector).
|
||||||
- `bias` is the bias, is always per-channel (row-vector).
|
- `bias` is the bias, is always per-channel (row-vector).
|
||||||
@ -152,7 +153,7 @@ That means the zero-point term $` z_a J_a \widehat B `$ becomes an outer product
|
|||||||
Epilogue parameters:
|
Epilogue parameters:
|
||||||
|
|
||||||
- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
|
- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
|
||||||
- Generally this will be per-token as the zero-points are per-token.
|
- Generally this will be per-token as the zero-points are per-token.
|
||||||
- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
|
- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
|
||||||
- `azp_adj` is the precomputed zero-point adjustment term ($` \mathbf 1 \widehat B `$), is per-channel (row-vector).
|
- `azp_adj` is the precomputed zero-point adjustment term ($` \mathbf 1 \widehat B `$), is per-channel (row-vector).
|
||||||
- `azp` is the zero-point (`z_a`), is per-token (column-vector).
|
- `azp` is the zero-point (`z_a`), is per-token (column-vector).
|
||||||
|
|||||||
@ -1,6 +1,5 @@
|
|||||||
#include "scaled_mm_kernels.hpp"
|
#include "scaled_mm_kernels.hpp"
|
||||||
#include "scaled_mm_sm90_fp8_dispatch.cuh"
|
#include "scaled_mm_sm90_fp8_dispatch.cuh"
|
||||||
#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
|
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
@ -13,11 +12,11 @@ void cutlass_scaled_mm_sm90_fp8(torch::Tensor& out, torch::Tensor const& a,
|
|||||||
if (bias) {
|
if (bias) {
|
||||||
TORCH_CHECK(bias->dtype() == out.dtype(),
|
TORCH_CHECK(bias->dtype() == out.dtype(),
|
||||||
"currently bias dtype must match output dtype ", out.dtype());
|
"currently bias dtype must match output dtype ", out.dtype());
|
||||||
return cutlass_scaled_mm_sm90_fp8_epilogue<c3x::ScaledEpilogueBias>(
|
return cutlass_scaled_mm_sm90_fp8_epilogue<true>(out, a, b, a_scales,
|
||||||
out, a, b, a_scales, b_scales, *bias);
|
b_scales, *bias);
|
||||||
} else {
|
} else {
|
||||||
return cutlass_scaled_mm_sm90_fp8_epilogue<c3x::ScaledEpilogue>(
|
return cutlass_scaled_mm_sm90_fp8_epilogue<false>(out, a, b, a_scales,
|
||||||
out, a, b, a_scales, b_scales);
|
b_scales);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
#include "scaled_mm.cuh"
|
#include "scaled_mm.cuh"
|
||||||
#include "cutlass_gemm_caller.cuh"
|
#include "cutlass_gemm_caller.cuh"
|
||||||
|
#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This file defines Gemm kernel configurations for SM90 (fp8) based on the Gemm
|
* This file defines Gemm kernel configurations for SM90 (fp8) based on the Gemm
|
||||||
@ -12,8 +13,91 @@ namespace vllm {
|
|||||||
|
|
||||||
using c3x::cutlass_gemm_caller;
|
using c3x::cutlass_gemm_caller;
|
||||||
|
|
||||||
template <typename InType, typename OutType,
|
template <typename ElementAB_, typename ElementD_,
|
||||||
template <typename, typename, typename> typename Epilogue>
|
template <typename, typename, typename> typename Epilogue_,
|
||||||
|
typename TileShape, typename ClusterShape, typename KernelSchedule,
|
||||||
|
typename EpilogueSchedule, bool swap_ab_ = false>
|
||||||
|
struct cutlass_3x_gemm_sm90_fp8 {
|
||||||
|
using ElementAB = ElementAB_;
|
||||||
|
using ElementC = ElementD_;
|
||||||
|
using ElementD = ElementD_;
|
||||||
|
using ElementAcc =
|
||||||
|
typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
|
||||||
|
float>::type;
|
||||||
|
|
||||||
|
using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
|
||||||
|
|
||||||
|
using EVTCompute = typename Epilogue::EVTCompute;
|
||||||
|
|
||||||
|
static constexpr int AlignmentAB =
|
||||||
|
128 / cutlass::sizeof_bits<ElementAB>::value;
|
||||||
|
static constexpr int AlignmentCD =
|
||||||
|
128 / cutlass::sizeof_bits<ElementD>::value;
|
||||||
|
|
||||||
|
// Compile-time swap_ab flag
|
||||||
|
static constexpr bool swap_ab = swap_ab_;
|
||||||
|
|
||||||
|
// -----------------------------------------------------------
|
||||||
|
// Layout definitions
|
||||||
|
// -----------------------------------------------------------
|
||||||
|
using LayoutA = cutlass::layout::RowMajor;
|
||||||
|
using LayoutA_T = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
|
||||||
|
|
||||||
|
using LayoutB = cutlass::layout::ColumnMajor;
|
||||||
|
using LayoutB_T = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
|
||||||
|
|
||||||
|
using LayoutD = cutlass::layout::RowMajor;
|
||||||
|
using LayoutD_Transpose =
|
||||||
|
typename cutlass::layout::LayoutTranspose<LayoutD>::type;
|
||||||
|
|
||||||
|
using LayoutC = LayoutD;
|
||||||
|
using LayoutC_Transpose = LayoutD_Transpose;
|
||||||
|
|
||||||
|
// -----------------------------------------------------------
|
||||||
|
// Collective epilogue (conditionally swap operands and layouts)
|
||||||
|
// -----------------------------------------------------------
|
||||||
|
using CollectiveEpilogue =
|
||||||
|
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||||
|
cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
|
||||||
|
ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
|
||||||
|
ElementAcc, float, ElementC,
|
||||||
|
conditional_t<swap_ab, LayoutC_Transpose, LayoutC>, AlignmentCD,
|
||||||
|
ElementD, conditional_t<swap_ab, LayoutD_Transpose, LayoutD>,
|
||||||
|
AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp;
|
||||||
|
|
||||||
|
static constexpr size_t CEStorageSize =
|
||||||
|
sizeof(typename CollectiveEpilogue::SharedStorage);
|
||||||
|
|
||||||
|
using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
|
||||||
|
static_cast<int>(CEStorageSize)>;
|
||||||
|
|
||||||
|
// -----------------------------------------------------------
|
||||||
|
// Collective mainloop (conditionally swap operands and layouts)
|
||||||
|
// -----------------------------------------------------------
|
||||||
|
using CollectiveMainloop = conditional_t<
|
||||||
|
swap_ab,
|
||||||
|
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||||
|
cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, ElementAB,
|
||||||
|
LayoutB_T, AlignmentAB, // Swapped B (as A)
|
||||||
|
ElementAB, LayoutA_T, AlignmentAB, // Swapped A (as B)
|
||||||
|
ElementAcc, TileShape, ClusterShape, Stages,
|
||||||
|
KernelSchedule>::CollectiveOp,
|
||||||
|
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||||
|
cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, ElementAB,
|
||||||
|
LayoutA, AlignmentAB, ElementAB, LayoutB, AlignmentAB, ElementAcc,
|
||||||
|
TileShape, ClusterShape, Stages, KernelSchedule>::CollectiveOp>;
|
||||||
|
|
||||||
|
// -----------------------------------------------------------
|
||||||
|
// Kernel definition
|
||||||
|
// -----------------------------------------------------------
|
||||||
|
using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
|
||||||
|
cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
|
||||||
|
cutlass::gemm::PersistentScheduler>>;
|
||||||
|
|
||||||
|
struct GemmKernel : public KernelType {};
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType, bool EnableBias>
|
||||||
struct sm90_fp8_config_default {
|
struct sm90_fp8_config_default {
|
||||||
// M in (128, inf)
|
// M in (128, inf)
|
||||||
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
@ -22,13 +106,17 @@ struct sm90_fp8_config_default {
|
|||||||
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
||||||
using TileShape = Shape<_128, _128, _128>;
|
using TileShape = Shape<_128, _128, _128>;
|
||||||
using ClusterShape = Shape<_2, _1, _1>;
|
using ClusterShape = Shape<_2, _1, _1>;
|
||||||
using Cutlass3xGemm =
|
|
||||||
cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
|
using Cutlass3xGemm = conditional_t<
|
||||||
KernelSchedule, EpilogueSchedule>;
|
EnableBias,
|
||||||
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueBias,
|
||||||
|
TileShape, ClusterShape, KernelSchedule,
|
||||||
|
EpilogueSchedule>,
|
||||||
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
|
||||||
|
ClusterShape, KernelSchedule, EpilogueSchedule>>;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename InType, typename OutType,
|
template <typename InType, typename OutType, bool EnableBias>
|
||||||
template <typename, typename, typename> typename Epilogue>
|
|
||||||
struct sm90_fp8_config_M128 {
|
struct sm90_fp8_config_M128 {
|
||||||
// M in (64, 128]
|
// M in (64, 128]
|
||||||
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
@ -37,33 +125,146 @@ struct sm90_fp8_config_M128 {
|
|||||||
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
||||||
using TileShape = Shape<_64, _128, _128>;
|
using TileShape = Shape<_64, _128, _128>;
|
||||||
using ClusterShape = Shape<_2, _1, _1>;
|
using ClusterShape = Shape<_2, _1, _1>;
|
||||||
using Cutlass3xGemm =
|
using Cutlass3xGemm = conditional_t<
|
||||||
cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
|
EnableBias,
|
||||||
KernelSchedule, EpilogueSchedule>;
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueBias,
|
||||||
|
TileShape, ClusterShape, KernelSchedule,
|
||||||
|
EpilogueSchedule>,
|
||||||
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
|
||||||
|
ClusterShape, KernelSchedule, EpilogueSchedule>>;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename InType, typename OutType,
|
template <typename InType, typename OutType, bool EnableBias>
|
||||||
template <typename, typename, typename> typename Epilogue>
|
struct sm90_fp8_config_M64_N1280 {
|
||||||
struct sm90_fp8_config_M64 {
|
// M in (16, 64], N in [1 1280]
|
||||||
// M in [1, 64]
|
|
||||||
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
using KernelSchedule =
|
using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
|
||||||
cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
|
|
||||||
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
||||||
using TileShape = Shape<_64, _64, _128>;
|
using TileShape = Shape<_64, _16, _256>;
|
||||||
using ClusterShape = Shape<_1, _8, _1>;
|
using ClusterShape = Shape<_1, _4, _1>;
|
||||||
|
|
||||||
using Cutlass3xGemm =
|
// enable swap AB for M < 64
|
||||||
cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
|
using Cutlass3xGemm = conditional_t<
|
||||||
KernelSchedule, EpilogueSchedule>;
|
EnableBias,
|
||||||
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
|
||||||
|
TileShape, ClusterShape, KernelSchedule,
|
||||||
|
EpilogueSchedule, true>,
|
||||||
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
|
||||||
|
ClusterShape, KernelSchedule, EpilogueSchedule,
|
||||||
|
true>>;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename InType, typename OutType,
|
template <typename InType, typename OutType, bool EnableBias>
|
||||||
template <typename, typename, typename> typename Epilogue,
|
struct sm90_fp8_config_M64_N8192 {
|
||||||
|
// M in (16, 64], N > 1280
|
||||||
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
|
using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
|
||||||
|
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
||||||
|
using TileShape = Shape<_64, _64, _256>;
|
||||||
|
using ClusterShape = Shape<_1, _1, _1>;
|
||||||
|
|
||||||
|
// enable swap AB for M < 64
|
||||||
|
using Cutlass3xGemm = conditional_t<
|
||||||
|
EnableBias,
|
||||||
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
|
||||||
|
TileShape, ClusterShape, KernelSchedule,
|
||||||
|
EpilogueSchedule, true>,
|
||||||
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
|
||||||
|
ClusterShape, KernelSchedule, EpilogueSchedule,
|
||||||
|
true>>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType, bool EnableBias>
|
||||||
|
struct sm90_fp8_config_M16_N1280 {
|
||||||
|
// M in [1, 16], N in [1, 1280]
|
||||||
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
|
using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
|
||||||
|
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
||||||
|
using TileShape = Shape<_64, _16, _256>;
|
||||||
|
using ClusterShape = Shape<_1, _2, _1>;
|
||||||
|
|
||||||
|
// enable swap AB for M < 64
|
||||||
|
using Cutlass3xGemm = conditional_t<
|
||||||
|
EnableBias,
|
||||||
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
|
||||||
|
TileShape, ClusterShape, KernelSchedule,
|
||||||
|
EpilogueSchedule, true>,
|
||||||
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
|
||||||
|
ClusterShape, KernelSchedule, EpilogueSchedule,
|
||||||
|
true>>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType, bool EnableBias>
|
||||||
|
struct sm90_fp8_config_M16_N8192 {
|
||||||
|
// M in [1, 16], N > 1280
|
||||||
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
|
using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
|
||||||
|
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
||||||
|
using TileShape = Shape<_64, _16, _256>;
|
||||||
|
using ClusterShape = Shape<_1, _1, _1>;
|
||||||
|
|
||||||
|
// enable swap AB for M < 64
|
||||||
|
using Cutlass3xGemm = conditional_t<
|
||||||
|
EnableBias,
|
||||||
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
|
||||||
|
TileShape, ClusterShape, KernelSchedule,
|
||||||
|
EpilogueSchedule, true>,
|
||||||
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
|
||||||
|
ClusterShape, KernelSchedule, EpilogueSchedule,
|
||||||
|
true>>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Gemm, typename... EpilogueArgs>
|
||||||
|
void cutlass_gemm_caller_sm90_fp8(torch::Tensor& out, torch::Tensor const& a,
|
||||||
|
torch::Tensor const& b,
|
||||||
|
EpilogueArgs&&... epilogue_params) {
|
||||||
|
static constexpr bool swap_ab = Gemm::swap_ab;
|
||||||
|
using ElementAB = typename Gemm::ElementAB;
|
||||||
|
using ElementD = typename Gemm::ElementD;
|
||||||
|
using GemmKernel = typename Gemm::GemmKernel;
|
||||||
|
|
||||||
|
using StrideA = typename Gemm::GemmKernel::StrideA;
|
||||||
|
using StrideB = typename Gemm::GemmKernel::StrideB;
|
||||||
|
using StrideC = typename Gemm::GemmKernel::StrideC;
|
||||||
|
|
||||||
|
int32_t m = a.size(0), n = b.size(1), k = a.size(1);
|
||||||
|
auto prob_shape =
|
||||||
|
swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);
|
||||||
|
|
||||||
|
StrideA a_stride =
|
||||||
|
cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
|
||||||
|
StrideB b_stride =
|
||||||
|
cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
|
||||||
|
StrideC c_stride = cutlass::make_cute_packed_stride(
|
||||||
|
StrideC{},
|
||||||
|
swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1));
|
||||||
|
|
||||||
|
auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
|
||||||
|
auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
|
||||||
|
auto c_ptr = static_cast<ElementD*>(out.data_ptr());
|
||||||
|
|
||||||
|
typename GemmKernel::MainloopArguments mainloop_args =
|
||||||
|
swap_ab ? typename GemmKernel::MainloopArguments{b_ptr, b_stride, a_ptr,
|
||||||
|
a_stride}
|
||||||
|
: typename GemmKernel::MainloopArguments{a_ptr, a_stride, b_ptr,
|
||||||
|
b_stride};
|
||||||
|
|
||||||
|
typename GemmKernel::EpilogueArguments epilogue_args{
|
||||||
|
Gemm::Epilogue::prepare_args(
|
||||||
|
std::forward<EpilogueArgs>(epilogue_params)...),
|
||||||
|
c_ptr, c_stride, c_ptr, c_stride};
|
||||||
|
|
||||||
|
c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
|
||||||
|
epilogue_args);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename InType, typename OutType, bool EnableBias,
|
||||||
typename... EpilogueArgs>
|
typename... EpilogueArgs>
|
||||||
inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
|
inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
|
||||||
torch::Tensor const& a,
|
torch::Tensor const& a,
|
||||||
torch::Tensor const& b,
|
torch::Tensor const& b,
|
||||||
|
torch::Tensor const& a_scales,
|
||||||
|
torch::Tensor const& b_scales,
|
||||||
EpilogueArgs&&... args) {
|
EpilogueArgs&&... args) {
|
||||||
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
|
TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
|
||||||
@ -71,50 +272,75 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
|
|||||||
|
|
||||||
using Cutlass3xGemmDefault =
|
using Cutlass3xGemmDefault =
|
||||||
typename sm90_fp8_config_default<InType, OutType,
|
typename sm90_fp8_config_default<InType, OutType,
|
||||||
Epilogue>::Cutlass3xGemm;
|
EnableBias>::Cutlass3xGemm;
|
||||||
using Cutlass3xGemmM64 =
|
|
||||||
typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
|
|
||||||
using Cutlass3xGemmM128 =
|
using Cutlass3xGemmM128 =
|
||||||
typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
|
typename sm90_fp8_config_M128<InType, OutType, EnableBias>::Cutlass3xGemm;
|
||||||
|
|
||||||
|
using Cutlass3xGemmM64_N1280 =
|
||||||
|
typename sm90_fp8_config_M64_N1280<InType, OutType,
|
||||||
|
EnableBias>::Cutlass3xGemm;
|
||||||
|
using Cutlass3xGemmM64_N8192 =
|
||||||
|
typename sm90_fp8_config_M64_N8192<InType, OutType,
|
||||||
|
EnableBias>::Cutlass3xGemm;
|
||||||
|
using Cutlass3xGemmM16_N1280 =
|
||||||
|
typename sm90_fp8_config_M16_N1280<InType, OutType,
|
||||||
|
EnableBias>::Cutlass3xGemm;
|
||||||
|
using Cutlass3xGemmM16_N8192 =
|
||||||
|
typename sm90_fp8_config_M16_N8192<InType, OutType,
|
||||||
|
EnableBias>::Cutlass3xGemm;
|
||||||
|
|
||||||
uint32_t const m = a.size(0);
|
uint32_t const m = a.size(0);
|
||||||
uint32_t const mp2 =
|
uint32_t const n = b.size(1);
|
||||||
std::max(static_cast<uint32_t>(64), next_pow_2(m)); // next power of 2
|
|
||||||
|
|
||||||
if (mp2 <= 64) {
|
if (m <= 16) {
|
||||||
// m in [1, 64]
|
// m in [1, 16]
|
||||||
return cutlass_gemm_caller<Cutlass3xGemmM64>(
|
if (n <= 1280) {
|
||||||
out, a, b, std::forward<EpilogueArgs>(args)...);
|
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM16_N1280>(
|
||||||
} else if (mp2 <= 128) {
|
out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
|
||||||
|
}
|
||||||
|
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM16_N8192>(
|
||||||
|
out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
|
||||||
|
} else if (m <= 64) {
|
||||||
|
// m in (16, 64]
|
||||||
|
if (n <= 1280) {
|
||||||
|
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM64_N1280>(
|
||||||
|
out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
|
||||||
|
}
|
||||||
|
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM64_N8192>(
|
||||||
|
out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
|
||||||
|
} else if (m <= 128) {
|
||||||
// m in (64, 128]
|
// m in (64, 128]
|
||||||
return cutlass_gemm_caller<Cutlass3xGemmM128>(
|
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM128>(
|
||||||
out, a, b, std::forward<EpilogueArgs>(args)...);
|
out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
|
||||||
} else {
|
} else {
|
||||||
// m in (128, inf)
|
// m in (128, inf)
|
||||||
return cutlass_gemm_caller<Cutlass3xGemmDefault>(
|
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmDefault>(
|
||||||
out, a, b, std::forward<EpilogueArgs>(args)...);
|
out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <template <typename, typename, typename> typename Epilogue,
|
template <bool EnableBias, typename... EpilogueArgs>
|
||||||
typename... EpilogueArgs>
|
|
||||||
void cutlass_scaled_mm_sm90_fp8_epilogue(torch::Tensor& out,
|
void cutlass_scaled_mm_sm90_fp8_epilogue(torch::Tensor& out,
|
||||||
torch::Tensor const& a,
|
torch::Tensor const& a,
|
||||||
torch::Tensor const& b,
|
torch::Tensor const& b,
|
||||||
|
torch::Tensor const& a_scales,
|
||||||
|
torch::Tensor const& b_scales,
|
||||||
EpilogueArgs&&... epilogue_args) {
|
EpilogueArgs&&... epilogue_args) {
|
||||||
TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
|
TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
|
||||||
TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
|
TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
|
||||||
|
|
||||||
if (out.dtype() == torch::kBFloat16) {
|
if (out.dtype() == torch::kBFloat16) {
|
||||||
return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
|
return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
|
||||||
cutlass::bfloat16_t, Epilogue>(
|
cutlass::bfloat16_t, EnableBias>(
|
||||||
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
out, a, b, a_scales, b_scales,
|
||||||
|
std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
} else {
|
} else {
|
||||||
TORCH_CHECK(out.dtype() == torch::kFloat16);
|
TORCH_CHECK(out.dtype() == torch::kFloat16);
|
||||||
return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
|
return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
|
||||||
cutlass::half_t, Epilogue>(
|
cutlass::half_t, EnableBias>(
|
||||||
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
out, a, b, a_scales, b_scales,
|
||||||
|
std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|||||||
@ -18,7 +18,6 @@ using ProblemShape =
|
|||||||
cutlass::gemm::GroupProblemShape<cute::Shape<int, int, int>>;
|
cutlass::gemm::GroupProblemShape<cute::Shape<int, int, int>>;
|
||||||
|
|
||||||
using ElementAccumulator = float;
|
using ElementAccumulator = float;
|
||||||
using ArchTag = cutlass::arch::Sm90;
|
|
||||||
using OperatorClass = cutlass::arch::OpClassTensorOp;
|
using OperatorClass = cutlass::arch::OpClassTensorOp;
|
||||||
|
|
||||||
using LayoutA = cutlass::layout::RowMajor;
|
using LayoutA = cutlass::layout::RowMajor;
|
||||||
@ -33,7 +32,7 @@ using LayoutD_Transpose =
|
|||||||
using LayoutC = LayoutD;
|
using LayoutC = LayoutD;
|
||||||
using LayoutC_Transpose = LayoutD_Transpose;
|
using LayoutC_Transpose = LayoutD_Transpose;
|
||||||
|
|
||||||
template <typename ElementAB_, typename ElementC_,
|
template <typename ElementAB_, typename ElementC_, typename ArchTag_,
|
||||||
template <typename, typename, typename> typename Epilogue_,
|
template <typename, typename, typename> typename Epilogue_,
|
||||||
typename TileShape, typename ClusterShape, typename KernelSchedule,
|
typename TileShape, typename ClusterShape, typename KernelSchedule,
|
||||||
typename EpilogueSchedule, bool swap_ab_ = false>
|
typename EpilogueSchedule, bool swap_ab_ = false>
|
||||||
@ -43,6 +42,7 @@ struct cutlass_3x_group_gemm {
|
|||||||
using ElementC = void;
|
using ElementC = void;
|
||||||
using ElementD = ElementC_;
|
using ElementD = ElementC_;
|
||||||
using ElementAccumulator = float;
|
using ElementAccumulator = float;
|
||||||
|
using ArchTag = ArchTag_;
|
||||||
|
|
||||||
using Epilogue = Epilogue_<ElementAccumulator, ElementD, TileShape>;
|
using Epilogue = Epilogue_<ElementAccumulator, ElementD, TileShape>;
|
||||||
|
|
||||||
@ -77,7 +77,7 @@ struct cutlass_3x_group_gemm {
|
|||||||
LayoutB*, AlignmentAB, ElementAccumulator, TileShape, ClusterShape,
|
LayoutB*, AlignmentAB, ElementAccumulator, TileShape, ClusterShape,
|
||||||
Stages, KernelSchedule>::CollectiveOp>;
|
Stages, KernelSchedule>::CollectiveOp>;
|
||||||
|
|
||||||
using KernelType = enable_sm90_only<cutlass::gemm::kernel::GemmUniversal<
|
using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
|
||||||
ProblemShape, CollectiveMainloop, CollectiveEpilogue>>;
|
ProblemShape, CollectiveMainloop, CollectiveEpilogue>>;
|
||||||
|
|
||||||
struct GemmKernel : public KernelType {};
|
struct GemmKernel : public KernelType {};
|
||||||
@ -156,9 +156,14 @@ void cutlass_group_gemm_caller(
|
|||||||
static_cast<ElementD**>(out_ptrs.data_ptr()),
|
static_cast<ElementD**>(out_ptrs.data_ptr()),
|
||||||
static_cast<StrideC*>(c_strides.data_ptr())};
|
static_cast<StrideC*>(c_strides.data_ptr())};
|
||||||
|
|
||||||
|
int device_id = a_tensors.device().index();
|
||||||
|
static const cutlass::KernelHardwareInfo hw_info{
|
||||||
|
device_id, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
|
||||||
|
device_id)};
|
||||||
|
|
||||||
typename GemmKernel::Arguments args{
|
typename GemmKernel::Arguments args{
|
||||||
cutlass::gemm::GemmUniversalMode::kGrouped, prob_shape, mainloop_args,
|
cutlass::gemm::GemmUniversalMode::kGrouped, prob_shape, mainloop_args,
|
||||||
epilogue_args};
|
epilogue_args, hw_info};
|
||||||
|
|
||||||
using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||||
GemmOp gemm_op;
|
GemmOp gemm_op;
|
||||||
|
|||||||
140
csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu
Normal file
@ -0,0 +1,140 @@
|
|||||||
|
#include <cudaTypedefs.h>
|
||||||
|
|
||||||
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
#include <torch/all.h>
|
||||||
|
|
||||||
|
#include "cutlass/cutlass.h"
|
||||||
|
#include "grouped_mm_c3x.cuh"
|
||||||
|
|
||||||
|
using namespace cute;
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
template <typename InType, typename OutType,
|
||||||
|
template <typename, typename, typename> typename Epilogue>
|
||||||
|
struct sm100_fp8_config_default {
|
||||||
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
|
using KernelSchedule =
|
||||||
|
cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
|
||||||
|
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||||
|
using TileShape = cute::Shape<cute::_128, cute::_256, cute::_128>;
|
||||||
|
using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
|
||||||
|
using ArchTag = cutlass::arch::Sm100;
|
||||||
|
|
||||||
|
using Cutlass3xGemm =
|
||||||
|
cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
|
||||||
|
ClusterShape, KernelSchedule, EpilogueSchedule>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType,
|
||||||
|
template <typename, typename, typename> typename Epilogue>
|
||||||
|
struct sm100_fp8_config_M64 {
|
||||||
|
// M in [1,64]
|
||||||
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
|
using KernelSchedule =
|
||||||
|
cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
|
||||||
|
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||||
|
using TileShape = cute::Shape<cute::_128, cute::_16, cute::_128>;
|
||||||
|
using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
|
||||||
|
using ArchTag = cutlass::arch::Sm100;
|
||||||
|
|
||||||
|
using Cutlass3xGemm =
|
||||||
|
cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
|
||||||
|
ClusterShape, KernelSchedule, EpilogueSchedule,
|
||||||
|
true>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType,
|
||||||
|
template <typename, typename, typename> typename Epilogue>
|
||||||
|
struct sm100_fp8_config_N8192 {
|
||||||
|
// N in [8192, inf)
|
||||||
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
|
using KernelSchedule =
|
||||||
|
cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
|
||||||
|
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
|
||||||
|
using TileShape = cute::Shape<cute::_128, cute::_256, cute::_128>;
|
||||||
|
using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
|
||||||
|
using ArchTag = cutlass::arch::Sm100;
|
||||||
|
|
||||||
|
using Cutlass3xGemm =
|
||||||
|
cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
|
||||||
|
ClusterShape, KernelSchedule, EpilogueSchedule>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType>
|
||||||
|
void run_cutlass_moe_mm_sm100(
|
||||||
|
torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
|
||||||
|
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
|
||||||
|
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
|
||||||
|
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
|
||||||
|
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
|
||||||
|
bool per_act_token, bool per_out_ch) {
|
||||||
|
TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided.");
|
||||||
|
TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided.");
|
||||||
|
TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided.");
|
||||||
|
|
||||||
|
TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn,
|
||||||
|
"A tensors must be of type float8_e4m3fn.");
|
||||||
|
TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn,
|
||||||
|
"B tensors must be of type float8_e4m3fn.");
|
||||||
|
|
||||||
|
using Cutlass3xGemmDefault = typename sm100_fp8_config_default<
|
||||||
|
InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
|
||||||
|
using Cutlass3xGemmN8192 = typename sm100_fp8_config_N8192<
|
||||||
|
InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
|
||||||
|
using Cutlass3xGemmM64 = typename sm100_fp8_config_M64<
|
||||||
|
InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
|
||||||
|
|
||||||
|
uint32_t const m = a_tensors.size(0);
|
||||||
|
uint32_t const n = out_tensors.size(1);
|
||||||
|
|
||||||
|
if (m <= 64) {
|
||||||
|
cutlass_group_gemm_caller<Cutlass3xGemmM64>(
|
||||||
|
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
|
||||||
|
problem_sizes, a_strides, b_strides, c_strides, per_act_token,
|
||||||
|
per_out_ch);
|
||||||
|
} else if (n >= 8192) {
|
||||||
|
cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
|
||||||
|
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
|
||||||
|
problem_sizes, a_strides, b_strides, c_strides, per_act_token,
|
||||||
|
per_out_ch);
|
||||||
|
} else {
|
||||||
|
cutlass_group_gemm_caller<Cutlass3xGemmDefault>(
|
||||||
|
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
|
||||||
|
problem_sizes, a_strides, b_strides, c_strides, per_act_token,
|
||||||
|
per_out_ch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
void dispatch_moe_mm_sm100(
|
||||||
|
torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
|
||||||
|
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
|
||||||
|
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
|
||||||
|
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
|
||||||
|
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
|
||||||
|
bool per_act_token, bool per_out_ch) {
|
||||||
|
if (out_tensors.dtype() == torch::kBFloat16) {
|
||||||
|
run_cutlass_moe_mm_sm100<cutlass::float_e4m3_t, cutlass::bfloat16_t>(
|
||||||
|
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
|
||||||
|
problem_sizes, a_strides, b_strides, c_strides, per_act_token,
|
||||||
|
per_out_ch);
|
||||||
|
} else {
|
||||||
|
run_cutlass_moe_mm_sm100<cutlass::float_e4m3_t, cutlass::half_t>(
|
||||||
|
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
|
||||||
|
problem_sizes, a_strides, b_strides, c_strides, per_act_token,
|
||||||
|
per_out_ch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void cutlass_moe_mm_sm100(
|
||||||
|
torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
|
||||||
|
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
|
||||||
|
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
|
||||||
|
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
|
||||||
|
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
|
||||||
|
bool per_act_token, bool per_out_ch) {
|
||||||
|
dispatch_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
||||||
|
expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
|
c_strides, per_act_token, per_out_ch);
|
||||||
|
}
|
||||||
@ -21,10 +21,11 @@ struct sm90_fp8_config_default {
|
|||||||
cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
|
cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
|
||||||
using TileShape = cute::Shape<cute::_64, cute::_256, cute::_128>;
|
using TileShape = cute::Shape<cute::_64, cute::_256, cute::_128>;
|
||||||
using ClusterShape = cute::Shape<cute::_1, cute::_2, cute::_1>;
|
using ClusterShape = cute::Shape<cute::_1, cute::_2, cute::_1>;
|
||||||
|
using ArchTag = cutlass::arch::Sm90;
|
||||||
|
|
||||||
using Cutlass3xGemm =
|
using Cutlass3xGemm =
|
||||||
cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
|
cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
|
||||||
KernelSchedule, EpilogueSchedule>;
|
ClusterShape, KernelSchedule, EpilogueSchedule>;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename InType, typename OutType,
|
template <typename InType, typename OutType,
|
||||||
@ -38,10 +39,12 @@ struct sm90_fp8_config_M4 {
|
|||||||
cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
|
cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
|
||||||
using TileShape = cute::Shape<cute::_128, cute::_16, cute::_128>;
|
using TileShape = cute::Shape<cute::_128, cute::_16, cute::_128>;
|
||||||
using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
|
using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
|
||||||
|
using ArchTag = cutlass::arch::Sm90;
|
||||||
|
|
||||||
using Cutlass3xGemm =
|
using Cutlass3xGemm =
|
||||||
cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
|
cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
|
||||||
KernelSchedule, EpilogueSchedule, true>;
|
ClusterShape, KernelSchedule, EpilogueSchedule,
|
||||||
|
true>;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename InType, typename OutType,
|
template <typename InType, typename OutType,
|
||||||
@ -55,10 +58,12 @@ struct sm90_fp8_config_M64 {
|
|||||||
cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
|
cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
|
||||||
using TileShape = cute::Shape<cute::_128, cute::_16, cute::_256>;
|
using TileShape = cute::Shape<cute::_128, cute::_16, cute::_256>;
|
||||||
using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
|
using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
|
||||||
|
using ArchTag = cutlass::arch::Sm90;
|
||||||
|
|
||||||
using Cutlass3xGemm =
|
using Cutlass3xGemm =
|
||||||
cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
|
cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
|
||||||
KernelSchedule, EpilogueSchedule, true>;
|
ClusterShape, KernelSchedule, EpilogueSchedule,
|
||||||
|
true>;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename InType, typename OutType,
|
template <typename InType, typename OutType,
|
||||||
@ -72,10 +77,11 @@ struct sm90_fp8_config_K8192 {
|
|||||||
cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
|
cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
|
||||||
using TileShape = cute::Shape<cute::_128, cute::_128, cute::_128>;
|
using TileShape = cute::Shape<cute::_128, cute::_128, cute::_128>;
|
||||||
using ClusterShape = cute::Shape<cute::_1, cute::_8, cute::_1>;
|
using ClusterShape = cute::Shape<cute::_1, cute::_8, cute::_1>;
|
||||||
|
using ArchTag = cutlass::arch::Sm90;
|
||||||
|
|
||||||
using Cutlass3xGemm =
|
using Cutlass3xGemm =
|
||||||
cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
|
cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
|
||||||
KernelSchedule, EpilogueSchedule>;
|
ClusterShape, KernelSchedule, EpilogueSchedule>;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename InType, typename OutType,
|
template <typename InType, typename OutType,
|
||||||
@ -89,10 +95,11 @@ struct sm90_fp8_config_N8192 {
|
|||||||
cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
|
cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
|
||||||
using TileShape = cute::Shape<cute::_64, cute::_128, cute::_256>;
|
using TileShape = cute::Shape<cute::_64, cute::_128, cute::_256>;
|
||||||
using ClusterShape = cute::Shape<cute::_1, cute::_8, cute::_1>;
|
using ClusterShape = cute::Shape<cute::_1, cute::_8, cute::_1>;
|
||||||
|
using ArchTag = cutlass::arch::Sm90;
|
||||||
|
|
||||||
using Cutlass3xGemm =
|
using Cutlass3xGemm =
|
||||||
cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
|
cutlass_3x_group_gemm<InType, OutType, ArchTag, Epilogue, TileShape,
|
||||||
KernelSchedule, EpilogueSchedule>;
|
ClusterShape, KernelSchedule, EpilogueSchedule>;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename InType, typename OutType>
|
template <typename InType, typename OutType>
|
||||||
@ -112,9 +119,6 @@ void run_cutlass_moe_mm_sm90(
|
|||||||
TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn,
|
TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn,
|
||||||
"B tensors must be of type float8_e4m3fn.");
|
"B tensors must be of type float8_e4m3fn.");
|
||||||
|
|
||||||
TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
|
|
||||||
TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
|
|
||||||
|
|
||||||
using Cutlass3xGemmN8192 = typename sm90_fp8_config_N8192<
|
using Cutlass3xGemmN8192 = typename sm90_fp8_config_N8192<
|
||||||
InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
|
InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
|
||||||
using Cutlass3xGemmK8192 = typename sm90_fp8_config_K8192<
|
using Cutlass3xGemmK8192 = typename sm90_fp8_config_K8192<
|
||||||
@ -47,13 +47,12 @@ __global__ void compute_problem_sizes(const int32_t* __restrict__ topk_ids,
|
|||||||
|
|
||||||
__global__ void compute_expert_offsets(
|
__global__ void compute_expert_offsets(
|
||||||
const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
|
const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
|
||||||
int32_t* atomic_buffer, const int num_experts, const int topk_length) {
|
int32_t* atomic_buffer, const int num_experts, const bool swap_ab) {
|
||||||
int32_t tot_offset = 0;
|
int32_t tot_offset = 0;
|
||||||
expert_offsets[0] = 0;
|
expert_offsets[0] = 0;
|
||||||
for (int i = 0; i < num_experts; ++i) {
|
for (int i = 0; i < num_experts; ++i) {
|
||||||
atomic_buffer[i] = tot_offset;
|
atomic_buffer[i] = tot_offset;
|
||||||
tot_offset += topk_length > SWAP_AB_THRESHOLD ? problem_sizes1[i * 3]
|
tot_offset += swap_ab ? problem_sizes1[i * 3 + 1] : problem_sizes1[i * 3];
|
||||||
: problem_sizes1[i * 3 + 1];
|
|
||||||
expert_offsets[i + 1] = tot_offset;
|
expert_offsets[i + 1] = tot_offset;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -61,15 +60,14 @@ __global__ void compute_expert_offsets(
|
|||||||
__global__ void compute_expert_blockscale_offsets(
|
__global__ void compute_expert_blockscale_offsets(
|
||||||
const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
|
const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
|
||||||
int32_t* blockscale_offsets, int32_t* atomic_buffer, const int num_experts,
|
int32_t* blockscale_offsets, int32_t* atomic_buffer, const int num_experts,
|
||||||
const int topk_length) {
|
const bool swap_ab) {
|
||||||
int32_t tot_offset = 0;
|
int32_t tot_offset = 0;
|
||||||
int32_t tot_offset_round = 0;
|
int32_t tot_offset_round = 0;
|
||||||
expert_offsets[0] = 0;
|
expert_offsets[0] = 0;
|
||||||
blockscale_offsets[0] = 0;
|
blockscale_offsets[0] = 0;
|
||||||
for (int i = 0; i < num_experts; ++i) {
|
for (int i = 0; i < num_experts; ++i) {
|
||||||
int32_t cur_offset = topk_length > SWAP_AB_THRESHOLD
|
int32_t cur_offset =
|
||||||
? problem_sizes1[i * 3]
|
swap_ab ? problem_sizes1[i * 3 + 1] : problem_sizes1[i * 3];
|
||||||
: problem_sizes1[i * 3 + 1];
|
|
||||||
atomic_buffer[i] = tot_offset;
|
atomic_buffer[i] = tot_offset;
|
||||||
tot_offset += cur_offset;
|
tot_offset += cur_offset;
|
||||||
expert_offsets[i + 1] = tot_offset;
|
expert_offsets[i + 1] = tot_offset;
|
||||||
@ -119,15 +117,19 @@ void get_cutlass_moe_mm_data_caller(
|
|||||||
|
|
||||||
int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
|
int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
|
||||||
|
|
||||||
if (topk_ids.numel() > SWAP_AB_THRESHOLD) {
|
// Swap-AB should be disabled for FP4 path
|
||||||
compute_problem_sizes<false><<<num_experts, num_threads, 0, stream>>>(
|
bool may_swap_ab = (!blockscale_offsets.has_value()) &&
|
||||||
|
(topk_ids.numel() <= SWAP_AB_THRESHOLD);
|
||||||
|
|
||||||
|
if (may_swap_ab) {
|
||||||
|
compute_problem_sizes<true><<<num_experts, num_threads, 0, stream>>>(
|
||||||
static_cast<const int32_t*>(topk_ids.data_ptr()),
|
static_cast<const int32_t*>(topk_ids.data_ptr()),
|
||||||
static_cast<int32_t*>(problem_sizes1.data_ptr()),
|
static_cast<int32_t*>(problem_sizes1.data_ptr()),
|
||||||
static_cast<int32_t*>(problem_sizes2.data_ptr()),
|
static_cast<int32_t*>(problem_sizes2.data_ptr()),
|
||||||
static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n,
|
static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n,
|
||||||
k);
|
k);
|
||||||
} else {
|
} else {
|
||||||
compute_problem_sizes<true><<<num_experts, num_threads, 0, stream>>>(
|
compute_problem_sizes<false><<<num_experts, num_threads, 0, stream>>>(
|
||||||
static_cast<const int32_t*>(topk_ids.data_ptr()),
|
static_cast<const int32_t*>(topk_ids.data_ptr()),
|
||||||
static_cast<int32_t*>(problem_sizes1.data_ptr()),
|
static_cast<int32_t*>(problem_sizes1.data_ptr()),
|
||||||
static_cast<int32_t*>(problem_sizes2.data_ptr()),
|
static_cast<int32_t*>(problem_sizes2.data_ptr()),
|
||||||
@ -136,18 +138,19 @@ void get_cutlass_moe_mm_data_caller(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (blockscale_offsets.has_value()) {
|
if (blockscale_offsets.has_value()) {
|
||||||
|
// fp4 path
|
||||||
compute_expert_blockscale_offsets<<<1, 1, 0, stream>>>(
|
compute_expert_blockscale_offsets<<<1, 1, 0, stream>>>(
|
||||||
static_cast<const int32_t*>(problem_sizes1.data_ptr()),
|
static_cast<const int32_t*>(problem_sizes1.data_ptr()),
|
||||||
static_cast<int32_t*>(expert_offsets.data_ptr()),
|
static_cast<int32_t*>(expert_offsets.data_ptr()),
|
||||||
static_cast<int32_t*>(blockscale_offsets.value().data_ptr()),
|
static_cast<int32_t*>(blockscale_offsets.value().data_ptr()),
|
||||||
static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts,
|
static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts,
|
||||||
topk_ids.numel());
|
may_swap_ab);
|
||||||
} else {
|
} else {
|
||||||
compute_expert_offsets<<<1, 1, 0, stream>>>(
|
compute_expert_offsets<<<1, 1, 0, stream>>>(
|
||||||
static_cast<const int32_t*>(problem_sizes1.data_ptr()),
|
static_cast<const int32_t*>(problem_sizes1.data_ptr()),
|
||||||
static_cast<int32_t*>(expert_offsets.data_ptr()),
|
static_cast<int32_t*>(expert_offsets.data_ptr()),
|
||||||
static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts,
|
static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts,
|
||||||
topk_ids.numel());
|
may_swap_ab);
|
||||||
}
|
}
|
||||||
compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
|
compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
|
||||||
static_cast<const int32_t*>(topk_ids.data_ptr()),
|
static_cast<const int32_t*>(topk_ids.data_ptr()),
|
||||||
@ -190,4 +193,4 @@ void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
|
|||||||
static_cast<int32_t*>(problem_sizes2.data_ptr()),
|
static_cast<int32_t*>(problem_sizes2.data_ptr()),
|
||||||
static_cast<const int32_t*>(expert_num_tokens.data_ptr()), padded_m, n,
|
static_cast<const int32_t*>(expert_num_tokens.data_ptr()), padded_m, n,
|
||||||
k);
|
k);
|
||||||
}
|
}
|
||||||
@ -41,6 +41,16 @@ void cutlass_moe_mm_sm90(
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
|
||||||
|
void cutlass_moe_mm_sm100(
|
||||||
|
torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
|
||||||
|
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
|
||||||
|
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
|
||||||
|
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
|
||||||
|
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
|
||||||
|
bool per_act_token, bool per_out_ch);
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined ENABLE_SCALED_MM_SM120 && ENABLE_SCALED_MM_SM120
|
#if defined ENABLE_SCALED_MM_SM120 && ENABLE_SCALED_MM_SM120
|
||||||
void cutlass_scaled_mm_sm120(torch::Tensor& c, torch::Tensor const& a,
|
void cutlass_scaled_mm_sm120(torch::Tensor& c, torch::Tensor const& a,
|
||||||
torch::Tensor const& b,
|
torch::Tensor const& b,
|
||||||
@ -130,10 +140,10 @@ bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) {
|
|||||||
// and at least SM90 (Hopper)
|
// and at least SM90 (Hopper)
|
||||||
|
|
||||||
#if defined CUDA_VERSION
|
#if defined CUDA_VERSION
|
||||||
if (cuda_device_capability >= 90 && cuda_device_capability < 100) {
|
if (cuda_device_capability >= 100) {
|
||||||
return CUDA_VERSION >= 12000;
|
|
||||||
} else if (cuda_device_capability >= 100) {
|
|
||||||
return CUDA_VERSION >= 12080;
|
return CUDA_VERSION >= 12080;
|
||||||
|
} else if (cuda_device_capability >= 90) {
|
||||||
|
return CUDA_VERSION >= 12000;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -141,11 +151,14 @@ bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool cutlass_group_gemm_supported(int64_t cuda_device_capability) {
|
bool cutlass_group_gemm_supported(int64_t cuda_device_capability) {
|
||||||
// CUTLASS grouped FP8 kernels need at least CUDA 12.3
|
// CUTLASS grouped FP8 kernels need at least CUDA 12.3 and SM90 (Hopper)
|
||||||
// and SM90 (Hopper)
|
// or CUDA 12.8 and SM100 (Blackwell)
|
||||||
|
|
||||||
#if defined CUDA_VERSION
|
#if defined CUDA_VERSION
|
||||||
if (cuda_device_capability == 90) {
|
if (cuda_device_capability >= 100) {
|
||||||
|
return CUDA_VERSION >= 12080;
|
||||||
|
}
|
||||||
|
if (cuda_device_capability >= 90) {
|
||||||
return CUDA_VERSION >= 12030;
|
return CUDA_VERSION >= 12030;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -234,16 +247,26 @@ void cutlass_moe_mm(
|
|||||||
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
|
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
|
||||||
bool per_act_token, bool per_out_ch) {
|
bool per_act_token, bool per_out_ch) {
|
||||||
int32_t version_num = get_sm_version_num();
|
int32_t version_num = get_sm_version_num();
|
||||||
|
#if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
|
||||||
|
if (version_num >= 100) {
|
||||||
|
cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
||||||
|
expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
|
c_strides, per_act_token, per_out_ch);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
|
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
|
||||||
cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
if (version_num >= 90) {
|
||||||
expert_offsets, problem_sizes, a_strides, b_strides,
|
cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
||||||
c_strides, per_act_token, per_out_ch);
|
expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
return;
|
c_strides, per_act_token, per_out_ch);
|
||||||
|
return;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||||
false,
|
false,
|
||||||
"No compiled cutlass_scaled_mm for CUDA device capability: ", version_num,
|
"No compiled cutlass_scaled_mm for CUDA device capability: ", version_num,
|
||||||
". Required capability: 90");
|
". Required capability: 90 or 100");
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_cutlass_moe_mm_data(
|
void get_cutlass_moe_mm_data(
|
||||||
|
|||||||
@ -88,6 +88,8 @@ void static_scaled_fp8_quant(torch::Tensor& out, // [..., d]
|
|||||||
torch::Tensor const& input, // [..., d]
|
torch::Tensor const& input, // [..., d]
|
||||||
torch::Tensor const& scale) // [1]
|
torch::Tensor const& scale) // [1]
|
||||||
{
|
{
|
||||||
|
TORCH_CHECK(input.is_contiguous());
|
||||||
|
TORCH_CHECK(out.is_contiguous());
|
||||||
int const block_size = 256;
|
int const block_size = 256;
|
||||||
int const num_tokens = input.numel() / input.size(-1);
|
int const num_tokens = input.numel() / input.size(-1);
|
||||||
int const num_elems = input.numel();
|
int const num_elems = input.numel();
|
||||||
@ -111,6 +113,8 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out, // [..., d]
|
|||||||
torch::Tensor const& input, // [..., d]
|
torch::Tensor const& input, // [..., d]
|
||||||
torch::Tensor& scale) // [1]
|
torch::Tensor& scale) // [1]
|
||||||
{
|
{
|
||||||
|
TORCH_CHECK(input.is_contiguous());
|
||||||
|
TORCH_CHECK(out.is_contiguous());
|
||||||
int const block_size = 256;
|
int const block_size = 256;
|
||||||
int const num_tokens = input.numel() / input.size(-1);
|
int const num_tokens = input.numel() / input.size(-1);
|
||||||
int const num_elems = input.numel();
|
int const num_elems = input.numel();
|
||||||
|
|||||||
215
csrc/quantization/fp8/per_token_group_quant.cu
Normal file
@ -0,0 +1,215 @@
|
|||||||
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
|
|
||||||
|
#include "../per_token_group_quant_8bit.h"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
|
||||||
|
#include <cuda_fp8.h>
|
||||||
|
|
||||||
|
#include <torch/all.h>
|
||||||
|
|
||||||
|
#include "../vectorization.cuh"
|
||||||
|
#include "../vectorization_utils.cuh"
|
||||||
|
#include "../../dispatch_utils.h"
|
||||||
|
|
||||||
|
__device__ __forceinline__ float GroupReduceMax(float val, const int tid) {
|
||||||
|
unsigned mask = 0xffff;
|
||||||
|
|
||||||
|
val = fmaxf(val, __shfl_xor_sync(mask, val, 8));
|
||||||
|
val = fmaxf(val, __shfl_xor_sync(mask, val, 4));
|
||||||
|
val = fmaxf(val, __shfl_xor_sync(mask, val, 2));
|
||||||
|
val = fmaxf(val, __shfl_xor_sync(mask, val, 1));
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename DST_DTYPE, bool IS_COLUMN_MAJOR = false,
|
||||||
|
bool SCALE_UE8M0 = false, typename scale_packed_t = float>
|
||||||
|
__global__ void per_token_group_quant_8bit_kernel(
|
||||||
|
const T* __restrict__ input, void* __restrict__ output_q,
|
||||||
|
scale_packed_t* __restrict__ output_s, const int group_size,
|
||||||
|
const int num_groups, const int groups_per_block, const float eps,
|
||||||
|
const float min_8bit, const float max_8bit, const int scale_num_rows = 0,
|
||||||
|
const int scale_stride = 0) {
|
||||||
|
const int threads_per_group = 16;
|
||||||
|
const int64_t local_group_id = threadIdx.x / threads_per_group;
|
||||||
|
const int lane_id = threadIdx.x % threads_per_group;
|
||||||
|
|
||||||
|
const int64_t block_group_id = blockIdx.x * groups_per_block;
|
||||||
|
const int64_t global_group_id = block_group_id + local_group_id;
|
||||||
|
const int64_t block_group_offset = global_group_id * group_size;
|
||||||
|
|
||||||
|
float local_absmax = eps;
|
||||||
|
|
||||||
|
using scale_element_t = float;
|
||||||
|
static_assert(sizeof(scale_packed_t) % sizeof(scale_element_t) == 0);
|
||||||
|
|
||||||
|
const T* group_input = input + block_group_offset;
|
||||||
|
DST_DTYPE* group_output =
|
||||||
|
static_cast<DST_DTYPE*>(output_q) + block_group_offset;
|
||||||
|
scale_element_t* scale_output;
|
||||||
|
|
||||||
|
if constexpr (IS_COLUMN_MAJOR) {
|
||||||
|
const int num_elems_per_pack =
|
||||||
|
static_cast<int>(sizeof(scale_packed_t) / sizeof(scale_element_t));
|
||||||
|
const int scale_num_rows_element = scale_num_rows * num_elems_per_pack;
|
||||||
|
const int row_idx = global_group_id / scale_num_rows_element;
|
||||||
|
const int col_idx_raw = global_group_id % scale_num_rows_element;
|
||||||
|
const int col_idx = col_idx_raw / num_elems_per_pack;
|
||||||
|
const int pack_idx = col_idx_raw % num_elems_per_pack;
|
||||||
|
scale_output = reinterpret_cast<scale_element_t*>(output_s) +
|
||||||
|
(col_idx * scale_stride * num_elems_per_pack +
|
||||||
|
row_idx * num_elems_per_pack + pack_idx);
|
||||||
|
} else {
|
||||||
|
scale_output = output_s + global_group_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
// shared memory to cache each group's data to avoid double DRAM reads.
|
||||||
|
extern __shared__ __align__(16) char smem_raw[];
|
||||||
|
T* smem = reinterpret_cast<T*>(smem_raw);
|
||||||
|
T* smem_group = smem + local_group_id * group_size;
|
||||||
|
|
||||||
|
constexpr int vec_size = 16 / sizeof(T);
|
||||||
|
using vec_t = vllm::vec_n_t<T, vec_size>;
|
||||||
|
|
||||||
|
// copy global -> shared & compute absmax
|
||||||
|
auto scalar_op_cache = [&] __device__(T & dst, const T& src) {
|
||||||
|
float abs_v = fabsf(static_cast<float>(src));
|
||||||
|
local_absmax = fmaxf(local_absmax, abs_v);
|
||||||
|
dst = src;
|
||||||
|
};
|
||||||
|
|
||||||
|
vllm::vectorize_with_alignment<vec_size>(
|
||||||
|
group_input, // in
|
||||||
|
smem_group, // out (shared)
|
||||||
|
group_size, // elements per group
|
||||||
|
lane_id, // thread id
|
||||||
|
threads_per_group, // stride in group
|
||||||
|
scalar_op_cache); // scalar handler
|
||||||
|
|
||||||
|
local_absmax = GroupReduceMax(local_absmax, lane_id);
|
||||||
|
|
||||||
|
float y_s = local_absmax / max_8bit;
|
||||||
|
if constexpr (SCALE_UE8M0) {
|
||||||
|
y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f))));
|
||||||
|
}
|
||||||
|
|
||||||
|
scale_element_t y_s_quant = y_s;
|
||||||
|
|
||||||
|
if (lane_id == 0) {
|
||||||
|
*scale_output = y_s_quant;
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// quantize shared -> global 8-bit
|
||||||
|
auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) {
|
||||||
|
float q = fminf(fmaxf(static_cast<float>(src) / y_s, min_8bit), max_8bit);
|
||||||
|
dst = DST_DTYPE(q);
|
||||||
|
};
|
||||||
|
|
||||||
|
vllm::vectorize_with_alignment<vec_size>(
|
||||||
|
smem_group, // in (shared)
|
||||||
|
group_output, // out (global quant tensor)
|
||||||
|
group_size, // elements
|
||||||
|
lane_id, // tid
|
||||||
|
threads_per_group, // stride
|
||||||
|
scalar_op_quant); // scalar handler
|
||||||
|
}
|
||||||
|
|
||||||
|
void per_token_group_quant_8bit(const torch::Tensor& input,
|
||||||
|
torch::Tensor& output_q,
|
||||||
|
torch::Tensor& output_s, int64_t group_size,
|
||||||
|
double eps, double min_8bit, double max_8bit,
|
||||||
|
bool scale_ue8m0) {
|
||||||
|
TORCH_CHECK(input.is_contiguous());
|
||||||
|
TORCH_CHECK(output_q.is_contiguous());
|
||||||
|
|
||||||
|
const int num_groups = input.numel() / group_size;
|
||||||
|
|
||||||
|
TORCH_CHECK(input.numel() % group_size == 0);
|
||||||
|
TORCH_CHECK(output_s.dim() == 2);
|
||||||
|
|
||||||
|
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
|
constexpr int THREADS_PER_GROUP = 16;
|
||||||
|
|
||||||
|
int groups_per_block = 1;
|
||||||
|
|
||||||
|
if (num_groups % 16 == 0) {
|
||||||
|
groups_per_block = 16;
|
||||||
|
} else if (num_groups % 8 == 0) {
|
||||||
|
groups_per_block = 8;
|
||||||
|
} else if (num_groups % 4 == 0) {
|
||||||
|
groups_per_block = 4;
|
||||||
|
} else if (num_groups % 2 == 0) {
|
||||||
|
groups_per_block = 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto dst_type = output_q.scalar_type();
|
||||||
|
const int num_blocks = num_groups / groups_per_block;
|
||||||
|
const int num_threads = groups_per_block * THREADS_PER_GROUP;
|
||||||
|
|
||||||
|
const bool is_column_major = output_s.stride(0) < output_s.stride(1);
|
||||||
|
const int scale_num_rows = output_s.size(1);
|
||||||
|
const int scale_stride = output_s.stride(1);
|
||||||
|
|
||||||
|
#define LAUNCH_KERNEL(T, DST_DTYPE) \
|
||||||
|
do { \
|
||||||
|
dim3 grid(num_blocks); \
|
||||||
|
dim3 block(num_threads); \
|
||||||
|
size_t smem_bytes = \
|
||||||
|
static_cast<size_t>(groups_per_block) * group_size * sizeof(T); \
|
||||||
|
if (is_column_major) { \
|
||||||
|
if (scale_ue8m0) { \
|
||||||
|
per_token_group_quant_8bit_kernel<T, DST_DTYPE, true, true> \
|
||||||
|
<<<grid, block, smem_bytes, stream>>>( \
|
||||||
|
static_cast<T*>(input.data_ptr()), output_q.data_ptr(), \
|
||||||
|
static_cast<float*>(output_s.data_ptr()), group_size, \
|
||||||
|
num_groups, groups_per_block, (float)eps, (float)min_8bit, \
|
||||||
|
(float)max_8bit, scale_num_rows, scale_stride); \
|
||||||
|
} else { \
|
||||||
|
per_token_group_quant_8bit_kernel<T, DST_DTYPE, true, false> \
|
||||||
|
<<<grid, block, smem_bytes, stream>>>( \
|
||||||
|
static_cast<T*>(input.data_ptr()), output_q.data_ptr(), \
|
||||||
|
static_cast<float*>(output_s.data_ptr()), group_size, \
|
||||||
|
num_groups, groups_per_block, (float)eps, (float)min_8bit, \
|
||||||
|
(float)max_8bit, scale_num_rows, scale_stride); \
|
||||||
|
} \
|
||||||
|
} else { \
|
||||||
|
if (scale_ue8m0) { \
|
||||||
|
per_token_group_quant_8bit_kernel<T, DST_DTYPE, false, true> \
|
||||||
|
<<<grid, block, smem_bytes, stream>>>( \
|
||||||
|
static_cast<T*>(input.data_ptr()), output_q.data_ptr(), \
|
||||||
|
static_cast<float*>(output_s.data_ptr()), group_size, \
|
||||||
|
num_groups, groups_per_block, (float)eps, (float)min_8bit, \
|
||||||
|
(float)max_8bit); \
|
||||||
|
} else { \
|
||||||
|
per_token_group_quant_8bit_kernel<T, DST_DTYPE, false, false> \
|
||||||
|
<<<grid, block, smem_bytes, stream>>>( \
|
||||||
|
static_cast<T*>(input.data_ptr()), output_q.data_ptr(), \
|
||||||
|
static_cast<float*>(output_s.data_ptr()), group_size, \
|
||||||
|
num_groups, groups_per_block, (float)eps, (float)min_8bit, \
|
||||||
|
(float)max_8bit); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
|
input.scalar_type(), "per_token_group_quant_8bit", ([&] {
|
||||||
|
if (dst_type == at::ScalarType::Float8_e4m3fn) {
|
||||||
|
LAUNCH_KERNEL(scalar_t, __nv_fp8_e4m3);
|
||||||
|
} else if (dst_type == at::ScalarType::Char) {
|
||||||
|
LAUNCH_KERNEL(scalar_t, int8_t);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
#undef LAUNCH_KERNEL
|
||||||
|
}
|
||||||
|
|
||||||
|
void per_token_group_quant_fp8(const torch::Tensor& input,
|
||||||
|
torch::Tensor& output_q, torch::Tensor& output_s,
|
||||||
|
int64_t group_size, double eps, double fp8_min,
|
||||||
|
double fp8_max, bool scale_ue8m0) {
|
||||||
|
per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
|
||||||
|
fp8_min, fp8_max, scale_ue8m0);
|
||||||
|
}
|
||||||
@ -4,7 +4,7 @@
|
|||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
|
||||||
#include "cuda_compat.h"
|
#include "../../cuda_compat.h"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
|
|
||||||
#include "ggml-common.h"
|
#include "ggml-common.h"
|
||||||
|
|||||||
@ -187,8 +187,12 @@ struct PrepackedLayoutBTemplate {
|
|||||||
CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy(
|
CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy(
|
||||||
Shape_NKL shape_mkl) {
|
Shape_NKL shape_mkl) {
|
||||||
auto layout = TVbNbKL_to_offset(shape_mkl);
|
auto layout = TVbNbKL_to_offset(shape_mkl);
|
||||||
return make_layout(coalesce(get<0>(layout)), get<1>(layout),
|
// for 4-bit elements, having >= 64 values per column
|
||||||
get<2>(layout));
|
// allows TMA to load full 32-byte sectors
|
||||||
|
auto inner_layout =
|
||||||
|
make_layout(make_shape(_256{}, size<0>(layout) / _256{}));
|
||||||
|
|
||||||
|
return make_layout(inner_layout, get<1>(layout), get<2>(layout));
|
||||||
}
|
}
|
||||||
|
|
||||||
// ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx)
|
// ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx)
|
||||||
|
|||||||
10
csrc/quantization/per_token_group_quant_8bit.h
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
#pragma once
|
||||||
|
#include <torch/all.h>
|
||||||
|
|
||||||
|
// TODO(wentao): refactor the folder to 8bit, then includes fp8 and int8 folders
|
||||||
|
// 8-bit per-token-group quantization helper used by both FP8 and INT8
|
||||||
|
void per_token_group_quant_8bit(const torch::Tensor& input,
|
||||||
|
torch::Tensor& output_q,
|
||||||
|
torch::Tensor& output_s, int64_t group_size,
|
||||||
|
double eps, double min_8bit, double max_8bit,
|
||||||
|
bool scale_ue8m0 = false);
|
||||||
@ -19,7 +19,7 @@
|
|||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
#include <hip/hip_fp8.h>
|
#include <hip/hip_fp8.h>
|
||||||
#include <hip/hip_bf16.h>
|
#include <hip/hip_bf16.h>
|
||||||
#include "cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include "../attention/dtype_fp8.cuh"
|
#include "../attention/dtype_fp8.cuh"
|
||||||
|
|||||||
@ -9,7 +9,7 @@
|
|||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
#include "cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
#include "quantization/fp8/common.cuh"
|
#include "quantization/fp8/common.cuh"
|
||||||
|
|
||||||
|
|||||||
@ -615,6 +615,23 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
|
ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
|
// Compute per-token-group FP8 quantized tensor and scaling factor.
|
||||||
|
ops.def(
|
||||||
|
"per_token_group_fp8_quant(Tensor input, Tensor! output_q, Tensor! "
|
||||||
|
"output_s, "
|
||||||
|
"int group_size, float eps, float fp8_min, float fp8_max, bool "
|
||||||
|
"scale_ue8m0) -> ()");
|
||||||
|
ops.impl("per_token_group_fp8_quant", torch::kCUDA,
|
||||||
|
&per_token_group_quant_fp8);
|
||||||
|
|
||||||
|
// Compute per-token-group INT8 quantized tensor and scaling factor.
|
||||||
|
ops.def(
|
||||||
|
"per_token_group_quant_int8(Tensor input, Tensor! output_q, Tensor! "
|
||||||
|
"output_s, int group_size, float eps, float int8_min, float int8_max) -> "
|
||||||
|
"()");
|
||||||
|
ops.impl("per_token_group_quant_int8", torch::kCUDA,
|
||||||
|
&per_token_group_quant_int8);
|
||||||
|
|
||||||
// reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel
|
// reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel
|
||||||
ops.def(
|
ops.def(
|
||||||
"rearrange_kn_weight_as_n32k16_order(Tensor b_qweight, Tensor b_scales, "
|
"rearrange_kn_weight_as_n32k16_order(Tensor b_qweight, Tensor b_scales, "
|
||||||
|
|||||||
@ -164,9 +164,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
# see https://github.com/pytorch/pytorch/pull/123243
|
# see https://github.com/pytorch/pytorch/pull/123243
|
||||||
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
|
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
|
||||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||||
# Override the arch list for flash-attn to reduce the binary size
|
|
||||||
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
|
|
||||||
ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
|
|
||||||
#################### BASE BUILD IMAGE ####################
|
#################### BASE BUILD IMAGE ####################
|
||||||
|
|
||||||
#################### WHEEL BUILD IMAGE ####################
|
#################### WHEEL BUILD IMAGE ####################
|
||||||
@ -209,16 +206,7 @@ ARG SCCACHE_REGION_NAME=us-west-2
|
|||||||
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||||
|
|
||||||
# Flag to control whether to use pre-built vLLM wheels
|
# Flag to control whether to use pre-built vLLM wheels
|
||||||
ARG VLLM_USE_PRECOMPILED
|
ARG VLLM_USE_PRECOMPILED=""
|
||||||
# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
|
|
||||||
ENV VLLM_USE_PRECOMPILED=""
|
|
||||||
RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \
|
|
||||||
export VLLM_USE_PRECOMPILED=1 && \
|
|
||||||
echo "Using precompiled wheels"; \
|
|
||||||
else \
|
|
||||||
unset VLLM_USE_PRECOMPILED && \
|
|
||||||
echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
# if USE_SCCACHE is set, use sccache to speed up compilation
|
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
@ -235,6 +223,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
|
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
|
||||||
&& export SCCACHE_IDLE_TIMEOUT=0 \
|
&& export SCCACHE_IDLE_TIMEOUT=0 \
|
||||||
&& export CMAKE_BUILD_TYPE=Release \
|
&& export CMAKE_BUILD_TYPE=Release \
|
||||||
|
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
|
||||||
|
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
|
||||||
&& sccache --show-stats \
|
&& sccache --show-stats \
|
||||||
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
|
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
|
||||||
&& sccache --show-stats; \
|
&& sccache --show-stats; \
|
||||||
@ -248,9 +238,22 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
|
|||||||
# Clean any existing CMake artifacts
|
# Clean any existing CMake artifacts
|
||||||
rm -rf .deps && \
|
rm -rf .deps && \
|
||||||
mkdir -p .deps && \
|
mkdir -p .deps && \
|
||||||
|
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
|
||||||
|
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
|
||||||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# When using precompiled wheels, keep only the newest manylinux1 wheel and delete others
|
||||||
|
RUN if [ "$VLLM_USE_PRECOMPILED" = "1" ]; then \
|
||||||
|
echo "Cleaning up extra wheels in dist/..." && \
|
||||||
|
# Identify the most recent manylinux1_x86_64 wheel
|
||||||
|
KEEP_WHEEL=$(ls -t dist/*manylinux1_x86_64.whl 2>/dev/null | head -n1) && \
|
||||||
|
if [ -n "$KEEP_WHEEL" ]; then \
|
||||||
|
echo "Keeping wheel: $KEEP_WHEEL"; \
|
||||||
|
find dist/ -type f -name "*.whl" ! -path "${KEEP_WHEEL}" -delete; \
|
||||||
|
fi; \
|
||||||
|
fi
|
||||||
|
|
||||||
# Check the size of the wheel if RUN_WHEEL_CHECK is true
|
# Check the size of the wheel if RUN_WHEEL_CHECK is true
|
||||||
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
||||||
# sync the default value with .buildkite/check-wheel-size.py
|
# sync the default value with .buildkite/check-wheel-size.py
|
||||||
@ -265,7 +268,7 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
|
|||||||
#################### EXTENSION Build IMAGE ####################
|
#################### EXTENSION Build IMAGE ####################
|
||||||
|
|
||||||
#################### DEV IMAGE ####################
|
#################### DEV IMAGE ####################
|
||||||
FROM base as dev
|
FROM base AS dev
|
||||||
|
|
||||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||||
@ -276,10 +279,6 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
|
|||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||||
|
|
||||||
# Workaround for #17068
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
|
|
||||||
|
|
||||||
COPY requirements/lint.txt requirements/lint.txt
|
COPY requirements/lint.txt requirements/lint.txt
|
||||||
COPY requirements/test.txt requirements/test.txt
|
COPY requirements/test.txt requirements/test.txt
|
||||||
COPY requirements/dev.txt requirements/dev.txt
|
COPY requirements/dev.txt requirements/dev.txt
|
||||||
@ -371,6 +370,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# Install vllm wheel first, so that torch etc will be installed.
|
# Install vllm wheel first, so that torch etc will be installed.
|
||||||
|
# !bang
|
||||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||||
--mount=type=cache,target=/root/.cache/uv \
|
--mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system dist/*.whl --verbose \
|
uv pip install --system dist/*.whl --verbose \
|
||||||
@ -390,7 +390,9 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
|||||||
|
|
||||||
# Install FlashInfer from source
|
# Install FlashInfer from source
|
||||||
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
|
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
|
||||||
ARG FLASHINFER_GIT_REF="v0.2.8rc1"
|
# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
|
||||||
|
# We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel.
|
||||||
|
ARG FLASHINFER_GIT_REF="v0.2.9rc2"
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
||||||
. /etc/environment
|
. /etc/environment
|
||||||
git clone --depth 1 --recursive --shallow-submodules \
|
git clone --depth 1 --recursive --shallow-submodules \
|
||||||
@ -412,7 +414,7 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
|||||||
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
||||||
python3 -m flashinfer.aot
|
python3 -m flashinfer.aot
|
||||||
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
||||||
uv pip install --system --no-build-isolation .
|
uv pip install --system --no-build-isolation --force-reinstall --no-deps .
|
||||||
popd
|
popd
|
||||||
rm -rf flashinfer
|
rm -rf flashinfer
|
||||||
BASH
|
BASH
|
||||||
@ -452,10 +454,6 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
|||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||||
|
|
||||||
# Workaround for #17068
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
|
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
|
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
|
||||||
@ -510,7 +508,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
else \
|
else \
|
||||||
BITSANDBYTES_VERSION="0.46.1"; \
|
BITSANDBYTES_VERSION="0.46.1"; \
|
||||||
fi; \
|
fi; \
|
||||||
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]
|
uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]
|
||||||
|
|
||||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||||
|
|
||||||
|
|||||||
@ -1,62 +0,0 @@
|
|||||||
# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform.
|
|
||||||
|
|
||||||
FROM ubuntu:22.04 AS cpu-test-arm
|
|
||||||
|
|
||||||
ENV CCACHE_DIR=/root/.cache/ccache
|
|
||||||
|
|
||||||
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/var/cache/apt \
|
|
||||||
apt-get update -y \
|
|
||||||
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
|
|
||||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
|
||||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
|
||||||
|
|
||||||
# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install py-cpuinfo # Use this to gather CPU info and optimize based on ARM Neoverse cores
|
|
||||||
|
|
||||||
# Set LD_PRELOAD for tcmalloc on ARM
|
|
||||||
ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
|
|
||||||
|
|
||||||
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
|
||||||
|
|
||||||
WORKDIR /workspace
|
|
||||||
|
|
||||||
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
|
||||||
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
|
|
||||||
pip install --upgrade pip && \
|
|
||||||
pip install -r requirements/build.txt
|
|
||||||
|
|
||||||
FROM cpu-test-arm AS build
|
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
|
|
||||||
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
|
|
||||||
pip install -v -r requirements/cpu.txt
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
ARG GIT_REPO_CHECK=0
|
|
||||||
RUN --mount=type=bind,source=.git,target=.git \
|
|
||||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
|
||||||
|
|
||||||
# Disabling AVX512 specific optimizations for ARM
|
|
||||||
ARG VLLM_CPU_DISABLE_AVX512="true"
|
|
||||||
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
--mount=type=cache,target=/root/.cache/ccache \
|
|
||||||
--mount=type=bind,source=.git,target=.git \
|
|
||||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
|
|
||||||
pip install dist/*.whl && \
|
|
||||||
rm -rf dist
|
|
||||||
|
|
||||||
WORKDIR /workspace/
|
|
||||||
|
|
||||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
|
||||||
|
|
||||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
|
||||||
@ -1,4 +1,11 @@
|
|||||||
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
|
# This vLLM Dockerfile is used to build images that can run vLLM on both x86_64 and arm64 CPU platforms.
|
||||||
|
#
|
||||||
|
# Supported platforms:
|
||||||
|
# - linux/amd64 (x86_64)
|
||||||
|
# - linux/arm64 (aarch64)
|
||||||
|
#
|
||||||
|
# Use the `--platform` option with `docker buildx build` to specify the target architecture, e.g.:
|
||||||
|
# docker buildx build --platform=linux/arm64 -f docker/Dockerfile.cpu .
|
||||||
#
|
#
|
||||||
# Build targets:
|
# Build targets:
|
||||||
# vllm-openai (default): used for serving deployment
|
# vllm-openai (default): used for serving deployment
|
||||||
@ -12,16 +19,14 @@
|
|||||||
# VLLM_CPU_AVX512VNNI=false (default)|true
|
# VLLM_CPU_AVX512VNNI=false (default)|true
|
||||||
#
|
#
|
||||||
|
|
||||||
######################### BASE IMAGE #########################
|
######################### COMMON BASE IMAGE #########################
|
||||||
FROM ubuntu:22.04 AS base
|
FROM ubuntu:22.04 AS base-common
|
||||||
|
|
||||||
WORKDIR /workspace/
|
WORKDIR /workspace/
|
||||||
|
|
||||||
ARG PYTHON_VERSION=3.12
|
ARG PYTHON_VERSION=3.12
|
||||||
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||||
|
|
||||||
ENV LD_PRELOAD=""
|
|
||||||
|
|
||||||
# Install minimal dependencies and uv
|
# Install minimal dependencies and uv
|
||||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||||
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||||
@ -53,7 +58,21 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
uv pip install --upgrade pip && \
|
uv pip install --upgrade pip && \
|
||||||
uv pip install -r requirements/cpu.txt
|
uv pip install -r requirements/cpu.txt
|
||||||
|
|
||||||
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD"
|
ARG TARGETARCH
|
||||||
|
ENV TARGETARCH=${TARGETARCH}
|
||||||
|
|
||||||
|
######################### x86_64 BASE IMAGE #########################
|
||||||
|
FROM base-common AS base-amd64
|
||||||
|
|
||||||
|
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so"
|
||||||
|
|
||||||
|
######################### arm64 BASE IMAGE #########################
|
||||||
|
FROM base-common AS base-arm64
|
||||||
|
|
||||||
|
ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
|
||||||
|
|
||||||
|
######################### BASE IMAGE #########################
|
||||||
|
FROM base-${TARGETARCH} AS base
|
||||||
|
|
||||||
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
||||||
|
|
||||||
|
|||||||
@ -114,9 +114,6 @@ RUN cat torch_build_versions.txt
|
|||||||
# explicitly set the list to avoid issues with torch 2.2
|
# explicitly set the list to avoid issues with torch 2.2
|
||||||
# see https://github.com/pytorch/pytorch/pull/123243
|
# see https://github.com/pytorch/pytorch/pull/123243
|
||||||
|
|
||||||
# Override the arch list for flash-attn to reduce the binary size
|
|
||||||
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
|
|
||||||
ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
|
|
||||||
#################### BASE BUILD IMAGE ####################
|
#################### BASE BUILD IMAGE ####################
|
||||||
|
|
||||||
#################### WHEEL BUILD IMAGE ####################
|
#################### WHEEL BUILD IMAGE ####################
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
ARG NIGHTLY_DATE="20250714"
|
ARG NIGHTLY_DATE="20250730"
|
||||||
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE"
|
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE"
|
||||||
|
|
||||||
FROM $BASE_IMAGE
|
FROM $BASE_IMAGE
|
||||||
|
|||||||
@ -47,7 +47,7 @@ FROM vllm-base AS vllm-openai
|
|||||||
|
|
||||||
# install additional dependencies for openai api server
|
# install additional dependencies for openai api server
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install accelerate hf_transfer pytest 'modelscope!=1.15.0'
|
pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope
|
||||||
|
|
||||||
ENV VLLM_USAGE_SOURCE production-docker-image \
|
ENV VLLM_USAGE_SOURCE production-docker-image \
|
||||||
TRITON_XPU_PROFILE 1
|
TRITON_XPU_PROFILE 1
|
||||||
|
|||||||
@ -56,9 +56,7 @@ nav:
|
|||||||
- contributing/model/tests.md
|
- contributing/model/tests.md
|
||||||
- contributing/model/multimodal.md
|
- contributing/model/multimodal.md
|
||||||
- CI: contributing/ci
|
- CI: contributing/ci
|
||||||
- Design Documents:
|
- Design Documents: design
|
||||||
- V0: design
|
|
||||||
- V1: design/v1
|
|
||||||
- API Reference:
|
- API Reference:
|
||||||
- Summary: api/README.md
|
- Summary: api/README.md
|
||||||
- Contents:
|
- Contents:
|
||||||
|
|||||||
@ -14,7 +14,6 @@ API documentation for vLLM's configuration classes.
|
|||||||
- [vllm.config.DeviceConfig][]
|
- [vllm.config.DeviceConfig][]
|
||||||
- [vllm.config.SpeculativeConfig][]
|
- [vllm.config.SpeculativeConfig][]
|
||||||
- [vllm.config.LoRAConfig][]
|
- [vllm.config.LoRAConfig][]
|
||||||
- [vllm.config.PromptAdapterConfig][]
|
|
||||||
- [vllm.config.MultiModalConfig][]
|
- [vllm.config.MultiModalConfig][]
|
||||||
- [vllm.config.PoolerConfig][]
|
- [vllm.config.PoolerConfig][]
|
||||||
- [vllm.config.DecodingConfig][]
|
- [vllm.config.DecodingConfig][]
|
||||||
|
|||||||
|
After Width: | Height: | Size: 187 KiB |
|
After Width: | Height: | Size: 189 KiB |
|
After Width: | Height: | Size: 227 KiB |
|
After Width: | Height: | Size: 128 KiB |
|
Before Width: | Height: | Size: 185 KiB After Width: | Height: | Size: 185 KiB |
|
Before Width: | Height: | Size: 162 KiB After Width: | Height: | Size: 162 KiB |